diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 69192299a3f..00000000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -# Ignore some errors, since we autoformat them away already wherever possible -# from https://github.com/psf/black/blob/main/.flake8 -# E302 is ignored to support jupytext files -ignore = E203, E266, E501, W503, E302 -exclude = .ipynb_checkpoints,*_cookiecutter,cookiecutter,etl/steps/archive,etl/snapshots/archive diff --git a/.vscode/settings.json b/.vscode/settings.json index 666dc3fa9a5..cb06b055745 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -37,17 +37,11 @@ "**/docs/architecture/*.md" ], "files.exclude": { - "etl/steps/archive": true, - "snapshots/archive": true, "**/dataset_*_config.json": true, "**/dataset_*_values.json": true, "**/dataset_*.json.dvc": true, "**/dataset_*.feather.dvc": true }, - "search.exclude": { - "etl/steps/archive": true, - "snapshots/archive": true - }, "yaml.format.printWidth": 999, "ruff.path": [ ".venv/bin/ruff" diff --git a/apps/wizard/app_pages/chart_diff/chart_diff.py b/apps/wizard/app_pages/chart_diff/chart_diff.py index af74d1fe8f2..3ed8b71f3a0 100644 --- a/apps/wizard/app_pages/chart_diff/chart_diff.py +++ b/apps/wizard/app_pages/chart_diff/chart_diff.py @@ -1,5 +1,6 @@ import datetime as dt import difflib +import json import pprint from typing import Any, Dict, List, Optional @@ -653,6 +654,7 @@ def _modified_chart_configs_on_staging( select c.id as chartId, MD5(cc.full) as chartChecksum, + cc.full as chartConfig, c.lastEditedByUserId as chartLastEditedByUserId, c.publishedByUserId as chartPublishedByUserId, c.lastEditedAt as chartLastEditedAt @@ -699,6 +701,20 @@ def _modified_chart_configs_on_staging( diff = source_df.copy() diff["configEdited"] = source_df["chartChecksum"] != target_df["chartChecksum"] + # Go through edited configs and do a more detailed comparison + ix = diff["configEdited"] & target_df["chartChecksum"].notnull() + equal_configs = [] + for chart_id, row in diff.loc[ix].iterrows(): + source_config = json.loads(row["chartConfig"]) + target_config = json.loads(target_df.loc[chart_id, "chartConfig"]) + + # Compare configs + if configs_are_equal(source_config, target_config): + equal_configs.append(chart_id) + + # Exclude configs that have different chartChecksum, but are actually the same (e.g. have just different version) + diff = diff[~diff.index.isin(equal_configs)] + # Add flag 'edited in staging' diff["chartEditedInStaging"] = True diff --git a/dag/archive/artificial_intelligence.yml b/dag/archive/artificial_intelligence.yml index f15060fc02a..a69ed71a8d4 100644 --- a/dag/archive/artificial_intelligence.yml +++ b/dag/archive/artificial_intelligence.yml @@ -1,6 +1,7 @@ steps: ############################################################################################################## # EPOCH archive (monthly updates) + # Artificial Intelligence (EPOCH) data://meadow/artificial_intelligence/latest/epoch: - snapshot://artificial_intelligence/latest/epoch.csv @@ -236,6 +237,53 @@ steps: data://grapher/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain: - data://garden/artificial_intelligence/2024-10-01/epoch_compute_intensive_domain + # Main EPOCH dataset + data://meadow/artificial_intelligence/2024-11-03/epoch: + - snapshot://artificial_intelligence/2024-11-03/epoch.csv + data://garden/artificial_intelligence/2024-11-03/epoch: + - data://meadow/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch: + - data://garden/artificial_intelligence/2024-11-03/epoch + + # Main EPOCH dataset regression lines + data://garden/artificial_intelligence/2024-11-03/epoch_regressions: + - data://garden/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_regressions: + - data://garden/artificial_intelligence/2024-11-03/epoch_regressions + + # EPOCH aggregates by domain + data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain + + # EPOCH aggregates by researcher affiliaiton + data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-11-03/epoch + data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation + + # EPOCH dataset on Compute Intensive AI + data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-11-03/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + + + # EPOCH dataset on Compute Intensive AI, aggregates by country + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries + + # EPOCH dataset on Compute Intensive AI, aggregates by domain + data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain + + + ############################################################################################################## # AI Incidents diff --git a/dag/artificial_intelligence.yml b/dag/artificial_intelligence.yml index a18f9550a83..868a40ad852 100644 --- a/dag/artificial_intelligence.yml +++ b/dag/artificial_intelligence.yml @@ -1,49 +1,49 @@ steps: ########### UPDATED MONTHLY ############################################################################# # Main EPOCH dataset - data://meadow/artificial_intelligence/2024-11-03/epoch: - - snapshot://artificial_intelligence/2024-11-03/epoch.csv - data://garden/artificial_intelligence/2024-11-03/epoch: - - data://meadow/artificial_intelligence/2024-11-03/epoch - data://grapher/artificial_intelligence/2024-11-03/epoch: - - data://garden/artificial_intelligence/2024-11-03/epoch + data://meadow/artificial_intelligence/2024-12-05/epoch: + - snapshot://artificial_intelligence/2024-12-05/epoch.csv + data://garden/artificial_intelligence/2024-12-05/epoch: + - data://meadow/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch: + - data://garden/artificial_intelligence/2024-12-05/epoch # Main EPOCH dataset regression lines - data://garden/artificial_intelligence/2024-11-03/epoch_regressions: - - data://garden/artificial_intelligence/2024-11-03/epoch - data://grapher/artificial_intelligence/2024-11-03/epoch_regressions: - - data://garden/artificial_intelligence/2024-11-03/epoch_regressions + data://garden/artificial_intelligence/2024-12-05/epoch_regressions: + - data://garden/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_regressions: + - data://garden/artificial_intelligence/2024-12-05/epoch_regressions # EPOCH aggregates by domain - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain: - - data://meadow/artificial_intelligence/2024-11-03/epoch - data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_domain: - - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_domain + data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain: + - data://meadow/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain: + - data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain # EPOCH aggregates by researcher affiliaiton - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-11-03/epoch - data://grapher/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation: - - data://garden/artificial_intelligence/2024-11-03/epoch_aggregates_affiliation + data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-12-05/epoch + data://grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation: + - data://garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation # EPOCH dataset on Compute Intensive AI - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive: - - snapshot://artificial_intelligence/2024-11-03/epoch_compute_intensive.csv - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive: - - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive + data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive: + - snapshot://artificial_intelligence/2024-12-05/epoch_compute_intensive.csv + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive # EPOCH dataset on Compute Intensive AI, aggregates by country - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: - - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries: - - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_countries + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries: + - data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries # EPOCH dataset on Compute Intensive AI, aggregates by domain - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: - - data://meadow/artificial_intelligence/2024-11-03/epoch_compute_intensive - data://grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain: - - data://garden/artificial_intelligence/2024-11-03/epoch_compute_intensive_domain + data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain: + - data://meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive + data://grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain: + - data://garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain ############### OTHERS ##################################################################################### diff --git a/dag/demography.yml b/dag/demography.yml index 7068ccce776..f4b1aab0583 100644 --- a/dag/demography.yml +++ b/dag/demography.yml @@ -117,9 +117,9 @@ steps: data://garden/ggdc/2024-01-19/maddison_federico_paper: - data://meadow/ggdc/2024-01-19/maddison_federico_paper - # UN WPP experiments + # UN WPP largest age-group per country data://garden/un/2024-03-14/un_wpp_most: - - data://garden/un/2022-07-11/un_wpp + - data://garden/un/2024-07-12/un_wpp data://grapher/un/2024-03-14/un_wpp_most: - data://garden/un/2024-03-14/un_wpp_most diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index a8f4ae3be6a..2c0f8c90876 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -74,8 +74,6 @@ steps: - snapshot-private://fasttrack/latest/pain_hours_hen_systems.csv data-private://grapher/fasttrack/latest/antibiotic_usage_livestock: - snapshot-private://fasttrack/latest/antibiotic_usage_livestock.csv - data-private://grapher/fasttrack/latest/antimicrobial_usage_livestock: - - snapshot-private://fasttrack/latest/antimicrobial_usage_livestock.csv data://grapher/fasttrack/2023-08-07/pain_hours_days_hen_systems: - snapshot://fasttrack/2023-08-07/pain_hours_days_hen_systems.csv data-private://grapher/fasttrack/latest/historical_france_mortality_cause: @@ -240,3 +238,5 @@ steps: - snapshot://fasttrack/latest/useful_energy_cost_way.csv data://grapher/fasttrack/2023-06-19/world_population_comparison: - snapshot://fasttrack/2023-06-19/world_population_comparison.csv + data://grapher/fasttrack/latest/antimicrobial_usage_livestock: + - snapshot://fasttrack/latest/antimicrobial_usage_livestock.csv diff --git a/dag/urbanization.yml b/dag/urbanization.yml index c4c4fdc49f7..ecff7c1f0a8 100644 --- a/dag/urbanization.yml +++ b/dag/urbanization.yml @@ -45,17 +45,6 @@ steps: data://grapher/un/2024-01-17/urban_agglomerations_definition_count: - data://garden/un/2024-01-17/urban_agglomerations_definition_count # - # GHSL degree of urbanization. - # - data://meadow/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - snapshot://urbanization/2024-01-26/ghsl_degree_of_urbanisation.zip - data://garden/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - data://meadow/urbanization/2024-01-26/ghsl_degree_of_urbanisation - - data://garden/wb/2023-04-30/income_groups - - data://garden/regions/2023-01-01/regions - data://grapher/urbanization/2024-01-26/ghsl_degree_of_urbanisation: - - data://garden/urbanization/2024-01-26/ghsl_degree_of_urbanisation - # # UN SDG indicators related to urbanization. # data://meadow/un/2024-02-14/sdgs_urbanization: @@ -76,3 +65,13 @@ steps: - data://garden/regions/2023-01-01/regions data://grapher/urbanization/2024-10-14/ghsl_degree_of_urbanisation: - data://garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation + + # GHSL urban centers. + data://meadow/urbanization/2024-12-02/ghsl_urban_centers: + - snapshot://urbanization/2024-12-02/ghsl_urban_centers.xlsx + data://garden/urbanization/2024-12-02/ghsl_urban_centers: + - data://meadow/urbanization/2024-12-02/ghsl_urban_centers + - data://garden/wb/2024-07-29/income_groups + - data://garden/regions/2023-01-01/regions + data://grapher/urbanization/2024-12-02/ghsl_urban_centers: + - data://garden/urbanization/2024-12-02/ghsl_urban_centers diff --git a/etl/paths.py b/etl/paths.py index f5a52647586..f55d81d3c71 100644 --- a/etl/paths.py +++ b/etl/paths.py @@ -23,7 +23,6 @@ # Snapshots SNAPSHOTS_DIR = BASE_DIR / "snapshots" -SNAPSHOTS_DIR_ARCHIVE = BASE_DIR / "snapshots_archive" # ETL library ETL_DIR = BASE_DIR / "etl" @@ -32,7 +31,6 @@ STEPS_MEADOW_DIR = STEPS_DATA_DIR / "meadow" STEPS_GARDEN_DIR = STEPS_DATA_DIR / "garden" STEPS_GRAPHER_DIR = STEPS_DATA_DIR / "grapher" -STEP_DIR_ARCHIVE = STEP_DIR / "archive" # Apps APPS_DIR = BASE_DIR / "apps" diff --git a/etl/snapshot.py b/etl/snapshot.py index 5e69867ea47..6c6a83ebdc2 100644 --- a/etl/snapshot.py +++ b/etl/snapshot.py @@ -60,11 +60,7 @@ def path(self) -> Path: @property def metadata_path(self) -> Path: """Path to metadata file.""" - archive_path = Path(f"{paths.SNAPSHOTS_DIR_ARCHIVE / self.uri}.dvc") - if archive_path.exists(): - return archive_path - else: - return Path(f"{paths.SNAPSHOTS_DIR / self.uri}.dvc") + return Path(f"{paths.SNAPSHOTS_DIR / self.uri}.dvc") def _download_dvc_file(self, md5: str) -> None: """Download file from remote to self.path.""" diff --git a/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py index 379ae3e3c4b..591d82b7d91 100644 --- a/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py +++ b/etl/steps/data/garden/antibiotics/2024-11-15/testing_coverage.py @@ -11,7 +11,7 @@ WHO_REGION_MEMBERS = { "African Region (WHO)": 47, "World": 194, - "Eastern Mediterranean (WHO)": 21, + "Eastern Mediterranean (WHO)": 22, "European Region (WHO)": 53, "Region of the Americas (WHO)": 35, "South-East Asia Region (WHO)": 11, diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml new file mode 100644 index 00000000000..c4764e0418e --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.meta.yml @@ -0,0 +1,98 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI model is designed to operate. + description_processing: |- + In cases where multiple domains were associated with a model, we consolidated these entries under the label "Multiple domains". We also identified domains associated with fewer than 20 notable systems and grouped these under the category 'Other'. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + organization_categorization: + title: Researcher affiliation + unit: '' + short_unit: '' + description_short: Describes the sector where the authors of an AI model have their primary affiliations. + description_from_producer: |- + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI model was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..1f489c23c58 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,144 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Filter notable systems by selecting rows where 'notability_criteria' is not nan + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + tb = tb.drop("notability_criteria", axis=1) + + # Convert relevant columns to string type + columns = ["model", "domain", "organization_categorization"] + tb[columns] = tb[columns].astype(str) + + def simplify_entry(entry): + """ + Simplifies an entry of organization categories which can include many entries of Industry, Academia etc. + Removes duplicates, ensures all words except the first one start with a lower case letter,and joins the categories with ", " and " and " before the last one. + """ + # Check for "nan" + if entry == "nan": + return "Not specified" + + # Split the entry into categories, convert to set to remove duplicates + categories = sorted(set(entry.split(","))) + + # Make sure all words except the first one start with a lower case letter + categories = [categories[0]] + [category.lower() for category in categories[1:]] + + # Join the categories with ", " and " and " before the last one + if len(categories) > 1: + simplified_entry = ", ".join(categories[:-1]) + " and " + categories[-1] + " collaboration" + else: + simplified_entry = categories[0] + + return simplified_entry + + tb["organization_categorization"] = tb["organization_categorization"].apply(simplify_entry) + + # Get the unique values in the organization_categorization column and compare them to expected affiliations + unique_values = set(tb["organization_categorization"]) + expected_values = { + "Industry", + "Academia", + "Government", + "Academia and industry collaboration", + "Academia and research collective collaboration", + "Industry and research collective collaboration", + "Academia, industry and research collective collaboration", + "Government and industry collaboration", + "Research collective", + "Academia, government and industry collaboration", + "Academia and government collaboration", + "Academia, government, industry and research collective collaboration", + "Not specified", + } + assert unique_values == expected_values, "Unexpected affiliations in organization_categorization column" + + # Replace affiliation of researchers with less than 20 systems with 'Other' + affiliation_counts = tb["organization_categorization"].value_counts() + + tb["organization_categorization"] = tb["organization_categorization"].where( + tb["organization_categorization"].map(affiliation_counts) >= 20, "Other" + ) + # Get the organizations that were reclassified to 'Other' + reclassified_organizations = affiliation_counts[affiliation_counts < 20].index.tolist() + + paths.log.info( + f"Affiliations of researchers with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_organizations)}" + ) + + # Replace nans with Unspecified in each column to avoid issues when calculating sume of notable systems + columns = ["organization_categorization", "domain", "organization"] + tb[columns] = tb[columns].replace("nan", "Not specified") + + # Check for multiple entries in 'domain' separated by comma + multiple_domains = tb["domain"].str.contains(",") + # Replace entries in 'domain' that contain a comma with 'Multiple Domains' + tb.loc[multiple_domains, "domain"] = "Multiple domains" + + # Replace domains with less than 20 systems with 'Other' + domain_counts = tb["domain"].value_counts() + + tb["domain"] = tb["domain"].where(tb["domain"].map(domain_counts) >= 20, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 20].index.tolist() + + paths.log.info( + f"Domains with less than 20 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["model", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "model"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml new file mode 100644 index 00000000000..1bf9422d84a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.meta.yml @@ -0,0 +1,35 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the sector where the authors of a notable AI system have their primary affiliations. {definitions.desc_update} + description_from_producer: |- + The distinction is documented in [Academia and Industry](https://docs.google.com/document/d/1wyJmDOWDEKItg0QhO5cpsNAgHq4aHOxQQZnTfzm34gI/edit). + Systems are categorized as “Industry” if their authors are affiliated with private sector organizations, “Academia” if the authors are affiliated with universities or academic institutions, or “Industry - Academia Collaboration” when at least 30% of the authors are from each. + Possible values: Industry, Research Collective, Academia, Industry - Academia Collaboration (Industry leaning), Industry - Academia Collaboration (Academia leaning), Non-profit + unit: 'AI systems' + short_unit: '' + display: + numDecimalPlaces: 0 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by researcher affiliation + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_affiliation: + variables: + yearly_count: + title: Annual number of AI systems by researcher affiliation + + cumulative_count: + title: Cumulative number of AI systems by researcher affiliation diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..7bcbf76a4d8 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py @@ -0,0 +1,75 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems in each category of researcher affiliation.""" + +import datetime as dt + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_affiliation.start") + + # + # Load inputs. + # + # Load the the garden dataset without aggregations. + ds_garden = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_garden["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + # Store the origins metadata for later use + origins = tb["organization_categorization"].metadata.origins + + # Define the columns that are not needed + unused_columns = [ + "days_since_1949", + "parameters", + "training_dataset_size__datapoints", + "domain", + "training_computation_petaflop", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Ensure 'publication_date' column type is datetime64 + assert tb["publication_date"].dtype == "datetime64[ns]", "publication_date column is not of type datetime64" + + # Extract the year from the 'publication_date' column + tb["year"] = tb["publication_date"].dt.year + + # Group by year and country and count the number of systems + tb_agg = tb.groupby(["year", "organization_categorization"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count + tb_agg["cumulative_count"] = tb_agg.groupby("organization_categorization", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "organization_categorization"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + ds_garden.save() + + paths.log.info("epoch_aggregates_affiliation.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml new file mode 100644 index 00000000000..b1cde5bb5d5 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.meta.yml @@ -0,0 +1,53 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_short: Describes the specific area, application, or field in which an AI system is designed to operate. An AI system can operate in more than one domain, thus contributing to the count for multiple domains. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal system, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging system uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Recommendation systems offer suggestions based on user preferences, prominently seen in online shopping and media streaming. For instance, Netflix's movie suggestions or Amazon's product recommendations are powered by algorithms that analyze users' preferences and past behaviors. + + - Audio systems process and generate sound, with applications in music composition, signal processing, and sound recognition. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + - Robotics systems combine AI with mechanical engineering to create autonomous robots for various industries. + + - Video systems analyze and generate video content, aiding in editing, surveillance, and content creation. + description_processing: The count of notable AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI system is primarily designed to operate within. System domains with less than 10 systems are grouped under "Other." + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Notable AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_aggregates_domain: + variables: + yearly_count: + title: Annual number of AI systems by domain + + cumulative_count: + title: Cumulative number of AI systems by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py new file mode 100644 index 00000000000..2a4e84e2673 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py @@ -0,0 +1,107 @@ +"""Generate aggregated table for total yearly and cumulative number of notable AI systems for each domain.""" + +import datetime as dt + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_aggregates_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"] + tb = tb.reset_index() + + # + # Process data. + # + + # Store the origins metadata for later use + origins = tb["domain"].metadata.origins + + # Select the rows where the 'notability_criteria' column is not null (only consider notable systems) + tb = tb[tb["notability_criteria"].notna()].reset_index(drop=True) + + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "organization_categorization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + "notability_criteria", + ] + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb["domain"] = tb["domain"].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode("domain") + + # Drop duplicates where the year, model and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "model", "domain"]) + + # Replace domains with less than 10 systems with 'Other' + domain_counts = tb_unique["domain"].value_counts() + + tb_unique["domain"] = tb_unique["domain"].where(tb_unique["domain"].map(domain_counts) >= 10, "Other") + # Get the domains that were reclassified to 'Other' + reclassified_domains = domain_counts[domain_counts < 10].index.tolist() + domain_counts = tb_unique["domain"].value_counts() + + paths.log.info( + f"Domains with less than 10 notable systems that were reclassified to 'Other': {', '.join(reclassified_domains)}" + ) + # Convert the column to category type so that the missing values will be considered as 0 + tb_unique["domain"] = tb_unique["domain"].astype("category") + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", "domain"], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby("domain", observed=False)["yearly_count"].cumsum() + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = paths.short_name + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_aggregates_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml new file mode 100644 index 00000000000..5f95f506c67 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.meta.yml @@ -0,0 +1,91 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive: + variables: + domain: + title: Domain + unit: '' + short_unit: '' + description_short: Refers to the specific area, application, or field in which an AI model is designed to operate. + display: + zeroDay: '1949-01-01' + yearIsDay: true + + + parameters: + title: Number of parameters + unit: '' + description_short: Total number of learnable variables or weights that the model contains. Parameters are adjusted during the training process to optimize the model's performance. + description_key: + - Parameters are internal variables that machine learning models adjust during their training process to improve their ability to make accurate predictions. They act as the model's "knobs" that are fine-tuned based on the provided data. In deep learning, a subset of artificial intelligence (AI), parameters primarily consist of the weights assigned to the connections between the small processing units called neurons. Picture a vast network of interconnected neurons where the strength of each connection represents a parameter. + + - The total number of parameters in a model is influenced by various factors. The model's structure and the number of “layers” of neurons play a significant role. Generally, more complex models with additional layers tend to have a higher number of parameters. Special components of specific deep learning architectures can further contribute to the overall parameter count. + + - Understanding the number of parameters in a model is crucial to design effective models. More parameters can help the model understand complex data patterns, potentially leading to higher accuracy. However, there's a fine balance to strike. If a model has too many parameters, it risks memorizing the specific examples in its training data rather than learning their underlying patterns. Consequently, it may perform poorly when presented with new, unseen data. Achieving the right balance of parameters is a critical consideration in model development. + + - In recent times, the AI community has witnessed the emergence of what are often referred to as "giant models." These models boast an astounding number of parameters, reaching into the billions or even trillions. While these huge models have achieved remarkable performance, they have a significant computational cost. Effectively managing and training such large-scale models has become a prominent and active area of research and discussion within the AI field. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_dataset_size__datapoints: + title: Training dataset size + unit: 'datapoints' + description_short: The number of examples provided to train an AI model. Typically, more data results in a more comprehensive understanding by the model. + description_key: + - Training data size refers to the volume of data employed to train an artificial intelligence (AI) model effectively. It's a representation of the number of examples that the model learns from during its training process. It is a fundamental measure of the scope of the data used in the model's learning phase. + + - To grasp the concept of training data size, imagine teaching a friend the art of distinguishing different types of birds. In this analogy, each bird picture presented to your friend corresponds to an individual piece of training data. If you showed them 100 unique bird photos, then the training data size in this scenario would be quantified as 100. + + - Training data size is an essential indicator in AI and machine learning. First and foremost, it directly impacts the depth of learning achieved by the model. The more extensive the dataset, the more profound and comprehensive the model's understanding of the subject matter becomes. Additionally, a large training data size contributes significantly to improved recognition capabilities. By exposing the model to a diverse array of examples, it becomes adept at identifying subtle nuances, much like how it becomes skilled at distinguishing various bird species through exposure to a large variety of bird images. + + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + + training_computation_petaflop: + title: Training computation (petaFLOP) + unit: 'petaFLOP' + description_short: Computation is measured in total petaFLOP, which is 10¹⁵ [floating-point operations](#dod:flop) estimated from AI literature, albeit with some uncertainty. + description_key: + - In the context of artificial intelligence (AI), training computation is predominantly measured using floating-point operations or “FLOP”. One FLOP represents a single arithmetic operation involving floating-point numbers, such as addition, subtraction, multiplication, or division. To adapt to the vast computational demands of AI systems, the measurement unit of petaFLOP is commonly used. One petaFLOP stands as a staggering one quadrillion FLOPs, underscoring the magnitude of computational operations within AI. + + - Modern AI systems are rooted in machine learning and deep learning techniques. These methodologies are notorious for their computational intensity, involving complex mathematical processes and algorithms. During the training phase, AI models process large volumes of data, while continuously adapting and refining their parameters to optimize performance, rendering the training process computationally intensive. + + - Many factors influence the magnitude of training computation within AI systems. Notably, the size of the dataset employed for training significantly impacts the computational load. Larger datasets necessitate more processing power. The complexity of the model's architecture also plays a pivotal role; more intricate models lead to more computations. Parallel processing, involving the simultaneous use of multiple processors, also has a substantial effect. Beyond these factors, specific design choices and other variables further contribute to the complexity and scale of training computation within AI. + + description_processing: Training computation was converted from its original measurement in FLOPs (floating-point operations) to a more manageable unit known as petaFLOPs. This conversion is performed by dividing the original training compute value by 1e15, which represents one quadrillion (10^15). The purpose of this conversion is to provide a more human-readable and practical representation of the immense computational efforts involved in training AI systems. By expressing the training computation in petaFLOPs, it becomes easier to grasp the scale and magnitude of the computational resources required for training these systems, especially when dealing with large datasets and complex architectures. + display: + numDecimalPlaces: 0 + zeroDay: '1949-01-01' + yearIsDay: true + presentation: + grapher_config: + title: Training computation + + publication_date: + title: Publication date + unit: '' + description_short: The date when the AI model was first published. + description_from_producer: The publication, announcement, or release date of the model, in YYYY-MM-DD format. If the year and month are known but the day is unknown, the day is filled in as YYYY-MM-15. If the year is known but the month and day are unknown, the month and day are filled in as YYYY-07-01. + + + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..b9b431c4ef0 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,60 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Convert FLOP to petaFLOP and remove the column with FLOPs (along with training time in hours) + tb["training_computation_petaflop"] = tb["training_compute__flop"] / 1e15 + + # Convert publication date to a datetime objects + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + + # Calculate 'days_since_1949' + tb["days_since_1949"] = (tb["publication_date"] - pd.to_datetime("1949-01-01")).dt.days.astype("Int64") + tb = tb.dropna(subset=["days_since_1949"]) + + tb = tb.reset_index(drop=True) + + assert not tb[["model", "days_since_1949"]].isnull().any().any(), "Index columns should not have NaN values" + + # Drop columns that are not needed + tb = tb.drop( + ["training_compute__flop", "organization", "authors", "country__from_organization"], + axis=1, + ) + tb = tb.format(["days_since_1949", "model"]) + + # Add metadata to the publication date column + tb["publication_date"].metadata.origins = tb["domain"].metadata.origins + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json new file mode 100644 index 00000000000..ddfda66807a --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.countries.json @@ -0,0 +1,18 @@ +{ + "Canada": "Canada", + "China": "China", + "Germany": "Germany", + "Israel": "Israel", + "Singapore": "Singapore", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Korea (Republic of)": "South Korea", + "Multinational": "Multinational", + "Russia": "Russia", + "Japan": "Japan", + "France": "France", + "Finland": "Finland", + "Total": "Total", + "Hong Kong": "Hong Kong" +} \ No newline at end of file diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml new file mode 100644 index 00000000000..3f97637a89b --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.meta.yml @@ -0,0 +1,31 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + + unit: 'AI systems' + short_unit: '' + description_short: Refers to the location of the primary organization with which the authors of a large-scale AI systems are affiliated. {definitions.desc_update} + description_processing: The number of large-scale AI systems by country is determined by tallying the number of machine learning models that are associated with the geographical location of the researchers' affiliated institutions. It's important to note that a single model can have multiple authors, each potentially affiliated with different institutions, thus contributing to the count for multiple countries. +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ + +dataset: + update_period_days: 31 + title: Large-scale AI systems by country +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_countries: + variables: + yearly_count: + title: Annual number of large-scale AI systems by country + + cumulative_count: + title: Cumulative number of large-scale AI systems by country diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..69bc951a631 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py @@ -0,0 +1,67 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems in each country.""" + +import datetime as dt + +import shared as sh + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_countries.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "domain", + "authors", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by country + tb_agg = sh.calculate_aggregates(tb, "country__from_organization", paths.short_name, unused_columns) + + # Rename the 'country__from_organization' column to 'country' + tb_agg = tb_agg.rename(columns={"country__from_organization": "country"}) + + # Harmonize the country names + tb_agg = geo.harmonize_countries(df=tb_agg, countries_file=paths.country_mapping_path) + + # Set the index to year and country + tb_agg = tb_agg.format(["year", "country"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_countries.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml new file mode 100644 index 00000000000..4d6697e7541 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.meta.yml @@ -0,0 +1,48 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + desc_update: The 2024 data is incomplete and was last updated {date_accessed}. + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + grapher_config: + note: Confirmed large-scale AI models are those where the training compute exceeds 10²³ floating-point operations (FLOP). + description_short: Describes the specific area, application, or field in which a large-scale AI model is designed to operate. {definitions.desc_update} + description_key: + - Game systems are specifically designed for games and excel in understanding and strategizing gameplay. For instance, AlphaGo, developed by DeepMind, defeated the world champion in the game of Go. Such systems use complex algorithms to compete effectively, even against skilled human players. + + - Language systems are tailored to process language, focusing on understanding, translating, and interacting with human languages. Examples include chatbots, machine translation tools like Google Translate, and sentiment analysis algorithms that can detect emotions in text. + + - Multimodal systems are artificial intelligence frameworks that integrate and interpret more than one type of data input, such as text, images, and audio. ChatGPT-4 is an example of a multimodal model, as it has the capability to process and generate responses based on both textual and visual inputs. + + - Vision systems focus on processing visual information, playing a pivotal role in image recognition and related areas. For example, Facebook's photo tagging model uses vision AI to identify faces. + + - Speech systems are dedicated to handling spoken language, serving as the backbone of voice assistants and similar applications. They recognize, interpret, and generate spoken language to interact with users. + + - Biology systems analyze biological data and simulate biological processes, aiding in drug discovery and genetic research. + + - Image generation systems create visual content from text descriptions or other inputs, used in graphic design and content creation. + + description_processing: The count of large-scale AI models AI systems per domain is derived by tallying the instances of machine learning models classified under each domain category. It's important to note that a single machine learning model can fall under multiple domains. The classification into domains is determined by the specific area, application, or field that the AI model is primarily designed to operate within. + description_from_producer: A foreign key field categorizing the system’s domain of machine learning. This field links to the [ML Domains table](https://airtable.com/appDFXXgaG1xLtXGL/shrhzolGiQCVnwOY5/tbleYEsZORsiYRVTM), and domains are selected from the options in that table. + unit: 'AI systems' + short_unit: '' + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 31 + title: Large-scale AI systems by domain type + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + epoch_compute_intensive_domain: + variables: + yearly_count: + title: Annual number of large-scale AI models by domain + + cumulative_count: + title: Cumulative number of large-scale AI models by domain diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..e832677a43d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py @@ -0,0 +1,60 @@ +"""Generate aggregated table for total yearly and cumulative number of compute intensive AI systems for each domain.""" + +import datetime as dt + +import shared as sh + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch_compute_intensive_domain.start") + + # + # Load inputs. + # + # Load the ds_meadow dataset. + ds_meadow = paths.load_dataset("epoch_compute_intensive") + + # Read table from meadow dataset. + tb = ds_meadow["epoch_compute_intensive"] + tb = tb.reset_index() + + # + # Process data. + # + # Define the columns that are not needed + unused_columns = [ + "authors", + "country__from_organization", + "organization", + "parameters", + "training_compute__flop", + "training_dataset_size__datapoints", + ] + + # Aggregate the data by domain + tb_agg = sh.calculate_aggregates(tb, "domain", paths.short_name, unused_columns) + + # Set the index to year and domain + tb_agg = tb_agg.format(["year", "domain"]) + + date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb_agg], + yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")}, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch_compute_intensive_domain.end") diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml new file mode 100644 index 00000000000..8bffd4fdf09 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.meta.yml @@ -0,0 +1,12 @@ + +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Artificial Intelligence + description_processing: |- + We performed a regression analysis, fitting exponential models to the data for both the pre-deep learning (before 2010) and deep learning eras (after 2010), using the code provided by researchers from Epoch. +dataset: + title: Parameter, Compute and Data Trends in Machine Learning - Regressions + diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py new file mode 100644 index 00000000000..8968e2c76a4 --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/epoch_regressions.py @@ -0,0 +1,145 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import numpy as np +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table +from sklearn.linear_model import LinearRegression + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Constants for defining the time periods +DL_ERA_START = 2010 +START_DATE = 1950 +END_DATE = 2025.2 + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("epoch") + + # Read table from meadow dataset. + tb = ds_meadow["epoch"].reset_index() + + # Run regression analysis and concatenate results + tb_trend = run_regression(tb) + tb = tb.drop("frac_year", axis=1) + tb = pr.concat([tb_trend, tb]) + + # Format the table + tb = tb.format(["days_since_1949", "model"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("epoch.end") + + +def fit_exponential(models, metric): + """Fit an exponential model to the given metric data. Code provided by Epoch AI team.""" + x = models["frac_year"].values.reshape(-1, 1) + y = models[metric] + + # Filter out non-positive values + positive_mask = y > 0 + x = x[positive_mask] + y = y[positive_mask] + + # Apply log10 transformation + y = np.log10(y) + + # Filter out infinite and extremely large values + finite_mask = np.isfinite(y) & (y < np.finfo(np.float32).max) + x = x[finite_mask] + y = y[finite_mask] + + # Fit linear regression model + reg = LinearRegression().fit(x, y) + return reg.intercept_, reg.coef_[0] + + +def run_regression(tb): + """Run regression analysis on the given table and return the updated table.""" + # Add fractional year for sorting and processing + publication_dates = tb["publication_date"] + tb.loc[:, "frac_year"] = ( + publication_dates.dt.year + (publication_dates.dt.month - 1) / 12 + (publication_dates.dt.day - 1) / 365 + ) + tb = tb.sort_values(by="frac_year") + + # Define periods dynamically + periods = { + f"{START_DATE}–{DL_ERA_START}": (tb["frac_year"] < DL_ERA_START), + f"{DL_ERA_START}–{int(END_DATE)}": ((tb["frac_year"] >= DL_ERA_START) & (tb["frac_year"] < END_DATE)), + } + # Define year grids dynamically + year_grids = { + f"{START_DATE}–{DL_ERA_START}": np.array([START_DATE, DL_ERA_START]), + f"{DL_ERA_START}–{int(END_DATE)}": np.array([DL_ERA_START, END_DATE]), + } + + metrics = ["training_computation_petaflop", "parameters", "training_dataset_size__datapoints"] + new_tables = [] + + for metric in metrics: + # Filter out models without the metric information + tb_metric = tb[pd.notnull(tb[metric])] + dfs = [] + + for period_name, condition in periods.items(): + # Subset data for the current period + period_data = tb_metric[condition] + + # Fit exponential model + fit = fit_exponential(period_data, metric) + oom_per_year = fit[1] + info = f"{10**oom_per_year:.1f}x/year" + + # Log the results + paths.log.info(f"{period_name} ({metric}): {info}") + + # Calculate the regression line for the current period + year_grid = year_grids[period_name] + line = 10 ** (fit[0] + year_grid * fit[1]) + + # Create DataFrame for the current period + df = pd.DataFrame( + { + "days_since_1949": [ + period_data["days_since_1949"].min(), + period_data["days_since_1949"].max(), + ], + f"{metric}": [line[0], line[-1]], + "model": [f"{info} between {period_name}"] * 2, + } + ) + dfs.append(df) + + # Combine the DataFrames for all periods for the current metric + df_combined = pd.concat(dfs, ignore_index=True) + new_tables.append(df_combined) + + # Merge all the new DataFrames + tb_new = new_tables[0] + for tb_m in new_tables[1:]: + tb_new = pd.merge(tb_new, tb_m, on=["model", "days_since_1949"], how="outer") + + # Convert to OWID Table and add metadata + tb_new = Table(tb_new, short_name=paths.short_name) + for column in tb_new.columns: + tb_new[column].metadata.origins = tb["publication_date"].metadata.origins + + return tb_new diff --git a/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py b/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py new file mode 100644 index 00000000000..016e6812e4d --- /dev/null +++ b/etl/steps/data/garden/artificial_intelligence/2024-12-05/shared.py @@ -0,0 +1,74 @@ +from typing import List + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def calculate_aggregates(tb: Table, agg_column: str, short_name: str, unused_columns: List[str]) -> Table: + """ + This function calculates aggregates for a given column in a Table. It is used to calculate the total yearly and cumulative number of notable AI systems for each domain or country. + + Parameters: + tb (Table): The input Table. + agg_column (str): The column to aggregate on. + short_name (str): The short name to set for the table. + unused_columns (List[str]): The list of columns to drop from the table. + + Returns: + Table: The output Table with calculated aggregates. + """ + + # Store the origins metadata for later use + origins = tb[agg_column].metadata.origins + + # Drop the unused columns + tb = tb.drop(unused_columns, axis=1) + + # Convert the 'publication_date' column to datetime format and extract the year + tb["publication_date"] = pd.to_datetime(tb["publication_date"]) + tb["year"] = tb["publication_date"].dt.year + + # Convert the column to category type so that the missing values will be considered as 0 + tb[agg_column] = tb[agg_column].astype("category") + + # Group total yearly counts and calculate cumulative count for total number of systems + tb_total = tb.groupby(["year"]).size().reset_index(name="yearly_count") + total_counts = tb_total.groupby("year")["yearly_count"].sum().reset_index() + total_counts[agg_column] = "Total" + total_counts["cumulative_count"] = total_counts["yearly_count"].cumsum() + + # Split the column to be aggregated by comma (several countries/domains can exist in each cell) + tb[agg_column] = tb[agg_column].str.split(",") + + # Explode the table to create separate rows for each country or domain + tb_exploded = tb.explode(agg_column) + + # Convert the column to category type so that the missing values will be considered as 0 + tb_exploded[agg_column] = tb_exploded[agg_column].astype("category") + + # Drop duplicates where the year, model and country/domain are the same + tb_unique = tb_exploded.drop_duplicates(subset=["year", "model", agg_column]) + + # Group by year and country/domain and count the number of systems (consider all categories which will assume 0 for missing values) + tb_agg = tb_unique.groupby(["year", agg_column], observed=False).size().reset_index(name="yearly_count") + + # Calculate the cumulative count (consider all categories which will assume 0 for missing values) + tb_agg["cumulative_count"] = tb_agg.groupby(agg_column, observed=False)["yearly_count"].cumsum() + + # Combine aggregated data with total counts + tb_agg = pr.concat([tb_agg, total_counts], ignore_index=True) + + # Add the origins metadata to the columns + for col in ["yearly_count", "cumulative_count"]: + tb_agg[col].metadata.origins = origins + + # Set the short_name metadata of the table + tb_agg.metadata.short_name = short_name + + return tb_agg diff --git a/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml b/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml index c9cc581a0ec..3ab26130309 100644 --- a/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml +++ b/etl/steps/data/garden/covid/latest/countries_reporting.meta.yml @@ -71,9 +71,9 @@ tables: country_reporting_delay: variables: num_days_delay_in_reporting: - title: Number of days needed to first report data on COVID-19 << type >> since the first vaccine was administered + title: Number of days needed to first report data on COVID-19 << type | default('') >> since the first vaccine was administered description_short: |- - Number of days needed to first report data on COVID-19 << type >> since the first vaccine was administered. Some countries may have started vaccinating before they reported it, or may have started reporting it before they started vaccinating. + Number of days needed to first report data on COVID-19 << type | default('') >> since the first vaccine was administered. Some countries may have started vaccinating before they reported it, or may have started reporting it before they started vaccinating. description_processing: *processing_2 description_key: *description_key_2 unit: "days" diff --git a/etl/steps/data/garden/covid/latest/sequence.meta.yml b/etl/steps/data/garden/covid/latest/sequence.meta.yml index acebff8b8e4..1f79931b216 100644 --- a/etl/steps/data/garden/covid/latest/sequence.meta.yml +++ b/etl/steps/data/garden/covid/latest/sequence.meta.yml @@ -21,13 +21,13 @@ tables: num_sequences: title: "Number of sequenced COVID-19 genomes - Variant: << variant >>" description_short: |- - <% set mapping = dict( - non_who="The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.", - other="The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced.", - else="The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced." - ) %> - - << mapping.get(variant, mapping['else']) >> + <% if variant == 'non_who' %> + The number of analyzed sequences in the preceding two weeks that correspond to non-relevant variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <% elif variant == 'other' %> + The number of analyzed sequences in the preceding two weeks that correspond to non-categorised variant groups. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <% else %> + The number of analyzed sequences in the preceding two weeks that correspond to variant group '<< variant >>'. This number may not reflect the complete breakdown of cases since only a fraction of all cases are sequenced. + <%- endif -%> unit: "sequenced genomes" display: tolerance: 28 diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py index 92d6d4886a3..1b93e5e9c5a 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_300k.py @@ -50,15 +50,15 @@ def run(dest_dir: str) -> None: } ) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb_average[tb_average["year"] < 2019].copy() - future_projections = tb_average[tb_average["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + past_estimates = tb_average[tb_average["year"] <= 2015].copy() + future_projections = tb_average[tb_average["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two for col in tb_average.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb_average.loc[tb_average["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb_average.loc[tb_average["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb_average.loc[tb_average["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb_average.loc[tb_average["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py index 6aa4daf3d05..e44531c7a8a 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_size_class.py @@ -38,15 +38,16 @@ def run(dest_dir: str) -> None: tb_pivot[col] = tb_pivot[col] * 1000 tb_pivot = tb_pivot.reset_index() - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb_pivot[tb_pivot["year"] < 2019].copy() - future_projections = tb_pivot[tb_pivot["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + + past_estimates = tb_pivot[tb_pivot["year"] <= 2015].copy() + future_projections = tb_pivot[tb_pivot["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two (projections and estimates) for col in tb_pivot.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb_pivot.loc[tb_pivot["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb_pivot.loc[tb_pivot["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb_pivot.loc[tb_pivot["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb_pivot.loc[tb_pivot["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py b/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py index c22b5c9aef1..9189fee2081 100644 --- a/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py +++ b/etl/steps/data/garden/un/2024-01-17/urbanization_urban_rural.py @@ -48,15 +48,15 @@ def run(dest_dir: str) -> None: # Remove 'thousands' from column name tb.rename(columns={col: col.replace("__thousands", "")}, inplace=True) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb[tb["year"] < 2019].copy() - future_projections = tb[tb["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections (pre-2015 and post-2015) + past_estimates = tb[tb["year"] <= 2015].copy() + future_projections = tb[tb["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two (projections and estimates) for col in tb.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml index c1e97dd6da2..c7793d9cbb2 100644 --- a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml +++ b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.meta.yml @@ -1,13 +1,21 @@ +definitions: + common: + presentation: + grapher_config: + subtitle: "" + originUrl: "https://ourworldindata.org/population-growth" + note: "" + tables: population_5_year_age_groups: variables: - age: + age_group_five: title: Five year age-group with the highest population unit: "" description_short: |- Five-year age group with the highest population. type: ordinal - sort: + sort: # May need additional groups when data is updated - 0-4 - 5-9 - 10-14 @@ -23,22 +31,18 @@ tables: - 60-64 - 65-69 - 70-74 - value: - title: Population of the most populous five-year age group - unit: "people" - description_short: |- - Population of the most populous five-year age-group. - display: - numDecimalPlaces: 0 + - 75-79 + presentation: + title_public: Five year age-group with the highest population population_10_year_age_groups: variables: - age_group: + age_group_ten: title: Ten year age-group with the highest population unit: "" description_short: |- Ten-year age group with the highest population. type: ordinal - sort: + sort: # May need additional groups when data is updated - 0-9 - 10-19 - 20-29 @@ -46,10 +50,6 @@ tables: - 40-49 - 50-59 - 60-69 - value: - title: Population of the most populous ten year age group - unit: "people" - description_short: |- - Population of the most populous ten-year age-group. - display: - numDecimalPlaces: 0 + - 70-79 + presentation: + title_public: Ten year age-group with the highest population diff --git a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py index c067d01d25e..0b0220c589f 100644 --- a/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py +++ b/etl/steps/data/garden/un/2024-03-14/un_wpp_most.py @@ -1,10 +1,14 @@ +from typing import Any + from owid.catalog import Table from owid.catalog import processing as pr +from structlog import get_logger from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +log = get_logger() def run(dest_dir: str) -> None: @@ -12,27 +16,31 @@ def run(dest_dir: str) -> None: # Load inputs. ds_garden = paths.load_dataset("un_wpp") tb_pop = ds_garden["population"].reset_index() - + origins = tb_pop["population"].metadata.origins[0] age_group_size = [5, 10] tb_list = [] tb_pop_filter = Table() for age_group in age_group_size: + log.info(f"Creating population table for {age_group} year age groups") # filter data for just sex = all, metrics = population, variant = estimates if age_group == 5: - tb_pop_filter = create_five_year_age_groups(tb_pop) + tb_pop_filter = create_five_year_age_groups(tb_pop, origins) if age_group == 10: - tb_pop_filter = create_ten_year_age_groups(tb_pop) + tb_pop_filter = create_ten_year_age_groups(tb_pop, origins) # Group by country and year, and apply the custom function - tb_pop_filter = tb_pop_filter.groupby(["location", "year"], observed=False).apply(get_largest_age_group) - # The function above creates NAs for some locations that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here - tb_pop_filter = tb_pop_filter.dropna() - tb_pop_filter = tb_pop_filter.reset_index(drop=True) - tb_pop_filter = tb_pop_filter.set_index(["location", "year"], verify_integrity=True) - tb_pop_filter = tb_pop_filter.copy_metadata(tb_pop) + tb_pop_filter = ( + tb_pop_filter.groupby(["country", "year"], group_keys=False) + .apply(get_largest_age_group) + .reset_index(drop=True) # Reset index to have a clean DataFrame + ) + # The function above creates NAs for some countrys that don't appear to be in the table e.g. Vatican, Melanesia, so dropping here + + tb_pop_filter = tb_pop_filter.drop(columns=["population"]) + tb_pop_filter = tb_pop_filter.set_index(["country", "year"], verify_integrity=True) tb_pop_filter.metadata.short_name = f"population_{age_group}_year_age_groups" tb_list.append(tb_pop_filter) # Save outputs. - # + # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset(dest_dir, tables=tb_list, default_metadata=ds_garden.metadata) @@ -40,7 +48,7 @@ def run(dest_dir: str) -> None: ds_garden.save() -def create_ten_year_age_groups(tb: Table) -> Table: +def create_ten_year_age_groups(tb: Table, origins: Any) -> Table: # Initialize an empty list to hold the age bands age_bands = [] # Loop through a range with a step of 5, stopping before 100 @@ -49,28 +57,29 @@ def create_ten_year_age_groups(tb: Table) -> Table: # Add the "100+" group at the end and 0-4 and 5-9 as 0-9 is not a group in the dataset age_bands = age_bands + ["100+", "0-4", "5-9", "10-14", "15-19"] # Filter the table to only include the age bands we want - tb = tb[(tb.sex == "all") & (tb.metric == "population") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] + tb = tb[(tb.sex == "all") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] assert tb["age"].nunique() == len(age_bands), "Age groups are not as expected" - tb = tb.drop(columns=["metric", "sex", "variant"]) + tb = tb.drop(columns=["sex", "variant", "population_change", "population_density"]) # Create the 0-9 and 10-19 age groups tb_0_9 = tb[(tb.age == "0-4") | (tb.age == "5-9")] - tb_0_9 = tb_0_9.groupby(["location", "year"], observed=False)["value"].sum().reset_index() + tb_0_9 = tb_0_9.groupby(["country", "year"], observed=False)["population"].sum().reset_index() tb_0_9["age"] = "0-9" tb_10_19 = tb[(tb.age == "10-14") | (tb.age == "15-19")] - tb_10_19 = tb_10_19.groupby(["location", "year"], observed=False)["value"].sum().reset_index() + tb_10_19 = tb_10_19.groupby(["country", "year"], observed=False)["population"].sum().reset_index() tb_10_19["age"] = "10-19" # Drop the 0-4, 5-9, 10-14 and 15-19 age groups tb = tb[(tb.age != "0-4") & (tb.age != "5-9") & (tb.age != "10-14") & (tb.age != "15-19")] # Concatenate the 0-9 and 10-19 age groups with the original table tb = pr.concat([tb, tb_0_9, tb_10_19]) - tb = tb.rename(columns={"age": "age_group"}) + tb = tb.rename(columns={"age": "age_group_ten"}) + tb["age_group_ten"].metadata.origins = [origins] tb = tb.reset_index(drop=True) return tb -def create_five_year_age_groups(tb: Table) -> Table: +def create_five_year_age_groups(tb: Table, origins: Any) -> Table: # Initialize an empty list to hold the age bands age_bands = [] # Loop through a range with a step of 5, stopping before 100 @@ -79,13 +88,15 @@ def create_five_year_age_groups(tb: Table) -> Table: # Add the "100+" group at the end age_bands.append("100+") # Filter the table to only include the age bands we want - tb = tb[(tb.sex == "all") & (tb.metric == "population") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] + tb = tb[(tb.sex == "all") & (tb.variant == "estimates") & (tb.age.isin(age_bands))] assert tb["age"].nunique() == len(age_bands), "Age groups are not as expected" - tb = tb.drop(columns=["metric", "sex", "variant"]) + tb = tb.drop(columns=["sex", "variant", "population_change", "population_density"]) + tb = tb.rename(columns={"age": "age_group_five"}) + tb["age_group_five"].metadata.origins = [origins] tb = tb.reset_index(drop=True) return tb # Function to apply to each group to find the age group with the largest population def get_largest_age_group(group): - return group.loc[group["value"].idxmax()] + return group.loc[group["population"].idxmax()] diff --git a/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml b/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml index 8097420beb4..1344ff48672 100644 --- a/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml +++ b/etl/steps/data/garden/un/2024-07-11/un_wpp.meta.yml @@ -1,7 +1,7 @@ definitions: global: projections: - <%- if variant != 'estimates' -%> + <%- if (variant is defined) and (variant != 'estimates') -%> Projections from 2024 onwards are based on the UN's << variant >> scenario. <%- endif -%> dimensions: @@ -39,6 +39,7 @@ definitions: {definitions.global.projections} description_short_births: |- + <%- if sex is defined -%> <%- if not (sex == 'all' and age == 'all') -%> This only includes <%- if sex != 'all' -%> @@ -48,6 +49,7 @@ definitions: mothers aged << age >> <%- endif -%>. <%- endif -%> + <%- endif -%> {definitions.global.projections} diff --git a/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml b/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml index c62b1eac1fe..3d59ea4fcb8 100644 --- a/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml +++ b/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml @@ -1,7 +1,7 @@ definitions: global: projections: - <%- if variant != 'estimates' -%> + <%- if (variant is defined) and (variant != 'estimates') -%> Projections from 2024 onwards are based on the UN's << variant >> scenario. <%- endif -%> dimensions: @@ -43,6 +43,7 @@ definitions: {definitions.global.projections} description_short_births: |- + <%- if sex is defined -%> <%-if not (sex == 'all' and age == 'all') -%> This only includes <%- if sex != 'all' -%> @@ -51,7 +52,8 @@ definitions: <%- if age != 'all' -%> mothers aged << age >> <%- endif -%>. - <%-endif -%> + <%- endif -%> + <%- endif -%> {definitions.global.projections} diff --git a/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml b/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml index b12d8afc8f2..7d7c77d2f91 100644 --- a/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml +++ b/etl/steps/data/garden/urbanization/2024-10-14/ghsl_degree_of_urbanisation.meta.yml @@ -9,13 +9,13 @@ definitions: - |- **The Degree of Urbanisation (DEGURBA)** is a method for capturing the urban-rural divide, designed for international comparisons. Developed by six organizations and endorsed by the UN, it uses a two-level classification. - The first level divides areas into cities, towns and semi-dense areas, and rural areas, distinguishing between urban (cities, towns, suburbs) and rural regions. The second level adds detail, splitting towns and rural areas further. + The first level divides areas into cities, towns, and villages, distinguishing between urban (cities, towns, suburbs) and rural regions. The second level adds detail, splitting towns and villages further. This classification is based on 1 km² grid cells, grouped into urban centers, urban clusters, and rural cells. These grids are then used to classify smaller areas, typically using residential population grids from censuses or registers. If detailed data isn't available, a disaggregation grid estimates population distribution. To predict future urbanization (2025 and 2030), both static (land features) and dynamic (past satellite images) components are used to project growth. DEGURBA defines cities by population, not administrative borders, aligning with UN guidelines, though fixed thresholds may not always capture local differences. - description_short: The European Commission combines satellite imagery with national census data to identify [cities](#dod:cities-degurba), [towns and semi-dense areas](#dod:towns-suburbs-degurba), and [rural areas](#dod:rural-areas-degurba) and estimate their respective populations. + description_short: The European Commission combines satellite imagery with national census data to identify [cities](#dod:cities-degurba), [towns](#dod:towns-degurba), and [villages](#dod:villages-degurba) and estimate their respective populations. # Learn more about the available fields: @@ -30,13 +30,13 @@ tables: value: title: |- <% if location_type == "rural_total" and attribute == 'area' and type == 'estimates' %> - Land covered by rural areas + Land covered by villages <% elif location_type == "rural_total" and attribute == 'population' and type == 'estimates' %> - Population living in rural areas + Population living in villages <% elif location_type == "rural_total" and attribute == 'share' and type == 'estimates' %> - Share of land covered by rural areas + Share of land covered by villages <% elif location_type == "rural_total" and attribute == 'popshare' and type == 'estimates' %> - Share of population living in rural areas + Share of population living in villages <% elif location_type == "urban_centre" and attribute == 'area' and type == 'estimates' %> Land covered by cities @@ -48,13 +48,13 @@ tables: Share of population living in cities <% elif location_type == "urban_cluster" and attribute == 'area' and type == 'estimates' %> - Land covered by towns and semi-dense areas + Land covered by towns <% elif location_type == "urban_cluster" and attribute == 'population' and type == 'estimates' %> - Population living in towns and semi-dense areas + Population living in towns <% elif location_type == "urban_cluster" and attribute == 'share' and type == 'estimates' %> - Share of land covered by towns and semi-dense areas + Share of land covered by towns <% elif location_type == "urban_cluster" and attribute == 'popshare' and type == 'estimates' %> - Share of population living in towns and semi-dense areas + Share of population living in towns <% elif location_type == "urban_total" and attribute == 'area' and type == 'estimates' %> Land covered by urban areas @@ -66,13 +66,13 @@ tables: Share of population living in urban areas <% elif location_type == "rural_total" and attribute == 'area' and type == 'projections' %> - Projected land covered by rural areas + Projected land covered by villages <% elif location_type == "rural_total" and attribute == 'population' and type == 'projections' %> - Projected population living in rural areas + Projected population living in villages <% elif location_type == "rural_total" and attribute == 'share' and type == 'projections' %> - Projected share of land covered by rural areas + Projected share of land covered by villages <% elif location_type == "rural_total" and attribute == 'popshare' and type == 'projections' %> - Projected share of population living in rural areas + Projected share of population living in villages <% elif location_type == "urban_centre" and attribute == 'area' and type == 'projections' %> Projected land covered by cities @@ -84,13 +84,13 @@ tables: Projected share of population living in cities <% elif location_type == "urban_cluster" and attribute == 'area' and type == 'projections' %> - Projected land covered by towns and semi-dense areas + Projected land covered by towns <% elif location_type == "urban_cluster" and attribute == 'population' and type == 'projections' %> - Projected population living in towns and semi-dense areas + Projected population living in towns <% elif location_type == "urban_cluster" and attribute == 'share' and type == 'projections' %> - Projected share of land covered by towns and semi-dense areas + Projected share of land covered by towns <% elif location_type == "urban_cluster" and attribute == 'popshare' and type == 'projections' %> - Projected share of population living in towns and semi-dense areas + Projected share of population living in towns <% elif location_type == "urban_total" and attribute == 'area' and type == 'projections' %> Projected land covered by urban areas @@ -107,7 +107,7 @@ tables: <% elif location_type == "semi_dense" and attribute == 'number' and type == 'estimates' %> Number of semi-dense areas <% elif location_type == "rural_total" and attribute == 'number' and type == 'estimates' %> - Number of rural areas + Number of villages <% elif location_type == "urban_centre" and attribute == 'number' and type == 'estimates' %> Number of cities @@ -117,7 +117,7 @@ tables: <% elif location_type == "semi_dense" and attribute == 'number' and type == 'projections' %> Projected number of semi-dense areas <% elif location_type == "rural_total" and attribute == 'number' and type == 'projections' %> - Projected number of rural areas + Projected number of villages <% elif location_type == "urban_centre" and attribute == 'number' and type == 'projections' %> Projected number of cities @@ -127,14 +127,14 @@ tables: Projected population density in cities <% elif location_type == "urban_cluster" and attribute == 'density' and type == 'estimates' %> - Population density in towns and semi-dense areas + Population density in towns <% elif location_type == "urban_cluster" and attribute == 'density' and type == 'projections' %> - Projected population density in towns and semi-dense areas + Projected population density in towns <% elif location_type == "rural_total" and attribute == 'density' and type == 'estimates' %> - Population density in rural areas + Population density in villages <% elif location_type == "rural_total" and attribute == 'density' and type == 'projections' %> - Projected population density in rural areas + Projected population density in villages <% endif %> unit: @@ -160,7 +160,7 @@ tables: <%- endif -%> description_processing: <% if attribute == 'share' or attribute == 'popshare' %> - The share of total area or population for each urbanization level was calculated by dividing the area or population of each level (cities, towns and semi-dense areas, rural areas) by the overall total, providing a percentage representation for each category. + The share of total area or population for each urbanization level was calculated by dividing the area or population of each level (cities, towns, villages) by the overall total, providing a percentage representation for each category. <% elif attribute == 'density' %> Population density was calculated by dividing the population of cities by the total area it covers, providing a measure of the number of people living in each km². <%- endif -%> diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json new file mode 100644 index 00000000000..bdb892107bf --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.countries.json @@ -0,0 +1,185 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Benin": "Benin", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Guatemala": "Guatemala", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Northern Cyprus": "Northern Cyprus", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Samoa": "Samoa", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "M\u00e9xico": "Mexico", + "Republic of the Congo": "Congo", + "S\u00e3o Tom\u00e9 and Pr\u00edncipe": "Sao Tome and Principe" +} \ No newline at end of file diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml new file mode 100644 index 00000000000..a8d7bc66382 --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.meta.yml @@ -0,0 +1,290 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Urbanization + display: + numDecimalPlaces: + 0 + + processing_level: minor + description_key: + - |- + The European Commission integrates satellite imagery with national census data to delineate the boundaries of capital cities and estimate their populations. + To predict future urbanization (2025 and 2030), both static (land features) and dynamic (past satellite images) components are used to project growth. DEGURBA defines cities by population, not administrative borders, aligning with UN guidelines, though fixed thresholds may not always capture local differences. + + desc_short_density_capital: &desc_short_density_capital |- + The number of people per km² of land area for the country's capital city. + desc_short_density_top_100: &desc_short_density_top_100 |- + The number of people per km² of land area for cities ranked among the top 100 most populous in 2020. + + desc_short_pop_capital: &desc_short_pop_capital |- + The total number of people living in the country's capital city. + desc_short_pop_top_100: &desc_short_pop_top_100 |- + The total number of people living in cities ranked among the top 100 most populous in 2020. + desc_short_pop_1mil: &desc_short_pop_1mil |- + The total number of people living in cities with more than 1 million inhabitants. + + desc_short_1m_total : &desc_short_1m_total |- + The percentage of the total population living in cities with more than 1 million inhabitants. + desc_short_1m_urb: &desc_short_1m_urb |- + The percentage of the urban population living in cities with more than 1 million inhabitants. + + desc_processing_density: &desc_processing_density |- + Population density was calculated by dividing the population of the city by the total area it covers, providing a measure of the number of people living in each km². + + entityAnnotationsMapCapitals: &entityAnnotationsMapCapitals |- + Afghanistan: Kabul + Albania: Tirana + Algeria: Algiers + Angola: Luanda + Argentina: Buenos Aires + Armenia: Yerevan + Aruba: Oranjestad + Australia: North Canberra [Canberra] + Austria: Vienna + Azerbaijan: Baku + Bahamas: Nassau + Bahrain: Manama + Bangladesh: Dhaka + Barbados: Bridgetown + Belarus: Minsk + Belgium: Brussels + Benin: Porto-Novo + Bolivia: La Paz + Bosnia and Herzegovina: Sarajevo + Botswana: Gaborone + Brazil: Brasilia + Brunei: Bandar Seri Begawan + Bulgaria: Sofia + Burkina Faso: Ouagadougou + Burundi: Gitega + Cambodia: Phnom Penh + Cameroon: Yaoundé + Canada: Ottawa + Central African Republic: Bangui + Chad: N'Djamena + Chile: Santiago + China: Beijing + Colombia: Bogota + Comoros: Moroni + Costa Rica: San José + Croatia: Zagreb + Cuba: Havana + Curacao: Willemstad + Cyprus: Strovolos [Nicosia] + Czechia: Prague + Cote d'Ivoire: Yamoussoukro + Democratic Republic of Congo: Kinshasa + Denmark: Copenhagen + Djibouti: Djibouti + Dominican Republic: Santo Domingo + Ecuador: Quito + Egypt: Cairo + El Salvador: San Salvador + Equatorial Guinea: Malabo + Eritrea: Asmara + Estonia: Tallinn + Ethiopia: Addis Ababa + Fiji: Suva + Finland: Helsinki + France: Paris + French Guiana: Cayenne + French Polynesia: Papeete + Gabon: Libreville + Georgia: Tbilisi + Germany: Berlin + Ghana: Accra + Greece: Athens + Guatemala: Guatemala City + Guinea-Bissau: Bissau + Guyana: Georgetown + Haiti: Port-au-Prince + Honduras: Tegucigalpa + Hungary: Budapest + Iceland: Reykjavik + India: New Delhi + Indonesia: Jakarta + Iran: Tehran + Iraq: Baghdad + Ireland: Dublin + Israel: Jerusalem + Italy: Rome + Jamaica: Kingston + Japan: Tokyo + Jersey: St. Helier + Jordan: Amman + Kazakhstan: Astana + Kenya: Nairobi + Kosovo: Pristina + Kuwait: Kuwait City + Kyrgyzstan: Bishkek + Laos: Vientiane + Latvia: Riga + Lebanon: Beirut + Lesotho: Maseru + Liberia: Monrovia + Libya: Tripoli + Lithuania: Vilnius + Luxembourg: Luxembourg + Madagascar: Antananarivo + Malawi: Lilongwe + Malaysia: Kuala Lumpur + Maldives: Malé + Mali: Bamako + Malta: Valletta + Mauritania: Nouakchott + Mauritius: Port Louis + Mayotte: Mamoudzou + Moldova: Chișinău + Mongolia: Ulaanbaatar + Montenegro: Podgorica + Morocco: Rabat + Mozambique: Maputo + Myanmar: Pyinmana [Nay Pyi Taw] + Mexico: Mexico City + Namibia: Windhoek + Nepal: Kathmandu + Netherlands: Amsterdam + New Caledonia: Nouméa + New Zealand: Wellington + Nicaragua: Managua + Niger: Niamey + Nigeria: Abuja + North Korea: P'yŏngyang + North Macedonia: Skopje + Northern Cyprus: Nicosia + Norway: Oslo + Oman: Muscat + Pakistan: Islamabad + Palestine: Ramallah + Panama: Panama City + Papua New Guinea: Port Moresby + Paraguay: Asuncion + Peru: Lima + Philippines: Manila + Poland: Warsaw + Portugal: Lisbon + Puerto Rico: Bayamón [San Juan] + Qatar: Doha + Congo: Brazzaville + Romania: Bucharest + Russia: Moscow + Rwanda: Kigali + Reunion: Saint-Denis + Samoa: Apia + Saudi Arabia: Riyadh + Senegal: Dakar + Serbia: Belgrade + Sierra Leone: Freetown + Singapore: Singapore + Slovakia: Bratislava + Slovenia: Ljubljana + Solomon Islands: Honiara + Somalia: Mogadishu + South Africa: Cape Town + South Korea: Seoul + South Sudan: Juba + Spain: Madrid + Sri Lanka: Colombo [Sri Jayawardenepura Kotte] + Sudan: Khartoum + Suriname: Paramaribo + Sweden: Stockholm + Switzerland: Bern + Syria: Damascus + Sao Tome and Principe: São Tomé + Taiwan: Taipei + Tajikistan: Dushanbe + Tanzania: Dodoma + Thailand: Bangkok + East Timor: Dili + Togo: Lomé + Tonga: Nuku'alofa + Trinidad and Tobago: Port of Spain + Tunisia: Tunis + Turkey: Ankara + Turkmenistan: Ashgabat + Uganda: Kampala + Ukraine: Kyiv + United Arab Emirates: Abu Dhabi + United Kingdom: London + United States: Washington + Uruguay: Montevideo + Uzbekistan: Tashkent + Vanuatu: Port Vila + Venezuela: Caracas + Vietnam: Hanoi + Yemen: Sana'a + Zambia: Lusaka + Zimbabwe: Harare + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + + +tables: + ghsl_urban_centers: + variables: + + urban_pop_projections: + title: Population of the capital city (projected) + unit: 'people' + description_short: *desc_short_pop_capital + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + isProjection: true + + urban_density_projections: + title: Population density of the capital city (projected) + unit: 'people/km²' + description_short: *desc_short_density_capital + description_processing: *desc_processing_density + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + isProjection: true + + urban_pop_estimates: + title: Population of the capital city + unit: 'people' + description_short: *desc_short_pop_capital + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + + urban_density_estimates: + title: Population density of the capital city + unit: 'people/km²' + description_short: *desc_short_density_capital + description_processing: *desc_processing_density + display: + entityAnnotationsMap: *entityAnnotationsMapCapitals + + urban_density_top_100_projections: + title: Population density of the top 100 most populous cities (projected) + unit: 'people/km²' + description_short: *desc_short_density_top_100 + description_processing: *desc_processing_density + display: + isProjection: true + + urban_density_top_100_estimates: + title: Population density of the top 100 most populous cities + unit: 'people/km²' + description_short: *desc_short_density_top_100 + description_processing: *desc_processing_density + + urban_pop_top_100_estimates: + title: Population of the top 100 most populous cities + unit: 'people' + description_short: *desc_short_pop_top_100 + + urban_pop_top_100_projections: + title: Population of the top 100 most populous cities (projected) + unit: 'people' + description_short: *desc_short_pop_top_100 + display: + isProjection: true diff --git a/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..7107170111a --- /dev/null +++ b/etl/steps/data/garden/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,86 @@ +"""Load a meadow dataset and create a garden dataset.""" +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +START_OF_PROJECTIONS = 2025 + +# Regions for which aggregates will be created. +REGIONS = [ + "North America", + "South America", + "Europe", + "Africa", + "Asia", + "Oceania", + "Low-income countries", + "Upper-middle-income countries", + "Lower-middle-income countries", + "High-income countries", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("ghsl_urban_centers") + # Read table from meadow dataset. + tb = ds_meadow.read("ghsl_urban_centers") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.drop(columns=["urban_center_name", "urban_area"]) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb, + aggregations={"urban_pop": "sum"}, + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + + # Split data into estimates and projections. + past_estimates = tb[tb["year"] < START_OF_PROJECTIONS].copy() + future_projections = tb[tb["year"] >= START_OF_PROJECTIONS - 5].copy() + + # Now, for each column, split it into two (projections and estimates). + for col in ["urban_pop", "urban_density", "urban_density_top_100", "urban_pop_top_100"]: + if col not in ["country", "year"]: + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < START_OF_PROJECTIONS, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= START_OF_PROJECTIONS - 5, col] + past_estimates = past_estimates.drop(columns=[col]) + future_projections = future_projections.drop(columns=[col]) + + # Merge past estimates and future projections + tb = pr.merge(past_estimates, future_projections, on=["country", "year"], how="outer") + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml index a4186d774c3..6a971fa5882 100644 --- a/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml +++ b/etl/steps/data/garden/wb/2024-10-07/world_bank_pip.meta.yml @@ -406,6 +406,12 @@ tables: - Bangladesh originUrl: https://ourworldindata.org/poverty + poverty_gap_index_215: + presentation: + title_public: Poverty gap index at $2.15 per day + topic_tags: + - Poverty + gini: presentation: title_public: Gini Coefficient diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml index fd736fb1874..f3f04bbd41d 100644 --- a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.meta.yml @@ -29,26 +29,28 @@ definitions: Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account. description_key_scenarios: |- - <% if scenario == "Historical" %> + <% if scenario == "Historical estimates" %> Estimates are based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts. For more details about the methodology, please refer to the [World Bank PIP documentation](https://datanalytics.worldbank.org/PIP-Methodology/lineupestimates.html#nowcasts). - <% elif scenario == "Current forecast + historical growth" %> + <% elif scenario == "Current forecast + historical growth projections" %> This data is a projection of the estimates based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. - <% elif scenario == "2% growth" %> + <% elif scenario == "Historical estimates + projections" %> + This data combines data based on household surveys or extrapolated up until the year of the data release using GDP growth estimates and forecasts, with projections based on GDP growth projections from the World Bank's Global Economic Prospects and the the Macro Poverty Outlook, together with IMF's World Economic Outlook, in the period 2025-2029. For the period 2030-2050, the data is projected using the average annual historical GDP per capita growth over 2010-2019. + <% elif scenario == "2% growth projections" %> This data is a projection of the estimates based on a scenario of 2% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "2% growth + Gini reduction 1%" %> + <% elif scenario == "2% growth + Gini reduction 1% projections" %> This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 1% of the Gini coefficient per year. - <% elif scenario == "2% growth + Gini reduction 2%" %> + <% elif scenario == "2% growth + Gini reduction 2% projections" %> This data is a projection of the estimates based on a scenatio of 2% average GDP per capita growth, while reducing income inequality by 2% of the Gini coefficient per year. - <% elif scenario == "4% growth" %> + <% elif scenario == "4% growth projections" %> This data is a projection of the estimates based on a scenario of 4% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "6% growth" %> + <% elif scenario == "6% growth projections" %> This data is a projection of the estimates based on a scenario of 6% average GDP per capita growth, while keeping income inequality constant. - <% elif scenario == "8% growth" %> + <% elif scenario == "8% growth projections" %> This data is a projection of the estimates based on a scenario of 8% average GDP per capita growth, while keeping income inequality constant. <%- endif -%> isprojection_by_scenario: |- - <% if scenario == "Historical" %> + <% if scenario == "Historical estimates" or scenario == "Historical estimates + projections" %> false <% else %> true diff --git a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py index fa62ff97730..66e637c2fcd 100644 --- a/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py +++ b/etl/steps/data/garden/wb/2024-12-03/poverty_projections.py @@ -18,14 +18,14 @@ # Define scenarios and new names SCENARIOS = { - "historical": "Historical", - "current_forecast": "Current forecast + historical growth", - "2pct": "2% growth", - "2pct_gini1": "2% growth + Gini reduction 1%", - "2pct_gini2": "2% growth + Gini reduction 2%", - "4pct": "4% growth", - "6pct": "6% growth", - "8pct": "8% growth", + "historical": "Historical estimates", + "current_forecast": "Current forecast + historical growth projections", + "2pct": "2% growth projections", + "2pct_gini1": "2% growth + Gini reduction 1% projections", + "2pct_gini2": "2% growth + Gini reduction 2% projections", + "4pct": "4% growth projections", + "6pct": "6% growth projections", + "8pct": "8% growth projections", } # Define index columns @@ -92,6 +92,10 @@ def connect_estimates_with_projections(tb: Table) -> Table: tb = tb.copy() + # Save tb_historical and tb_current_forecast, by filtering scenario in historical and current_forecast + tb_historical = tb[tb["scenario"] == "historical"].copy().reset_index(drop=True) + tb_current_forecast = tb[tb["scenario"] == "current_forecast"].copy().reset_index(drop=True) + # Make table wider, by using scenario as columns tb = tb.pivot(index=["country", "year", "povertyline"], columns="scenario", values=INDICATOR_COLUMNS) @@ -116,4 +120,16 @@ def connect_estimates_with_projections(tb: Table) -> Table: for indicator in INDICATOR_COLUMNS: tb[indicator] = tb[indicator].copy_metadata(tb["country"]) + # Combine historical and current_forecast, by concatenating tb_historical and tb_current_forecast + tb_connected = pr.concat([tb_historical, tb_current_forecast], ignore_index=True) + + # Rename scenario column to "Historical + current forecast + historical growth" + tb_connected["scenario"] = "Historical estimates + projections" + + # Keep only the columns in INDEX_COLUMNS and INDICATOR_COLUMNS + tb_connected = tb_connected[INDEX_COLUMNS + INDICATOR_COLUMNS] + + # Concatenate tb and tb_connected + tb = pr.concat([tb, tb_connected], ignore_index=True) + return tb diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py index 82ece67e6f9..e8c91465a15 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch.py @@ -37,13 +37,13 @@ def run(dest_dir: str) -> None: # Update metadata for col in ["max_compute", "max_parameters", "max_data"]: - tb[col].metadata.origins = tb["system"].metadata.origins + tb[col].metadata.origins = tb["model"].metadata.origins # Drop year as we don't need it anymore tb = tb.drop("year", axis=1) # Rename for plotting model name as country in grapher - tb = tb.rename(columns={"system": "country", "days_since_1949": "year"}) + tb = tb.rename(columns={"model": "country", "days_since_1949": "year"}) tb = tb.format(["country", "year"]) # diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py index 323e67bd023..f6df4df4e55 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_compute_intensive.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: # Process data. # # Rename for plotting model name as country in grapher - tb = tb.rename_index_names({"system": "country", "days_since_1949": "year"}) + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) # # Save outputs. # diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py index e3d4b17b89d..8c21dfbbc5a 100644 --- a/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py +++ b/etl/steps/data/grapher/artificial_intelligence/2024-11-03/epoch_regressions.py @@ -15,7 +15,7 @@ def run(dest_dir: str) -> None: # Read table from garden dataset. tb = ds_garden["epoch_regressions"] - tb = tb.rename_index_names({"system": "country", "days_since_1949": "year"}) + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) # # Save outputs. # diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml new file mode 100644 index 00000000000..af50f790b40 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.meta.yml @@ -0,0 +1,18 @@ +definitions: + common: + unit: '' + short_unit: '' + display: + zeroDay: '1949-01-01' + yearIsDay: true + +tables: + epoch: + variables: + max_compute: + title: Maximum compute + max_data: + title: Maximum data + max_parameters: + title: Maximum parameters + diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..6db83e94816 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,102 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch") + + # Read table from garden dataset. + tb = ds_garden["epoch"].reset_index() + # + # Process data. + # + # Extract year from 'publication_date' and create a new 'year' column + tb["year"] = tb["publication_date"].dt.year + + # For visualization purposes I am adding the rows with the maximum values of compute, data, and parameters in each year to the table as a separate "model". I don't want to do this in garden as it'd affect other datasets that depend on this one. + columns = { + "training_computation_petaflop": "compute", + "training_dataset_size__datapoints": "data", + "parameters": "parameters", + } + # Find maximum values for a given column (compute, data, params) per year, label them, and add summary rows. + for column, label in columns.items(): + tb = find_max_label_and_concat(tb, column, label) + + # Update metadata + for col in ["max_compute", "max_parameters", "max_data"]: + tb[col].metadata.origins = tb["model"].metadata.origins + + # Drop year as we don't need it anymore + tb = tb.drop("year", axis=1) + + # Rename for plotting model name as country in grapher + tb = tb.rename(columns={"model": "country", "days_since_1949": "year"}) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() + + +def find_max_label_and_concat(tb, column, label): + """ + Find maximum values for a given column per year, label them, and add summary rows. + + This function: + 1. Identifies rows with maximum values for the specified column in each year. + 2. Labels these maximum value rows in a new column using their original model names. + 3. Creates new summary rows for these maximum values. + 4. Adds these new summary rows to the original table. + + Note: + - Creates a new column named f"max_{label}" to indicate maximum values. + - Preserves original data and model names. + - Adds new summary rows with "model" set to f"Maximum {label}". + """ + tb = tb.sort_values(by=["year"]) # Ensure the DataFrame is sorted by year + max_value = -float("inf") + rows_to_keep = [] + + for _, row in tb.iterrows(): + if not pd.isna(row[column]) and row[column] > max_value: + max_value = row[column] + rows_to_keep.append(row) + + tb_filtered = Table(rows_to_keep) + + idx = tb_filtered[[column, "year"]].fillna(0).groupby("year")[column].idxmax() + + tb_filtered[f"max_{label}"] = "Other" + tb_filtered.loc[idx, f"max_{label}"] = f"Maximum {label}" + + max_rows = tb_filtered.loc[idx].copy() + max_rows["model"] = f"Maximum {label}" + + tb = pr.concat([tb, max_rows], ignore_index=True) + + return tb diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py new file mode 100644 index 00000000000..6582a86db80 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_affiliation.py @@ -0,0 +1,41 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_affiliation") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_affiliation"] + + # + # Process data. + # + # Rename for plotting research affiliation as country in grapher + tb = tb.rename_index_names( + { + "organization_categorization": "country", + } + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py new file mode 100644 index 00000000000..658d7982804 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_aggregates_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py new file mode 100644 index 00000000000..fb2fa66d43b --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py new file mode 100644 index 00000000000..f479f165881 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_aggregates_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_aggregates_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_aggregates_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..f6df4df4e55 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive"] + + # + # Process data. + # + # Rename for plotting model name as country in grapher + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py new file mode 100644 index 00000000000..ef0aea55b10 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_countries.py @@ -0,0 +1,30 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_countries") + + # Read table from garden dataset. + tb_garden = ds_garden["epoch_compute_intensive_countries"] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py new file mode 100644 index 00000000000..efb5fea33ce --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_domain.py @@ -0,0 +1,39 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_domain") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_domain"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "domain": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py new file mode 100644 index 00000000000..9478c5e5e42 --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_compute_intensive_organizations.py @@ -0,0 +1,38 @@ +"""Load a garden dataset and create a grapher dataset.""" +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_compute_intensive_organizations") + + # Read table from garden dataset. + tb = ds_garden["epoch_compute_intensive_organizations"] + # + # Process data. + # + # Rename for plotting model domain as country in grapher + tb = tb.rename_index_names( + { + "organization": "country", + } + ) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py new file mode 100644 index 00000000000..8c21dfbbc5a --- /dev/null +++ b/etl/steps/data/grapher/artificial_intelligence/2024-12-05/epoch_regressions.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("epoch_regressions") + + # Read table from garden dataset. + tb = ds_garden["epoch_regressions"] + tb = tb.rename_index_names({"model": "country", "days_since_1949": "year"}) + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py b/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py index e922b1b7ebe..10e396c4de9 100644 --- a/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py +++ b/etl/steps/data/grapher/un/2024-01-17/urban_agglomerations_largest_cities.py @@ -31,15 +31,15 @@ def run(dest_dir: str) -> None: ) tb = tb.drop(columns=["rank_order", "population_capital", "country_code"]) - # Create two new dataframes to separate data into estimates and projections (pre-2019 and post-2019) - past_estimates = tb[tb["year"] < 2019].copy() - future_projections = tb[tb["year"] >= 2019].copy() + # Create two new dataframes to separate data into estimates and projections + past_estimates = tb[tb["year"] <= 2015].copy() + future_projections = tb[tb["year"] >= 2015].copy() # Now, for each column in the original dataframe, split it into two for col in tb.columns: if col not in ["country", "year"]: - past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] < 2019, col] - future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2019, col] + past_estimates[f"{col}_estimates"] = tb.loc[tb["year"] <= 2015, col] + future_projections[f"{col}_projections"] = tb.loc[tb["year"] >= 2015, col] past_estimates = past_estimates.drop(columns=[col]) future_projections = future_projections.drop(columns=[col]) diff --git a/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py b/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py index d3919bc2c93..dd1bd352efd 100644 --- a/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py +++ b/etl/steps/data/grapher/un/2024-03-14/un_wpp_most.py @@ -15,12 +15,12 @@ def run(dest_dir: str) -> None: # Read five-year age-group table from garden dataset. tb_five = ds_garden["population_5_year_age_groups"].reset_index() - tb_five = tb_five.rename(columns={"location": "country"}) + # tb_five = tb_five.rename(columns={"location": "country"}) tb_five = tb_five.set_index(["country", "year"], verify_integrity=True) # Read ten-year age-group table from garden dataset. tb_ten = ds_garden["population_10_year_age_groups"].reset_index() - tb_ten = tb_ten.rename(columns={"location": "country"}) + # tb_ten = tb_ten.rename(columns={"location": "country"}) tb_ten = tb_ten.set_index(["country", "year"], verify_integrity=True) # Save outputs. # diff --git a/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..7ed2180485c --- /dev/null +++ b/etl/steps/data/grapher/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,28 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("ghsl_urban_centers") + + # Read table from garden dataset. + tb = ds_garden.read("ghsl_urban_centers", reset_index=False) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..916eff1f4e7 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,73 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("epoch.start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch.csv") + + # Read snapshot + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "Model", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Organization categorization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + "Notability criteria", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["Model"] = tb["Model"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["Model"].isna().any(), "NaN values found in 'Model' column after processing." + # + # Create a new table. + # + tb = tb.format(["model", "publication_date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("epoch.end") diff --git a/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..a8509aef960 --- /dev/null +++ b/etl/steps/data/meadow/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,66 @@ +"""Load a snapshot and create a meadow dataset.""" + +import numpy as np + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("epoch_compute_intensive.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Define columns of interest. + cols = [ + "Model", + "Domain", + "Authors", + "Country (from Organization)", + "Organization", + "Publication date", + "Parameters", + "Training compute (FLOP)", + "Training dataset size (datapoints)", + ] + + # Check that the columns of interest are present + for col in cols: + assert col in tb.columns, f"Column '{col}' is missing from the dataframe." + + # Select the columns of interest + tb = tb[cols] + # Replace empty strings with NaN values + tb = tb.replace("", np.nan) + # Remove rows where all values are NaN + tb = tb.dropna(how="all") + + # Convert the training compute column to float + tb["Training compute (FLOP)"] = tb["Training compute (FLOP)"].astype(float) + + # Replace the missing values in the system column with the organization column. If organization column is NaN as well replace the missing values in the system column with the authors column + tb["Model"] = tb["Model"].fillna(tb["Organization"]).fillna(tb["Authors"]) + # Check that there are no NaN values in the system column + assert not tb["Model"].isna().any(), "NaN values found in 'Model' column after processing." + # + # Create a new table. + # + tb = tb.format(["model", "publication_date"]) + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py b/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..ac3575f9683 --- /dev/null +++ b/etl/steps/data/meadow/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,112 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("ghsl_urban_centers.xlsx") + + # Load data from snapshot. + tb_urban_center_names = snap.read(safe_types=False, sheet_name="General_info") + tb_urban_center_density = snap.read(safe_types=False, sheet_name="Area_km2_time_series") + tb_urban_center_population = snap.read(safe_types=False, sheet_name="POP_time_series") + + # Process data. + # + + # Remove duplicates in the ID sheet - based on the name of the urban center and country + tb_urban_center_names = tb_urban_center_names.drop_duplicates(subset=["Main Name", "GADM_name"]) + + tb_urban_center_names = tb_urban_center_names[ + [ + "ID_MTUC_G0", + "Main Name", + "GADM_name", + "UNSDGRegion", + "CountryCapital", + ] + ] + tb_urban_center_density = tb_urban_center_density.melt( + id_vars=["ID_MTUC_G0"], var_name="year", value_name="urban_area" + ) + tb_urban_center_population = tb_urban_center_population.melt( + id_vars=["ID_MTUC_G0"], var_name="year", value_name="urban_pop" + ) + + # Replace zeros with NaNs in the urban_pop column (when the urban center did not meet the criteria) + tb_urban_center_population["urban_pop"] = tb_urban_center_population["urban_pop"].replace(0, pd.NA) + + # Convert the urban_pop column to a numeric dtype + tb_urban_center_population["urban_pop"] = pd.to_numeric(tb_urban_center_population["urban_pop"], errors="coerce") + + tb = pr.merge( + tb_urban_center_population, + tb_urban_center_density, + on=["ID_MTUC_G0", "year"], + how="outer", + ) + tb["urban_density"] = tb["urban_pop"] / tb["urban_area"] + + tb = pr.merge( + tb, + tb_urban_center_names, + on="ID_MTUC_G0", + how="right", + ) + + tb = tb.rename( + columns={ + "GADM_name": "country", + "Main Name": "urban_center_name", + "UNSDGRegion": "region", + "WBIncome2022": "income_group", + "CountryCapital": "capital", + } + ) + + # Filter the Table where urban_center_name is NaN + tb = tb.dropna(subset=["urban_center_name"]) + + # Population and density of the capital city + tb_capitals = tb[tb["capital"] == 1] + + tb_capitals = tb_capitals.drop(columns=["ID_MTUC_G0", "region", "capital"]) + + # Select the top 100 most populous cities in 2020 + tb_2020 = tb[tb["year"] == 2020] + top_100_pop_2020 = tb_2020.nlargest(100, "urban_pop").drop_duplicates(subset=["ID_MTUC_G0"]) + + # Filter the original Table to select the top urban centers + tb_top = tb[tb["ID_MTUC_G0"].isin(top_100_pop_2020["ID_MTUC_G0"])] + + tb_top = tb_top.drop(columns=["urban_area", "ID_MTUC_G0", "region", "capital"]) + tb_top = tb_top.rename(columns={"urban_density": "urban_density_top_100", "urban_pop": "urban_pop_top_100"}) + + # Format the country column + tb_top["country"] = tb_top["urban_center_name"] + " (" + tb_top["country"] + ")" + tb_top = tb_top.drop(columns=["urban_center_name"]) + + tb = pr.merge(tb_capitals, tb_top, on=["country", "year"], how="outer") + + for col in ["urban_pop", "urban_density_top_100", "urban_pop_top_100"]: + tb[col].metadata.origins = tb["country"].metadata.origins + + tb = tb.format(["country", "year"]) + + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/version_tracker.py b/etl/version_tracker.py index 57f73541748..100af0e0ac2 100644 --- a/etl/version_tracker.py +++ b/etl/version_tracker.py @@ -446,53 +446,41 @@ def get_path_to_script(self, step: str, omit_base_dir: bool = False) -> Optional """Get the path to the script of a given step.""" # Get step attributes. _, step_type, _, channel, namespace, version, name, _ = extract_step_attributes(step=step).values() - state = "active" if step in self.all_active_steps else "archive" # Create a dictionary that contains the path to a script for a given step. # This dictionary has to keys, namely "active" and "archive". # Active steps should have a script in the active directory. # But steps that are in the archive dag can be either in the active or the archive directory. - path_to_script = {"active": None, "archive": None} + path_to_script = None if step_type == "export": - path_to_script["active"] = paths.STEP_DIR / "export" / channel / namespace / version / name # type: ignore + path_to_script = paths.STEP_DIR / "export" / channel / namespace / version / name # type: ignore elif channel == "snapshot": - path_to_script["active"] = paths.SNAPSHOTS_DIR / namespace / version / name # type: ignore - path_to_script["archive"] = paths.SNAPSHOTS_DIR_ARCHIVE / namespace / version / name # type: ignore + path_to_script = paths.SNAPSHOTS_DIR / namespace / version / name # type: ignore elif channel in ["meadow", "garden", "grapher", "explorers", "open_numbers", "examples", "external"]: - path_to_script["active"] = paths.STEP_DIR / "data" / channel / namespace / version / name # type: ignore - path_to_script["archive"] = paths.STEP_DIR_ARCHIVE / channel / namespace / version / name # type: ignore + path_to_script = paths.STEP_DIR / "data" / channel / namespace / version / name # type: ignore elif channel == "walden": - path_to_script["active"] = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore - path_to_script["archive"] = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore + path_to_script = paths.BASE_DIR / "lib" / "walden" / "ingests" / namespace / version / name # type: ignore elif channel in ["backport", "etag"]: # Ignore these channels, for which there is never a script. return None else: log.error(f"Unknown channel {channel} for step {step}.") - if state == "active": - # Steps in the active dag should only have a script in the active directory. - del path_to_script["archive"] - path_to_script_detected = None - for state in path_to_script: - # A step script can exist either as a .py file, as a .ipynb file, or a __init__.py file inside a folder. - # In the case of snapshots, there may or may not be a .py file, but there definitely needs to be a dvc file. - # In that case, the corresponding script is not trivial to find, but at least we can return the dvc file. - for path_to_script_candidate in [ - path_to_script[state].with_suffix(".py"), # type: ignore - path_to_script[state].with_suffix(".ipynb"), # type: ignore - path_to_script[state] / "__init__.py", # type: ignore - path_to_script[state].with_name(path_to_script[state].name + ".dvc"), # type: ignore - ]: - if path_to_script_candidate.exists(): - path_to_script_detected = path_to_script_candidate - break + # A step script can exist either as a .py file, as a .ipynb file, or a __init__.py file inside a folder. + # In the case of snapshots, there may or may not be a .py file, but there definitely needs to be a dvc file. + # In that case, the corresponding script is not trivial to find, but at least we can return the dvc file. + for path_to_script_candidate in [ + path_to_script.with_suffix(".py"), # type: ignore + path_to_script.with_suffix(".ipynb"), # type: ignore + path_to_script / "__init__.py", # type: ignore + path_to_script.with_name(path_to_script.name + ".dvc"), # type: ignore + ]: + if path_to_script_candidate.exists(): + path_to_script_detected = path_to_script_candidate + break if path_to_script_detected is None: - if state == "active": - log.error(f"Script for step {step} not found.") - else: - log.warning(f"Script for archive step {step} not found.") + log.error(f"Script for step {step} not found.") if omit_base_dir and path_to_script_detected is not None: # Return the path relative to the base directory (omitting the local path to the ETL repos). diff --git a/lib/catalog/owid/catalog/datasets.py b/lib/catalog/owid/catalog/datasets.py index 745563a2d81..d24f55d2c5f 100644 --- a/lib/catalog/owid/catalog/datasets.py +++ b/lib/catalog/owid/catalog/datasets.py @@ -119,7 +119,7 @@ def add( utils.validate_underscore(col, "Variable's name") if not table.primary_key: - if "OWID_STRICT" in environ: + if environ.get("OWID_STRICT"): raise PrimaryKeyMissing( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) @@ -128,7 +128,7 @@ def add( f"Table `{table.metadata.short_name}` does not have a primary_key -- please use t.set_index([col, ...], verify_integrity=True) to indicate dimensions before saving" ) - if not table.index.is_unique and "OWID_STRICT" in environ: + if not table.index.is_unique and environ.get("OWID_STRICT"): [(k, dups)] = table.index.value_counts().head(1).to_dict().items() raise NonUniqueIndex( f"Table `{table.metadata.short_name}` has duplicate values in the index -- could you have made a mistake?\n\n" diff --git a/pyproject.toml b/pyproject.toml index a5111ed6c6d..462eaa42eea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,8 +165,6 @@ build-backend = "hatchling.build" [tool.pyright] exclude = [ "lib/", - "etl/steps/archive", - "etl/snapshots/archive", "apps/wizard/etl_steps/cookiecutter/", "apps/wizard/etl_steps/cookiecutter/snapshot/**", "**/node_modules", diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc b/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc new file mode 100644 index 00000000000..2ba13908db0 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch.csv.dvc @@ -0,0 +1,38 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Parameter, Compute and Data Trends in Machine Learning + date_published: 2024-07-19 + description_snapshot: | + We update this chart with the latest available data from our source every month. + + The authors selected the AI systems for inclusion based on the following necessary criteria: + — Have an explicit learning component + — Showcase experimental results + — Advance the state of the art + + In addition, the systems had to meet at least one of the following notability criteria: + — Paper has more than 1000 citations + — Historical importance + — Important state-of-the-art advance + — Deployed in a notable context + + The authors note that: "For new models (from 2020 onward) it is harder to assess these criteria, so we fall back to a subjective selection. We refer to models meeting our selection criteria as 'milestone models." + # Citation + producer: Epoch + citation_full: "Epoch AI, ‘Parameter, Compute and Data Trends in Machine Learning’. Published online at epochai.org. Retrieved from: ‘https://epoch.ai/data/epochdb/visualization’ [online resource]" + # Files + url_main: https://epoch.ai/mlinputs/visualization + url_download: https://epoch.ai/data/epochdb/notable_ai_models.csv + date_accessed: 2024-12-05 + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 98750b0d23c2f5e11b766e0849432fb3 + size: 1600590 + path: epoch.csv diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch.py b/snapshots/artificial_intelligence/2024-12-05/epoch.py new file mode 100644 index 00000000000..daa355e267f --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch.py @@ -0,0 +1,33 @@ +"""Script to create a snapshot of dataset 'Parameter, Compute and Data Trends in Machine Learning (Epoch, 2023)'.""" + + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch.csv") + + # Download data from source. + snap.download_from_source() + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc new file mode 100644 index 00000000000..1850e7f75b3 --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Tracking Compute-Intensive AI Models + description: |- + A dataset that tracks compute-intensive AI models, with training compute over 10²³ floating point operations (FLOP). This corresponds to training costs of hundreds of thousands of dollars or more.  + + To identify compute-intensive AI models, the team at Epoch AI used various resources, estimating compute when not directly reported. They included benchmarks and repositories, such as Papers With Code and Hugging Face, to find models exceeding 10²³ FLOP. They also explored non-English media and specific leaderboards, particularly focusing on Chinese sources. + + Additionally, they examined blog posts, press releases from major labs, and scholarly literature to track new models. A separate table was created for models with unconfirmed but plausible compute levels. Despite thorough methods, proprietary and secretive models may have been missed. + date_published: "2024-06-19" + + # Citation + producer: Epoch + citation_full: |- + Robi Rahman, David Owen and Josh You (2024), "Tracking Compute-Intensive AI Models". Published online at epochai.org. Retrieved from: 'https://epoch.ai/blog/tracking-compute-intensive-ai-models' [online resource] + + # Files + url_main: https://epoch.ai/blog/tracking-compute-intensive-ai-models + url_download: https://epoch.ai/data/epochdb/large_scale_ai_models.csv + date_accessed: 2024-12-05 + + # License + license: + name: CC BY 4.0 + url: https://epoch.ai/blog/how-much-does-it-cost-to-train-frontier-ai-models + +outs: + - md5: c52df75e59048128dc8288a0467f3f4c + size: 484868 + path: epoch_compute_intensive.csv diff --git a/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py new file mode 100644 index 00000000000..fdbd7822e4a --- /dev/null +++ b/snapshots/artificial_intelligence/2024-12-05/epoch_compute_intensive.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"artificial_intelligence/{SNAPSHOT_VERSION}/epoch_compute_intensive.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 5f8b155e7fd..d103b37e950 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,12 +9,12 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-12-06 - date_published: 2024-12-06 + date_accessed: 2024-12-11 + date_published: 2024-12-11 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license outs: - - md5: 9b62fdbd7e71f568534c2d084dd4dec2 - size: 12765621 + - md5: fc6f8b908a2988b2d8048707526c460a + size: 12799310 path: weekly_wildfires.csv diff --git a/snapshots/covid/latest/cases_deaths.csv.dvc b/snapshots/covid/latest/cases_deaths.csv.dvc index 4ca66b5b7b9..247507293dc 100644 --- a/snapshots/covid/latest/cases_deaths.csv.dvc +++ b/snapshots/covid/latest/cases_deaths.csv.dvc @@ -22,12 +22,12 @@ meta: version_producer: WHO COVID-19 Dashboard - Daily cases and deaths url_main: https://covid19.who.int/ url_download: https://srhdpeuwpubsa.blob.core.windows.net/whdh/COVID/WHO-COVID-19-global-daily-data.csv - date_accessed: 2024-12-06 + date_accessed: 2024-12-11 date_published: '2024-07-07' license: name: CC BY 4.0 url: https://data.who.int/dashboards/covid19/ outs: - - md5: be6d52c03f3b02890e73a145ec3582c9 - size: 19463648 + - md5: 16914ffd0a8531ef26e28bc0578eb491 + size: 19539571 path: cases_deaths.csv diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index ec0c30e1fb0..29102377237 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-12-06 + date_accessed: 2024-12-11 publication_date: 2024-11-11 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 0c1ec7ee8fe..d7b4d86e4fd 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-12-06 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- @@ -33,6 +33,6 @@ meta: name: MIT License url: https://github.com/akarlinsky/world_mortality/blob/main/LICENSE outs: - - md5: 2df7e56f5f496e18a07780bf7dd0af07 - size: 1084531 + - md5: cab03dff0de45a45aae54fe9772c4666 + size: 1087717 path: wmd.csv diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 5795f2432ef..91a48ea6e6d 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-12-06 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index 61915814402..4571161e0bd 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-12-06 + date_accessed: 2024-12-11 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc b/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc index 059ede68482..e904f870cd9 100644 --- a/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc +++ b/snapshots/fasttrack/latest/antimicrobial_usage_livestock.csv.dvc @@ -1,6 +1,6 @@ meta: origin: - producer: Mulchandani et al. (2023) + producer: Mulchandani et al. title: 'Global trends in antimicrobial use in food-producing animals: 2020 to 2030' description: |- Data on usage of antimicrobials in food animals were collected from 42 countries. Multivariate regression models were used in combination with projections of animal counts for cattle, sheep, chicken, and pigs from the Food and Agriculture Organization to estimate global antimicrobial usage of veterinary antimicrobials in 2020 and 2030. Maps of animal densities were used to identify geographic hotspots of antimicrobial use. In each country, estimates of antimicrobial use (tonnes) were calibrated to match continental-level reports of antimicrobial use intensity (milligrams per kilogram of animal) from the World Organization for Animal Health, as well as country-level reports of antimicrobial use from countries that made this information publicly available. @@ -8,8 +8,8 @@ meta: Mulchandani, R., Wang, Y., Gilbert, M., & Van Boeckel, T. P. (2023). Global trends in antimicrobial use in food-producing animals: 2020 to 2030. PLOS Global Public Health, 3(2), e0001305. https://doi.org/10.1371/journal.pgph.0001305 url_main: https://journals.plos.org/globalpublichealth/article?id=10.1371/journal.pgph.0001305 url_download: |- - gAAAAABnRZJtJagGwio4A8B3H6-jsxNa4M3zZtR92Arl_amq659ebrzeaxRovvWxLH4v5lvmoTALQnfL2aC7g2J8SSkjrgOPMLZXqOtoECV4ISER2UrZ9lbQUCsOS9-Cbc52-I1joAi2QwDz0J4UV5ufumvj0rYVGmv6YdPAypyrHbpnd893i21f-l7IULYWBQSdPCrRAUV0RlAXURJuGd1dVadThd_gwBgMdxYmxuh4EUvxSSHVQG_I4pYURJQGNOuNI59qAJa5 - date_accessed: '2024-11-26' + https://docs.google.com/spreadsheets/d/e/2PACX-1vT1GgT43B-J5fD0kqup2QeajeMNLtjo10An4N3OkugtbOn-Q4OIaoI5pC2hsnYroRn8UmVhcczZADHw/pub?output=csv + date_accessed: '2024-12-06' date_published: '2023-02-01' license: name: Open access @@ -17,7 +17,6 @@ meta: description: |- This dataset estimates the usage of antimicrobials in livestock (cattle, sheep, chicken, and pigs) by country. Data on antimicrobials comes from government reports, surveillance systems and national surveys. In addition, the authors estimate the biomass of livestock in the country, to adjust for differences in antimicrobial usage by animal size. Biomass data comes from the Food and Agriculture Organization (FAO). 'The PCU represents the total number of animals in a country (alive or slaughtered), multiplied by the average weight of the animal at the time of treatment. Therefore, the PCU is a standardization metric that accounts for differences in animal weight, and number of production cycles per year between countries.' Therefore, mg/PCU refers to the usage of antimicrobials per animal population-corrected unit. license: {} - is_public: false outs: - md5: e0c44fec35851446ebb61784ce6528e3 size: 8682 diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc index f0ac69ac22b..4d27ab747a3 100644 --- a/snapshots/health/latest/global_health_mpox.csv.dvc +++ b/snapshots/health/latest/global_health_mpox.csv.dvc @@ -22,6 +22,6 @@ meta: url: https://global.health/terms-of-use/ outs: - - md5: 7918839d62392e863d680e58aa5b0808 - size: 16733331 + - md5: 08388d2230adafbb7fe28ddcd1eb0dc8 + size: 16813136 path: global_health_mpox.csv diff --git a/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc b/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc index 1701120fca4..9d606d204d9 100644 --- a/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc +++ b/snapshots/urbanization/2024-10-14/ghsl_degree_of_urbanisation.xlsx.dvc @@ -11,10 +11,10 @@ meta: # Citation producer: European Commission, Joint Research Centre (JRC) citation_full: |- - Carioli A., Schiavina M., Melchiorri M. (2024): GHS-COUNTRY-STATS R2024A - GHSL Country Statistics by Degree of Urbanization, multitemporal (1975-2030). European Commission, Joint Research Centre (JRC) [Dataset] doi:10.2905/341c0608-5ca5-4ddb-b068-a412e35a3326 PID: http://data.europa.eu/89h/341c0608-5ca5-4ddb-b068-a412e35a3326 + Carioli, Alessandra; Schiavina, Marcello; Melchiorri, Michele (2024): GHS-COUNTRY-STATS R2024A - GHSL Country Statistics by Degree of Urbanization, multitemporal (1975-2030). European Commission, Joint Research Centre (JRC) [Dataset] doi: 10.2905/341c0608-5ca5-4ddb-b068-a412e35a3326 PID: http://data.europa.eu/89h/341c0608-5ca5-4ddb-b068-a412e35a3326 # Files - url_main: https://ghsl.jrc.ec.europa.eu/CFS.php + url_main: https://data.jrc.ec.europa.eu/dataset/341c0608-5ca5-4ddb-b068-a412e35a3326 date_accessed: 2024-10-14 # License diff --git a/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py new file mode 100644 index 00000000000..4c0ede9b1f5 --- /dev/null +++ b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset. This version of the dataset was provided directly by the source via email (DIJKSTRA Lewis ).""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"urbanization/{SNAPSHOT_VERSION}/ghsl_urban_centers.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc new file mode 100644 index 00000000000..f2c69bd3b2c --- /dev/null +++ b/snapshots/urbanization/2024-12-02/ghsl_urban_centers.xlsx.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: Global Human Settlement Layer Dataset - Stats in the City Database + description: |- + The "Stats in the City Database" offers harmonized data on population and population density for 11,422 urban centres. + + This data, based on the Global Human Settlement Layer Dataset, uses the Degree of Urbanisation framework to delineate spatial entities and integrates geospatial data from a variety of open-source datasets. It represents one of the most comprehensive resources for understanding urban population patterns and densities worldwide + date_published: "2024" + + # Citation + producer: European Commission, Joint Research Centre (JRC) + citation_full: |- + Center For International Earth Science Information Network-CIESIN-Columbia University. 2018. “Gridded Population of the World, Version 4 (GPWv4): Population Count, Revision 11.” Palisades, NY: NASA Socioeconomic Data and Applications Center (SEDAC). https://doi.org/10.7927/H4JW8BX5 + Pesaresi M., Politis P. (2023): GHS-BUILT-V R2023A - GHS built-up volume grids derived from joint assessment of Sentinel2, Landsat, and global DEM data, multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/ab2f107a-03cd-47a3-85e5-139d8ec63283, doi:10.2905/AB2F107A-03CD-47A3-85E5-139D8EC63283 + Pesaresi M., Politis P. (2023): GHS-BUILT-S R2023A - GHS built-up surface grid, derived from Sentinel2 composite and Landsat, multitemporal (1975-2030)European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/9f06f36f-4b11-47ec-abb0-4f8b7b1d72ea, doi:10.2905/9F06F36F-4B11-47EC-ABB0-4F8B7B1D72EA + Schiavina M., Freire S., Carioli A., MacManus K. (2023): GHS-POP R2023A - GHS population grid multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/2ff68a52-5b5b-4a22-8f40-c41da8332cfe, doi:10.2905/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE + Schiavina M., Freire S., Carioli A., MacManus K. (2023): GHS-POP R2023A - GHS population grid multitemporal (1975-2030).European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/2ff68a52-5b5b-4a22-8f40-c41da8332cfe, doi:10.2905/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE + Schiavina M., Melchiorri M., Pesaresi M. (2023): GHS-SMOD R2023A - GHS settlement layers, application of the Degree of Urbanisation methodology (stage I) to GHS-POP R2023A and GHS-BUILT-S R2023A, multitemporal (1975-2030)European Commission, Joint Research Centre (JRC) PID: http://data.europa.eu/89h/a0df7a6f-49de-46ea-9bde-563437a6e2ba, doi:10.2905/A0DF7A6F-49DE-46EA-9BDE-563437A6E2BA + + url_main: https://human-settlement.emergency.copernicus.eu/ghs_ucdb_2024.php + date_accessed: 2024-12-02 + + # License + license: + name: CC BY 4.0 + url: https://commission.europa.eu/legal-notice_en +outs: + - md5: 78dbc4fc3cbcbe24cd51fe4f884319e2 + size: 2963003 + path: ghsl_urban_centers.xlsx diff --git a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc index 2e560863971..08689d46ba7 100644 --- a/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc +++ b/snapshots/wb/2024-12-03/reproducibility_package_poverty_prosperity_planet.zip.dvc @@ -12,6 +12,7 @@ meta: producer: Lakner et al. citation_full: |- Lakner, C., Genoni, M. E., Stemmler, H., Yonzan, N., & Tetteh Baah, S. K. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024. World Bank. https://doi.org/10.60572/KGE4-CX54 + attribution: Lakner et al. (2024). Reproducibility package for Poverty, Prosperity and Planet Report 2024 # Files url_main: https://reproducibility.worldbank.org/index.php/catalog/189/ diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 1017ddac368..30b2f569464 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 064d33d3d9c1f4ac31f99faf7bab6541 - size: 167833007 + - md5: 811f5ca9e719e680bc1cde286e599f9d + size: 168107745 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 1784829679a..6a11439d09e 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 746f6a9771c420d86bad223b5297c46d - size: 27203557 + - md5: b687f5f92351d148e71bb3b5d60c0c50 + size: 27222953 path: flunet.csv diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py index 316ecfdeaf1..be4466781c1 100644 --- a/tests/test_datadiff.py +++ b/tests/test_datadiff.py @@ -1,3 +1,6 @@ +import os +from unittest.mock import patch + import pandas as pd from owid.catalog import Dataset, DatasetMeta, Table @@ -19,6 +22,7 @@ def _create_datasets(tmp_path): return ds_a, ds_b +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_DatasetDiff_summary(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) @@ -43,6 +47,7 @@ def test_DatasetDiff_summary(tmp_path): ] +@patch.dict(os.environ, {"OWID_STRICT": ""}) def test_new_data(tmp_path): ds_a, ds_b = _create_datasets(tmp_path) diff --git a/tests/test_steps.py b/tests/test_steps.py index ff266f1917d..5693fcd05fd 100644 --- a/tests/test_steps.py +++ b/tests/test_steps.py @@ -15,6 +15,7 @@ from unittest.mock import patch import pandas as pd +import requests from owid.catalog import Dataset from etl import paths @@ -162,7 +163,11 @@ def test_select_dirty_steps(): def test_get_etag(): - etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + try: + etag = get_etag("https://raw.githubusercontent.com/owid/owid-grapher/master/README.md") + # ignore SSL errors + except requests.exceptions.SSLError: + return assert etag diff --git a/vscode_extensions/find-latest-etl-step/src/extension.ts b/vscode_extensions/find-latest-etl-step/src/extension.ts index bb1f0f2bdd6..8b3277d7fd7 100644 --- a/vscode_extensions/find-latest-etl-step/src/extension.ts +++ b/vscode_extensions/find-latest-etl-step/src/extension.ts @@ -115,9 +115,7 @@ function findFiles(dir: string, ig: any): { path: string, date: Date | 'latest', const excludeFolders = [ path.join('etl', 'data'), path.join('etl', 'export'), - path.join('snapshots', 'backport'), - 'snapshots_archive', - path.join('etl', 'steps', 'archive') + path.join('snapshots', 'backport') ]; if (excludeFolders.some(excludeFolder => filePath.includes(excludeFolder)) || filePath.includes('__pycache__')) {