From 98661be25ba35877a24717a556bcf37e23a022cf Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 09:10:31 +0100 Subject: [PATCH 01/10] =?UTF-8?q?=F0=9F=93=8A=20population:=20exploration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 5ce9fc3c1be9ffbacf87f5afd1096687facdcbd1 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 09:33:04 +0100 Subject: [PATCH 02/10] wip --- dag/main.yml | 5 + .../latest/population_explore.countries.json | 232 ++++++++++++++++++ .../owid/latest/population_explore.meta.yml | 15 ++ .../garden/owid/latest/population_explore.py | 45 ++++ .../grapher/owid/latest/population_explore.py | 14 ++ .../meadow/owid/latest/population_explore.py | 46 ++++ 6 files changed, 357 insertions(+) create mode 100644 etl/steps/data/garden/owid/latest/population_explore.countries.json create mode 100644 etl/steps/data/garden/owid/latest/population_explore.meta.yml create mode 100644 etl/steps/data/garden/owid/latest/population_explore.py create mode 100644 etl/steps/data/grapher/owid/latest/population_explore.py create mode 100644 etl/steps/data/meadow/owid/latest/population_explore.py diff --git a/dag/main.yml b/dag/main.yml index f2134d3ec1a..6761c67ae3e 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -849,6 +849,11 @@ steps: data://grapher/wpf/2024-10-03/famines_by_place: - data://garden/wpf/2024-10-03/famines_by_place + data://garden/owid/latest/population_explore: + - data://meadow/owid/latest/population_explore + data://grapher/owid/latest/population_explore: + - data://garden/owid/latest/population_explore + include: - dag/open_numbers.yml - dag/faostat.yml diff --git a/etl/steps/data/garden/owid/latest/population_explore.countries.json b/etl/steps/data/garden/owid/latest/population_explore.countries.json new file mode 100644 index 00000000000..3acd3a047e2 --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.countries.json @@ -0,0 +1,232 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Channel Islands": "Channel Islands", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Faeroe Islands": "Faroe Islands", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Holy See": "Vatican", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Micronesia (Fed. States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Northern Mariana Islands": "Northern Mariana Islands", + "Norway": "Norway", + "Occupied Palestinian Territory": "Palestine", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "TFYR Macedonia": "North Macedonia", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Dem. People's Republic of Korea": "North Korea", + "Libyan Arab Jamahiriya": "Libya", + "Other non-specified areas": "Others" +} diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml new file mode 100644 index 00000000000..1c1e32ce91b --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -0,0 +1,15 @@ +dataset: + update_period_days: 0 + title: Population (Maddison exploration) + +tables: + population_explore: + variables: + population: + title: Population + description_short: Population of countries + unit: "people" + origins: + - title: Maddison (2024) + date_published: "2024" + producer: Maddison diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py new file mode 100644 index 00000000000..72f2a8cad9a --- /dev/null +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -0,0 +1,45 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("population_explore") + + # Read table from meadow dataset. + tb = ds_meadow["population_explore"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + formats=["csv", "feather"], + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/owid/latest/population_explore.py b/etl/steps/data/grapher/owid/latest/population_explore.py new file mode 100644 index 00000000000..c2a096bb625 --- /dev/null +++ b/etl/steps/data/grapher/owid/latest/population_explore.py @@ -0,0 +1,14 @@ +from etl.helpers import PathFinder, create_dataset + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + ds_garden = paths.load_dataset("population_explore") + + tb = ds_garden["population_explore"] + + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + ds_grapher.save() diff --git a/etl/steps/data/meadow/owid/latest/population_explore.py b/etl/steps/data/meadow/owid/latest/population_explore.py new file mode 100644 index 00000000000..3dabf27c0da --- /dev/null +++ b/etl/steps/data/meadow/owid/latest/population_explore.py @@ -0,0 +1,46 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + path = paths.directory / "population_explore.xlsx" + tb = pr.read_excel(path) + tb = tb.drop(index=range(0, 5)) + + tb = tb.melt( + id_vars=["source"], + var_name="country", + value_name="population", + ).rename(columns={"source": "year"}) + + # Scale + tb["population"] = (tb["population"] * 1000).astype(float).round().astype("Int64") + + # + # Process data. + # + tb = tb.format(["country", "year"], short_name="population_explore") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) + + # Save changes in the new garden dataset. + ds_garden.save() From af2018374e67c8a78a2872d2c04b8bae6f6f9f13 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 12:12:43 +0100 Subject: [PATCH 03/10] wip --- dag/main.yml | 12 ++++--- .../owid/latest/population_explore.meta.yml | 1 + .../meadow/owid/latest/population_explore.py | 5 +-- snapshots/owid/latest/population_explore.py | 35 +++++++++++++++++++ .../owid/latest/population_explore.xlsx.dvc | 29 +++++++++++++++ 5 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 snapshots/owid/latest/population_explore.py create mode 100644 snapshots/owid/latest/population_explore.xlsx.dvc diff --git a/dag/main.yml b/dag/main.yml index 6761c67ae3e..8032b5ffb35 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -821,7 +821,7 @@ steps: data://meadow/wpf/2024-10-03/famines: - snapshot://wpf/2024-10-03/famines.xlsx data://garden/wpf/2024-10-03/famines: - - data://meadow/wpf/2024-10-03/famines + - data://meadow/wpf/2024-10-03/famines - data://garden/regions/2023-01-01/regions data://grapher/wpf/2024-10-03/famines: @@ -849,10 +849,12 @@ steps: data://grapher/wpf/2024-10-03/famines_by_place: - data://garden/wpf/2024-10-03/famines_by_place - data://garden/owid/latest/population_explore: - - data://meadow/owid/latest/population_explore - data://grapher/owid/latest/population_explore: - - data://garden/owid/latest/population_explore + data-private://meadow/owid/latest/population_explore: + - snapshot-private://owid/latest/population_explore.xlsx + data-private://garden/owid/latest/population_explore: + - data-private://meadow/owid/latest/population_explore + data-private://grapher/owid/latest/population_explore: + - data-private://garden/owid/latest/population_explore include: - dag/open_numbers.yml diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml index 1c1e32ce91b..5de72f0d6e1 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.meta.yml +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -1,6 +1,7 @@ dataset: update_period_days: 0 title: Population (Maddison exploration) + non_redistributable: true tables: population_explore: diff --git a/etl/steps/data/meadow/owid/latest/population_explore.py b/etl/steps/data/meadow/owid/latest/population_explore.py index 3dabf27c0da..1ee8aa3f181 100644 --- a/etl/steps/data/meadow/owid/latest/population_explore.py +++ b/etl/steps/data/meadow/owid/latest/population_explore.py @@ -1,8 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" -import owid.catalog.processing as pr -from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -14,8 +12,7 @@ def run(dest_dir: str) -> None: # Load inputs. # # Load meadow dataset. - path = paths.directory / "population_explore.xlsx" - tb = pr.read_excel(path) + tb = paths.read_snap_table("population_explore.xlsx") tb = tb.drop(index=range(0, 5)) tb = tb.melt( diff --git a/snapshots/owid/latest/population_explore.py b/snapshots/owid/latest/population_explore.py new file mode 100644 index 00000000000..6af7d3655d3 --- /dev/null +++ b/snapshots/owid/latest/population_explore.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import click +from structlog import get_logger + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Log +log = get_logger() + + +######################################################################################################################## +# TODO: Temporarily using a local file until 2024 revision is released +# The download url should still be the same: +# https://population.un.org/wpp +######################################################################################################################## +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", type=str, help="Path to population local file.") +def main( + upload: bool, + path_to_file: str | None = None, +) -> None: + # Create a new snapshot. + snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/population_explore.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/owid/latest/population_explore.xlsx.dvc b/snapshots/owid/latest/population_explore.xlsx.dvc new file mode 100644 index 00000000000..22616da0f81 --- /dev/null +++ b/snapshots/owid/latest/population_explore.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + is_public: false + + origin: + # Data product / Snapshot + title: pop + description: |- + pop + date_published: "2024" + + # Citation + producer: "?" + citation_full: "?" + + # Files + url_main: "" + date_accessed: 2024--11-06 + + # License + license: + name: Open Government Licence v3.0 + url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ + +outs: + - md5: 50f6fdb368ab8068dbb19e5c029455a2 + size: 435873 + path: population_explore.xlsx From e0dc67e93f87b109e3accb6f061cd8cb6150d5b0 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 12:16:28 +0100 Subject: [PATCH 04/10] fix typo --- dag/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dag/main.yml b/dag/main.yml index 8032b5ffb35..acd837c877f 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -821,7 +821,7 @@ steps: data://meadow/wpf/2024-10-03/famines: - snapshot://wpf/2024-10-03/famines.xlsx data://garden/wpf/2024-10-03/famines: - - data://meadow/wpf/2024-10-03/famines + - data://meadow/wpf/2024-10-03/famines - data://garden/regions/2023-01-01/regions data://grapher/wpf/2024-10-03/famines: From 1b6c18e12e182d4df0a0cf25ddca94b519788939 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 12:50:12 +0100 Subject: [PATCH 05/10] wip --- dag/main.yml | 1 + .../owid/latest/population_explore.meta.yml | 14 +++++++++----- .../data/garden/owid/latest/population_explore.py | 15 +++++++++++++-- snapshots/owid/latest/population_explore.xlsx.dvc | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/dag/main.yml b/dag/main.yml index acd837c877f..b66fe427293 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -853,6 +853,7 @@ steps: - snapshot-private://owid/latest/population_explore.xlsx data-private://garden/owid/latest/population_explore: - data-private://meadow/owid/latest/population_explore + - data://garden/demography/2024-07-15/population data-private://grapher/owid/latest/population_explore: - data-private://garden/owid/latest/population_explore diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml index 5de72f0d6e1..d9c257a1539 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.meta.yml +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -6,11 +6,15 @@ dataset: tables: population_explore: variables: - population: + population_explore: title: Population description_short: Population of countries unit: "people" - origins: - - title: Maddison (2024) - date_published: "2024" - producer: Maddison + population_omm: + title: Population (omm) + description_short: Population of countries + unit: "people" + diff: + title: Population difference + description_short: Population difference + unit: people diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 72f2a8cad9a..2b457eaed02 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -1,7 +1,5 @@ """Load a meadow dataset and create a garden dataset.""" -import owid.catalog.processing as pr - from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -15,9 +13,11 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("population_explore") + ds_omm = paths.load_dataset("population") # Read table from meadow dataset. tb = ds_meadow["population_explore"].reset_index() + tb_omm = ds_omm["population"].reset_index() # # Process data. @@ -27,6 +27,17 @@ def run(dest_dir: str) -> None: countries_file=paths.country_mapping_path, ) + # Format OMM column + tb_omm = tb_omm[["country", "year", "population"]] + tb_omm = tb_omm[tb_omm["country"].isin(tb["country"].unique())] + tb_omm = tb_omm[(tb_omm["year"] >= 1800) & (tb_omm["year"] <= 1951)] + tb_omm["population"] = tb_omm["population"].astype("Int64") + + # Merge + tb = tb.merge(tb_omm, on=["country", "year"], suffixes=("_explore", "_omm"), how="outer") + tb["diff"] = tb["population_explore"] - tb["population_omm"] + + # Format tb = tb.format(["country", "year"]) # diff --git a/snapshots/owid/latest/population_explore.xlsx.dvc b/snapshots/owid/latest/population_explore.xlsx.dvc index 22616da0f81..803aa8c62b7 100644 --- a/snapshots/owid/latest/population_explore.xlsx.dvc +++ b/snapshots/owid/latest/population_explore.xlsx.dvc @@ -12,7 +12,7 @@ meta: # Citation producer: "?" - citation_full: "?" + citation_full: "Something here" # Files url_main: "" From 8fe2790e766d2a71258ed10a85d18e28d03a2e1d Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 14:26:17 +0100 Subject: [PATCH 06/10] add separate sources --- dag/main.yml | 3 ++ .../owid/latest/population_explore.meta.yml | 18 ++++++- .../garden/owid/latest/population_explore.py | 52 ++++++++++++++++--- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/dag/main.yml b/dag/main.yml index b66fe427293..f042959b467 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -854,6 +854,9 @@ steps: data-private://garden/owid/latest/population_explore: - data-private://meadow/owid/latest/population_explore - data://garden/demography/2024-07-15/population + - data://garden/hyde/2017/baseline + - data://garden/gapminder/2019-12-10/population + - data://garden/un/2022-07-11/un_wpp data-private://grapher/owid/latest/population_explore: - data-private://garden/owid/latest/population_explore diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml index d9c257a1539..aa6c1ba1b93 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.meta.yml +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -6,7 +6,7 @@ dataset: tables: population_explore: variables: - population_explore: + population: title: Population description_short: Population of countries unit: "people" @@ -14,6 +14,22 @@ tables: title: Population (omm) description_short: Population of countries unit: "people" + population_hyde: + title: Population (HYDE) + description_short: Population of countries + unit: "people" + population_gm: + title: Population (gapminder) + description_short: Population of countries + unit: "people" + origins: + - title: Gapminder + url: https://www.gapminder.org/data/ + producer: Gapminder + population_wpp: + title: Population (UN) + description_short: Population of countries + unit: "people" diff: title: Population difference description_short: Population difference diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 2b457eaed02..1f674c5d0e6 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -1,11 +1,30 @@ """Load a meadow dataset and create a garden dataset.""" +import owid.catalog.processing as pr + from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +YEAR_MIN = 1790 +YEAR_MAX = 1955 + + +def standardize_tb(tb, tb_main, col_population: str = "population"): + tb = tb.loc[:, ["country", "year", "population"]] + tb = tb.loc[tb["country"].isin(tb_main["country"].unique())] + tb = tb.loc[(tb["year"] >= YEAR_MIN) & (tb["year"] <= YEAR_MAX)] + tb["population"] = tb["population"].round().astype("Int64") + + tb = tb.rename( + columns={ + "population": col_population, + } + ) + return tb + def run(dest_dir: str) -> None: # @@ -13,11 +32,17 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("population_explore") - ds_omm = paths.load_dataset("population") + ds_omm = paths.load_dataset("population", namespace="demography") + ds_hyde = paths.load_dataset("baseline") + ds_gm = paths.load_dataset("population", namespace="gapminder") + ds_wpp = paths.load_dataset("un_wpp") # Read table from meadow dataset. tb = ds_meadow["population_explore"].reset_index() tb_omm = ds_omm["population"].reset_index() + tb_hyde = ds_hyde["population"].reset_index() + tb_gm = ds_gm["population"].reset_index() + tb_wpp = ds_wpp["population"].reset_index() # # Process data. @@ -28,14 +53,27 @@ def run(dest_dir: str) -> None: ) # Format OMM column - tb_omm = tb_omm[["country", "year", "population"]] - tb_omm = tb_omm[tb_omm["country"].isin(tb["country"].unique())] - tb_omm = tb_omm[(tb_omm["year"] >= 1800) & (tb_omm["year"] <= 1951)] - tb_omm["population"] = tb_omm["population"].astype("Int64") + tb_omm = standardize_tb(tb_omm, tb, "population_omm") + # Format HYDE + tb_hyde = standardize_tb(tb_hyde, tb, "population_hyde") + # Format Gapminder + tb_gm = standardize_tb(tb_gm, tb, "population_gm") + # Format WPP + tb_wpp = tb_wpp[ + (tb_wpp["age"] == "all") + & (tb_wpp["sex"] == "all") + & (tb_wpp["metric"] == "population") + & (tb_wpp["variant"] == "estimates") + ].rename(columns={"location": "country", "value": "population"}) + tb_wpp = standardize_tb(tb_wpp, tb, "population_wpp") # Merge - tb = tb.merge(tb_omm, on=["country", "year"], suffixes=("_explore", "_omm"), how="outer") - tb["diff"] = tb["population_explore"] - tb["population_omm"] + tb = pr.multi_merge( + tables=[tb, tb_omm, tb_hyde, tb_gm, tb_wpp], + on=["country", "year"], + how="outer", + ) + tb["diff"] = tb["population"] - tb["population_omm"] # Format tb = tb.format(["country", "year"]) From 653377b097f8aa11c577cf44e8b51fd999b8be96 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 6 Nov 2024 14:48:50 +0100 Subject: [PATCH 07/10] wip --- .../owid/latest/population_explore.meta.yml | 17 +++++++++++++++++ .../garden/owid/latest/population_explore.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml index aa6c1ba1b93..31a30b27521 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.meta.yml +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -34,3 +34,20 @@ tables: title: Population difference description_short: Population difference unit: people + + population_hyde_cut: + title: Population (HYDE), cut + description_short: Population of countries + unit: "people" + population_gm_cut: + title: Population (gapminder), cut + description_short: Population of countries + unit: "people" + origins: + - title: Gapminder + url: https://www.gapminder.org/data/ + producer: Gapminder + population_wpp_cut: + title: Population (UN), cut + description_short: Population of countries + unit: "people" diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 1f674c5d0e6..4a955f3299d 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -75,6 +75,11 @@ def run(dest_dir: str) -> None: ) tb["diff"] = tb["population"] - tb["population_omm"] + # Add cut versions + tb["population_hyde_cut"] = tb.loc[tb["year"] <= 1800, "population_hyde"] + tb["population_gm_cut"] = tb.loc[(tb["year"] >= 1801) & (tb["year"] <= 1950), "population_gm"] + tb["population_wpp_cut"] = tb.loc[tb["year"] >= 1950, "population_wpp"] + # Format tb = tb.format(["country", "year"]) From 5ee2d0e389f24331a9732a75530e84cecec54ced Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 12 Nov 2024 19:04:16 +0100 Subject: [PATCH 08/10] relevant countries --- .../garden/owid/latest/population_explore.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 4a955f3299d..5d6bd133243 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -11,6 +11,101 @@ YEAR_MIN = 1790 YEAR_MAX = 1955 +# Countries that are relevant for the analysis +COUNTRIES_RELEVANT = [ + "Afghanistan", + "Albania", + "Algeria", + "Andorra", + "Angola", + "Armenia", + "Azerbaijan", + "Bahrain", + "Bangladesh", + "Belarus", + "British Virgin Islands", + "Burundi", + "Cambodia", + "Cameroon", + "Central African Republic", + "Chad", + "Comoros", + "South Korea", + "North Korea", + "Democratic Republic of Congo", + "Djibouti", + "Eritrea", + "Falkland Islands", + "Gabon", + "Georgia", + "Guam", + "Iceland", + "India", + "Iran", + "Iraq", + "Iran", + "Ireland", + "Japan", + "Kazakhstan", + "Kenya", + "Kuwait", + "Kyrgyzstan", + "Laos", + "Latvia", + "Libya", + "Liechtenstein", + "Madagascar", + "Malawi", + "Mali", + "Marshall Islands", + "Mauritania", + "Micronesia (country)", # TODO + "Namibia", + "Nauru", + "New Caledonia", + "New Zealand", + "Niger", + "Nigeria", + "Niue", + # "Northern Mariana Islands", # TODO + # "Palestine", # TODO + "Oman", + "Palau", + "Paraquay", + "Qatar", + # "Russia", # TODO + "Rwanda", + "Reunion", + "Saint Helena", + "Saint Kitts and Nevis", + "Saint Lucia", + "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines", + "Samoa", + "San Marino", + "Sao Tome and Principe", + "Saudi Arabia", + "Senegal", + "Serbia", + "Sierra Leone", + "Slovakia", + "Solomon Islands", + "Somalia", + "South Africa", + "Sudan", + "Tajikistan", + "Thailand", + "Tonga", + "Turkmenistan", + "Uganda", + "Ukraine", + "United Arab Emirates", + "Uzbekistan", + "Vanuatu", + "Yemen", + "Zambia", +] + def standardize_tb(tb, tb_main, col_population: str = "population"): tb = tb.loc[:, ["country", "year", "population"]] @@ -80,6 +175,9 @@ def run(dest_dir: str) -> None: tb["population_gm_cut"] = tb.loc[(tb["year"] >= 1801) & (tb["year"] <= 1950), "population_gm"] tb["population_wpp_cut"] = tb.loc[tb["year"] >= 1950, "population_wpp"] + # Filter relevant countries + tb = tb.loc[tb["country"].isin(COUNTRIES_RELEVANT)] + # Format tb = tb.format(["country", "year"]) From cb480687d12cb7caaf83d7887d79e3bb815ae656 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 25 Nov 2024 11:19:27 +0100 Subject: [PATCH 09/10] show all countries --- etl/steps/data/garden/owid/latest/population_explore.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 5d6bd133243..64128301579 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -38,6 +38,7 @@ "Falkland Islands", "Gabon", "Georgia", + "Germany", "Guam", "Iceland", "India", @@ -176,7 +177,7 @@ def run(dest_dir: str) -> None: tb["population_wpp_cut"] = tb.loc[tb["year"] >= 1950, "population_wpp"] # Filter relevant countries - tb = tb.loc[tb["country"].isin(COUNTRIES_RELEVANT)] + # tb = tb.loc[tb["country"].isin(COUNTRIES_RELEVANT)] # Format tb = tb.format(["country", "year"]) From 95b5fb9978467947f2aade4b2afb09c6953ea987 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 9 Jan 2025 14:57:48 +0100 Subject: [PATCH 10/10] =?UTF-8?q?=F0=9F=93=8A=20Update=20population=20data?= =?UTF-8?q?set=20with=202025=20projections=20and=20improve=20processing=20?= =?UTF-8?q?logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dag/main.yml | 1 + .../latest/population_explore.countries.json | 3 +- .../owid/latest/population_explore.meta.yml | 7 +++- .../garden/owid/latest/population_explore.py | 2 +- .../meadow/owid/latest/population_explore.py | 35 +++++++++++++------ snapshots/owid/latest/population_explore.py | 20 +++++++---- .../latest/population_explore_2025.xlsx.dvc | 29 +++++++++++++++ 7 files changed, 77 insertions(+), 20 deletions(-) create mode 100644 snapshots/owid/latest/population_explore_2025.xlsx.dvc diff --git a/dag/main.yml b/dag/main.yml index 282ed5d5bf2..ded0d039dee 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -700,6 +700,7 @@ steps: data-private://meadow/owid/latest/population_explore: - snapshot-private://owid/latest/population_explore.xlsx + - snapshot-private://owid/latest/population_explore_2025.xlsx data-private://garden/owid/latest/population_explore: - data-private://meadow/owid/latest/population_explore - data://garden/demography/2024-07-15/population diff --git a/etl/steps/data/garden/owid/latest/population_explore.countries.json b/etl/steps/data/garden/owid/latest/population_explore.countries.json index 3acd3a047e2..6521646124a 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.countries.json +++ b/etl/steps/data/garden/owid/latest/population_explore.countries.json @@ -228,5 +228,6 @@ "Zimbabwe": "Zimbabwe", "Dem. People's Republic of Korea": "North Korea", "Libyan Arab Jamahiriya": "Libya", - "Other non-specified areas": "Others" + "Other non-specified areas": "Others", + "TAIWAN": "Taiwan" } diff --git a/etl/steps/data/garden/owid/latest/population_explore.meta.yml b/etl/steps/data/garden/owid/latest/population_explore.meta.yml index 31a30b27521..1f739e67705 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.meta.yml +++ b/etl/steps/data/garden/owid/latest/population_explore.meta.yml @@ -7,7 +7,11 @@ tables: population_explore: variables: population: - title: Population + title: Population (f & t) + description_short: Population of countries + unit: "people" + population_v2: + title: Population (f & t), v2 description_short: Population of countries unit: "people" population_omm: @@ -30,6 +34,7 @@ tables: title: Population (UN) description_short: Population of countries unit: "people" + diff: title: Population difference description_short: Population difference diff --git a/etl/steps/data/garden/owid/latest/population_explore.py b/etl/steps/data/garden/owid/latest/population_explore.py index 64128301579..4586e69e073 100644 --- a/etl/steps/data/garden/owid/latest/population_explore.py +++ b/etl/steps/data/garden/owid/latest/population_explore.py @@ -169,7 +169,7 @@ def run(dest_dir: str) -> None: on=["country", "year"], how="outer", ) - tb["diff"] = tb["population"] - tb["population_omm"] + tb["diff"] = tb["population_v2"] - tb["population_omm"] # Add cut versions tb["population_hyde_cut"] = tb.loc[tb["year"] <= 1800, "population_hyde"] diff --git a/etl/steps/data/meadow/owid/latest/population_explore.py b/etl/steps/data/meadow/owid/latest/population_explore.py index 1ee8aa3f181..1391efb4b12 100644 --- a/etl/steps/data/meadow/owid/latest/population_explore.py +++ b/etl/steps/data/meadow/owid/latest/population_explore.py @@ -1,6 +1,5 @@ """Load a meadow dataset and create a garden dataset.""" - from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -13,20 +12,18 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. tb = paths.read_snap_table("population_explore.xlsx") - tb = tb.drop(index=range(0, 5)) - - tb = tb.melt( - id_vars=["source"], - var_name="country", - value_name="population", - ).rename(columns={"source": "year"}) - - # Scale - tb["population"] = (tb["population"] * 1000).astype(float).round().astype("Int64") + tb2 = paths.read_snap_table("population_explore_2025.xlsx") # # Process data. # + tb = process_table(tb) + tb2 = process_table(tb2) + + # Combine + tb = tb.merge(tb2, on=["year", "country"], how="outer", suffixes=("", "_v2")) + + # Format tb = tb.format(["country", "year"], short_name="population_explore") # @@ -41,3 +38,19 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() + + +def process_table(tb): + tb = tb.drop(index=range(0, 5)) + tb = tb.dropna(axis=1, how="all") + + tb = tb.melt( + id_vars=["source"], + var_name="country", + value_name="population", + ).rename(columns={"source": "year"}) + + # Scale + tb["population"] = (tb["population"].astype(float) * 1000).round().astype("Int64") + + return tb diff --git a/snapshots/owid/latest/population_explore.py b/snapshots/owid/latest/population_explore.py index 6af7d3655d3..a10f4623a70 100644 --- a/snapshots/owid/latest/population_explore.py +++ b/snapshots/owid/latest/population_explore.py @@ -19,16 +19,24 @@ ######################################################################################################################## @click.command() @click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") -@click.option("--path-to-file", type=str, help="Path to population local file.") +@click.option("-f1", type=str, help="Path to population local file.") +@click.option("-f2", type=str, help="Path to population local file.") def main( upload: bool, - path_to_file: str | None = None, + f1: str | None = None, + f2: str | None = None, ) -> None: - # Create a new snapshot. - snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/population_explore.xlsx") + if f1 is not None: + # Create a new snapshot. + snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/population_explore.xlsx") - # Copy local data file to snapshots data folder, add file to DVC and upload to S3. - snap.create_snapshot(filename=path_to_file, upload=upload) + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=f1, upload=upload) + if f2 is not None: + snap = Snapshot(f"owid/{SNAPSHOT_VERSION}/population_explore_2025.xlsx") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=f2, upload=upload) if __name__ == "__main__": diff --git a/snapshots/owid/latest/population_explore_2025.xlsx.dvc b/snapshots/owid/latest/population_explore_2025.xlsx.dvc new file mode 100644 index 00000000000..b202068399f --- /dev/null +++ b/snapshots/owid/latest/population_explore_2025.xlsx.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + is_public: false + + origin: + # Data product / Snapshot + title: pop + description: |- + pop + date_published: "2024" + + # Citation + producer: "?" + citation_full: "Something here" + + # Files + url_main: "" + date_accessed: 2024--11-06 + + # License + license: + name: Open Government Licence v3.0 + url: https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/ + +outs: + - md5: d0330a5337f75732b6950e3327f2afed + size: 449383 + path: population_explore_2025.xlsx