From 931ce739019754cdf2dc5989e40d968e63227f06 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 11:18:18 +0200 Subject: [PATCH] Add data from Floud et al. (2011) (WIP) --- dag/agriculture.yml | 11 ++- .../2024-05-23/daily_calories_per_person.py | 4 + .../2024-05-23/floud_et_al_2011.py | 45 +++++++++++ .../2024-05-23/harris_et_al_2015.py | 14 ++-- .../2024-05-23/floud_et_al_2011.py | 75 +++++++++++++++++++ ...d_et_al_2011_daily_calories_europe.csv.dvc | 30 ++++++++ ...floud_et_al_2011_daily_calories_us.csv.dvc | 30 ++++++++ 7 files changed, 199 insertions(+), 10 deletions(-) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011.py create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 665b895630f..16e76b97374 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -78,11 +78,20 @@ steps: # data://meadow/agriculture/2024-05-23/harris_et_al_2015: - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv + # + # Floud et al. (2011) - Daily calories in United States and Western Europe. + # + data://meadow/agriculture/2024-05-23/floud_et_al_2011: + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv + # + # Agriculture - Long-run daily calorie supply per person. + # data://garden/agriculture/2024-05-23/daily_calories_per_person: - data://meadow/agriculture/2024-05-23/harris_et_al_2015 + - data://meadow/agriculture/2024-05-23/floud_et_al_2011 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person - ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 5566f7ec0f2..3bf3a4201b1 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -16,6 +16,10 @@ def run(dest_dir: str) -> None: ds_harris = paths.load_dataset("harris_et_al_2015") tb_harris = ds_harris["harris_et_al_2015"].reset_index() + # Load Floud et al. (2011) dataset and read its main table. + ds_floud = paths.load_dataset("floud_et_al_2011") + tb_floud = ds_floud["floud_et_al_2011"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..b9542acb88c --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,45 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_europe = paths.load_snapshot("floud_et_al_2011_daily_calories_europe.csv") + snap_us = paths.load_snapshot("floud_et_al_2011_daily_calories_us.csv") + + # Load data from snapshots. + tb_europe = snap_europe.read() + tb_us = snap_us.read() + + # + # Process data. + # + # Transform Europe data to have a year column. + tb_europe = tb_europe.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Prepare US data. + tb_us = tb_us.rename(columns={"Year": "year", "Calories": "daily_calories"}, errors="raise").assign( + **{"country": "United States"} + ) + + # Combine both tables. + tb = pr.concat([tb_europe, tb_us], ignore_index=True) + + # Format table conveniently. + tb = tb.format(["country", "year"], short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py index 2720bef8d18..da4abd44e6b 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -10,23 +10,19 @@ def run(dest_dir: str) -> None: # # Load inputs. # - # Retrieve snapshot. + # Retrieve snapshot and read its data. snap = paths.load_snapshot("harris_et_al_2015.csv") - - # Load data from snapshot. tb = snap.read() # # Process data. # - # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.format(["country", "year"]) + # Format table conveniently. + tb = tb.format(["years", "source"]) # # Save outputs. # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) - - # Save changes in the new meadow dataset. + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011.py b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..0b201795ce9 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,75 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Initialize new snapshots for daily caloric intake in the US and in Western Europe. + snap_us = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_us.csv") + snap_europe = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_europe.csv") + + # Data from Table 6.6 on US daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_us = """ +Year,Calories +1800,2952 +1810,2935 +1820,2904 +1830,2888 +1840,3013 +1850,2585 +1860,2826 +1870,3029 +1880,3237 +1890,3134 +1900,3212 +1910,3068 +1920,3259 +1930,3400 +1940,3300 +1952,3200 +1960,3100 +1970,3200 +1980,3200 +1990,3500 +2000,3900 +2004,3900 + """ + + # Create a dataframe with the extracted data. + data_us_parsed = [line.split(",") for line in data_us.split("\n")[1:-1]] + df_us = pd.DataFrame(data_us_parsed[1:], columns=data_us_parsed[0]) + + # Data from Table 5.5 on Western Europe daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_europe = """ +country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960 +Belgium,2840,,,,,2423,2426,2553,2663,2851,2987,3278,,2940,,,3040 +England,2436,,,,,2512,,,2773,,,2977,,2810,3060,3120,3280 +Finland,,,,,,,1900,,,,,3000,,2950,,,3110 +France,1846,,1984,2118,2377,2840,2854,3085,3085,3220,3192,3323,3133,,,,3050 +Germany,2210,,,,,,2120,,,,,,,,,,2960 +Iceland,,,2887,,3080,3381,,2573,3002,3106,3316,3499,,,,, +Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730 +Netherlands,,,,,,,2227,,2493,,2721,,,,,, +Norway,,1800,,,2250,,3300,,,,,,,,,,2930 + """ + # Create a dataframe with the extracted data. + data_europe_parsed = [line.split(",") for line in data_europe.split("\n")[1:-1]] + df_europe = pd.DataFrame(data_europe_parsed[1:], columns=data_europe_parsed[0]) + + # Create snapshots. + snap_us.create_snapshot(upload=upload, data=df_us) + snap_europe.create_snapshot(upload=upload, data=df_europe) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc new file mode 100644 index 00000000000..044d7a4e0e7 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in Western Europe + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4f31506ded236dc72a590695f8868a1c + size: 554 + path: floud_et_al_2011_daily_calories_europe.csv diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc new file mode 100644 index 00000000000..3573e2923e6 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in United States + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4316767b9de23caf9710fe44caff5ec9 + size: 234 + path: floud_et_al_2011_daily_calories_us.csv