From 931ce739019754cdf2dc5989e40d968e63227f06 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Mon, 27 May 2024 11:18:18 +0200
Subject: [PATCH] Add data from Floud et al. (2011) (WIP)

---
 dag/agriculture.yml                           | 11 ++-
 .../2024-05-23/daily_calories_per_person.py   |  4 +
 .../2024-05-23/floud_et_al_2011.py            | 45 +++++++++++
 .../2024-05-23/harris_et_al_2015.py           | 14 ++--
 .../2024-05-23/floud_et_al_2011.py            | 75 +++++++++++++++++++
 ...d_et_al_2011_daily_calories_europe.csv.dvc | 30 ++++++++
 ...floud_et_al_2011_daily_calories_us.csv.dvc | 30 ++++++++
 7 files changed, 199 insertions(+), 10 deletions(-)
 create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py
 create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011.py
 create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc
 create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc

diff --git a/dag/agriculture.yml b/dag/agriculture.yml
index 665b895630f..16e76b97374 100644
--- a/dag/agriculture.yml
+++ b/dag/agriculture.yml
@@ -78,11 +78,20 @@ steps:
   #
   data://meadow/agriculture/2024-05-23/harris_et_al_2015:
     - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv
+  #
+  # Floud et al. (2011) - Daily calories in United States and Western Europe.
+  #
+  data://meadow/agriculture/2024-05-23/floud_et_al_2011:
+  - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv
+  - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv
+  #
+  # Agriculture - Long-run daily calorie supply per person.
+  #
   data://garden/agriculture/2024-05-23/daily_calories_per_person:
     - data://meadow/agriculture/2024-05-23/harris_et_al_2015
+    - data://meadow/agriculture/2024-05-23/floud_et_al_2011
   data://grapher/agriculture/2024-05-23/daily_calories_per_person:
     - data://garden/agriculture/2024-05-23/daily_calories_per_person
-
   ######################################################################################################################
   # Older versions to be archived once they are not used by any other steps.
   ######################################################################################################################
diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py
index 5566f7ec0f2..3bf3a4201b1 100644
--- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py
+++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py
@@ -16,6 +16,10 @@ def run(dest_dir: str) -> None:
     ds_harris = paths.load_dataset("harris_et_al_2015")
     tb_harris = ds_harris["harris_et_al_2015"].reset_index()
 
+    # Load Floud et al. (2011) dataset and read its main table.
+    ds_floud = paths.load_dataset("floud_et_al_2011")
+    tb_floud = ds_floud["floud_et_al_2011"].reset_index()
+
     #
     # Process data.
     #
diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py
new file mode 100644
index 00000000000..b9542acb88c
--- /dev/null
+++ b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py
@@ -0,0 +1,45 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshots.
+    snap_europe = paths.load_snapshot("floud_et_al_2011_daily_calories_europe.csv")
+    snap_us = paths.load_snapshot("floud_et_al_2011_daily_calories_us.csv")
+
+    # Load data from snapshots.
+    tb_europe = snap_europe.read()
+    tb_us = snap_us.read()
+
+    #
+    # Process data.
+    #
+    # Transform Europe data to have a year column.
+    tb_europe = tb_europe.melt(id_vars=["country"], var_name="year", value_name="daily_calories")
+
+    # Prepare US data.
+    tb_us = tb_us.rename(columns={"Year": "year", "Calories": "daily_calories"}, errors="raise").assign(
+        **{"country": "United States"}
+    )
+
+    # Combine both tables.
+    tb = pr.concat([tb_europe, tb_us], ignore_index=True)
+
+    # Format table conveniently.
+    tb = tb.format(["country", "year"], short_name=paths.short_name)
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py
index 2720bef8d18..da4abd44e6b 100644
--- a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py
+++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py
@@ -10,23 +10,19 @@ def run(dest_dir: str) -> None:
     #
     # Load inputs.
     #
-    # Retrieve snapshot.
+    # Retrieve snapshot and read its data.
     snap = paths.load_snapshot("harris_et_al_2015.csv")
-
-    # Load data from snapshot.
     tb = snap.read()
 
     #
     # Process data.
     #
-    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
-    tb = tb.format(["country", "year"])
+    # Format table conveniently.
+    tb = tb.format(["years", "source"])
 
     #
     # Save outputs.
     #
-    # Create a new meadow dataset with the same metadata as the snapshot.
-    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
-
-    # Save changes in the new meadow dataset.
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
     ds_meadow.save()
diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011.py b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py
new file mode 100644
index 00000000000..0b201795ce9
--- /dev/null
+++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py
@@ -0,0 +1,75 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+import pandas as pd
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Initialize new snapshots for daily caloric intake in the US and in Western Europe.
+    snap_us = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_us.csv")
+    snap_europe = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_europe.csv")
+
+    # Data from Table 6.6 on US daily caloric intake, extracted using chatGPT 4o (and manually inspected).
+    data_us = """
+Year,Calories
+1800,2952
+1810,2935
+1820,2904
+1830,2888
+1840,3013
+1850,2585
+1860,2826
+1870,3029
+1880,3237
+1890,3134
+1900,3212
+1910,3068
+1920,3259
+1930,3400
+1940,3300
+1952,3200
+1960,3100
+1970,3200
+1980,3200
+1990,3500
+2000,3900
+2004,3900
+    """
+
+    # Create a dataframe with the extracted data.
+    data_us_parsed = [line.split(",") for line in data_us.split("\n")[1:-1]]
+    df_us = pd.DataFrame(data_us_parsed[1:], columns=data_us_parsed[0])
+
+    # Data from Table 5.5 on Western Europe daily caloric intake, extracted using chatGPT 4o (and manually inspected).
+    data_europe = """
+country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960
+Belgium,2840,,,,,2423,2426,2553,2663,2851,2987,3278,,2940,,,3040
+England,2436,,,,,2512,,,2773,,,2977,,2810,3060,3120,3280
+Finland,,,,,,,1900,,,,,3000,,2950,,,3110
+France,1846,,1984,2118,2377,2840,2854,3085,3085,3220,3192,3323,3133,,,,3050
+Germany,2210,,,,,,2120,,,,,,,,,,2960
+Iceland,,,2887,,3080,3381,,2573,3002,3106,3316,3499,,,,,
+Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730
+Netherlands,,,,,,,2227,,2493,,2721,,,,,,
+Norway,,1800,,,2250,,3300,,,,,,,,,,2930
+    """
+    # Create a dataframe with the extracted data.
+    data_europe_parsed = [line.split(",") for line in data_europe.split("\n")[1:-1]]
+    df_europe = pd.DataFrame(data_europe_parsed[1:], columns=data_europe_parsed[0])
+
+    # Create snapshots.
+    snap_us.create_snapshot(upload=upload, data=df_us)
+    snap_europe.create_snapshot(upload=upload, data=df_europe)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc
new file mode 100644
index 00000000000..044d7a4e0e7
--- /dev/null
+++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc
@@ -0,0 +1,30 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: The Changing Body
+    title_snapshot: The Changing Body - Daily calories in Western Europe
+    description: |-
+      This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011).
+    date_published: "2011-03-31"
+
+    # Citation
+    producer: Floud et al.
+    citation_full: |-
+      Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750.
+      Data extracted from Tables 5.5 and 6.6.
+    attribution_short: Floud et al. (2011)
+
+    # Files
+    url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
+    date_accessed: 2024-05-27
+
+    # License
+    license:
+      name: © Cambridge University Press 2011
+      url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
+outs:
+  - md5: 4f31506ded236dc72a590695f8868a1c
+    size: 554
+    path: floud_et_al_2011_daily_calories_europe.csv
diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc
new file mode 100644
index 00000000000..3573e2923e6
--- /dev/null
+++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc
@@ -0,0 +1,30 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: The Changing Body
+    title_snapshot: The Changing Body - Daily calories in United States
+    description: |-
+      This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011).
+    date_published: "2011-03-31"
+
+    # Citation
+    producer: Floud et al.
+    citation_full: |-
+      Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750.
+      Data extracted from Tables 5.5 and 6.6.
+    attribution_short: Floud et al. (2011)
+
+    # Files
+    url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
+    date_accessed: 2024-05-27
+
+    # License
+    license:
+      name: © Cambridge University Press 2011
+      url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
+outs:
+  - md5: 4316767b9de23caf9710fe44caff5ec9
+    size: 234
+    path: floud_et_al_2011_daily_calories_us.csv