Skip to content

Commit

Permalink
Add data from Floud et al. (2011) (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
pabloarosado committed May 27, 2024
1 parent e750a5a commit 931ce73
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 10 deletions.
11 changes: 10 additions & 1 deletion dag/agriculture.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,20 @@ steps:
#
data://meadow/agriculture/2024-05-23/harris_et_al_2015:
- snapshot://agriculture/2024-05-23/harris_et_al_2015.csv
#
# Floud et al. (2011) - Daily calories in United States and Western Europe.
#
data://meadow/agriculture/2024-05-23/floud_et_al_2011:
- snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv
- snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv
#
# Agriculture - Long-run daily calorie supply per person.
#
data://garden/agriculture/2024-05-23/daily_calories_per_person:
- data://meadow/agriculture/2024-05-23/harris_et_al_2015
- data://meadow/agriculture/2024-05-23/floud_et_al_2011
data://grapher/agriculture/2024-05-23/daily_calories_per_person:
- data://garden/agriculture/2024-05-23/daily_calories_per_person

######################################################################################################################
# Older versions to be archived once they are not used by any other steps.
######################################################################################################################
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ def run(dest_dir: str) -> None:
ds_harris = paths.load_dataset("harris_et_al_2015")
tb_harris = ds_harris["harris_et_al_2015"].reset_index()

# Load Floud et al. (2011) dataset and read its main table.
ds_floud = paths.load_dataset("floud_et_al_2011")
tb_floud = ds_floud["floud_et_al_2011"].reset_index()

#
# Process data.
#
Expand Down
45 changes: 45 additions & 0 deletions etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Load a snapshot and create a meadow dataset."""

import owid.catalog.processing as pr

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshots.
snap_europe = paths.load_snapshot("floud_et_al_2011_daily_calories_europe.csv")
snap_us = paths.load_snapshot("floud_et_al_2011_daily_calories_us.csv")

# Load data from snapshots.
tb_europe = snap_europe.read()
tb_us = snap_us.read()

#
# Process data.
#
# Transform Europe data to have a year column.
tb_europe = tb_europe.melt(id_vars=["country"], var_name="year", value_name="daily_calories")

# Prepare US data.
tb_us = tb_us.rename(columns={"Year": "year", "Calories": "daily_calories"}, errors="raise").assign(
**{"country": "United States"}
)

# Combine both tables.
tb = pr.concat([tb_europe, tb_us], ignore_index=True)

# Format table conveniently.
tb = tb.format(["country", "year"], short_name=paths.short_name)

#
# Save outputs.
#
# Create a new meadow dataset.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
ds_meadow.save()
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,19 @@ def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
# Retrieve snapshot and read its data.
snap = paths.load_snapshot("harris_et_al_2015.csv")

# Load data from snapshot.
tb = snap.read()

#
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tb = tb.format(["country", "year"])
# Format table conveniently.
tb = tb.format(["years", "source"])

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
# Create a new meadow dataset.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
ds_meadow.save()
75 changes: 75 additions & 0 deletions snapshots/agriculture/2024-05-23/floud_et_al_2011.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click
import pandas as pd

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Initialize new snapshots for daily caloric intake in the US and in Western Europe.
snap_us = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_us.csv")
snap_europe = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_europe.csv")

# Data from Table 6.6 on US daily caloric intake, extracted using chatGPT 4o (and manually inspected).
data_us = """
Year,Calories
1800,2952
1810,2935
1820,2904
1830,2888
1840,3013
1850,2585
1860,2826
1870,3029
1880,3237
1890,3134
1900,3212
1910,3068
1920,3259
1930,3400
1940,3300
1952,3200
1960,3100
1970,3200
1980,3200
1990,3500
2000,3900
2004,3900
"""

# Create a dataframe with the extracted data.
data_us_parsed = [line.split(",") for line in data_us.split("\n")[1:-1]]
df_us = pd.DataFrame(data_us_parsed[1:], columns=data_us_parsed[0])

# Data from Table 5.5 on Western Europe daily caloric intake, extracted using chatGPT 4o (and manually inspected).
data_europe = """
country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960
Belgium,2840,,,,,2423,2426,2553,2663,2851,2987,3278,,2940,,,3040
England,2436,,,,,2512,,,2773,,,2977,,2810,3060,3120,3280
Finland,,,,,,,1900,,,,,3000,,2950,,,3110
France,1846,,1984,2118,2377,2840,2854,3085,3085,3220,3192,3323,3133,,,,3050
Germany,2210,,,,,,2120,,,,,,,,,,2960
Iceland,,,2887,,3080,3381,,2573,3002,3106,3316,3499,,,,,
Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730
Netherlands,,,,,,,2227,,2493,,2721,,,,,,
Norway,,1800,,,2250,,3300,,,,,,,,,,2930
"""
# Create a dataframe with the extracted data.
data_europe_parsed = [line.split(",") for line in data_europe.split("\n")[1:-1]]
df_europe = pd.DataFrame(data_europe_parsed[1:], columns=data_europe_parsed[0])

# Create snapshots.
snap_us.create_snapshot(upload=upload, data=df_us)
snap_europe.create_snapshot(upload=upload, data=df_europe)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: The Changing Body
title_snapshot: The Changing Body - Daily calories in Western Europe
description: |-
This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011).
date_published: "2011-03-31"

# Citation
producer: Floud et al.
citation_full: |-
Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750.
Data extracted from Tables 5.5 and 6.6.
attribution_short: Floud et al. (2011)

# Files
url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
date_accessed: 2024-05-27

# License
license:
name: © Cambridge University Press 2011
url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
outs:
- md5: 4f31506ded236dc72a590695f8868a1c
size: 554
path: floud_et_al_2011_daily_calories_europe.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: The Changing Body
title_snapshot: The Changing Body - Daily calories in United States
description: |-
This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011).
date_published: "2011-03-31"

# Citation
producer: Floud et al.
citation_full: |-
Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750.
Data extracted from Tables 5.5 and 6.6.
attribution_short: Floud et al. (2011)

# Files
url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
date_accessed: 2024-05-27

# License
license:
name: © Cambridge University Press 2011
url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E
outs:
- md5: 4316767b9de23caf9710fe44caff5ec9
size: 234
path: floud_et_al_2011_daily_calories_us.csv

0 comments on commit 931ce73

Please sign in to comment.