From 98d3f5361599c5ad1ba37a9fbd520a9e62306362 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 23 May 2024 15:57:02 +0200 Subject: [PATCH 01/17] Rename caloric intake to calorie supply in FAOSTAT additional_variables dataset --- .../2024-03-14/additional_variables.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py index 8617d95e0e7..da5b1056487 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py @@ -496,7 +496,7 @@ def generate_food_available_for_consumption(tb_fbsc: Table) -> Table: ) tb_food_available_for_consumption[ underscore(group) - ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" + ].metadata.title = f"Daily calorie supply per person from {group.lower().replace('other', 'other commodities')}" tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" tb_food_available_for_consumption[underscore(group)].metadata.description_key = [description] @@ -582,37 +582,37 @@ def generate_macronutrient_compositions(tb_fbsc: Table) -> Table: # Combine all tables. combined = pr.multi_merge(tables=tables, on=["country", "year"], how="outer") - # Daily caloric intake from fat, per person. + # Daily calorie supply from fat, per person. combined["Total energy from fat"] = combined["Total fat"] * KCAL_PER_GRAM_OF_FAT - # Daily caloric intake from protein, per person. + # Daily calorie supply from protein, per person. combined["Total energy from protein"] = combined["Total protein"] * KCAL_PER_GRAM_OF_PROTEIN - # Daily caloric intake from carbohydrates (assumed to be the rest of the daily caloric intake), per person. - # This is the difference between the total caloric intake minus the caloric intake from protein and fat. + # Daily calorie supply from carbohydrates (assumed to be the rest of the daily calorie supply), per person. + # This is the difference between the total calorie supply minus the calorie supply from protein and fat. combined["Total energy from carbohydrates"] = ( combined["Total energy"] - combined["Total energy from fat"] - combined["Total energy from protein"] ) - # Daily intake of carbohydrates per person. + # Daily supply of carbohydrates per person. combined["Total carbohydrates"] = combined["Total energy from carbohydrates"] / KCAL_PER_GRAM_OF_CARBOHYDRATES - # Caloric intake from fat as a percentage of the total daily caloric intake. + # Calorie supply from fat as a percentage of the total daily calorie supply. combined["Share of energy from fat"] = 100 * combined["Total energy from fat"] / combined["Total energy"] - # Caloric intake from protein as a percentage of the total daily caloric intake. + # Calorie supply from protein as a percentage of the total daily calorie supply. combined["Share of energy from protein"] = 100 * combined["Total energy from protein"] / combined["Total energy"] - # Caloric intake from carbohydrates as a percentage of the total daily caloric intake. + # Calorie supply from carbohydrates as a percentage of the total daily calorie supply. combined["Share of energy from carbohydrates"] = ( 100 * combined["Total energy from carbohydrates"] / combined["Total energy"] ) - # Daily caloric intake from animal protein. + # Daily calorie supply from animal protein. combined["Energy from animal protein"] = combined["Protein from animal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from animal protein as a percentage of the total daily caloric intake. + # Calorie supply from animal protein as a percentage of the total daily calorie supply. combined["Share of energy from animal protein"] = ( 100 * combined["Energy from animal protein"] / combined["Total energy"] ) - # Daily caloric intake from vegetal protein. + # Daily calorie supply from vegetal protein. combined["Energy from vegetal protein"] = combined["Protein from vegetal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from vegetal protein as a percentage of the total daily caloric intake. + # Calorie supply from vegetal protein as a percentage of the total daily calorie supply. combined["Share of energy from vegetal protein"] = ( 100 * combined["Energy from vegetal protein"] / combined["Total energy"] ) From e0da7c07c4fd2b3d81f4ab9e9967a42527758ff6 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 23 May 2024 16:24:55 +0200 Subject: [PATCH 02/17] Replace more instances of caloric intake by calorie supply --- .../2024-03-14/additional_variables.meta.yml | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml index d24da7afa40..e8d7eb60289 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml @@ -107,83 +107,83 @@ tables: - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. variables: energy_from_animal_products: - title: "Daily caloric intake per person from animal products" + title: "Daily calorie supply per person from animal products" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_animal_protein: - title: "Daily caloric intake per person that comes from animal protein" + title: "Daily calorie supply per person that comes from animal protein" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_vegetal_products: - title: "Daily caloric intake per person from vegetal products" + title: "Daily calorie supply per person from vegetal products" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_vegetal_protein: - title: "Daily caloric intake per person that comes from vegetal protein" + title: "Daily calorie supply per person that comes from vegetal protein" unit: "kilocalories per day per capita" short_unit: "kcal" fat_from_animal_products: - title: "Daily fat intake per person from animal products" + title: "Daily fat supply per person from animal products" unit: "grams per day per capita" short_unit: "g" fat_from_vegetal_products: - title: "Daily fat intake per person from vegetal products" + title: "Daily fat supply per person from vegetal products" unit: "grams per day per capita" short_unit: "g" protein_from_animal_products: - title: "Daily protein intake from animal products" + title: "Daily protein supply from animal products" unit: "grams per day per capita" short_unit: "g" protein_from_vegetal_products: - title: "Daily protein intake per person from vegetal products" + title: "Daily protein supply per person from vegetal products" unit: "grams per day per capita" short_unit: "g" share_of_energy_from_animal_protein: - title: "Share of the daily caloric intake that comes from animal protein" + title: "Share of the daily calorie supply that comes from animal protein" unit: "%" short_unit: "%" share_of_energy_from_carbohydrates: - title: "Share of the daily caloric intake that comes from carbohydrates" + title: "Share of the daily calorie supply that comes from carbohydrates" unit: "%" short_unit: "%" share_of_energy_from_fat: - title: "Share of the daily caloric intake that comes from fat" + title: "Share of the daily calorie supply that comes from fat" unit: "%" short_unit: "%" share_of_energy_from_protein: - title: "Share of the daily caloric intake that comes from protein" + title: "Share of the daily calorie supply that comes from protein" unit: "%" short_unit: "%" share_of_energy_from_vegetal_protein: - title: "Share of the daily caloric intake that comes from vegetal protein" + title: "Share of the daily calorie supply that comes from vegetal protein" unit: "%" short_unit: "%" total_carbohydrates: - title: "Daily carbohydrates intake per person" + title: "Daily carbohydrates supply per person" unit: "grams per day per capita" short_unit: "g" total_energy: - title: "Daily caloric intake per person" + title: "Daily calorie supply per person" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_carbohydrates: - title: "Daily caloric intake per person from carbohydrates" + title: "Daily calorie supply per person from carbohydrates" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_fat: - title: "Daily caloric intake per person from fat" + title: "Daily calorie supply per person from fat" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_protein: - title: "Daily caloric intake per person from protein" + title: "Daily calorie supply per person from protein" unit: "kilocalories per day per capita" short_unit: "kcal" total_fat: - title: "Daily fat intake per person" + title: "Daily fat supply per person" unit: "grams per day per capita" short_unit: "g" total_protein: - title: "Daily protein intake per person" + title: "Daily protein supply per person" unit: "grams per day per capita" short_unit: "g" fertilizers: From d5a13f8cd28c3ffed36a08f5a86736a12d52a376 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 23 May 2024 18:05:17 +0200 Subject: [PATCH 03/17] Add snapshot of Harris et al (2015) and start meadow and garden steps (WIP) --- dag/agriculture.yml | 8 ++ .../daily_calories_per_person.countries.json | 1 + .../2024-05-23/daily_calories_per_person.py | 33 +++++++ .../2024-05-23/daily_calories_per_person.yml | 15 ++++ .../2024-05-23/harris_et_al_2015.py | 32 +++++++ .../2024-05-23/harris_et_al_2015.csv.dvc | 30 +++++++ .../2024-05-23/harris_et_al_2015.py | 89 +++++++++++++++++++ 7 files changed, 208 insertions(+) create mode 100644 etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json create mode 100644 etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py create mode 100644 etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py create mode 100644 snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/harris_et_al_2015.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index fc24e0cb998..609fca68a6c 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -73,6 +73,14 @@ steps: data://grapher/wb/2024-03-26/food_prices_for_nutrition: - data://garden/wb/2024-03-26/food_prices_for_nutrition + # + # Harris et al. (2015) - Daily calories in England and Wales, according to various studies. + # + data://meadow/agriculture/2024-05-23/harris_et_al_2015: + - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv + data://garden/agriculture/2024-05-23/daily_calories_per_person: + - data://meadow/agriculture/2024-05-23/harris_et_al_2015 + ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json new file mode 100644 index 00000000000..0967ef424bc --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json @@ -0,0 +1 @@ +{} diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py new file mode 100644 index 00000000000..5566f7ec0f2 --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -0,0 +1,33 @@ +"""TODO: Explain this step. + +""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load Harris et al. (2015) dataset and read its main table. + ds_harris = paths.load_dataset("harris_et_al_2015") + tb_harris = ds_harris["harris_et_al_2015"].reset_index() + + # + # Process data. + # + # TODO: Continue processing. + tb = tb_harris.copy() + + # Set an appropriate index and sort conveniently. + tb = tb.format(short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml new file mode 100644 index 00000000000..b89a93f9b85 --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml @@ -0,0 +1,15 @@ +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Food supply + +dataset: + update_period_days: 365 + +tables: + daily_calories_per_person: + title: Daily calory supply per person + variables: + {} diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py new file mode 100644 index 00000000000..2720bef8d18 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("harris_et_al_2015.csv") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc new file mode 100644 index 00000000000..730901ccbcb --- /dev/null +++ b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Daily calories in England and Wales according to various authors + description: |- + This dataset contains the table in the appendix of Harris et al. (2015) paper: "How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries". + That table contains a compilation of daily calorie (supply or consumption) in England and Wales, according to various different studies. + date_published: "2015-04-22" + + # Citation + producer: Harris et al. + citation_full: |- + Harris, B., Floud, R. and Hong, S.C. (2015), "How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries", Research in Economic History (Research in Economic History, Vol. 31), Emerald Group Publishing Limited, Leeds, pp. 111-191. https://doi.org/10.1108/S0363-326820150000031003 + Data extracted from the Appendix. + attribution_short: Harris et al. (2015) + + # Files + url_main: https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html + date_accessed: 2024-05-23 + + # License + license: + name: © Emerald Group Publishing Limited 2015 + url: https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html +outs: + - md5: 79d314aa6815574e11146337336ee10b + size: 2050 + path: harris_et_al_2015.csv diff --git a/snapshots/agriculture/2024-05-23/harris_et_al_2015.py b/snapshots/agriculture/2024-05-23/harris_et_al_2015.py new file mode 100644 index 00000000000..e1f1ebe0d1d --- /dev/null +++ b/snapshots/agriculture/2024-05-23/harris_et_al_2015.py @@ -0,0 +1,89 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/harris_et_al_2015.csv") + + # Data extracted from chatGPT 4o (and manually inspected and corrected). + data = """ + 1270/1279, Broadberry et al. (2015), 2203 + 1300, Allen (2005), 1791 + 1300, Overton and Campbell (1996), n/a + 1300/1309, Broadberry et al. (2015), 2056 + 1310/1319, Broadberry et al. (2015), 1998 + 1380, Overton and Campbell (1996), n/a + 1380/1389, Broadberry et al. (2015), 2467 + 1420/1429, Broadberry et al. (2015), 2146 + 1450/1459, Broadberry et al. (2015), 2176 + 1500, Allen (2005), 3397 + 1600, Muldrew (2011), 3062 + 1600, Overton and Campbell (1996), n/a + 1600/1609, Broadberry et al. (2015), 2104 + 1650/1659, Broadberry et al. (2015), 1945 + 1700, Allen (2005), 3255 + 1700, Floud et al. (2011) (Estimates A and B), 2230 + 1700, Fogel (2004), 2095 + 1700, Meredith and Oxley (2014), 2557 + 1700, Muldrew (2011), 3579 + 1700, Overton and Campbell (1996), n/a + 1700/1709, Broadberry et al. (2015), 2187 + 1750, Allen (2005), 3803 + 1750, Floud et al. (2011) (Estimate A; with correction), 2328 + 1750, Floud et al. (2011) (Estimate B; with correction), 2516 + 1750, Fogel (2004), 2168 + 1750, Kelly and Ó Gráda (2013b), 2914-2949 + 1750/1759, Broadberry et al. (2015), 2178 + 1770, Kelly and Ó Gráda (2013b), 3542-3547 + 1770, Meredith and Oxley (2014), 3271 + 1770, Muldrew (2011), 5047 + 1800, Allen (2005), 2938 + 1800, Floud et al. (Estimate A), 2472 + 1800, Floud et al. (Estimate B), 2439 + 1800, Fogel (2004), 2237 + 1800, Kelly and Ó Gráda (2013b) (Estimate A), 2941-2956 + 1800, Kelly and Ó Gráda (2013b) (Estimate B), 2749-2794 + 1800, Meredith and Oxley (2014), 2620 + 1800, Muldrew (2011), 3977 + 1800, Overton and Campbell (1996), n/a + 1800/1809, Broadberry et al. (2015), 2175 + 1830, Overton and Campbell (1996), n/a + 1830/1839, Broadberry et al. (2015), 1950 + 1840/1849, Broadberry et al. (2015), 2166 + 1850, Allen (2005), 2525 + 1850, Floud et al. (2011) (Estimate A), 2505 + 1850, Floud et al. (2011) (Estimate B)/Meredith and Oxley (2013), 2545 + 1850, Fogel (2004), 2362 + 1850/1859, Broadberry et al. (2015), 2111 + 1861/1870, Broadberry et al. (2015), 2463 + 1871, Overton and Campbell (1996), n/a + 1909/13, Floud et al. (2011) & Meredith and Oxley (2014), 2977 + 1909/13, Fogel (2004), 2857 + 1954/55, Fogel (2004), 3231 + 1961, Fogel (2004), 3170 + 1965, Fogel (2004), 3304 + 1989, Fogel (2004), 3149 + """ + + # Create a dataframe with the extracted data. + data_parsed = [[item.strip() for item in line.split(",")] for line in data.split("\n")[1:-1]] + df = pd.DataFrame(data_parsed, columns=["Years", "Source", "Total"]) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() From 1373b1535dc4236508aa8648528fdec98e7205d0 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 24 May 2024 09:18:58 +0200 Subject: [PATCH 04/17] Add grapher step (WIP) --- dag/agriculture.yml | 2 ++ .../2024-05-23/daily_calories_per_person.py | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 609fca68a6c..665b895630f 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -80,6 +80,8 @@ steps: - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv data://garden/agriculture/2024-05-23/daily_calories_per_person: - data://meadow/agriculture/2024-05-23/harris_et_al_2015 + data://grapher/agriculture/2024-05-23/daily_calories_per_person: + - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. diff --git a/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py new file mode 100644 index 00000000000..d56cf6fcb70 --- /dev/null +++ b/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("daily_calories_per_person") + tb = ds_garden["daily_calories_per_person"] + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() From 931ce739019754cdf2dc5989e40d968e63227f06 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 11:18:18 +0200 Subject: [PATCH 05/17] Add data from Floud et al. (2011) (WIP) --- dag/agriculture.yml | 11 ++- .../2024-05-23/daily_calories_per_person.py | 4 + .../2024-05-23/floud_et_al_2011.py | 45 +++++++++++ .../2024-05-23/harris_et_al_2015.py | 14 ++-- .../2024-05-23/floud_et_al_2011.py | 75 +++++++++++++++++++ ...d_et_al_2011_daily_calories_europe.csv.dvc | 30 ++++++++ ...floud_et_al_2011_daily_calories_us.csv.dvc | 30 ++++++++ 7 files changed, 199 insertions(+), 10 deletions(-) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011.py create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 665b895630f..16e76b97374 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -78,11 +78,20 @@ steps: # data://meadow/agriculture/2024-05-23/harris_et_al_2015: - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv + # + # Floud et al. (2011) - Daily calories in United States and Western Europe. + # + data://meadow/agriculture/2024-05-23/floud_et_al_2011: + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv + # + # Agriculture - Long-run daily calorie supply per person. + # data://garden/agriculture/2024-05-23/daily_calories_per_person: - data://meadow/agriculture/2024-05-23/harris_et_al_2015 + - data://meadow/agriculture/2024-05-23/floud_et_al_2011 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person - ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 5566f7ec0f2..3bf3a4201b1 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -16,6 +16,10 @@ def run(dest_dir: str) -> None: ds_harris = paths.load_dataset("harris_et_al_2015") tb_harris = ds_harris["harris_et_al_2015"].reset_index() + # Load Floud et al. (2011) dataset and read its main table. + ds_floud = paths.load_dataset("floud_et_al_2011") + tb_floud = ds_floud["floud_et_al_2011"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..b9542acb88c --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,45 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_europe = paths.load_snapshot("floud_et_al_2011_daily_calories_europe.csv") + snap_us = paths.load_snapshot("floud_et_al_2011_daily_calories_us.csv") + + # Load data from snapshots. + tb_europe = snap_europe.read() + tb_us = snap_us.read() + + # + # Process data. + # + # Transform Europe data to have a year column. + tb_europe = tb_europe.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Prepare US data. + tb_us = tb_us.rename(columns={"Year": "year", "Calories": "daily_calories"}, errors="raise").assign( + **{"country": "United States"} + ) + + # Combine both tables. + tb = pr.concat([tb_europe, tb_us], ignore_index=True) + + # Format table conveniently. + tb = tb.format(["country", "year"], short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py index 2720bef8d18..da4abd44e6b 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -10,23 +10,19 @@ def run(dest_dir: str) -> None: # # Load inputs. # - # Retrieve snapshot. + # Retrieve snapshot and read its data. snap = paths.load_snapshot("harris_et_al_2015.csv") - - # Load data from snapshot. tb = snap.read() # # Process data. # - # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.format(["country", "year"]) + # Format table conveniently. + tb = tb.format(["years", "source"]) # # Save outputs. # - # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) - - # Save changes in the new meadow dataset. + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011.py b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..0b201795ce9 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,75 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Initialize new snapshots for daily caloric intake in the US and in Western Europe. + snap_us = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_us.csv") + snap_europe = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_europe.csv") + + # Data from Table 6.6 on US daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_us = """ +Year,Calories +1800,2952 +1810,2935 +1820,2904 +1830,2888 +1840,3013 +1850,2585 +1860,2826 +1870,3029 +1880,3237 +1890,3134 +1900,3212 +1910,3068 +1920,3259 +1930,3400 +1940,3300 +1952,3200 +1960,3100 +1970,3200 +1980,3200 +1990,3500 +2000,3900 +2004,3900 + """ + + # Create a dataframe with the extracted data. + data_us_parsed = [line.split(",") for line in data_us.split("\n")[1:-1]] + df_us = pd.DataFrame(data_us_parsed[1:], columns=data_us_parsed[0]) + + # Data from Table 5.5 on Western Europe daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_europe = """ +country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960 +Belgium,2840,,,,,2423,2426,2553,2663,2851,2987,3278,,2940,,,3040 +England,2436,,,,,2512,,,2773,,,2977,,2810,3060,3120,3280 +Finland,,,,,,,1900,,,,,3000,,2950,,,3110 +France,1846,,1984,2118,2377,2840,2854,3085,3085,3220,3192,3323,3133,,,,3050 +Germany,2210,,,,,,2120,,,,,,,,,,2960 +Iceland,,,2887,,3080,3381,,2573,3002,3106,3316,3499,,,,, +Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730 +Netherlands,,,,,,,2227,,2493,,2721,,,,,, +Norway,,1800,,,2250,,3300,,,,,,,,,,2930 + """ + # Create a dataframe with the extracted data. + data_europe_parsed = [line.split(",") for line in data_europe.split("\n")[1:-1]] + df_europe = pd.DataFrame(data_europe_parsed[1:], columns=data_europe_parsed[0]) + + # Create snapshots. + snap_us.create_snapshot(upload=upload, data=df_us) + snap_europe.create_snapshot(upload=upload, data=df_europe) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc new file mode 100644 index 00000000000..044d7a4e0e7 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in Western Europe + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4f31506ded236dc72a590695f8868a1c + size: 554 + path: floud_et_al_2011_daily_calories_europe.csv diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc new file mode 100644 index 00000000000..3573e2923e6 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in United States + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4316767b9de23caf9710fe44caff5ec9 + size: 234 + path: floud_et_al_2011_daily_calories_us.csv From ea87b56aa763ec86cf0c6532ba60ac5e64858a04 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 13:21:22 +0200 Subject: [PATCH 06/17] Add data from Jonsson 1998 --- dag/agriculture.yml | 6 +++ .../2024-05-23/daily_calories_per_person.py | 4 ++ .../agriculture/2024-05-23/jonsson_1998.py | 32 ++++++++++++ .../2024-05-23/harris_et_al_2015.csv.dvc | 3 +- .../2024-05-23/jonsson_1998.csv.dvc | 30 +++++++++++ .../agriculture/2024-05-23/jonsson_1998.py | 50 +++++++++++++++++++ 6 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py create mode 100644 snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/jonsson_1998.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 16e76b97374..24584e57983 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -85,11 +85,17 @@ steps: - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv # + # Jonsson (1998) - Daily calories in Iceland. + # + data://meadow/agriculture/2024-05-23/jonsson_1998: + - snapshot://agriculture/2024-05-23/jonsson_1998.csv + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: - data://meadow/agriculture/2024-05-23/harris_et_al_2015 - data://meadow/agriculture/2024-05-23/floud_et_al_2011 + - data://meadow/agriculture/2024-05-23/jonsson_1998 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 3bf3a4201b1..6dca083cfa5 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -20,6 +20,10 @@ def run(dest_dir: str) -> None: ds_floud = paths.load_dataset("floud_et_al_2011") tb_floud = ds_floud["floud_et_al_2011"].reset_index() + # Load Jonsson (1998) dataset and read its main table. + ds_jonsson = paths.load_dataset("jonsson_1998") + tb_jonsson = ds_jonsson["jonsson_1998"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py new file mode 100644 index 00000000000..ae84f20378c --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("jonsson_1998.csv") + tb = snap.read() + + # + # Process data. + # + # Add a country column. + tb["country"] = "Iceland" + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc index 730901ccbcb..d96fa7c97db 100644 --- a/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc +++ b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc @@ -3,7 +3,8 @@ meta: origin: # Data product / Snapshot - title: Daily calories in England and Wales according to various authors + title: How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries + title_snapshot: How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries - Daily calories in England and Wales description: |- This dataset contains the table in the appendix of Harris et al. (2015) paper: "How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries". That table contains a compilation of daily calorie (supply or consumption) in England and Wales, according to various different studies. diff --git a/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc b/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc new file mode 100644 index 00000000000..a7cc48a8338 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Changes in food consumption in Iceland, 1770-1940 + title_snapshot: Changes in food consumption in Iceland, 1770-1940 - Daily calories in Iceland + description: |- + This dataset contains daily energy from Table 5 of Jonsson (1998) paper: "Changes in food consumption in Iceland, 1770-1940". + date_published: "1998-01-01" + + # Citation + producer: Jonsson + citation_full: |- + Jonsson, G.R. (1998), "Changes in food consumption in Iceland, 1770-1940". Scandinavian Economic History Review, 46, 24-41. + Data extracted from Table 5. + attribution_short: Jonsson (1998) + + # Files + url_main: https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677 + date_accessed: 2024-05-27 + + # License + license: + name: © Scandinavian Economic History Review 1998 + url: https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677 +outs: + - md5: 9637e39deb3ff3064e125c5141d273f1 + size: 180 + path: jonsson_1998.csv diff --git a/snapshots/agriculture/2024-05-23/jonsson_1998.py b/snapshots/agriculture/2024-05-23/jonsson_1998.py new file mode 100644 index 00000000000..c819cb394d4 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/jonsson_1998.py @@ -0,0 +1,50 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/jonsson_1998.csv") + + # Data extracted using chatGPT 4o (and manually inspected and corrected). + data = """ +year,daily_calories +1770,3048 +1784,2322 +1795,2724 +1819,2887 +1840,3080 +1849,3381 +1855,2917 +1863,2885 +1870,2573 +1880,3002 +1890,3106 +1900,3316 +1910,3499 +1920,3610 +1930,4207 +1938,4066 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() From e5fdf18b7bf9dc716456d93eb4d408b13259c4cb Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 13:50:10 +0200 Subject: [PATCH 07/17] Add Grigg (1995) data --- dag/agriculture.yml | 6 +++ ...yml => daily_calories_per_person.meta.yml} | 5 ++- .../2024-05-23/daily_calories_per_person.py | 6 ++- .../agriculture/2024-05-23/grigg_1995.py | 32 +++++++++++++++ .../agriculture/2024-05-23/grigg_1995.csv.dvc | 30 ++++++++++++++ .../agriculture/2024-05-23/grigg_1995.py | 41 +++++++++++++++++++ 6 files changed, 118 insertions(+), 2 deletions(-) rename etl/steps/data/garden/agriculture/2024-05-23/{daily_calories_per_person.yml => daily_calories_per_person.meta.yml} (66%) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py create mode 100644 snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/grigg_1995.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 24584e57983..a037d3ae204 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -90,12 +90,18 @@ steps: data://meadow/agriculture/2024-05-23/jonsson_1998: - snapshot://agriculture/2024-05-23/jonsson_1998.csv # + # Grigg (1995) - Daily calories in Western Europe. + # + data://meadow/agriculture/2024-05-23/grigg_1995: + - snapshot://agriculture/2024-05-23/grigg_1995.csv + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: - data://meadow/agriculture/2024-05-23/harris_et_al_2015 - data://meadow/agriculture/2024-05-23/floud_et_al_2011 - data://meadow/agriculture/2024-05-23/jonsson_1998 + - data://meadow/agriculture/2024-05-23/grigg_1995 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml similarity index 66% rename from etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml rename to etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml index b89a93f9b85..333b2d3b73c 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.yml +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -12,4 +12,7 @@ tables: daily_calories_per_person: title: Daily calory supply per person variables: - {} + daily_calories: + title: Daily calory supply per person + unit: kilocalories + short_unit: kcal diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 6dca083cfa5..ac0f1ceb768 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -24,11 +24,15 @@ def run(dest_dir: str) -> None: ds_jonsson = paths.load_dataset("jonsson_1998") tb_jonsson = ds_jonsson["jonsson_1998"].reset_index() + # Load Grigg (1995) dataset and read its main table. + ds_grigg = paths.load_dataset("grigg_1995") + tb_grigg = ds_grigg["grigg_1995"].reset_index() + # # Process data. # # TODO: Continue processing. - tb = tb_harris.copy() + tb = tb_grigg.copy() # Set an appropriate index and sort conveniently. tb = tb.format(short_name=paths.short_name) diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py new file mode 100644 index 00000000000..b455d709750 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("grigg_1995.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc b/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc new file mode 100644 index 00000000000..8f30143e7fd --- /dev/null +++ b/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The nutritional transition in Western Europe + title_snapshot: The nutritional transition in Western Europe - Daily calories in Western Europe + description: |- + This dataset contains daily calories available per capita from Table 1 of Grigg (1995) paper: "The nutritional transition in Western Europe". + date_published: "1995-07-01" + + # Citation + producer: Grigg + citation_full: |- + Grigg, D. (1995), "The nutritional transition in Western Europe". Journal of Historical Geography, Volume 21, Issue 3, 1995, Pages 247-261. https://doi.org/10.1006/jhge.1995.0018 + Data extracted from Table 1. + attribution_short: Grigg (1995) + + # Files + url_main: https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187?via%3Dihub + date_accessed: 2024-05-27 + + # License + license: + name: © Elsevier 1995 + url: https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187?via%3Dihub +outs: + - md5: 79e132cf8120eb4ad8a1dd9d349d892a + size: 520 + path: grigg_1995.csv diff --git a/snapshots/agriculture/2024-05-23/grigg_1995.py b/snapshots/agriculture/2024-05-23/grigg_1995.py new file mode 100644 index 00000000000..acc5d81981b --- /dev/null +++ b/snapshots/agriculture/2024-05-23/grigg_1995.py @@ -0,0 +1,41 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/grigg_1995.csv") + + # Data extracted using chatGPT 4o (and manually inspected and corrected). + data = """ +country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960 +Belgium,2247,,,,,2238,2580,,,,,3300,,2940,,,3040 +England,2349,,,,,,3240,,2773,,,2760,,2810,3060,3120,3280 +Germany,2210,,,,,,2120,,,,,,,,,,2960 +Finland,,,,,,,1900,,,,,3000,,2950,,,3110 +Norway,,1800,,,2250,,3300,,,,,,,,,,2930 +Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730 +France,1846,,1984,2118,2377,2480,2854,2875,3085,3220,3192,3323,3133,3127,,,3050 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() From 0245769d67a1547c641b9a6e61beb43392b0301a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 14:11:32 +0200 Subject: [PATCH 08/17] Add data from Fogel (2004) --- dag/agriculture.yml | 6 +++ .../2024-05-23/daily_calories_per_person.py | 4 ++ .../agriculture/2024-05-23/fogel_2004.py | 32 ++++++++++++ .../agriculture/2024-05-23/fogel_2004.csv.dvc | 30 ++++++++++++ .../agriculture/2024-05-23/fogel_2004.py | 49 +++++++++++++++++++ 5 files changed, 121 insertions(+) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py create mode 100644 snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/fogel_2004.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index a037d3ae204..c53f8cccdc6 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -95,6 +95,11 @@ steps: data://meadow/agriculture/2024-05-23/grigg_1995: - snapshot://agriculture/2024-05-23/grigg_1995.csv # + # Fogel (2004) - Daily calories in France and Great Britain. + # + data://meadow/agriculture/2024-05-23/fogel_2004: + - snapshot://agriculture/2024-05-23/fogel_2004.csv + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: @@ -102,6 +107,7 @@ steps: - data://meadow/agriculture/2024-05-23/floud_et_al_2011 - data://meadow/agriculture/2024-05-23/jonsson_1998 - data://meadow/agriculture/2024-05-23/grigg_1995 + - data://meadow/agriculture/2024-05-23/fogel_2004 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index ac0f1ceb768..ec1c780a252 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -28,6 +28,10 @@ def run(dest_dir: str) -> None: ds_grigg = paths.load_dataset("grigg_1995") tb_grigg = ds_grigg["grigg_1995"].reset_index() + # Load Fogel (2004) dataset and read its main table. + ds_fogel = paths.load_dataset("fogel_2004") + tb_fogel = ds_fogel["fogel_2004"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py new file mode 100644 index 00000000000..45bcd82f1a2 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fogel_2004.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["Year"], var_name="country", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc new file mode 100644 index 00000000000..f2534d326d1 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Escape from hunger and Premature Death + title_snapshot: The Escape from hunger and Premature Death - Daily calories in France and Great Britain + description: |- + This dataset contains daily calorie supply from Table 1.2 of Fogel (2004) book: "The Escape from hunger and Premature Death". + date_published: "2004-05-24" + + # Citation + producer: Fogel + citation_full: |- + Fogel, R.W. (2004), "The Escape from hunger and Premature Death". Cambridge Studies in Population, Economy and Society in Past Time, Series Number 38. + Data extracted from Table 1.2. + attribution_short: Fogel (2004) + + # Files + url_main: https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2004 + url: https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA +outs: + - md5: 11d98a77f8589394b38ba286dc56d27f + size: 241 + path: fogel_2004.csv diff --git a/snapshots/agriculture/2024-05-23/fogel_2004.py b/snapshots/agriculture/2024-05-23/fogel_2004.py new file mode 100644 index 00000000000..45d81263452 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fogel_2004.py @@ -0,0 +1,49 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fogel_2004.csv") + + # Data manually extracted. + data = """ +Year,France,Great Britain +1700,,2095 +1705,1657, +1750,,2168 +1785,1848, +1800,,2237 +1803-12,1846, +1845-54,2480, +1850,,2362 +1909-13,,2857 +1935-39,2975, +1954-55,2783,3231 +1961,,3170 +1965,3355,3304 +1989,3465,3149 + + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() From a98158d65c3324c4603c9aaeab0992ec54e4c15a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 15:56:44 +0200 Subject: [PATCH 09/17] Add FAO (2000) data --- dag/agriculture.yml | 6 +++ .../2024-05-23/daily_calories_per_person.py | 4 ++ .../meadow/agriculture/2024-05-23/fao_2000.py | 32 ++++++++++++ .../agriculture/2024-05-23/fao_2000.csv.dvc | 30 +++++++++++ snapshots/agriculture/2024-05-23/fao_2000.py | 51 +++++++++++++++++++ 5 files changed, 123 insertions(+) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py create mode 100644 snapshots/agriculture/2024-05-23/fao_2000.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/fao_2000.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index c53f8cccdc6..acea38f0749 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -100,6 +100,11 @@ steps: data://meadow/agriculture/2024-05-23/fogel_2004: - snapshot://agriculture/2024-05-23/fogel_2004.csv # + # FAO (2000) - The State of Food and Agriculture. + # + data://meadow/agriculture/2024-05-23/fao_2000: + - snapshot://agriculture/2024-05-23/fao_2000.csv + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: @@ -108,6 +113,7 @@ steps: - data://meadow/agriculture/2024-05-23/jonsson_1998 - data://meadow/agriculture/2024-05-23/grigg_1995 - data://meadow/agriculture/2024-05-23/fogel_2004 + - data://meadow/agriculture/2024-05-23/fao_2000 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index ec1c780a252..6856876d677 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -32,6 +32,10 @@ def run(dest_dir: str) -> None: ds_fogel = paths.load_dataset("fogel_2004") tb_fogel = ds_fogel["fogel_2004"].reset_index() + # Load FAO (2000) dataset and read its main table. + ds_fao = paths.load_dataset("fao_2000") + tb_fao = ds_fao["fao_2000"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py new file mode 100644 index 00000000000..7c2b95c3e46 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fao_2000.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc new file mode 100644 index 00000000000..68060ef62fd --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The State of Food and Agriculture 2000 + title_snapshot: The State of Food and Agriculture 2000 - Daily calories in various countries + description: |- + This dataset contains daily calories in various countries, from Table 11 of FAO's "The State of Food and Agriculture 2000". + date_published: "2000-03-01" + + # Citation + producer: Food and Agriculture Organization of the United Nations + citation_full: |- + Food and Agriculture Organization of the United Nations (2000), "The State of Food and Agriculture 2000". + Data extracted from Table 11. + attribution_short: FAO (2000) + + # Files + url_main: https://www.fao.org/agrifood-economics/publications/detail/en/c/122046/ + date_accessed: 2024-05-27 + + # License + license: + name: © FAO 2000 + url: https://www.fao.org/agrifood-economics/publications/detail/en/c/122046/ +outs: + - md5: 695a0bdbf3f50e6008d3be384cb8588c + size: 196 + path: fao_2000.csv diff --git a/snapshots/agriculture/2024-05-23/fao_2000.py b/snapshots/agriculture/2024-05-23/fao_2000.py new file mode 100644 index 00000000000..cc43a9eeecf --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_2000.py @@ -0,0 +1,51 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fao_2000.csv") + + # Data manually extracted. + data = """ +country,1934-38,1946-49 +Uganda,,2100 +Cambodia,,1560 +Mexico,1800,2050 +Peru,1860,1920 + """ + # Note that I removed "Kenya,2230," because, as the footnote says, it includes Uganda. + # I also removed the first point of Cambodia because it was actually referring to French Indochina. + # I also removed the first point of India because it was actually referring to India and Pakistan. + # Note that the footnote of the table says that the year ranges for India, China and Brazil are different. + # Create an additional dataframe for them. + data_additional = """ +country,1934-38,1946-49,1931-37,1949-50,1935-39 +China,,,2230,2030 +India,,,,1700 +Brazil,,2340,,,2150 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + df_additional = pd.read_csv(StringIO(data_additional)) + df = pd.concat([df, df_additional], ignore_index=True) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() From 779f9dbf137bcf3e9ad586bbfc710aff781d201f Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 16:53:54 +0200 Subject: [PATCH 10/17] Add data from FAO (1949) --- dag/agriculture.yml | 8 +- .../2024-05-23/daily_calories_per_person.py | 8 +- .../meadow/agriculture/2024-05-23/fao_1949.py | 32 +++++++ .../agriculture/2024-05-23/fao_1949.csv.dvc | 30 +++++++ snapshots/agriculture/2024-05-23/fao_1949.py | 89 +++++++++++++++++++ .../agriculture/2024-05-23/fao_2000.csv.dvc | 2 +- 6 files changed, 165 insertions(+), 4 deletions(-) create mode 100644 etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py create mode 100644 snapshots/agriculture/2024-05-23/fao_1949.csv.dvc create mode 100644 snapshots/agriculture/2024-05-23/fao_1949.py diff --git a/dag/agriculture.yml b/dag/agriculture.yml index acea38f0749..6c36a9328d9 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -100,11 +100,16 @@ steps: data://meadow/agriculture/2024-05-23/fogel_2004: - snapshot://agriculture/2024-05-23/fogel_2004.csv # - # FAO (2000) - The State of Food and Agriculture. + # FAO (2000) - The State of Food and Agriculture 2000. # data://meadow/agriculture/2024-05-23/fao_2000: - snapshot://agriculture/2024-05-23/fao_2000.csv # + # FAO (1949) - The State of Food and Agriculture 1949. + # + data://meadow/agriculture/2024-05-23/fao_1949: + - snapshot://agriculture/2024-05-23/fao_1949.csv + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: @@ -114,6 +119,7 @@ steps: - data://meadow/agriculture/2024-05-23/grigg_1995 - data://meadow/agriculture/2024-05-23/fogel_2004 - data://meadow/agriculture/2024-05-23/fao_2000 + - data://meadow/agriculture/2024-05-23/fao_1949 data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 6856876d677..1cc4ff5a887 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -33,8 +33,12 @@ def run(dest_dir: str) -> None: tb_fogel = ds_fogel["fogel_2004"].reset_index() # Load FAO (2000) dataset and read its main table. - ds_fao = paths.load_dataset("fao_2000") - tb_fao = ds_fao["fao_2000"].reset_index() + ds_fao2000 = paths.load_dataset("fao_2000") + tb_fao2000 = ds_fao2000["fao_2000"].reset_index() + + # Load FAO (1949) dataset and read its main table. + ds_fao1949 = paths.load_dataset("fao_1949") + tb_fao1949 = ds_fao1949["fao_1949"].reset_index() # # Process data. diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py new file mode 100644 index 00000000000..0e965f59f58 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fao_1949.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc new file mode 100644 index 00000000000..e83f8681ad9 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The State of Food and Agriculture 1949 + title_snapshot: The State of Food and Agriculture 1949 - Daily calories in various countries + description: |- + This dataset contains daily calories in various countries, from Table 15 of FAO's "The State of Food and Agriculture 1949". + date_published: "1949-10-01" + + # Citation + producer: Food and Agriculture Organization of the United Nations + citation_full: |- + Food and Agriculture Organization of the United Nations (1949), "The State of Food and Agriculture 1949". + Data extracted from Table 15. + attribution_short: FAO (1949) + + # Files + url_main: https://www.un-ilibrary.org/content/books/9789210472654 + date_accessed: 2024-05-27 + + # License + license: + name: © FAO 1949 + url: https://www.un-ilibrary.org/content/books/9789210472654 +outs: + - md5: 4fdce005d413a34348174c4371f41025 + size: 1103 + path: fao_1949.csv diff --git a/snapshots/agriculture/2024-05-23/fao_1949.py b/snapshots/agriculture/2024-05-23/fao_1949.py new file mode 100644 index 00000000000..1d3db306464 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_1949.py @@ -0,0 +1,89 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fao_1949.csv") + + # Data manually extracted. + data = """ +country,1947/48,1948/49 +Burma,1986,1877 +Ceylon,1977,1918 +China (22 provinces),2115, +India,,1570 +Japan,1670,1795 +Philippines,1770, +Thailand,2110,2020 +Austria,2397,2698 +Belgium,2667,2760 +Czechoslovakia,2402,2656 +Denmark,3125,3206 +Finland,2617,2851 +France,2357,2667 +Greece,2266,2358 +Hungary,2432, +Iceland,3268, +Ireland,3260,3276 +Italy,2249,2398 +Netherlands,2856, +Luxembourg,2693,2878 +Norway,2899,3051 +Poland,2363,2625 +Portugal,2279,2184 +Spain,2180,2377 +Sweden,2871,3108 +Switzerland,3050,2996 +United Kingdom,2968,3084 +Yugoslavia,2144, +Australia,3262,3265 +New Zealand,3286,3259 +Canada,3161,3141 +United States,3244,3186 +Cuba,2682,2814 +El Salvador,1557, +Mexico,2032,2101 +Argentina,3188,3191 +Brazil,2245, +Chile,2352,2356 +Colombia,1950, +Peru,1925,2219 +Uruguay,2490,2529 +Egypt,2364,2458 +Turkey,2173,2506 +Ethiopia,1770, +Algeria,1279,1421 +Madagascar,2074, +Morocco,1837,1825 +Tanganyika,2163, +Tunisia,1498,1545 +Union of South Africa,2422,2517 + """ + # NOTE: + # * The table includes India and Pakistan, but the footnote says that the value for 1948/49 is only India. + # * Footnote says about Japan: "1t is believed by the Supreme Command Allied Powers that for staple foods there is an appreciable understatement of production, particularly from home gardens, both in staple foods and vegetables. A nutrition survey conducted by the Ministry of Welfare estimated calorie supplies per person per day at 1,965.". + # * Footnote says about France: "Unreported production has most likely provided enough calories to raise the level to about 2,500-2,600 calories.". + # * For some countries, the footnote says "Calendar year basis: 1947 and 1948.", but that is already the years we will use for all countries. + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc index 68060ef62fd..93afa11984c 100644 --- a/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc +++ b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc @@ -7,7 +7,7 @@ meta: title_snapshot: The State of Food and Agriculture 2000 - Daily calories in various countries description: |- This dataset contains daily calories in various countries, from Table 11 of FAO's "The State of Food and Agriculture 2000". - date_published: "2000-03-01" + date_published: "2000-10-01" # Citation producer: Food and Agriculture Organization of the United Nations From e732f5d8327d1e602f135a9556ffc4267d629415 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 27 May 2024 18:14:24 +0200 Subject: [PATCH 11/17] Add USDA/ERS data on food availability --- dag/agriculture.yml | 6 +++ .../2024-05-23/daily_calories_per_person.py | 4 ++ .../usda_ers/2024-05-23/food_availability.py | 43 +++++++++++++++++++ .../usda_ers/2024-05-23/food_availability.py | 24 +++++++++++ .../2024-05-23/food_availability.xls.dvc | 29 +++++++++++++ 5 files changed, 106 insertions(+) create mode 100644 etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py create mode 100644 snapshots/usda_ers/2024-05-23/food_availability.py create mode 100644 snapshots/usda_ers/2024-05-23/food_availability.xls.dvc diff --git a/dag/agriculture.yml b/dag/agriculture.yml index 6c36a9328d9..cd97be2cebe 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -110,6 +110,11 @@ steps: data://meadow/agriculture/2024-05-23/fao_1949: - snapshot://agriculture/2024-05-23/fao_1949.csv # + # USDA/ERS - Food availability. + # + data://meadow/usda_ers/2024-05-23/food_availability: + - snapshot://usda_ers/2024-05-23/food_availability.xls + # # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: @@ -120,6 +125,7 @@ steps: - data://meadow/agriculture/2024-05-23/fogel_2004 - data://meadow/agriculture/2024-05-23/fao_2000 - data://meadow/agriculture/2024-05-23/fao_1949 + - data://meadow/usda_ers/2024-05-23/food_availability data://grapher/agriculture/2024-05-23/daily_calories_per_person: - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 1cc4ff5a887..12b7f3385ca 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -40,6 +40,10 @@ def run(dest_dir: str) -> None: ds_fao1949 = paths.load_dataset("fao_1949") tb_fao1949 = ds_fao1949["fao_1949"].reset_index() + # Load USDA/ERS data on food availability. + ds_usda = paths.load_dataset("food_availability") + tb_usda = ds_usda["food_availability"].reset_index() + # # Process data. # diff --git a/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py new file mode 100644 index 00000000000..1eeba5fcac0 --- /dev/null +++ b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py @@ -0,0 +1,43 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "Year": "year", + "Food energy": "daily_calories", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots and read their data. + snap = paths.load_snapshot("food_availability.xls") + data = snap.read(sheet_name="Totals", skiprows=1) + + # + # Process data. + # + # Select and rename columns. + tb = data[COLUMNS.keys()].rename(columns=COLUMNS) + + # Drop any row for which "year" is not an integer (to get rid of headers and footers). + tb = tb[tb["year"].apply(lambda x: isinstance(x, int))].reset_index(drop=True) + + # Add a country column. + tb["country"] = "United States" + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/usda_ers/2024-05-23/food_availability.py b/snapshots/usda_ers/2024-05-23/food_availability.py new file mode 100644 index 00000000000..d56f36767d0 --- /dev/null +++ b/snapshots/usda_ers/2024-05-23/food_availability.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"usda_ers/{SNAPSHOT_VERSION}/food_availability.xls") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc b/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc new file mode 100644 index 00000000000..32e3e117abf --- /dev/null +++ b/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: "U.S. food supply: Nutrients and other food components, per capita per day" + date_published: "2015-02-01" + + # Citation + producer: USDA Economic Research Service (ERS) + citation_full: |- + Economic Research Service of the United States Department of Agriculture (USDA/ERS) - U.S. food supply: Nutrients and other food components, per capita per day. + The data can be found as one of the archived tables of the Food Availability (Per Capita) Data System. + attribution_short: USDA/ERS + + # Files + url_main: https://www.ers.usda.gov/data-products/food-availability-per-capita-data-system/food-availability-per-capita-data-system/ + url_download: https://www.ers.usda.gov/webdocs/DataFiles/50472/nutrients.xls?v=4603.6 + date_accessed: 2024-05-27 + + # License + license: + name: Public Domain + url: https://www.ers.usda.gov/data-products/food-availability-per-capita-data-system/food-availability-per-capita-data-system/ + +outs: + - md5: bfebce79879913cf997ecb2a2e32161b + size: 164864 + path: food_availability.xls From b45da123f4ba957a390dd7acfc2b3b0f54bf85e0 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 28 May 2024 20:40:22 +0200 Subject: [PATCH 12/17] Combine all tables in the garden step --- dag/agriculture.yml | 1 + .../daily_calories_per_person.countries.json | 248 ++++++++++++++++- .../2024-05-23/daily_calories_per_person.py | 258 +++++++++++++++++- .../2024-05-23/harris_et_al_2015.py | 8 +- .../agriculture/2024-05-23/fao_1949.csv.dvc | 4 +- snapshots/agriculture/2024-05-23/fao_1949.py | 2 +- 6 files changed, 512 insertions(+), 9 deletions(-) diff --git a/dag/agriculture.yml b/dag/agriculture.yml index cd97be2cebe..490a0635c14 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -118,6 +118,7 @@ steps: # Agriculture - Long-run daily calorie supply per person. # data://garden/agriculture/2024-05-23/daily_calories_per_person: + - data://garden/faostat/2024-03-14/faostat_fbsc - data://meadow/agriculture/2024-05-23/harris_et_al_2015 - data://meadow/agriculture/2024-05-23/floud_et_al_2011 - data://meadow/agriculture/2024-05-23/jonsson_1998 diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json index 0967ef424bc..28076d8bb8c 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json @@ -1 +1,247 @@ -{} +{ + "Belgium": "Belgium", + "Finland": "Finland", + "Germany": "Germany", + "Iceland": "Iceland", + "Italy": "Italy", + "Netherlands": "Netherlands", + "Norway": "Norway", + "Algeria": "Algeria", + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Brazil": "Brazil", + "Burma": "Myanmar", + "Cambodia": "Cambodia", + "Canada": "Canada", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Cuba": "Cuba", + "Czechoslovakia": "Czechoslovakia", + "Denmark": "Denmark", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Ethiopia": "Ethiopia", + "France": "France", + "Greece": "Greece", + "Hungary": "Hungary", + "India": "India", + "Ireland": "Ireland", + "Japan": "Japan", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Mexico": "Mexico", + "Morocco": "Morocco", + "New Zealand": "New Zealand", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Thailand": "Thailand", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Uganda": "Uganda", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Yugoslavia": "Yugoslavia", + "Ceylon": "Sri Lanka", + "England": "United Kingdom", + "England and Wales": "United Kingdom", + "Great Britain": "United Kingdom", + "Tanganyika": "Tanzania", + "Union of South Africa": "South Africa", + "Net Food Importing Developing Countries (FAO)": "Net Food Importing Developing Countries (FAO)", + "Serbia": "Serbia", + "Bermuda": "Bermuda", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Georgia": "Georgia", + "Taiwan": "Taiwan", + "North Korea": "North Korea", + "Liberia": "Liberia", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Romania": "Romania", + "Africa (FAO)": "Africa (FAO)", + "Haiti": "Haiti", + "Senegal": "Senegal", + "Croatia": "Croatia", + "Brunei": "Brunei", + "Guinea": "Guinea", + "Jamaica": "Jamaica", + "Syria": "Syria", + "Albania": "Albania", + "Southern Africa (FAO)": "Southern Africa (FAO)", + "Papua New Guinea": "Papua New Guinea", + "Sudan": "Sudan", + "Niger": "Niger", + "China (FAO)": "China (FAO)", + "Southern Asia (FAO)": "Southern Asia (FAO)", + "Antigua and Barbuda": "Antigua and Barbuda", + "High-income countries": "High-income countries", + "Small Island Developing States (FAO)": "Small Island Developing States (FAO)", + "Togo": "Togo", + "Honduras": "Honduras", + "Nicaragua": "Nicaragua", + "Lower-middle-income countries": "Lower-middle-income countries", + "Middle Africa (FAO)": "Middle Africa (FAO)", + "Paraguay": "Paraguay", + "South Africa": "South Africa", + "Indonesia": "Indonesia", + "Western Europe (FAO)": "Western Europe (FAO)", + "Northern America (FAO)": "Northern America (FAO)", + "Burundi": "Burundi", + "Gambia": "Gambia", + "Russia": "Russia", + "Belarus": "Belarus", + "Kenya": "Kenya", + "Slovenia": "Slovenia", + "Cameroon": "Cameroon", + "Micronesia (country)": "Micronesia (country)", + "Namibia": "Namibia", + "Vanuatu": "Vanuatu", + "Bolivia": "Bolivia", + "Samoa": "Samoa", + "Sri Lanka": "Sri Lanka", + "South America": "South America", + "Bahamas": "Bahamas", + "Nepal": "Nepal", + "Zambia": "Zambia", + "Mauritius": "Mauritius", + "Turkmenistan": "Turkmenistan", + "Sudan (former)": "Sudan (former)", + "Least Developed Countries (FAO)": "Least Developed Countries (FAO)", + "Czechia": "Czechia", + "Afghanistan": "Afghanistan", + "Sierra Leone": "Sierra Leone", + "Asia (FAO)": "Asia (FAO)", + "Netherlands Antilles": "Netherlands Antilles", + "Mongolia": "Mongolia", + "Saudi Arabia": "Saudi Arabia", + "Yemen": "Yemen", + "Grenada": "Grenada", + "Lebanon": "Lebanon", + "Cyprus": "Cyprus", + "Kiribati": "Kiribati", + "North Macedonia": "North Macedonia", + "South America (FAO)": "South America (FAO)", + "Malaysia": "Malaysia", + "Eswatini": "Eswatini", + "Barbados": "Barbados", + "Uzbekistan": "Uzbekistan", + "Angola": "Angola", + "French Polynesia": "French Polynesia", + "Rwanda": "Rwanda", + "Oceania": "Oceania", + "Vietnam": "Vietnam", + "Slovakia": "Slovakia", + "Fiji": "Fiji", + "Botswana": "Botswana", + "Lithuania": "Lithuania", + "Caribbean (FAO)": "Caribbean (FAO)", + "Micronesia (FAO)": "Micronesia (FAO)", + "Southern Europe (FAO)": "Southern Europe (FAO)", + "Guinea-Bissau": "Guinea-Bissau", + "Bhutan": "Bhutan", + "Dominica": "Dominica", + "Land Locked Developing Countries (FAO)": "Land Locked Developing Countries (FAO)", + "Asia": "Asia", + "Central America (FAO)": "Central America (FAO)", + "South-eastern Asia (FAO)": "South-eastern Asia (FAO)", + "Azerbaijan": "Azerbaijan", + "Malta": "Malta", + "Low Income Food Deficit Countries (FAO)": "Low Income Food Deficit Countries (FAO)", + "Qatar": "Qatar", + "East Timor": "East Timor", + "Libya": "Libya", + "Comoros": "Comoros", + "Cote d'Ivoire": "Cote d'Ivoire", + "Nauru": "Nauru", + "Armenia": "Armenia", + "Upper-middle-income countries": "Upper-middle-income countries", + "Tanzania": "Tanzania", + "Burkina Faso": "Burkina Faso", + "Iran": "Iran", + "Eastern Asia (FAO)": "Eastern Asia (FAO)", + "Central African Republic": "Central African Republic", + "New Caledonia": "New Caledonia", + "Europe (FAO)": "Europe (FAO)", + "European Union (27)": "European Union (27)", + "Moldova": "Moldova", + "Belgium-Luxembourg (FAO)": "Belgium-Luxembourg (FAO)", + "Benin": "Benin", + "Israel": "Israel", + "Democratic Republic of Congo": "Democratic Republic of Congo", + "Nigeria": "Nigeria", + "Africa": "Africa", + "Belize": "Belize", + "Suriname": "Suriname", + "South Sudan": "South Sudan", + "Ecuador": "Ecuador", + "Cape Verde": "Cape Verde", + "Western Asia (FAO)": "Western Asia (FAO)", + "Sao Tome and Principe": "Sao Tome and Principe", + "Eastern Africa (FAO)": "Eastern Africa (FAO)", + "Tajikistan": "Tajikistan", + "Low-income countries": "Low-income countries", + "Oman": "Oman", + "Montenegro": "Montenegro", + "Latvia": "Latvia", + "Lesotho": "Lesotho", + "Bahrain": "Bahrain", + "North America": "North America", + "Myanmar": "Myanmar", + "Seychelles": "Seychelles", + "Trinidad and Tobago": "Trinidad and Tobago", + "Bulgaria": "Bulgaria", + "Western Africa (FAO)": "Western Africa (FAO)", + "Panama": "Panama", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Chad": "Chad", + "World": "World", + "Serbia and Montenegro": "Serbia and Montenegro", + "Maldives": "Maldives", + "Bangladesh": "Bangladesh", + "Kyrgyzstan": "Kyrgyzstan", + "Northern Europe (FAO)": "Northern Europe (FAO)", + "Central Asia (FAO)": "Central Asia (FAO)", + "Congo": "Congo", + "Europe": "Europe", + "Zimbabwe": "Zimbabwe", + "Mali": "Mali", + "Iraq": "Iraq", + "Jordan": "Jordan", + "Ghana": "Ghana", + "Solomon Islands": "Solomon Islands", + "Mozambique": "Mozambique", + "Costa Rica": "Costa Rica", + "Americas (FAO)": "Americas (FAO)", + "Hong Kong": "Hong Kong", + "Saint Lucia": "Saint Lucia", + "Malawi": "Malawi", + "Guyana": "Guyana", + "Eastern Europe (FAO)": "Eastern Europe (FAO)", + "Pakistan": "Pakistan", + "Estonia": "Estonia", + "Oceania (FAO)": "Oceania (FAO)", + "Djibouti": "Djibouti", + "Kuwait": "Kuwait", + "Ukraine": "Ukraine", + "Gabon": "Gabon", + "Mauritania": "Mauritania", + "Guatemala": "Guatemala", + "South Korea": "South Korea", + "Kazakhstan": "Kazakhstan", + "Laos": "Laos", + "Macao": "Macao", + "USSR": "USSR", + "United Arab Emirates": "United Arab Emirates", + "Dominican Republic": "Dominican Republic", + "Ethiopia (former)": "Ethiopia (former)", + "Northern Africa (FAO)": "Northern Africa (FAO)", + "Venezuela": "Venezuela", + "European Union (27) (FAO)": "European Union (27) (FAO)" +} diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index 12b7f3385ca..f10185c1e99 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -2,16 +2,55 @@ """ +from typing import Union + +import owid.catalog.processing as pr +import pandas as pd + +from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +# FAOSTAT element code for "Food available for consumption" measured in "kilocalories per day per capita" +# (corresponding to original FAOSTAT element "Food supply (kcal/capita/day)"). +ELEMENT_CODE_FOOD_SUPPLY_PER_CAPITA = "0664pc" +# FAOSTAT item code for "Total" (corresponding to original FAOSTAT item "Grand Total"). +ITEM_CODE_TOTAL = "00002901" + + +def correct_year(year: Union[str, int], verbose: bool = False) -> int: + year_str = str(year) + if len(year_str) == 4: + # Normal format, e.g. "1990". + year_corrected = int(year_str) + elif len(year_str) == 9: + # Range format, e.g. "1990-1999" or "1845/1854". + year_start, year_end = year_str[0:4], year_str[5:9] + year_corrected = int((int(year_start) + int(year_end)) / 2) + elif len(year_str) == 7: + # Range format, but second year is incomplete, e.g. "1845-54". + year_start, year_end = year_str[0:4], year_str[0:2] + year_str[5:7] + year_corrected = int((int(year_start) + int(year_end)) / 2) + else: + raise ValueError(f"Unexpected year format: {year}") + + # As a sanity check, optionally print the correction. + if verbose and (str(year_corrected) != year_str): + print(f'Corrected "{year}" -> "{year_corrected}"') + + return year_corrected + def run(dest_dir: str) -> None: # # Load inputs. # + # Load FAOSTAT FBSC dataset and read its main table. + ds_fbsc = paths.load_dataset("faostat_fbsc") + tb_fbsc = ds_fbsc["faostat_fbsc"].reset_index() + # Load Harris et al. (2015) dataset and read its main table. ds_harris = paths.load_dataset("harris_et_al_2015") tb_harris = ds_harris["harris_et_al_2015"].reset_index() @@ -47,15 +86,226 @@ def run(dest_dir: str) -> None: # # Process data. # - # TODO: Continue processing. - tb = tb_grigg.copy() + # Prepare FAOSTAT data. + tb_fbsc = ( + tb_fbsc[ + (tb_fbsc["element_code"] == ELEMENT_CODE_FOOD_SUPPLY_PER_CAPITA) & (tb_fbsc["item_code"] == ITEM_CODE_TOTAL) + ][["country", "year", "value"]] + .rename(columns={"value": "daily_calories"}) + .reset_index(drop=True) + ) + + # Ensure the "source" column in Harris et al. (2015) is different from the sources in other tables. + tb_harris["source"] = tb_harris["source"].astype(str) + " via Harris et al. (2015)" + + # Concatenate all tables and add a source column. + tb = pr.concat( + [ + tb_fbsc.assign(**{"source": "FAOSTAT"}), + tb_harris, + tb_floud.assign(**{"source": "Floud et al. (2011)"}), + tb_jonsson.assign(**{"source": "Jonsson (1998)"}), + tb_grigg.assign(**{"source": "Grigg (1995)"}), + tb_fogel.assign(**{"source": "Fogel (2004)"}), + tb_fao2000.assign(**{"source": "FAO (2000)"}), + tb_fao1949.assign(**{"source": "FAO (1949)"}), + tb_usda.assign(**{"source": "USDA/ERS"}), + ], + ignore_index=True, + ) + + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, warn_on_missing_countries=True, warn_on_unused_countries=True + ) + + # Ensure years are integers. When given a range of years, take the middle year. + tb["year"] = tb["year"].apply(correct_year) + + # Sanity check. + assert tb[tb["country"].isnull()].empty, "Some countries are missing." + assert tb[tb["year"].isnull()].empty, "Some years are missing." + + # Drop rows with no data on daily calories. + tb = tb.dropna(subset=["daily_calories"]).reset_index(drop=True) + + # Some numbers are given as ranges, e.g. "2914-2949". Take the average value. + select_ranges = pd.to_numeric(tb["daily_calories"], errors="coerce").isnull() + tb.loc[select_ranges, "daily_calories"] = [ + (float(value.split("-")[0]) + float(value.split("-")[1])) / 2 + for value in tb[select_ranges]["daily_calories"].values + ] + tb = tb.astype({"daily_calories": float}) + + # Start a new table with the selection of all countries from different sources. + # * Most countries have data only from FAOSTAT, or from FAOSTAT + FAO (1949), or from FAOSTAT + FAO (2000). + # There is no conflicting overlap among them. + # Although there are some jumps between FAO (1949) and FAOSTAT (which we will accept). + # We take all these countries. + countries_selected = [ + country for country in tb["country"].unique() if len(set(tb[tb["country"] == country]["source"])) < 3 + ] + tb_selected = tb[tb["country"].isin(countries_selected)].reset_index(drop=True) + + # For the remaining countries, we will combine sources in different ways one by one. + + # * Belgium: + tb_belgium = tb[ + (tb["country"] == "Belgium") & (tb["source"].isin(["FAOSTAT", "FAO (1949)", "Floud et al. (2011)"])) + ] + + # * Brazil: + tb_brazil = tb[(tb["country"] == "Brazil") & (tb["source"].isin(["FAOSTAT", "FAO (2000)"]))] + + # * Finland: + tb_finland = tb[(tb["country"] == "Finland") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * France: + tb_france = pr.concat( + [ + # Prior to 1800, the only data (two points) comes from Fogel (2004). + tb[(tb["country"] == "France") & (tb["year"] < 1800) & (tb["source"] == "Fogel (2004)")], + # After 1800, take data from Grigg (1995) and FAOSTAT. + # TODO: Shall we include "FAO (1949)"? + tb[(tb["country"] == "France") & (tb["year"] >= 1800) & (tb["source"].isin(["FAOSTAT", "Grigg (1995)"]))], + ], + ignore_index=True, + ) + + # * Germany: + tb_germany = tb[(tb["country"] == "Germany") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)"]))] + + # * Iceland: + # TODO: Check why Floud et al. (2011) disregards the later increase from Jonsson (which is quite abrupt). + tb_iceland = tb[(tb["country"] == "Iceland") & (tb["source"].isin(["FAOSTAT", "Jonsson (1998)", "FAO (1949)"]))] + + # * India: + tb_india = tb[(tb["country"] == "India") & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"]))] + + # * Italy: + tb_italy = tb[(tb["country"] == "Italy") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * Netherlands: + tb_netherlands = tb[ + (tb["country"] == "Netherlands") & (tb["source"].isin(["FAOSTAT", "Floud et al. (2011)", "FAO (1949)"])) + ] + + # * Norway: + tb_norway = tb[(tb["country"] == "Norway") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * Mexico: + # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). + # Remove the overlapping point from FAO (1949). + tb_mexico = tb[ + (tb["country"] == "Mexico") + & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) + & ~((tb["year"] == 1947) & (tb["source"] == "FAO (1949)")) + ] + + # * Peru: + # (Same issue as Mexico). + tb_peru = tb[ + (tb["country"] == "Peru") + & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) + & ~((tb["year"] == 1947) & (tb["source"] == "FAO (1949)")) + ] + + # * United States: + tb_us = pr.concat( + [ + # Prior to 1909, the only data is from Floud et al. (2011). + tb[(tb["country"] == "United States") & (tb["year"] < 1909) & (tb["source"] == "Floud et al. (2011)")], + # From 1909 to 1960, take data from USDA/ERS. + tb[ + (tb["country"] == "United States") + & (tb["year"] >= 1909) + & (tb["year"] < 1961) + & (tb["source"] == "USDA/ERS") + ], + # After 1960, use FAOSTAT. + tb[(tb["country"] == "United States") & (tb["year"] >= 1961) & (tb["source"] == "FAOSTAT")], + ], + ignore_index=True, + ) + + # * United Kingdom: + tb_uk = pr.concat( + [ + # Prior to 1700, take data from Broadberry et al. (2015). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] < 1700) + & (tb["source"] == "Broadberry et al. (2015) via Harris et al. (2015)") + ], + # On 1700, take the Estimate (A) (which coincides with (B)) from Floud et al. (2011) via Harris et al. (2015). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] == 1700) + & (tb["source"] == "Floud et al. (2011) (Estimates A and B) via Harris et al. (2015)") + ], + # Between 1700 and 1850, take the corrected data from Floud et al. (2011), averaging estimates (A) and (B) (taken from Harris et al. (2015)). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] >= 1750) + & (tb["year"] <= 1850) + & (tb["source"].str.startswith("Floud et al. ")) + & (tb["source"].str.endswith("via Harris et al. (2015)")) + ] + .groupby(["country", "year"], observed=True, as_index=False) + .agg({"daily_calories": "mean"}) + .assign(**{"source": "Floud et al. (2011) via Harris et al. (2015) average between estimates (A) and (B)"}), + # Between 1850 and 1960, take data from Floud et al. (2011). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] > 1850) + & (tb["year"] <= 1960) + & (tb["source"] == "Floud et al. (2011)") + ], + # After 1960, use FAOSTAT. + tb[(tb["country"] == "United Kingdom") & (tb["year"] >= 1961) & (tb["source"] == "FAOSTAT")], + ], + ignore_index=True, + ) + + # Combine all selected tables. + tb_combined = ( + pr.concat( + [ + tb_selected, + tb_belgium, + tb_brazil, + tb_finland, + tb_france, + tb_germany, + tb_iceland, + tb_india, + tb_italy, + tb_netherlands, + tb_norway, + tb_mexico, + tb_peru, + tb_us, + tb_uk, + ], + ignore_index=True, + ) + .sort_values(["country", "year"]) + .reset_index(drop=True) + ) + + # Uncomment to visualize all original and combined series. + # import plotly.express as px + # tb_plot = pr.concat([tb, tb_combined.copy().assign(**{"source": "combined"})], ignore_index=True) + # for country in sorted(set(tb_plot["country"])): + # if len(set(tb_plot[tb_plot["country"] == country]["source"])) > 2: + # px.line(tb_plot[tb_plot["country"]==country], x="year", y="daily_calories", color="source", title=country, markers=True, color_discrete_map={"combined": "rgba(0,256,0,0.5)", "FAOSTAT": "rgba(0,0,256,0.5)", "USDA/ERS": "rgba(100,100,100,0.5)", "FAO (1949)": "rgba(256,0,0,0.5)", "FAO (2000)": "rgba(100,100,0,0.5)", "Fogel (2004)": "rgba(0,100,100,0.5)"}).show() # Set an appropriate index and sort conveniently. - tb = tb.format(short_name=paths.short_name) + tb_combined = tb_combined.format(short_name=paths.short_name) # # Save outputs. # # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden = create_dataset(dest_dir, tables=[tb_combined], check_variables_metadata=True) ds_garden.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py index da4abd44e6b..6fb275bf11a 100644 --- a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -17,8 +17,14 @@ def run(dest_dir: str) -> None: # # Process data. # + # Rename columns. + tb = tb.rename(columns={"Years": "year", "Source": "source", "Total": "daily_calories"}, errors="raise") + + # Add a country column. + tb["country"] = "England and Wales" + # Format table conveniently. - tb = tb.format(["years", "source"]) + tb = tb.format(["country", "year", "source"]) # # Save outputs. diff --git a/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc index e83f8681ad9..5c39dead21d 100644 --- a/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc +++ b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc @@ -25,6 +25,6 @@ meta: name: © FAO 1949 url: https://www.un-ilibrary.org/content/books/9789210472654 outs: - - md5: 4fdce005d413a34348174c4371f41025 - size: 1103 + - md5: de13ba1616d768c467cb228cc495ad87 + size: 1074 path: fao_1949.csv diff --git a/snapshots/agriculture/2024-05-23/fao_1949.py b/snapshots/agriculture/2024-05-23/fao_1949.py index 1d3db306464..9e5588e7089 100644 --- a/snapshots/agriculture/2024-05-23/fao_1949.py +++ b/snapshots/agriculture/2024-05-23/fao_1949.py @@ -23,7 +23,6 @@ def main(upload: bool) -> None: country,1947/48,1948/49 Burma,1986,1877 Ceylon,1977,1918 -China (22 provinces),2115, India,,1570 Japan,1670,1795 Philippines,1770, @@ -73,6 +72,7 @@ def main(upload: bool) -> None: Union of South Africa,2422,2517 """ # NOTE: + # * The table includes China, but only for 22 provinces, so we ignore it. # * The table includes India and Pakistan, but the footnote says that the value for 1948/49 is only India. # * Footnote says about Japan: "1t is believed by the Supreme Command Allied Powers that for staple foods there is an appreciable understatement of production, particularly from home gardens, both in staple foods and vegetables. A nutrition survey conducted by the Ministry of Welfare estimated calorie supplies per person per day at 1,965.". # * Footnote says about France: "Unreported production has most likely provided enough calories to raise the level to about 2,500-2,600 calories.". From 9feaf0042430517d9314a23ea5e2644fa4efc1d6 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 29 May 2024 08:36:42 +0200 Subject: [PATCH 13/17] Improve metadata --- .../2024-05-23/daily_calories_per_person.meta.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml index 333b2d3b73c..f2ce63e38c6 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -3,7 +3,7 @@ definitions: processing_level: major presentation: topic_tags: - - Food supply + - Food Supply dataset: update_period_days: 365 @@ -14,5 +14,9 @@ tables: variables: daily_calories: title: Daily calory supply per person - unit: kilocalories + unit: kilocalories per day short_unit: kcal + source: + title: Source of the data point + unit: "" + short_unit: "" From 0a915f37526699516862658ecc1aa815a425e45e Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 29 May 2024 10:44:36 +0200 Subject: [PATCH 14/17] Improve metadata --- .../daily_calories_per_person.meta.yml | 19 ++++++++- .../2024-05-23/daily_calories_per_person.py | 41 ++++++++++++------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml index f2ce63e38c6..1d2420a1aad 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -10,12 +10,27 @@ dataset: tables: daily_calories_per_person: - title: Daily calory supply per person + title: Daily calorie supply per person variables: daily_calories: - title: Daily calory supply per person + title: Daily calorie supply per person unit: kilocalories per day short_unit: kcal + description_key: + - This data shows daily calorie supply per person, which is the amount of calories available to an average person, and does necessarily correspond to the calories actually consumed by that person. + - Calorie supply is always larger than actual calorie consumption, since there may be waste at the household level. + - For historical data, daily calorie supply and daily calorie consumption are sometimes used interchangeably, due to poor data availability. + description_processing: |- + - For all countries, the data after 1960 is taken from FAOSTAT Food Balances datasets ([old](https://www.fao.org/faostat/en/#data/FBSH) and [new](https://www.fao.org/faostat/en/#data/FBS) methodologies combined). + - For the UK: We load Appendix Table from [Harris et al. (2015)](https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html). From that table, we select values from [Broadberry et al. (2015)](https://www.cambridge.org/core/books/british-economic-growth-12701870/A270234C137117C8E0F1D1E7E6F0DA56) and the corrected values from [Floud et al (2011)](https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E) (taking the average value of Estimates (A) and (B)). + - For the US: For years 1800-1900, we use Table 6.6 of [Floud et al. (2011)](https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E). For years 1900-1960, we use [the archived table of food supply from USDA](https://www.ers.usda.gov/webdocs/DataFiles/50472/nutrients.xls?v=6096.1). + - For Iceland: We use Table 5 of [Jonsson (1994)](https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677). + - For Finland, Germany, Italy, Norway: We use Table 1 from [Grigg (1995)](https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187), which is a compilation of many sources. + - For France: We use Table 1 from Grigg (1995). + - We include the two additional data points (1705 and 1785) from [Fogel (2004)](https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA). + - For Belgium and Netherlands: We use Table 5.5 of Floud et al. (2011). + - For Uganda, Cambodia, China, India, Brazil, Mexico, and Peru: We use Table 11 of [FAO (2000)](https://www.fao.org/4/x4400e/x4400e.pdf) (The State of Food and Agriculture), data for "1934-38" (use 1936) and/or "1946-49" (use 1947). + - For Myanmar, Sri Lanka, China, and many other countries for 1947 and 1948: We use values from Table 15 from [FAO (1949)](https://www.fao.org/4/ap637e/ap637e.pdf). source: title: Source of the data point unit: "" diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index f10185c1e99..cba2cf6531c 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -1,4 +1,6 @@ -"""TODO: Explain this step. +"""Historical daily calorie supply per person, based on a combination of sources. + +See description_processing (in the adjacent metadata file) for more details on the choices below. """ @@ -6,6 +8,7 @@ import owid.catalog.processing as pr import pandas as pd +from owid.catalog import Table from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -21,6 +24,7 @@ def correct_year(year: Union[str, int], verbose: bool = False) -> int: + # Correct the value of a year, to ensure it is an integer, and, when given a range, take the average value. year_str = str(year) if len(year_str) == 4: # Normal format, e.g. "1990". @@ -43,6 +47,19 @@ def correct_year(year: Union[str, int], verbose: bool = False) -> int: return year_corrected +def correct_data_values(tb: Table) -> Table: + # Some data values are given as ranges, e.g. "1234-1345". + # Ensure all values are real numbers, and take the average value when a range is given. + select_ranges = pd.to_numeric(tb["daily_calories"], errors="coerce").isnull() + tb.loc[select_ranges, "daily_calories"] = [ + (float(value.split("-")[0]) + float(value.split("-")[1])) / 2 + for value in tb[select_ranges]["daily_calories"].values + ] + tb = tb.astype({"daily_calories": float}) + + return tb + + def run(dest_dir: str) -> None: # # Load inputs. @@ -122,7 +139,7 @@ def run(dest_dir: str) -> None: # Ensure years are integers. When given a range of years, take the middle year. tb["year"] = tb["year"].apply(correct_year) - # Sanity check. + # Sanity checks. assert tb[tb["country"].isnull()].empty, "Some countries are missing." assert tb[tb["year"].isnull()].empty, "Some years are missing." @@ -130,25 +147,18 @@ def run(dest_dir: str) -> None: tb = tb.dropna(subset=["daily_calories"]).reset_index(drop=True) # Some numbers are given as ranges, e.g. "2914-2949". Take the average value. - select_ranges = pd.to_numeric(tb["daily_calories"], errors="coerce").isnull() - tb.loc[select_ranges, "daily_calories"] = [ - (float(value.split("-")[0]) + float(value.split("-")[1])) / 2 - for value in tb[select_ranges]["daily_calories"].values - ] - tb = tb.astype({"daily_calories": float}) + tb = correct_data_values(tb=tb) # Start a new table with the selection of all countries from different sources. # * Most countries have data only from FAOSTAT, or from FAOSTAT + FAO (1949), or from FAOSTAT + FAO (2000). - # There is no conflicting overlap among them. - # Although there are some jumps between FAO (1949) and FAOSTAT (which we will accept). + # There is no overlap among them (i.e. there is only one value for each year). + # Although there are some abrupt jumps between FAO (1949) and FAOSTAT (which we will accept). # We take all these countries. countries_selected = [ country for country in tb["country"].unique() if len(set(tb[tb["country"] == country]["source"])) < 3 ] tb_selected = tb[tb["country"].isin(countries_selected)].reset_index(drop=True) - # For the remaining countries, we will combine sources in different ways one by one. - # * Belgium: tb_belgium = tb[ (tb["country"] == "Belgium") & (tb["source"].isin(["FAOSTAT", "FAO (1949)", "Floud et al. (2011)"])) @@ -194,8 +204,8 @@ def run(dest_dir: str) -> None: tb_norway = tb[(tb["country"] == "Norway") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] # * Mexico: - # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). - # Remove the overlapping point from FAO (1949). + # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). + # Remove the overlapping point from FAO (1949). tb_mexico = tb[ (tb["country"] == "Mexico") & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) @@ -203,7 +213,8 @@ def run(dest_dir: str) -> None: ] # * Peru: - # (Same issue as Mexico). + # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). + # Remove the overlapping point from FAO (1949). tb_peru = tb[ (tb["country"] == "Peru") & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) From 4befd6b7c70b53b5481334e0e53b30d865aa4d09 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 29 May 2024 11:13:45 +0200 Subject: [PATCH 15/17] Fix remaining to-dos --- .../2024-05-23/daily_calories_per_person.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py index cba2cf6531c..1e64f71fa93 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -175,9 +175,12 @@ def run(dest_dir: str) -> None: [ # Prior to 1800, the only data (two points) comes from Fogel (2004). tb[(tb["country"] == "France") & (tb["year"] < 1800) & (tb["source"] == "Fogel (2004)")], - # After 1800, take data from Grigg (1995) and FAOSTAT. - # TODO: Shall we include "FAO (1949)"? - tb[(tb["country"] == "France") & (tb["year"] >= 1800) & (tb["source"].isin(["FAOSTAT", "Grigg (1995)"]))], + # After 1800, take data from Grigg (1995), FAOSTAT, and FAO (1949). + tb[ + (tb["country"] == "France") + & (tb["year"] >= 1800) + & (tb["source"].isin(["FAOSTAT", "FAO (1949)", "Grigg (1995)"])) + ], ], ignore_index=True, ) @@ -186,7 +189,6 @@ def run(dest_dir: str) -> None: tb_germany = tb[(tb["country"] == "Germany") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)"]))] # * Iceland: - # TODO: Check why Floud et al. (2011) disregards the later increase from Jonsson (which is quite abrupt). tb_iceland = tb[(tb["country"] == "Iceland") & (tb["source"].isin(["FAOSTAT", "Jonsson (1998)", "FAO (1949)"]))] # * India: From d6463297bce38fa427e5b6806e453a8a0ee3b9a5 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 29 May 2024 12:59:15 +0200 Subject: [PATCH 16/17] Improve metadata --- .../2024-05-23/daily_calories_per_person.meta.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml index 1d2420a1aad..a40071775fa 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -17,9 +17,10 @@ tables: unit: kilocalories per day short_unit: kcal description_key: - - This data shows daily calorie supply per person, which is the amount of calories available to an average person, and does necessarily correspond to the calories actually consumed by that person. + - This data shows per capita daily calorie supply, which is the amount of calories available to an average person, and does necessarily correspond to the calories actually consumed by that person. - Calorie supply is always larger than actual calorie consumption, since there may be waste at the household level. - - For historical data, daily calorie supply and daily calorie consumption are sometimes used interchangeably, due to poor data availability. + - For historical data, daily calorie supply and consumption are sometimes used interchangeably, due to poor data availability. + - "This data does not give a complete picture of nutrition - for a healthy diet [we need much more](https://ourworldindata.org/micronutrient-deficiency) than just energy. But as the most basic criteria of food security, getting enough calories is an important measure. It is used as input for the most important metrics used to assess global malnutrition: [undernourishment](https://ourworldindata.org/undernourishment-definition)." description_processing: |- - For all countries, the data after 1960 is taken from FAOSTAT Food Balances datasets ([old](https://www.fao.org/faostat/en/#data/FBSH) and [new](https://www.fao.org/faostat/en/#data/FBS) methodologies combined). - For the UK: We load Appendix Table from [Harris et al. (2015)](https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html). From that table, we select values from [Broadberry et al. (2015)](https://www.cambridge.org/core/books/british-economic-growth-12701870/A270234C137117C8E0F1D1E7E6F0DA56) and the corrected values from [Floud et al (2011)](https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E) (taking the average value of Estimates (A) and (B)). @@ -29,8 +30,9 @@ tables: - For France: We use Table 1 from Grigg (1995). - We include the two additional data points (1705 and 1785) from [Fogel (2004)](https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA). - For Belgium and Netherlands: We use Table 5.5 of Floud et al. (2011). - - For Uganda, Cambodia, China, India, Brazil, Mexico, and Peru: We use Table 11 of [FAO (2000)](https://www.fao.org/4/x4400e/x4400e.pdf) (The State of Food and Agriculture), data for "1934-38" (use 1936) and/or "1946-49" (use 1947). - - For Myanmar, Sri Lanka, China, and many other countries for 1947 and 1948: We use values from Table 15 from [FAO (1949)](https://www.fao.org/4/ap637e/ap637e.pdf). + - For Uganda, Cambodia, China, India, Brazil, Mexico, and Peru for 1936 and 1947: We use Table 11 of [FAO (2000)](https://www.fao.org/4/x4400e/x4400e.pdf) (The State of Food and Agriculture). + - For many countries (including some of the above) for 1947 and 1948: We use values from Table 15 from [FAO (1949)](https://www.fao.org/4/ap637e/ap637e.pdf). + - Note that prior to 1961, data for the UK may correspond to England, or England and Wales; and Tanzania refers to Tanganyika. source: title: Source of the data point unit: "" From f13367af63b2aa1a2892da185af0ec85bd01d371 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 29 May 2024 16:20:17 +0200 Subject: [PATCH 17/17] Improve metadata --- .../agriculture/2024-05-23/daily_calories_per_person.meta.yml | 2 ++ snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml index a40071775fa..6d26a849449 100644 --- a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -33,6 +33,8 @@ tables: - For Uganda, Cambodia, China, India, Brazil, Mexico, and Peru for 1936 and 1947: We use Table 11 of [FAO (2000)](https://www.fao.org/4/x4400e/x4400e.pdf) (The State of Food and Agriculture). - For many countries (including some of the above) for 1947 and 1948: We use values from Table 15 from [FAO (1949)](https://www.fao.org/4/ap637e/ap637e.pdf). - Note that prior to 1961, data for the UK may correspond to England, or England and Wales; and Tanzania refers to Tanganyika. + display: + numDecimalPlaces: 0 source: title: Source of the data point unit: "" diff --git a/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc index f2534d326d1..f175ff6ab3d 100644 --- a/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc +++ b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc @@ -3,7 +3,7 @@ meta: origin: # Data product / Snapshot - title: The Escape from hunger and Premature Death + title: The Escape from Hunger and Premature Death title_snapshot: The Escape from hunger and Premature Death - Daily calories in France and Great Britain description: |- This dataset contains daily calorie supply from Table 1.2 of Fogel (2004) book: "The Escape from hunger and Premature Death".