diff --git a/dag/agriculture.yml b/dag/agriculture.yml index fc24e0cb998..490a0635c14 100644 --- a/dag/agriculture.yml +++ b/dag/agriculture.yml @@ -73,6 +73,62 @@ steps: data://grapher/wb/2024-03-26/food_prices_for_nutrition: - data://garden/wb/2024-03-26/food_prices_for_nutrition + # + # Harris et al. (2015) - Daily calories in England and Wales, according to various studies. + # + data://meadow/agriculture/2024-05-23/harris_et_al_2015: + - snapshot://agriculture/2024-05-23/harris_et_al_2015.csv + # + # Floud et al. (2011) - Daily calories in United States and Western Europe. + # + data://meadow/agriculture/2024-05-23/floud_et_al_2011: + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv + - snapshot://agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv + # + # Jonsson (1998) - Daily calories in Iceland. + # + data://meadow/agriculture/2024-05-23/jonsson_1998: + - snapshot://agriculture/2024-05-23/jonsson_1998.csv + # + # Grigg (1995) - Daily calories in Western Europe. + # + data://meadow/agriculture/2024-05-23/grigg_1995: + - snapshot://agriculture/2024-05-23/grigg_1995.csv + # + # Fogel (2004) - Daily calories in France and Great Britain. + # + data://meadow/agriculture/2024-05-23/fogel_2004: + - snapshot://agriculture/2024-05-23/fogel_2004.csv + # + # FAO (2000) - The State of Food and Agriculture 2000. + # + data://meadow/agriculture/2024-05-23/fao_2000: + - snapshot://agriculture/2024-05-23/fao_2000.csv + # + # FAO (1949) - The State of Food and Agriculture 1949. + # + data://meadow/agriculture/2024-05-23/fao_1949: + - snapshot://agriculture/2024-05-23/fao_1949.csv + # + # USDA/ERS - Food availability. + # + data://meadow/usda_ers/2024-05-23/food_availability: + - snapshot://usda_ers/2024-05-23/food_availability.xls + # + # Agriculture - Long-run daily calorie supply per person. + # + data://garden/agriculture/2024-05-23/daily_calories_per_person: + - data://garden/faostat/2024-03-14/faostat_fbsc + - data://meadow/agriculture/2024-05-23/harris_et_al_2015 + - data://meadow/agriculture/2024-05-23/floud_et_al_2011 + - data://meadow/agriculture/2024-05-23/jonsson_1998 + - data://meadow/agriculture/2024-05-23/grigg_1995 + - data://meadow/agriculture/2024-05-23/fogel_2004 + - data://meadow/agriculture/2024-05-23/fao_2000 + - data://meadow/agriculture/2024-05-23/fao_1949 + - data://meadow/usda_ers/2024-05-23/food_availability + data://grapher/agriculture/2024-05-23/daily_calories_per_person: + - data://garden/agriculture/2024-05-23/daily_calories_per_person ###################################################################################################################### # Older versions to be archived once they are not used by any other steps. ###################################################################################################################### diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json new file mode 100644 index 00000000000..28076d8bb8c --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.countries.json @@ -0,0 +1,247 @@ +{ + "Belgium": "Belgium", + "Finland": "Finland", + "Germany": "Germany", + "Iceland": "Iceland", + "Italy": "Italy", + "Netherlands": "Netherlands", + "Norway": "Norway", + "Algeria": "Algeria", + "Argentina": "Argentina", + "Australia": "Australia", + "Austria": "Austria", + "Brazil": "Brazil", + "Burma": "Myanmar", + "Cambodia": "Cambodia", + "Canada": "Canada", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Cuba": "Cuba", + "Czechoslovakia": "Czechoslovakia", + "Denmark": "Denmark", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Ethiopia": "Ethiopia", + "France": "France", + "Greece": "Greece", + "Hungary": "Hungary", + "India": "India", + "Ireland": "Ireland", + "Japan": "Japan", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Mexico": "Mexico", + "Morocco": "Morocco", + "New Zealand": "New Zealand", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Thailand": "Thailand", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Uganda": "Uganda", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Yugoslavia": "Yugoslavia", + "Ceylon": "Sri Lanka", + "England": "United Kingdom", + "England and Wales": "United Kingdom", + "Great Britain": "United Kingdom", + "Tanganyika": "Tanzania", + "Union of South Africa": "South Africa", + "Net Food Importing Developing Countries (FAO)": "Net Food Importing Developing Countries (FAO)", + "Serbia": "Serbia", + "Bermuda": "Bermuda", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Georgia": "Georgia", + "Taiwan": "Taiwan", + "North Korea": "North Korea", + "Liberia": "Liberia", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Romania": "Romania", + "Africa (FAO)": "Africa (FAO)", + "Haiti": "Haiti", + "Senegal": "Senegal", + "Croatia": "Croatia", + "Brunei": "Brunei", + "Guinea": "Guinea", + "Jamaica": "Jamaica", + "Syria": "Syria", + "Albania": "Albania", + "Southern Africa (FAO)": "Southern Africa (FAO)", + "Papua New Guinea": "Papua New Guinea", + "Sudan": "Sudan", + "Niger": "Niger", + "China (FAO)": "China (FAO)", + "Southern Asia (FAO)": "Southern Asia (FAO)", + "Antigua and Barbuda": "Antigua and Barbuda", + "High-income countries": "High-income countries", + "Small Island Developing States (FAO)": "Small Island Developing States (FAO)", + "Togo": "Togo", + "Honduras": "Honduras", + "Nicaragua": "Nicaragua", + "Lower-middle-income countries": "Lower-middle-income countries", + "Middle Africa (FAO)": "Middle Africa (FAO)", + "Paraguay": "Paraguay", + "South Africa": "South Africa", + "Indonesia": "Indonesia", + "Western Europe (FAO)": "Western Europe (FAO)", + "Northern America (FAO)": "Northern America (FAO)", + "Burundi": "Burundi", + "Gambia": "Gambia", + "Russia": "Russia", + "Belarus": "Belarus", + "Kenya": "Kenya", + "Slovenia": "Slovenia", + "Cameroon": "Cameroon", + "Micronesia (country)": "Micronesia (country)", + "Namibia": "Namibia", + "Vanuatu": "Vanuatu", + "Bolivia": "Bolivia", + "Samoa": "Samoa", + "Sri Lanka": "Sri Lanka", + "South America": "South America", + "Bahamas": "Bahamas", + "Nepal": "Nepal", + "Zambia": "Zambia", + "Mauritius": "Mauritius", + "Turkmenistan": "Turkmenistan", + "Sudan (former)": "Sudan (former)", + "Least Developed Countries (FAO)": "Least Developed Countries (FAO)", + "Czechia": "Czechia", + "Afghanistan": "Afghanistan", + "Sierra Leone": "Sierra Leone", + "Asia (FAO)": "Asia (FAO)", + "Netherlands Antilles": "Netherlands Antilles", + "Mongolia": "Mongolia", + "Saudi Arabia": "Saudi Arabia", + "Yemen": "Yemen", + "Grenada": "Grenada", + "Lebanon": "Lebanon", + "Cyprus": "Cyprus", + "Kiribati": "Kiribati", + "North Macedonia": "North Macedonia", + "South America (FAO)": "South America (FAO)", + "Malaysia": "Malaysia", + "Eswatini": "Eswatini", + "Barbados": "Barbados", + "Uzbekistan": "Uzbekistan", + "Angola": "Angola", + "French Polynesia": "French Polynesia", + "Rwanda": "Rwanda", + "Oceania": "Oceania", + "Vietnam": "Vietnam", + "Slovakia": "Slovakia", + "Fiji": "Fiji", + "Botswana": "Botswana", + "Lithuania": "Lithuania", + "Caribbean (FAO)": "Caribbean (FAO)", + "Micronesia (FAO)": "Micronesia (FAO)", + "Southern Europe (FAO)": "Southern Europe (FAO)", + "Guinea-Bissau": "Guinea-Bissau", + "Bhutan": "Bhutan", + "Dominica": "Dominica", + "Land Locked Developing Countries (FAO)": "Land Locked Developing Countries (FAO)", + "Asia": "Asia", + "Central America (FAO)": "Central America (FAO)", + "South-eastern Asia (FAO)": "South-eastern Asia (FAO)", + "Azerbaijan": "Azerbaijan", + "Malta": "Malta", + "Low Income Food Deficit Countries (FAO)": "Low Income Food Deficit Countries (FAO)", + "Qatar": "Qatar", + "East Timor": "East Timor", + "Libya": "Libya", + "Comoros": "Comoros", + "Cote d'Ivoire": "Cote d'Ivoire", + "Nauru": "Nauru", + "Armenia": "Armenia", + "Upper-middle-income countries": "Upper-middle-income countries", + "Tanzania": "Tanzania", + "Burkina Faso": "Burkina Faso", + "Iran": "Iran", + "Eastern Asia (FAO)": "Eastern Asia (FAO)", + "Central African Republic": "Central African Republic", + "New Caledonia": "New Caledonia", + "Europe (FAO)": "Europe (FAO)", + "European Union (27)": "European Union (27)", + "Moldova": "Moldova", + "Belgium-Luxembourg (FAO)": "Belgium-Luxembourg (FAO)", + "Benin": "Benin", + "Israel": "Israel", + "Democratic Republic of Congo": "Democratic Republic of Congo", + "Nigeria": "Nigeria", + "Africa": "Africa", + "Belize": "Belize", + "Suriname": "Suriname", + "South Sudan": "South Sudan", + "Ecuador": "Ecuador", + "Cape Verde": "Cape Verde", + "Western Asia (FAO)": "Western Asia (FAO)", + "Sao Tome and Principe": "Sao Tome and Principe", + "Eastern Africa (FAO)": "Eastern Africa (FAO)", + "Tajikistan": "Tajikistan", + "Low-income countries": "Low-income countries", + "Oman": "Oman", + "Montenegro": "Montenegro", + "Latvia": "Latvia", + "Lesotho": "Lesotho", + "Bahrain": "Bahrain", + "North America": "North America", + "Myanmar": "Myanmar", + "Seychelles": "Seychelles", + "Trinidad and Tobago": "Trinidad and Tobago", + "Bulgaria": "Bulgaria", + "Western Africa (FAO)": "Western Africa (FAO)", + "Panama": "Panama", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Chad": "Chad", + "World": "World", + "Serbia and Montenegro": "Serbia and Montenegro", + "Maldives": "Maldives", + "Bangladesh": "Bangladesh", + "Kyrgyzstan": "Kyrgyzstan", + "Northern Europe (FAO)": "Northern Europe (FAO)", + "Central Asia (FAO)": "Central Asia (FAO)", + "Congo": "Congo", + "Europe": "Europe", + "Zimbabwe": "Zimbabwe", + "Mali": "Mali", + "Iraq": "Iraq", + "Jordan": "Jordan", + "Ghana": "Ghana", + "Solomon Islands": "Solomon Islands", + "Mozambique": "Mozambique", + "Costa Rica": "Costa Rica", + "Americas (FAO)": "Americas (FAO)", + "Hong Kong": "Hong Kong", + "Saint Lucia": "Saint Lucia", + "Malawi": "Malawi", + "Guyana": "Guyana", + "Eastern Europe (FAO)": "Eastern Europe (FAO)", + "Pakistan": "Pakistan", + "Estonia": "Estonia", + "Oceania (FAO)": "Oceania (FAO)", + "Djibouti": "Djibouti", + "Kuwait": "Kuwait", + "Ukraine": "Ukraine", + "Gabon": "Gabon", + "Mauritania": "Mauritania", + "Guatemala": "Guatemala", + "South Korea": "South Korea", + "Kazakhstan": "Kazakhstan", + "Laos": "Laos", + "Macao": "Macao", + "USSR": "USSR", + "United Arab Emirates": "United Arab Emirates", + "Dominican Republic": "Dominican Republic", + "Ethiopia (former)": "Ethiopia (former)", + "Northern Africa (FAO)": "Northern Africa (FAO)", + "Venezuela": "Venezuela", + "European Union (27) (FAO)": "European Union (27) (FAO)" +} diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml new file mode 100644 index 00000000000..6d26a849449 --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.meta.yml @@ -0,0 +1,41 @@ +definitions: + common: + processing_level: major + presentation: + topic_tags: + - Food Supply + +dataset: + update_period_days: 365 + +tables: + daily_calories_per_person: + title: Daily calorie supply per person + variables: + daily_calories: + title: Daily calorie supply per person + unit: kilocalories per day + short_unit: kcal + description_key: + - This data shows per capita daily calorie supply, which is the amount of calories available to an average person, and does necessarily correspond to the calories actually consumed by that person. + - Calorie supply is always larger than actual calorie consumption, since there may be waste at the household level. + - For historical data, daily calorie supply and consumption are sometimes used interchangeably, due to poor data availability. + - "This data does not give a complete picture of nutrition - for a healthy diet [we need much more](https://ourworldindata.org/micronutrient-deficiency) than just energy. But as the most basic criteria of food security, getting enough calories is an important measure. It is used as input for the most important metrics used to assess global malnutrition: [undernourishment](https://ourworldindata.org/undernourishment-definition)." + description_processing: |- + - For all countries, the data after 1960 is taken from FAOSTAT Food Balances datasets ([old](https://www.fao.org/faostat/en/#data/FBSH) and [new](https://www.fao.org/faostat/en/#data/FBS) methodologies combined). + - For the UK: We load Appendix Table from [Harris et al. (2015)](https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html). From that table, we select values from [Broadberry et al. (2015)](https://www.cambridge.org/core/books/british-economic-growth-12701870/A270234C137117C8E0F1D1E7E6F0DA56) and the corrected values from [Floud et al (2011)](https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E) (taking the average value of Estimates (A) and (B)). + - For the US: For years 1800-1900, we use Table 6.6 of [Floud et al. (2011)](https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E). For years 1900-1960, we use [the archived table of food supply from USDA](https://www.ers.usda.gov/webdocs/DataFiles/50472/nutrients.xls?v=6096.1). + - For Iceland: We use Table 5 of [Jonsson (1994)](https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677). + - For Finland, Germany, Italy, Norway: We use Table 1 from [Grigg (1995)](https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187), which is a compilation of many sources. + - For France: We use Table 1 from Grigg (1995). + - We include the two additional data points (1705 and 1785) from [Fogel (2004)](https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA). + - For Belgium and Netherlands: We use Table 5.5 of Floud et al. (2011). + - For Uganda, Cambodia, China, India, Brazil, Mexico, and Peru for 1936 and 1947: We use Table 11 of [FAO (2000)](https://www.fao.org/4/x4400e/x4400e.pdf) (The State of Food and Agriculture). + - For many countries (including some of the above) for 1947 and 1948: We use values from Table 15 from [FAO (1949)](https://www.fao.org/4/ap637e/ap637e.pdf). + - Note that prior to 1961, data for the UK may correspond to England, or England and Wales; and Tanzania refers to Tanganyika. + display: + numDecimalPlaces: 0 + source: + title: Source of the data point + unit: "" + short_unit: "" diff --git a/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py new file mode 100644 index 00000000000..1e64f71fa93 --- /dev/null +++ b/etl/steps/data/garden/agriculture/2024-05-23/daily_calories_per_person.py @@ -0,0 +1,324 @@ +"""Historical daily calorie supply per person, based on a combination of sources. + +See description_processing (in the adjacent metadata file) for more details on the choices below. + +""" + +from typing import Union + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# FAOSTAT element code for "Food available for consumption" measured in "kilocalories per day per capita" +# (corresponding to original FAOSTAT element "Food supply (kcal/capita/day)"). +ELEMENT_CODE_FOOD_SUPPLY_PER_CAPITA = "0664pc" +# FAOSTAT item code for "Total" (corresponding to original FAOSTAT item "Grand Total"). +ITEM_CODE_TOTAL = "00002901" + + +def correct_year(year: Union[str, int], verbose: bool = False) -> int: + # Correct the value of a year, to ensure it is an integer, and, when given a range, take the average value. + year_str = str(year) + if len(year_str) == 4: + # Normal format, e.g. "1990". + year_corrected = int(year_str) + elif len(year_str) == 9: + # Range format, e.g. "1990-1999" or "1845/1854". + year_start, year_end = year_str[0:4], year_str[5:9] + year_corrected = int((int(year_start) + int(year_end)) / 2) + elif len(year_str) == 7: + # Range format, but second year is incomplete, e.g. "1845-54". + year_start, year_end = year_str[0:4], year_str[0:2] + year_str[5:7] + year_corrected = int((int(year_start) + int(year_end)) / 2) + else: + raise ValueError(f"Unexpected year format: {year}") + + # As a sanity check, optionally print the correction. + if verbose and (str(year_corrected) != year_str): + print(f'Corrected "{year}" -> "{year_corrected}"') + + return year_corrected + + +def correct_data_values(tb: Table) -> Table: + # Some data values are given as ranges, e.g. "1234-1345". + # Ensure all values are real numbers, and take the average value when a range is given. + select_ranges = pd.to_numeric(tb["daily_calories"], errors="coerce").isnull() + tb.loc[select_ranges, "daily_calories"] = [ + (float(value.split("-")[0]) + float(value.split("-")[1])) / 2 + for value in tb[select_ranges]["daily_calories"].values + ] + tb = tb.astype({"daily_calories": float}) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load FAOSTAT FBSC dataset and read its main table. + ds_fbsc = paths.load_dataset("faostat_fbsc") + tb_fbsc = ds_fbsc["faostat_fbsc"].reset_index() + + # Load Harris et al. (2015) dataset and read its main table. + ds_harris = paths.load_dataset("harris_et_al_2015") + tb_harris = ds_harris["harris_et_al_2015"].reset_index() + + # Load Floud et al. (2011) dataset and read its main table. + ds_floud = paths.load_dataset("floud_et_al_2011") + tb_floud = ds_floud["floud_et_al_2011"].reset_index() + + # Load Jonsson (1998) dataset and read its main table. + ds_jonsson = paths.load_dataset("jonsson_1998") + tb_jonsson = ds_jonsson["jonsson_1998"].reset_index() + + # Load Grigg (1995) dataset and read its main table. + ds_grigg = paths.load_dataset("grigg_1995") + tb_grigg = ds_grigg["grigg_1995"].reset_index() + + # Load Fogel (2004) dataset and read its main table. + ds_fogel = paths.load_dataset("fogel_2004") + tb_fogel = ds_fogel["fogel_2004"].reset_index() + + # Load FAO (2000) dataset and read its main table. + ds_fao2000 = paths.load_dataset("fao_2000") + tb_fao2000 = ds_fao2000["fao_2000"].reset_index() + + # Load FAO (1949) dataset and read its main table. + ds_fao1949 = paths.load_dataset("fao_1949") + tb_fao1949 = ds_fao1949["fao_1949"].reset_index() + + # Load USDA/ERS data on food availability. + ds_usda = paths.load_dataset("food_availability") + tb_usda = ds_usda["food_availability"].reset_index() + + # + # Process data. + # + # Prepare FAOSTAT data. + tb_fbsc = ( + tb_fbsc[ + (tb_fbsc["element_code"] == ELEMENT_CODE_FOOD_SUPPLY_PER_CAPITA) & (tb_fbsc["item_code"] == ITEM_CODE_TOTAL) + ][["country", "year", "value"]] + .rename(columns={"value": "daily_calories"}) + .reset_index(drop=True) + ) + + # Ensure the "source" column in Harris et al. (2015) is different from the sources in other tables. + tb_harris["source"] = tb_harris["source"].astype(str) + " via Harris et al. (2015)" + + # Concatenate all tables and add a source column. + tb = pr.concat( + [ + tb_fbsc.assign(**{"source": "FAOSTAT"}), + tb_harris, + tb_floud.assign(**{"source": "Floud et al. (2011)"}), + tb_jonsson.assign(**{"source": "Jonsson (1998)"}), + tb_grigg.assign(**{"source": "Grigg (1995)"}), + tb_fogel.assign(**{"source": "Fogel (2004)"}), + tb_fao2000.assign(**{"source": "FAO (2000)"}), + tb_fao1949.assign(**{"source": "FAO (1949)"}), + tb_usda.assign(**{"source": "USDA/ERS"}), + ], + ignore_index=True, + ) + + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, warn_on_missing_countries=True, warn_on_unused_countries=True + ) + + # Ensure years are integers. When given a range of years, take the middle year. + tb["year"] = tb["year"].apply(correct_year) + + # Sanity checks. + assert tb[tb["country"].isnull()].empty, "Some countries are missing." + assert tb[tb["year"].isnull()].empty, "Some years are missing." + + # Drop rows with no data on daily calories. + tb = tb.dropna(subset=["daily_calories"]).reset_index(drop=True) + + # Some numbers are given as ranges, e.g. "2914-2949". Take the average value. + tb = correct_data_values(tb=tb) + + # Start a new table with the selection of all countries from different sources. + # * Most countries have data only from FAOSTAT, or from FAOSTAT + FAO (1949), or from FAOSTAT + FAO (2000). + # There is no overlap among them (i.e. there is only one value for each year). + # Although there are some abrupt jumps between FAO (1949) and FAOSTAT (which we will accept). + # We take all these countries. + countries_selected = [ + country for country in tb["country"].unique() if len(set(tb[tb["country"] == country]["source"])) < 3 + ] + tb_selected = tb[tb["country"].isin(countries_selected)].reset_index(drop=True) + + # * Belgium: + tb_belgium = tb[ + (tb["country"] == "Belgium") & (tb["source"].isin(["FAOSTAT", "FAO (1949)", "Floud et al. (2011)"])) + ] + + # * Brazil: + tb_brazil = tb[(tb["country"] == "Brazil") & (tb["source"].isin(["FAOSTAT", "FAO (2000)"]))] + + # * Finland: + tb_finland = tb[(tb["country"] == "Finland") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * France: + tb_france = pr.concat( + [ + # Prior to 1800, the only data (two points) comes from Fogel (2004). + tb[(tb["country"] == "France") & (tb["year"] < 1800) & (tb["source"] == "Fogel (2004)")], + # After 1800, take data from Grigg (1995), FAOSTAT, and FAO (1949). + tb[ + (tb["country"] == "France") + & (tb["year"] >= 1800) + & (tb["source"].isin(["FAOSTAT", "FAO (1949)", "Grigg (1995)"])) + ], + ], + ignore_index=True, + ) + + # * Germany: + tb_germany = tb[(tb["country"] == "Germany") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)"]))] + + # * Iceland: + tb_iceland = tb[(tb["country"] == "Iceland") & (tb["source"].isin(["FAOSTAT", "Jonsson (1998)", "FAO (1949)"]))] + + # * India: + tb_india = tb[(tb["country"] == "India") & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"]))] + + # * Italy: + tb_italy = tb[(tb["country"] == "Italy") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * Netherlands: + tb_netherlands = tb[ + (tb["country"] == "Netherlands") & (tb["source"].isin(["FAOSTAT", "Floud et al. (2011)", "FAO (1949)"])) + ] + + # * Norway: + tb_norway = tb[(tb["country"] == "Norway") & (tb["source"].isin(["FAOSTAT", "Grigg (1995)", "FAO (1949)"]))] + + # * Mexico: + # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). + # Remove the overlapping point from FAO (1949). + tb_mexico = tb[ + (tb["country"] == "Mexico") + & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) + & ~((tb["year"] == 1947) & (tb["source"] == "FAO (1949)")) + ] + + # * Peru: + # There is an overlap on year 1947 between FAO (1949) and FAO (2000) (with similar values). + # Remove the overlapping point from FAO (1949). + tb_peru = tb[ + (tb["country"] == "Peru") + & (tb["source"].isin(["FAOSTAT", "FAO (2000)", "FAO (1949)"])) + & ~((tb["year"] == 1947) & (tb["source"] == "FAO (1949)")) + ] + + # * United States: + tb_us = pr.concat( + [ + # Prior to 1909, the only data is from Floud et al. (2011). + tb[(tb["country"] == "United States") & (tb["year"] < 1909) & (tb["source"] == "Floud et al. (2011)")], + # From 1909 to 1960, take data from USDA/ERS. + tb[ + (tb["country"] == "United States") + & (tb["year"] >= 1909) + & (tb["year"] < 1961) + & (tb["source"] == "USDA/ERS") + ], + # After 1960, use FAOSTAT. + tb[(tb["country"] == "United States") & (tb["year"] >= 1961) & (tb["source"] == "FAOSTAT")], + ], + ignore_index=True, + ) + + # * United Kingdom: + tb_uk = pr.concat( + [ + # Prior to 1700, take data from Broadberry et al. (2015). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] < 1700) + & (tb["source"] == "Broadberry et al. (2015) via Harris et al. (2015)") + ], + # On 1700, take the Estimate (A) (which coincides with (B)) from Floud et al. (2011) via Harris et al. (2015). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] == 1700) + & (tb["source"] == "Floud et al. (2011) (Estimates A and B) via Harris et al. (2015)") + ], + # Between 1700 and 1850, take the corrected data from Floud et al. (2011), averaging estimates (A) and (B) (taken from Harris et al. (2015)). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] >= 1750) + & (tb["year"] <= 1850) + & (tb["source"].str.startswith("Floud et al. ")) + & (tb["source"].str.endswith("via Harris et al. (2015)")) + ] + .groupby(["country", "year"], observed=True, as_index=False) + .agg({"daily_calories": "mean"}) + .assign(**{"source": "Floud et al. (2011) via Harris et al. (2015) average between estimates (A) and (B)"}), + # Between 1850 and 1960, take data from Floud et al. (2011). + tb[ + (tb["country"] == "United Kingdom") + & (tb["year"] > 1850) + & (tb["year"] <= 1960) + & (tb["source"] == "Floud et al. (2011)") + ], + # After 1960, use FAOSTAT. + tb[(tb["country"] == "United Kingdom") & (tb["year"] >= 1961) & (tb["source"] == "FAOSTAT")], + ], + ignore_index=True, + ) + + # Combine all selected tables. + tb_combined = ( + pr.concat( + [ + tb_selected, + tb_belgium, + tb_brazil, + tb_finland, + tb_france, + tb_germany, + tb_iceland, + tb_india, + tb_italy, + tb_netherlands, + tb_norway, + tb_mexico, + tb_peru, + tb_us, + tb_uk, + ], + ignore_index=True, + ) + .sort_values(["country", "year"]) + .reset_index(drop=True) + ) + + # Uncomment to visualize all original and combined series. + # import plotly.express as px + # tb_plot = pr.concat([tb, tb_combined.copy().assign(**{"source": "combined"})], ignore_index=True) + # for country in sorted(set(tb_plot["country"])): + # if len(set(tb_plot[tb_plot["country"] == country]["source"])) > 2: + # px.line(tb_plot[tb_plot["country"]==country], x="year", y="daily_calories", color="source", title=country, markers=True, color_discrete_map={"combined": "rgba(0,256,0,0.5)", "FAOSTAT": "rgba(0,0,256,0.5)", "USDA/ERS": "rgba(100,100,100,0.5)", "FAO (1949)": "rgba(256,0,0,0.5)", "FAO (2000)": "rgba(100,100,0,0.5)", "Fogel (2004)": "rgba(0,100,100,0.5)"}).show() + + # Set an appropriate index and sort conveniently. + tb_combined = tb_combined.format(short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_combined], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml index d24da7afa40..e8d7eb60289 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml @@ -107,83 +107,83 @@ tables: - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. variables: energy_from_animal_products: - title: "Daily caloric intake per person from animal products" + title: "Daily calorie supply per person from animal products" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_animal_protein: - title: "Daily caloric intake per person that comes from animal protein" + title: "Daily calorie supply per person that comes from animal protein" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_vegetal_products: - title: "Daily caloric intake per person from vegetal products" + title: "Daily calorie supply per person from vegetal products" unit: "kilocalories per day per capita" short_unit: "kcal" energy_from_vegetal_protein: - title: "Daily caloric intake per person that comes from vegetal protein" + title: "Daily calorie supply per person that comes from vegetal protein" unit: "kilocalories per day per capita" short_unit: "kcal" fat_from_animal_products: - title: "Daily fat intake per person from animal products" + title: "Daily fat supply per person from animal products" unit: "grams per day per capita" short_unit: "g" fat_from_vegetal_products: - title: "Daily fat intake per person from vegetal products" + title: "Daily fat supply per person from vegetal products" unit: "grams per day per capita" short_unit: "g" protein_from_animal_products: - title: "Daily protein intake from animal products" + title: "Daily protein supply from animal products" unit: "grams per day per capita" short_unit: "g" protein_from_vegetal_products: - title: "Daily protein intake per person from vegetal products" + title: "Daily protein supply per person from vegetal products" unit: "grams per day per capita" short_unit: "g" share_of_energy_from_animal_protein: - title: "Share of the daily caloric intake that comes from animal protein" + title: "Share of the daily calorie supply that comes from animal protein" unit: "%" short_unit: "%" share_of_energy_from_carbohydrates: - title: "Share of the daily caloric intake that comes from carbohydrates" + title: "Share of the daily calorie supply that comes from carbohydrates" unit: "%" short_unit: "%" share_of_energy_from_fat: - title: "Share of the daily caloric intake that comes from fat" + title: "Share of the daily calorie supply that comes from fat" unit: "%" short_unit: "%" share_of_energy_from_protein: - title: "Share of the daily caloric intake that comes from protein" + title: "Share of the daily calorie supply that comes from protein" unit: "%" short_unit: "%" share_of_energy_from_vegetal_protein: - title: "Share of the daily caloric intake that comes from vegetal protein" + title: "Share of the daily calorie supply that comes from vegetal protein" unit: "%" short_unit: "%" total_carbohydrates: - title: "Daily carbohydrates intake per person" + title: "Daily carbohydrates supply per person" unit: "grams per day per capita" short_unit: "g" total_energy: - title: "Daily caloric intake per person" + title: "Daily calorie supply per person" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_carbohydrates: - title: "Daily caloric intake per person from carbohydrates" + title: "Daily calorie supply per person from carbohydrates" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_fat: - title: "Daily caloric intake per person from fat" + title: "Daily calorie supply per person from fat" unit: "kilocalories per day per capita" short_unit: "kcal" total_energy_from_protein: - title: "Daily caloric intake per person from protein" + title: "Daily calorie supply per person from protein" unit: "kilocalories per day per capita" short_unit: "kcal" total_fat: - title: "Daily fat intake per person" + title: "Daily fat supply per person" unit: "grams per day per capita" short_unit: "g" total_protein: - title: "Daily protein intake per person" + title: "Daily protein supply per person" unit: "grams per day per capita" short_unit: "g" fertilizers: diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py index 8617d95e0e7..da5b1056487 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py @@ -496,7 +496,7 @@ def generate_food_available_for_consumption(tb_fbsc: Table) -> Table: ) tb_food_available_for_consumption[ underscore(group) - ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" + ].metadata.title = f"Daily calorie supply per person from {group.lower().replace('other', 'other commodities')}" tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" tb_food_available_for_consumption[underscore(group)].metadata.description_key = [description] @@ -582,37 +582,37 @@ def generate_macronutrient_compositions(tb_fbsc: Table) -> Table: # Combine all tables. combined = pr.multi_merge(tables=tables, on=["country", "year"], how="outer") - # Daily caloric intake from fat, per person. + # Daily calorie supply from fat, per person. combined["Total energy from fat"] = combined["Total fat"] * KCAL_PER_GRAM_OF_FAT - # Daily caloric intake from protein, per person. + # Daily calorie supply from protein, per person. combined["Total energy from protein"] = combined["Total protein"] * KCAL_PER_GRAM_OF_PROTEIN - # Daily caloric intake from carbohydrates (assumed to be the rest of the daily caloric intake), per person. - # This is the difference between the total caloric intake minus the caloric intake from protein and fat. + # Daily calorie supply from carbohydrates (assumed to be the rest of the daily calorie supply), per person. + # This is the difference between the total calorie supply minus the calorie supply from protein and fat. combined["Total energy from carbohydrates"] = ( combined["Total energy"] - combined["Total energy from fat"] - combined["Total energy from protein"] ) - # Daily intake of carbohydrates per person. + # Daily supply of carbohydrates per person. combined["Total carbohydrates"] = combined["Total energy from carbohydrates"] / KCAL_PER_GRAM_OF_CARBOHYDRATES - # Caloric intake from fat as a percentage of the total daily caloric intake. + # Calorie supply from fat as a percentage of the total daily calorie supply. combined["Share of energy from fat"] = 100 * combined["Total energy from fat"] / combined["Total energy"] - # Caloric intake from protein as a percentage of the total daily caloric intake. + # Calorie supply from protein as a percentage of the total daily calorie supply. combined["Share of energy from protein"] = 100 * combined["Total energy from protein"] / combined["Total energy"] - # Caloric intake from carbohydrates as a percentage of the total daily caloric intake. + # Calorie supply from carbohydrates as a percentage of the total daily calorie supply. combined["Share of energy from carbohydrates"] = ( 100 * combined["Total energy from carbohydrates"] / combined["Total energy"] ) - # Daily caloric intake from animal protein. + # Daily calorie supply from animal protein. combined["Energy from animal protein"] = combined["Protein from animal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from animal protein as a percentage of the total daily caloric intake. + # Calorie supply from animal protein as a percentage of the total daily calorie supply. combined["Share of energy from animal protein"] = ( 100 * combined["Energy from animal protein"] / combined["Total energy"] ) - # Daily caloric intake from vegetal protein. + # Daily calorie supply from vegetal protein. combined["Energy from vegetal protein"] = combined["Protein from vegetal products"] * KCAL_PER_GRAM_OF_PROTEIN - # Caloric intake from vegetal protein as a percentage of the total daily caloric intake. + # Calorie supply from vegetal protein as a percentage of the total daily calorie supply. combined["Share of energy from vegetal protein"] = ( 100 * combined["Energy from vegetal protein"] / combined["Total energy"] ) diff --git a/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py b/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py new file mode 100644 index 00000000000..d56cf6fcb70 --- /dev/null +++ b/etl/steps/data/grapher/agriculture/2024-05-23/daily_calories_per_person.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("daily_calories_per_person") + tb = ds_garden["daily_calories_per_person"] + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py new file mode 100644 index 00000000000..0e965f59f58 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_1949.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fao_1949.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py new file mode 100644 index 00000000000..7c2b95c3e46 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fao_2000.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fao_2000.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..b9542acb88c --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,45 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_europe = paths.load_snapshot("floud_et_al_2011_daily_calories_europe.csv") + snap_us = paths.load_snapshot("floud_et_al_2011_daily_calories_us.csv") + + # Load data from snapshots. + tb_europe = snap_europe.read() + tb_us = snap_us.read() + + # + # Process data. + # + # Transform Europe data to have a year column. + tb_europe = tb_europe.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Prepare US data. + tb_us = tb_us.rename(columns={"Year": "year", "Calories": "daily_calories"}, errors="raise").assign( + **{"country": "United States"} + ) + + # Combine both tables. + tb = pr.concat([tb_europe, tb_us], ignore_index=True) + + # Format table conveniently. + tb = tb.format(["country", "year"], short_name=paths.short_name) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py new file mode 100644 index 00000000000..45bcd82f1a2 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/fogel_2004.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("fogel_2004.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["Year"], var_name="country", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py new file mode 100644 index 00000000000..b455d709750 --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/grigg_1995.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("grigg_1995.csv") + tb = snap.read() + + # + # Process data. + # + # Transform data to have a year column. + tb = tb.melt(id_vars=["country"], var_name="year", value_name="daily_calories") + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py new file mode 100644 index 00000000000..6fb275bf11a --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/harris_et_al_2015.py @@ -0,0 +1,34 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("harris_et_al_2015.csv") + tb = snap.read() + + # + # Process data. + # + # Rename columns. + tb = tb.rename(columns={"Years": "year", "Source": "source", "Total": "daily_calories"}, errors="raise") + + # Add a country column. + tb["country"] = "England and Wales" + + # Format table conveniently. + tb = tb.format(["country", "year", "source"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py new file mode 100644 index 00000000000..ae84f20378c --- /dev/null +++ b/etl/steps/data/meadow/agriculture/2024-05-23/jonsson_1998.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("jonsson_1998.csv") + tb = snap.read() + + # + # Process data. + # + # Add a country column. + tb["country"] = "Iceland" + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py new file mode 100644 index 00000000000..1eeba5fcac0 --- /dev/null +++ b/etl/steps/data/meadow/usda_ers/2024-05-23/food_availability.py @@ -0,0 +1,43 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "Year": "year", + "Food energy": "daily_calories", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots and read their data. + snap = paths.load_snapshot("food_availability.xls") + data = snap.read(sheet_name="Totals", skiprows=1) + + # + # Process data. + # + # Select and rename columns. + tb = data[COLUMNS.keys()].rename(columns=COLUMNS) + + # Drop any row for which "year" is not an integer (to get rid of headers and footers). + tb = tb[tb["year"].apply(lambda x: isinstance(x, int))].reset_index(drop=True) + + # Add a country column. + tb["country"] = "United States" + + # Format table conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc new file mode 100644 index 00000000000..5c39dead21d --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_1949.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The State of Food and Agriculture 1949 + title_snapshot: The State of Food and Agriculture 1949 - Daily calories in various countries + description: |- + This dataset contains daily calories in various countries, from Table 15 of FAO's "The State of Food and Agriculture 1949". + date_published: "1949-10-01" + + # Citation + producer: Food and Agriculture Organization of the United Nations + citation_full: |- + Food and Agriculture Organization of the United Nations (1949), "The State of Food and Agriculture 1949". + Data extracted from Table 15. + attribution_short: FAO (1949) + + # Files + url_main: https://www.un-ilibrary.org/content/books/9789210472654 + date_accessed: 2024-05-27 + + # License + license: + name: © FAO 1949 + url: https://www.un-ilibrary.org/content/books/9789210472654 +outs: + - md5: de13ba1616d768c467cb228cc495ad87 + size: 1074 + path: fao_1949.csv diff --git a/snapshots/agriculture/2024-05-23/fao_1949.py b/snapshots/agriculture/2024-05-23/fao_1949.py new file mode 100644 index 00000000000..9e5588e7089 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_1949.py @@ -0,0 +1,89 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fao_1949.csv") + + # Data manually extracted. + data = """ +country,1947/48,1948/49 +Burma,1986,1877 +Ceylon,1977,1918 +India,,1570 +Japan,1670,1795 +Philippines,1770, +Thailand,2110,2020 +Austria,2397,2698 +Belgium,2667,2760 +Czechoslovakia,2402,2656 +Denmark,3125,3206 +Finland,2617,2851 +France,2357,2667 +Greece,2266,2358 +Hungary,2432, +Iceland,3268, +Ireland,3260,3276 +Italy,2249,2398 +Netherlands,2856, +Luxembourg,2693,2878 +Norway,2899,3051 +Poland,2363,2625 +Portugal,2279,2184 +Spain,2180,2377 +Sweden,2871,3108 +Switzerland,3050,2996 +United Kingdom,2968,3084 +Yugoslavia,2144, +Australia,3262,3265 +New Zealand,3286,3259 +Canada,3161,3141 +United States,3244,3186 +Cuba,2682,2814 +El Salvador,1557, +Mexico,2032,2101 +Argentina,3188,3191 +Brazil,2245, +Chile,2352,2356 +Colombia,1950, +Peru,1925,2219 +Uruguay,2490,2529 +Egypt,2364,2458 +Turkey,2173,2506 +Ethiopia,1770, +Algeria,1279,1421 +Madagascar,2074, +Morocco,1837,1825 +Tanganyika,2163, +Tunisia,1498,1545 +Union of South Africa,2422,2517 + """ + # NOTE: + # * The table includes China, but only for 22 provinces, so we ignore it. + # * The table includes India and Pakistan, but the footnote says that the value for 1948/49 is only India. + # * Footnote says about Japan: "1t is believed by the Supreme Command Allied Powers that for staple foods there is an appreciable understatement of production, particularly from home gardens, both in staple foods and vegetables. A nutrition survey conducted by the Ministry of Welfare estimated calorie supplies per person per day at 1,965.". + # * Footnote says about France: "Unreported production has most likely provided enough calories to raise the level to about 2,500-2,600 calories.". + # * For some countries, the footnote says "Calendar year basis: 1947 and 1948.", but that is already the years we will use for all countries. + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc new file mode 100644 index 00000000000..93afa11984c --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_2000.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The State of Food and Agriculture 2000 + title_snapshot: The State of Food and Agriculture 2000 - Daily calories in various countries + description: |- + This dataset contains daily calories in various countries, from Table 11 of FAO's "The State of Food and Agriculture 2000". + date_published: "2000-10-01" + + # Citation + producer: Food and Agriculture Organization of the United Nations + citation_full: |- + Food and Agriculture Organization of the United Nations (2000), "The State of Food and Agriculture 2000". + Data extracted from Table 11. + attribution_short: FAO (2000) + + # Files + url_main: https://www.fao.org/agrifood-economics/publications/detail/en/c/122046/ + date_accessed: 2024-05-27 + + # License + license: + name: © FAO 2000 + url: https://www.fao.org/agrifood-economics/publications/detail/en/c/122046/ +outs: + - md5: 695a0bdbf3f50e6008d3be384cb8588c + size: 196 + path: fao_2000.csv diff --git a/snapshots/agriculture/2024-05-23/fao_2000.py b/snapshots/agriculture/2024-05-23/fao_2000.py new file mode 100644 index 00000000000..cc43a9eeecf --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fao_2000.py @@ -0,0 +1,51 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fao_2000.csv") + + # Data manually extracted. + data = """ +country,1934-38,1946-49 +Uganda,,2100 +Cambodia,,1560 +Mexico,1800,2050 +Peru,1860,1920 + """ + # Note that I removed "Kenya,2230," because, as the footnote says, it includes Uganda. + # I also removed the first point of Cambodia because it was actually referring to French Indochina. + # I also removed the first point of India because it was actually referring to India and Pakistan. + # Note that the footnote of the table says that the year ranges for India, China and Brazil are different. + # Create an additional dataframe for them. + data_additional = """ +country,1934-38,1946-49,1931-37,1949-50,1935-39 +China,,,2230,2030 +India,,,,1700 +Brazil,,2340,,,2150 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + df_additional = pd.read_csv(StringIO(data_additional)) + df = pd.concat([df, df_additional], ignore_index=True) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011.py b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py new file mode 100644 index 00000000000..0b201795ce9 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011.py @@ -0,0 +1,75 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Initialize new snapshots for daily caloric intake in the US and in Western Europe. + snap_us = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_us.csv") + snap_europe = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/floud_et_al_2011_daily_calories_europe.csv") + + # Data from Table 6.6 on US daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_us = """ +Year,Calories +1800,2952 +1810,2935 +1820,2904 +1830,2888 +1840,3013 +1850,2585 +1860,2826 +1870,3029 +1880,3237 +1890,3134 +1900,3212 +1910,3068 +1920,3259 +1930,3400 +1940,3300 +1952,3200 +1960,3100 +1970,3200 +1980,3200 +1990,3500 +2000,3900 +2004,3900 + """ + + # Create a dataframe with the extracted data. + data_us_parsed = [line.split(",") for line in data_us.split("\n")[1:-1]] + df_us = pd.DataFrame(data_us_parsed[1:], columns=data_us_parsed[0]) + + # Data from Table 5.5 on Western Europe daily caloric intake, extracted using chatGPT 4o (and manually inspected). + data_europe = """ +country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960 +Belgium,2840,,,,,2423,2426,2553,2663,2851,2987,3278,,2940,,,3040 +England,2436,,,,,2512,,,2773,,,2977,,2810,3060,3120,3280 +Finland,,,,,,,1900,,,,,3000,,2950,,,3110 +France,1846,,1984,2118,2377,2840,2854,3085,3085,3220,3192,3323,3133,,,,3050 +Germany,2210,,,,,,2120,,,,,,,,,,2960 +Iceland,,,2887,,3080,3381,,2573,3002,3106,3316,3499,,,,, +Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730 +Netherlands,,,,,,,2227,,2493,,2721,,,,,, +Norway,,1800,,,2250,,3300,,,,,,,,,,2930 + """ + # Create a dataframe with the extracted data. + data_europe_parsed = [line.split(",") for line in data_europe.split("\n")[1:-1]] + df_europe = pd.DataFrame(data_europe_parsed[1:], columns=data_europe_parsed[0]) + + # Create snapshots. + snap_us.create_snapshot(upload=upload, data=df_us) + snap_europe.create_snapshot(upload=upload, data=df_europe) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc new file mode 100644 index 00000000000..044d7a4e0e7 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_europe.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in Western Europe + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4f31506ded236dc72a590695f8868a1c + size: 554 + path: floud_et_al_2011_daily_calories_europe.csv diff --git a/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc new file mode 100644 index 00000000000..3573e2923e6 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/floud_et_al_2011_daily_calories_us.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Changing Body + title_snapshot: The Changing Body - Daily calories in United States + description: |- + This dataset contains the estimates on the daily caloric intake in the United States (Table 6.6) and Western Europe (Table 5.5) of "The Changing Body", by Floud et al. (2011). + date_published: "2011-03-31" + + # Citation + producer: Floud et al. + citation_full: |- + Floud, R., Fogel, R. W., Harris, B. and Hong, S. C. (2011), "The Changing Body," Cambridge Books, Cambridge University Press, number 9780521879750. + Data extracted from Tables 5.5 and 6.6. + attribution_short: Floud et al. (2011) + + # Files + url_main: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2011 + url: https://www.cambridge.org/core/books/changing-body/DE3BB0E3577205AC26823CF2120D8B7E +outs: + - md5: 4316767b9de23caf9710fe44caff5ec9 + size: 234 + path: floud_et_al_2011_daily_calories_us.csv diff --git a/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc new file mode 100644 index 00000000000..f175ff6ab3d --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fogel_2004.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The Escape from Hunger and Premature Death + title_snapshot: The Escape from hunger and Premature Death - Daily calories in France and Great Britain + description: |- + This dataset contains daily calorie supply from Table 1.2 of Fogel (2004) book: "The Escape from hunger and Premature Death". + date_published: "2004-05-24" + + # Citation + producer: Fogel + citation_full: |- + Fogel, R.W. (2004), "The Escape from hunger and Premature Death". Cambridge Studies in Population, Economy and Society in Past Time, Series Number 38. + Data extracted from Table 1.2. + attribution_short: Fogel (2004) + + # Files + url_main: https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA + date_accessed: 2024-05-27 + + # License + license: + name: © Cambridge University Press 2004 + url: https://www.cambridge.org/core/books/escape-from-hunger-and-premature-death-17002100/384C6032DE4E73C90EF6C9D1E55009CA +outs: + - md5: 11d98a77f8589394b38ba286dc56d27f + size: 241 + path: fogel_2004.csv diff --git a/snapshots/agriculture/2024-05-23/fogel_2004.py b/snapshots/agriculture/2024-05-23/fogel_2004.py new file mode 100644 index 00000000000..45d81263452 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/fogel_2004.py @@ -0,0 +1,49 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/fogel_2004.csv") + + # Data manually extracted. + data = """ +Year,France,Great Britain +1700,,2095 +1705,1657, +1750,,2168 +1785,1848, +1800,,2237 +1803-12,1846, +1845-54,2480, +1850,,2362 +1909-13,,2857 +1935-39,2975, +1954-55,2783,3231 +1961,,3170 +1965,3355,3304 +1989,3465,3149 + + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc b/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc new file mode 100644 index 00000000000..8f30143e7fd --- /dev/null +++ b/snapshots/agriculture/2024-05-23/grigg_1995.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: The nutritional transition in Western Europe + title_snapshot: The nutritional transition in Western Europe - Daily calories in Western Europe + description: |- + This dataset contains daily calories available per capita from Table 1 of Grigg (1995) paper: "The nutritional transition in Western Europe". + date_published: "1995-07-01" + + # Citation + producer: Grigg + citation_full: |- + Grigg, D. (1995), "The nutritional transition in Western Europe". Journal of Historical Geography, Volume 21, Issue 3, 1995, Pages 247-261. https://doi.org/10.1006/jhge.1995.0018 + Data extracted from Table 1. + attribution_short: Grigg (1995) + + # Files + url_main: https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187?via%3Dihub + date_accessed: 2024-05-27 + + # License + license: + name: © Elsevier 1995 + url: https://www.sciencedirect.com/science/article/abs/pii/S0305748885700187?via%3Dihub +outs: + - md5: 79e132cf8120eb4ad8a1dd9d349d892a + size: 520 + path: grigg_1995.csv diff --git a/snapshots/agriculture/2024-05-23/grigg_1995.py b/snapshots/agriculture/2024-05-23/grigg_1995.py new file mode 100644 index 00000000000..acc5d81981b --- /dev/null +++ b/snapshots/agriculture/2024-05-23/grigg_1995.py @@ -0,0 +1,41 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/grigg_1995.csv") + + # Data extracted using chatGPT 4o (and manually inspected and corrected). + data = """ +country,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,1900,1910,1920,1930,1940,1950,1960 +Belgium,2247,,,,,2238,2580,,,,,3300,,2940,,,3040 +England,2349,,,,,,3240,,2773,,,2760,,2810,3060,3120,3280 +Germany,2210,,,,,,2120,,,,,,,,,,2960 +Finland,,,,,,,1900,,,,,3000,,2950,,,3110 +Norway,,1800,,,2250,,3300,,,,,,,,,,2930 +Italy,,,,,,,,2647,2197,2119,,2617,,2627,,,2730 +France,1846,,1984,2118,2377,2480,2854,2875,3085,3220,3192,3323,3133,3127,,,3050 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc new file mode 100644 index 00000000000..d96fa7c97db --- /dev/null +++ b/snapshots/agriculture/2024-05-23/harris_et_al_2015.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries + title_snapshot: How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries - Daily calories in England and Wales + description: |- + This dataset contains the table in the appendix of Harris et al. (2015) paper: "How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries". + That table contains a compilation of daily calorie (supply or consumption) in England and Wales, according to various different studies. + date_published: "2015-04-22" + + # Citation + producer: Harris et al. + citation_full: |- + Harris, B., Floud, R. and Hong, S.C. (2015), "How Many Calories? Food Availability in England and Wales in the Eighteenth and Nineteenth Centuries", Research in Economic History (Research in Economic History, Vol. 31), Emerald Group Publishing Limited, Leeds, pp. 111-191. https://doi.org/10.1108/S0363-326820150000031003 + Data extracted from the Appendix. + attribution_short: Harris et al. (2015) + + # Files + url_main: https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html + date_accessed: 2024-05-23 + + # License + license: + name: © Emerald Group Publishing Limited 2015 + url: https://www.emerald.com/insight/content/doi/10.1108/S0363-326820150000031003/full/html +outs: + - md5: 79d314aa6815574e11146337336ee10b + size: 2050 + path: harris_et_al_2015.csv diff --git a/snapshots/agriculture/2024-05-23/harris_et_al_2015.py b/snapshots/agriculture/2024-05-23/harris_et_al_2015.py new file mode 100644 index 00000000000..e1f1ebe0d1d --- /dev/null +++ b/snapshots/agriculture/2024-05-23/harris_et_al_2015.py @@ -0,0 +1,89 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/harris_et_al_2015.csv") + + # Data extracted from chatGPT 4o (and manually inspected and corrected). + data = """ + 1270/1279, Broadberry et al. (2015), 2203 + 1300, Allen (2005), 1791 + 1300, Overton and Campbell (1996), n/a + 1300/1309, Broadberry et al. (2015), 2056 + 1310/1319, Broadberry et al. (2015), 1998 + 1380, Overton and Campbell (1996), n/a + 1380/1389, Broadberry et al. (2015), 2467 + 1420/1429, Broadberry et al. (2015), 2146 + 1450/1459, Broadberry et al. (2015), 2176 + 1500, Allen (2005), 3397 + 1600, Muldrew (2011), 3062 + 1600, Overton and Campbell (1996), n/a + 1600/1609, Broadberry et al. (2015), 2104 + 1650/1659, Broadberry et al. (2015), 1945 + 1700, Allen (2005), 3255 + 1700, Floud et al. (2011) (Estimates A and B), 2230 + 1700, Fogel (2004), 2095 + 1700, Meredith and Oxley (2014), 2557 + 1700, Muldrew (2011), 3579 + 1700, Overton and Campbell (1996), n/a + 1700/1709, Broadberry et al. (2015), 2187 + 1750, Allen (2005), 3803 + 1750, Floud et al. (2011) (Estimate A; with correction), 2328 + 1750, Floud et al. (2011) (Estimate B; with correction), 2516 + 1750, Fogel (2004), 2168 + 1750, Kelly and Ó Gráda (2013b), 2914-2949 + 1750/1759, Broadberry et al. (2015), 2178 + 1770, Kelly and Ó Gráda (2013b), 3542-3547 + 1770, Meredith and Oxley (2014), 3271 + 1770, Muldrew (2011), 5047 + 1800, Allen (2005), 2938 + 1800, Floud et al. (Estimate A), 2472 + 1800, Floud et al. (Estimate B), 2439 + 1800, Fogel (2004), 2237 + 1800, Kelly and Ó Gráda (2013b) (Estimate A), 2941-2956 + 1800, Kelly and Ó Gráda (2013b) (Estimate B), 2749-2794 + 1800, Meredith and Oxley (2014), 2620 + 1800, Muldrew (2011), 3977 + 1800, Overton and Campbell (1996), n/a + 1800/1809, Broadberry et al. (2015), 2175 + 1830, Overton and Campbell (1996), n/a + 1830/1839, Broadberry et al. (2015), 1950 + 1840/1849, Broadberry et al. (2015), 2166 + 1850, Allen (2005), 2525 + 1850, Floud et al. (2011) (Estimate A), 2505 + 1850, Floud et al. (2011) (Estimate B)/Meredith and Oxley (2013), 2545 + 1850, Fogel (2004), 2362 + 1850/1859, Broadberry et al. (2015), 2111 + 1861/1870, Broadberry et al. (2015), 2463 + 1871, Overton and Campbell (1996), n/a + 1909/13, Floud et al. (2011) & Meredith and Oxley (2014), 2977 + 1909/13, Fogel (2004), 2857 + 1954/55, Fogel (2004), 3231 + 1961, Fogel (2004), 3170 + 1965, Fogel (2004), 3304 + 1989, Fogel (2004), 3149 + """ + + # Create a dataframe with the extracted data. + data_parsed = [[item.strip() for item in line.split(",")] for line in data.split("\n")[1:-1]] + df = pd.DataFrame(data_parsed, columns=["Years", "Source", "Total"]) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc b/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc new file mode 100644 index 00000000000..a7cc48a8338 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/jonsson_1998.csv.dvc @@ -0,0 +1,30 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Changes in food consumption in Iceland, 1770-1940 + title_snapshot: Changes in food consumption in Iceland, 1770-1940 - Daily calories in Iceland + description: |- + This dataset contains daily energy from Table 5 of Jonsson (1998) paper: "Changes in food consumption in Iceland, 1770-1940". + date_published: "1998-01-01" + + # Citation + producer: Jonsson + citation_full: |- + Jonsson, G.R. (1998), "Changes in food consumption in Iceland, 1770-1940". Scandinavian Economic History Review, 46, 24-41. + Data extracted from Table 5. + attribution_short: Jonsson (1998) + + # Files + url_main: https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677 + date_accessed: 2024-05-27 + + # License + license: + name: © Scandinavian Economic History Review 1998 + url: https://www.tandfonline.com/doi/abs/10.1080/03585522.1998.10414677 +outs: + - md5: 9637e39deb3ff3064e125c5141d273f1 + size: 180 + path: jonsson_1998.csv diff --git a/snapshots/agriculture/2024-05-23/jonsson_1998.py b/snapshots/agriculture/2024-05-23/jonsson_1998.py new file mode 100644 index 00000000000..c819cb394d4 --- /dev/null +++ b/snapshots/agriculture/2024-05-23/jonsson_1998.py @@ -0,0 +1,50 @@ +"""Script to create a snapshot of dataset.""" + +from io import StringIO +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"agriculture/{SNAPSHOT_VERSION}/jonsson_1998.csv") + + # Data extracted using chatGPT 4o (and manually inspected and corrected). + data = """ +year,daily_calories +1770,3048 +1784,2322 +1795,2724 +1819,2887 +1840,3080 +1849,3381 +1855,2917 +1863,2885 +1870,2573 +1880,3002 +1890,3106 +1900,3316 +1910,3499 +1920,3610 +1930,4207 +1938,4066 + """ + + # Create a dataframe with the extracted data. + df = pd.read_csv(StringIO(data)) + + # Create snapshot. + snap.create_snapshot(upload=upload, data=df) + + +if __name__ == "__main__": + main() diff --git a/snapshots/usda_ers/2024-05-23/food_availability.py b/snapshots/usda_ers/2024-05-23/food_availability.py new file mode 100644 index 00000000000..d56f36767d0 --- /dev/null +++ b/snapshots/usda_ers/2024-05-23/food_availability.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"usda_ers/{SNAPSHOT_VERSION}/food_availability.xls") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc b/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc new file mode 100644 index 00000000000..32e3e117abf --- /dev/null +++ b/snapshots/usda_ers/2024-05-23/food_availability.xls.dvc @@ -0,0 +1,29 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: "U.S. food supply: Nutrients and other food components, per capita per day" + date_published: "2015-02-01" + + # Citation + producer: USDA Economic Research Service (ERS) + citation_full: |- + Economic Research Service of the United States Department of Agriculture (USDA/ERS) - U.S. food supply: Nutrients and other food components, per capita per day. + The data can be found as one of the archived tables of the Food Availability (Per Capita) Data System. + attribution_short: USDA/ERS + + # Files + url_main: https://www.ers.usda.gov/data-products/food-availability-per-capita-data-system/food-availability-per-capita-data-system/ + url_download: https://www.ers.usda.gov/webdocs/DataFiles/50472/nutrients.xls?v=4603.6 + date_accessed: 2024-05-27 + + # License + license: + name: Public Domain + url: https://www.ers.usda.gov/data-products/food-availability-per-capita-data-system/food-availability-per-capita-data-system/ + +outs: + - md5: bfebce79879913cf997ecb2a2e32161b + size: 164864 + path: food_availability.xls