Skip to content

Commit

Permalink
include stock percentage distribution in livestock preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
elpamart authored and alexeh committed Nov 15, 2023
1 parent fdca0a3 commit 3a31a4e
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 29 deletions.
4 changes: 2 additions & 2 deletions data/h3_data_importer/data_checksums/livestock_processed_ha
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
36e9a56a00986ee9a578a07530b944970353b7bbe74c7546ffb150cadc1dea47 GLO_2021_HensEggs_ha.tif
ac38acef226254d7902397fd4b2913438f5924b8fa316b4737d77977f0f8ead3 GLO_2021_TotalRawMilk_ha.tif
9d42bb8467c0522c67776c3461bad80ef9ffa31ae1b7a1d04e6758a197661ff2 GLO_2021_HensEggs_ha.tif
ddb5ce3950a4287e55333cc451cdf902ffb6a5f9cb54f6fd203cac2298707a29 GLO_2021_TotalRawMilk_ha.tif
4 changes: 2 additions & 2 deletions data/h3_data_importer/data_checksums/livestock_processed_prod
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
919e2e088264d04e2e1d203200ace7e676b510d6b30e4191cbbea7b7559c2b09 GLO_2021_HensEggs_t.tif
608cb760c7860370406e5a06239334abe02a87b87bbf07c5bb8a9d30bfaeb03a GLO_2021_TotalRawMilk_t.tif
890c9474871b98d932be5c79d1f2a7209b93649460ddb6b09e34c3c37447c91c GLO_2021_HensEggs_t.tif
4f4a6c1d18a54372d1c211337fa908138eb8e58d552c3f3b2f67b34d3f73133e GLO_2021_TotalRawMilk_t.tif
74 changes: 49 additions & 25 deletions data/preprocessing/livestock_processed/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ AWS_S3_BUCKET_URL=s3://landgriffon-raw-data
export AWS_ACCESS_KEY_ID = $(DATA_S3_ACCESS_KEY)
export AWS_SECRET_ACCESS_KEY = $(DATA_S3_SECRET_KEY)

.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums
.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock

all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums
all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock

download_pasture_data:
mkdir -p $(data_dir)/pasture
Expand All @@ -24,50 +24,72 @@ download_faostats_data_harvest:
mkdir -p $(data_dir)/faostats/harvest
aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/harvest data/faostats/harvest/

# preprocess hens eggs and total milk production equivalent
download_faostats_stocks_data:
mkdir -p $(data_dir)/faostats/stocks
aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/stocks data/faostats/stocks/

# obtain percentage of cattle for dairy and chickens by country for layers and vectorize
preprocess_faostats_data_stocks:
mkdir -p $(data_dir)/faostats_processed/stocks
python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_cattle_dairy_2019.csv \
$(data_dir)/faostats/stocks/FAOSTAT_cattle_non_dairy_2019.csv \
$(data_dir)/faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp;
python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_chickens_layers_2019.csv \
$(data_dir)/faostats/stocks/FAOSTAT_chickens_broilers_2019.csv \
$(data_dir)/faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp;

# clean and vectorize faostats production data for hens and milk
preprocess_faostats_data_production:
mkdir -p $(data_dir)/faostats_processed/production
python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \
python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \
$(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp production;
python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \
python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \
$(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp production;

# preprocess hens eggs and total milk harvest equivalent
# clean and vectorize faostats harvest data for hens and milk
preprocess_faostats_data_harvest:
mkdir -p $(data_dir)/faostats_processed/harvest
python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \
python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \
$(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp harvest;
python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \
python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \
$(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp harvest;

# aggregate the pasture data to get the totals that we will need to the total milk calculation
# the -1.7e+308 is the nodata value for the pasture data. We need to set it to 0 for the calculation.
calculate_aggregation:
gdal_calc.py --NoDataValue=0 --quiet --calc "(A!=-1.7e+308)*A+(B!=-1.7e+308)*B+(C!=-1.7e+308)*C" --format GTiff --type Float64 \
-A $(data_dir)/pasture/6_Ct_2010_Aw.tif \
-B $(data_dir)/pasture/6_Gt_2010_Aw.tif \
-C $(data_dir)/pasture/6_Sh_2010_Aw.tif \
--outfile $(data_dir)/pasture/6_Tm_2010_Aw.tif

#rasterise and calculate the tonnes of material
rasterize_percentage_stock:
mkdir -p $(data_dir)/rasterized/stocks
gdal_rasterize -q -l FAOSTAT_cattle_dairy_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp \
$(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif;
gdal_rasterize -q -l FAOSTAT_chickens_eggs_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp \
$(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif;

# first we need to rasterize stock data with percentage of dairy cattle and chicken eggs
# We rasterize the production chickens and cattle data
# then we need to multiply the rasterized stock percentage with the chicken and cattle data from glwv3
# and the rasterized production data
rasterize_and_calculate_commodities_production:
mkdir -p $(data_dir)/rasterized/production
mkdir -p $(data_dir)/processed_commodities/production
gdal_rasterize -q -l FAOSTAT_data_hens_eggs_iso3_2021_t -a Value -tr 0.083333 0.083333 -a_nodata 0 \
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp \
$(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif;
gdal_calc.py --quiet --calc "A*B" --format GTiff --type Float32 --NoDataValue 0.0 \
-A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \
-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
--outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif;
gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_t -a Value -tr 0.083333 0.083333 -a_nodata 0 \
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp \
$(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif;
gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B" --format GTiff --type Float64 \
gdal_calc.py --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float32 --NoDataValue 0.0 \
-A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \
-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
-C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \
--outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif;
gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float64 \
-A $(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif --A_band 1 \
-B $(data_dir)/pasture/6_Tm_2010_Aw.tif \
-B $(data_dir)/pasture/6_Ct_2010_Aw.tif \
-C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \
--outfile $(data_dir)/processed_commodities/production/GLO_2021_TotalRawMilk_t.tif;

# 3.40282e+38 is the nodata value for the total milk data. We need to set it to 0 for the calculation.
Expand All @@ -78,17 +100,19 @@ rasterize_and_calculate_commodities_harvest:
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp \
$(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif;
gdal_calc.py --quiet --calc "B/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \
gdal_calc.py --quiet --calc "(B*C)/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \
-A $(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif --A_band 1 \
-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
-C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \
--outfile $(data_dir)/processed_commodities/harvest/GLO_2021_HensEggs_ha.tif;
gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_ha -a Value -tr 0.083333 0.083333 -a_nodata 0 \
-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
$(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp \
$(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif;
gdal_calc.py --NoDataValue=0 --quiet --calc "((B!=3.40282e+38)*B)/((A!=0)*A)" --format GTiff --type Float64 \
gdal_calc.py --NoDataValue=0 --quiet --calc "(((B!=3.40282e+38)*B)*C)/((A!=0)*A)" --format GTiff --type Float64 \
-A $(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif --A_band 1 \
-B $(data_dir)/pasture/6_Tm_2010_Aw.tif \
-C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \
--outfile $(data_dir)/processed_commodities/harvest/GLO_2021_TotalRawMilk_ha.tif;

upload_livestock_processed_production:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def get_country_geometry():
This geometry is used to spatalised the faostat data
"""
# Connect to the database

conn = psycopg2.connect(
host=os.getenv("API_POSTGRES_HOST"),
port=os.getenv("API_POSTGRES_PORT"),
Expand Down
126 changes: 126 additions & 0 deletions data/preprocessing/livestock_processed/preprocess_faostats_stocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import logging
import argparse

import pandas as pd
import geopandas as gpd
import psycopg2


logging.basicConfig(level=logging.INFO)
log = logging.getLogger("preprocessing_processed_livestock_stock_faostats_file")


def clean_data(df, columns):
"""
Clean the input dataframe by keeping only the specified columns.
"""
df_clean = df[columns]
return df_clean


def rename_columns(df, column_map):
"""
Rename the columns of the input dataframe using the specified column map.
"""
df_renamed = df.rename(columns=column_map)
return df_renamed


def merge_data(df1, df2, on):
"""
Merge two dataframes on the specified column(s).
"""
df_merged = df1.merge(df2, on=on)
return df_merged


def calculate_percentage(df, numerator_col, denominator_col, output_col):
"""
Calculate the percentage of the numerator column from the total of the numerator and denominator columns.
"""
df[output_col] = df[numerator_col] / (df[numerator_col] + df[denominator_col])
return df


def get_country_geometry():
"""
Get the country geometry from a database.
"""

# Connect to the database
conn = psycopg2.connect(
host=os.getenv("API_POSTGRES_HOST"),
port=os.getenv("API_POSTGRES_PORT"),
database=os.getenv("API_POSTGRES_DATABASE"),
user=os.getenv("API_POSTGRES_USERNAME"),
password=os.getenv("API_POSTGRES_PASSWORD"),
)

# Get the countries geometries
countries_df = gpd.read_postgis(
"""SELECT ar."isoA3", gr."theGeom"
FROM admin_region ar
INNER JOIN geo_region gr ON gr.id = ar."geoRegionId"
WHERE ar."level" = 0
""",
conn,
geom_col="theGeom",
)
return countries_df


def merge_with_geometry(df, geometry_df, on):
"""
Merge the input dataframe with the country geometry dataframe on the specified column(s).
"""
df_merged = df.merge(geometry_df, on=on)
return df_merged


def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Process livestock preprocessed faostats data.")
parser.add_argument(
"input_file_main",
type=str,
help="Path to the input file containing vector files of the main value to preprocess",
)
parser.add_argument(
"input_file_secondary",
type=str,
help="Path to the input file containing vector files of the secondary value to preproces",
)
parser.add_argument("output_file", type=str, help="Path to the output file to save processed data")
args = parser.parse_args()

# Open the files and clean the data
df_main = pd.read_csv(args.input_file_main)
df_secondary = pd.read_csv(args.input_file_secondary)
df_main_clean = clean_data(df_main, ["Area Code (ISO3)", "Value", "Unit"])
df_secondary_clean = clean_data(df_secondary, ["Area Code (ISO3)", "Value", "Unit"])

# Rename the columns
df_main_renamed = rename_columns(df_main_clean, {"Area Code (ISO3)": "isoA3", "Value": "main_value"})
df_secondary_renamed = rename_columns(df_secondary_clean, {"Area Code (ISO3)": "isoA3", "Value": "secondary_value"})

# Merge the dataframes
df_merged = merge_data(df_main_renamed, df_secondary_renamed, "isoA3")

# Calculate the percentage
df_merged = calculate_percentage(df_merged, "main_value", "secondary_value", "percentage")

# Get the country geometry
countries_df = get_country_geometry()

# Merge the dataframes
df_merged = merge_with_geometry(df_merged, countries_df, "isoA3")
# Set geoeometry and crs
df_merged = df_merged.set_geometry("theGeom")
df_merged = df_merged.set_crs("EPSG:4326")
# Save the dataframe
df_merged.to_file(args.output_file)


if __name__ == "__main__":
main()

0 comments on commit 3a31a4e

Please sign in to comment.