From 3a31a4ed382be665974e07fe7e61090a0b38d647 Mon Sep 17 00:00:00 2001 From: elpamart Date: Tue, 31 Oct 2023 12:51:29 +0100 Subject: [PATCH] include stock percentage distribution in livestock preprocessing --- .../data_checksums/livestock_processed_ha | 4 +- .../data_checksums/livestock_processed_prod | 4 +- .../livestock_processed/Makefile | 74 ++++++---- ...tats.py => preprocess_faostats_ha_prod.py} | 1 + .../preprocess_faostats_stocks.py | 126 ++++++++++++++++++ 5 files changed, 180 insertions(+), 29 deletions(-) rename data/preprocessing/livestock_processed/{preprocess_faostats.py => preprocess_faostats_ha_prod.py} (99%) create mode 100644 data/preprocessing/livestock_processed/preprocess_faostats_stocks.py diff --git a/data/h3_data_importer/data_checksums/livestock_processed_ha b/data/h3_data_importer/data_checksums/livestock_processed_ha index 3058b4b10..c0cbf863a 100644 --- a/data/h3_data_importer/data_checksums/livestock_processed_ha +++ b/data/h3_data_importer/data_checksums/livestock_processed_ha @@ -1,2 +1,2 @@ -36e9a56a00986ee9a578a07530b944970353b7bbe74c7546ffb150cadc1dea47 GLO_2021_HensEggs_ha.tif -ac38acef226254d7902397fd4b2913438f5924b8fa316b4737d77977f0f8ead3 GLO_2021_TotalRawMilk_ha.tif +9d42bb8467c0522c67776c3461bad80ef9ffa31ae1b7a1d04e6758a197661ff2 GLO_2021_HensEggs_ha.tif +ddb5ce3950a4287e55333cc451cdf902ffb6a5f9cb54f6fd203cac2298707a29 GLO_2021_TotalRawMilk_ha.tif diff --git a/data/h3_data_importer/data_checksums/livestock_processed_prod b/data/h3_data_importer/data_checksums/livestock_processed_prod index 7f6daf5dd..9af408b6e 100644 --- a/data/h3_data_importer/data_checksums/livestock_processed_prod +++ b/data/h3_data_importer/data_checksums/livestock_processed_prod @@ -1,2 +1,2 @@ -919e2e088264d04e2e1d203200ace7e676b510d6b30e4191cbbea7b7559c2b09 GLO_2021_HensEggs_t.tif -608cb760c7860370406e5a06239334abe02a87b87bbf07c5bb8a9d30bfaeb03a GLO_2021_TotalRawMilk_t.tif +890c9474871b98d932be5c79d1f2a7209b93649460ddb6b09e34c3c37447c91c GLO_2021_HensEggs_t.tif +4f4a6c1d18a54372d1c211337fa908138eb8e58d552c3f3b2f67b34d3f73133e GLO_2021_TotalRawMilk_t.tif diff --git a/data/preprocessing/livestock_processed/Makefile b/data/preprocessing/livestock_processed/Makefile index b9d2cfe15..a7a795bc8 100644 --- a/data/preprocessing/livestock_processed/Makefile +++ b/data/preprocessing/livestock_processed/Makefile @@ -5,9 +5,9 @@ AWS_S3_BUCKET_URL=s3://landgriffon-raw-data export AWS_ACCESS_KEY_ID = $(DATA_S3_ACCESS_KEY) export AWS_SECRET_ACCESS_KEY = $(DATA_S3_SECRET_KEY) -.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums +.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock -all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums +all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock download_pasture_data: mkdir -p $(data_dir)/pasture @@ -24,32 +24,52 @@ download_faostats_data_harvest: mkdir -p $(data_dir)/faostats/harvest aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/harvest data/faostats/harvest/ -# preprocess hens eggs and total milk production equivalent +download_faostats_stocks_data: + mkdir -p $(data_dir)/faostats/stocks + aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/stocks data/faostats/stocks/ + +# obtain percentage of cattle for dairy and chickens by country for layers and vectorize +preprocess_faostats_data_stocks: + mkdir -p $(data_dir)/faostats_processed/stocks + python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_cattle_dairy_2019.csv \ + $(data_dir)/faostats/stocks/FAOSTAT_cattle_non_dairy_2019.csv \ + $(data_dir)/faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp; + python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_chickens_layers_2019.csv \ + $(data_dir)/faostats/stocks/FAOSTAT_chickens_broilers_2019.csv \ + $(data_dir)/faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp; + +# clean and vectorize faostats production data for hens and milk preprocess_faostats_data_production: mkdir -p $(data_dir)/faostats_processed/production - python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \ + python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \ $(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp production; - python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \ + python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \ $(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp production; -# preprocess hens eggs and total milk harvest equivalent +# clean and vectorize faostats harvest data for hens and milk preprocess_faostats_data_harvest: mkdir -p $(data_dir)/faostats_processed/harvest - python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \ + python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \ $(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp harvest; - python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \ + python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \ $(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp harvest; -# aggregate the pasture data to get the totals that we will need to the total milk calculation -# the -1.7e+308 is the nodata value for the pasture data. We need to set it to 0 for the calculation. -calculate_aggregation: - gdal_calc.py --NoDataValue=0 --quiet --calc "(A!=-1.7e+308)*A+(B!=-1.7e+308)*B+(C!=-1.7e+308)*C" --format GTiff --type Float64 \ - -A $(data_dir)/pasture/6_Ct_2010_Aw.tif \ - -B $(data_dir)/pasture/6_Gt_2010_Aw.tif \ - -C $(data_dir)/pasture/6_Sh_2010_Aw.tif \ - --outfile $(data_dir)/pasture/6_Tm_2010_Aw.tif - #rasterise and calculate the tonnes of material +rasterize_percentage_stock: + mkdir -p $(data_dir)/rasterized/stocks + gdal_rasterize -q -l FAOSTAT_cattle_dairy_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \ + -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ + $(data_dir)faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp \ + $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif; + gdal_rasterize -q -l FAOSTAT_chickens_eggs_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \ + -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ + $(data_dir)faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp \ + $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif; + +# first we need to rasterize stock data with percentage of dairy cattle and chicken eggs +# We rasterize the production chickens and cattle data +# then we need to multiply the rasterized stock percentage with the chicken and cattle data from glwv3 +# and the rasterized production data rasterize_and_calculate_commodities_production: mkdir -p $(data_dir)/rasterized/production mkdir -p $(data_dir)/processed_commodities/production @@ -57,17 +77,19 @@ rasterize_and_calculate_commodities_production: -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ $(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp \ $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif; - gdal_calc.py --quiet --calc "A*B" --format GTiff --type Float32 --NoDataValue 0.0 \ - -A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \ - -B $(data_dir)/pasture/6_Ch_2010_Aw.tif \ - --outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif; gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_t -a Value -tr 0.083333 0.083333 -a_nodata 0 \ -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ $(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp \ $(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif; - gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B" --format GTiff --type Float64 \ + gdal_calc.py --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float32 --NoDataValue 0.0 \ + -A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \ + -B $(data_dir)/pasture/6_Ch_2010_Aw.tif \ + -C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \ + --outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif; + gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float64 \ -A $(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif --A_band 1 \ - -B $(data_dir)/pasture/6_Tm_2010_Aw.tif \ + -B $(data_dir)/pasture/6_Ct_2010_Aw.tif \ + -C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \ --outfile $(data_dir)/processed_commodities/production/GLO_2021_TotalRawMilk_t.tif; # 3.40282e+38 is the nodata value for the total milk data. We need to set it to 0 for the calculation. @@ -78,17 +100,19 @@ rasterize_and_calculate_commodities_harvest: -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ $(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp \ $(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif; - gdal_calc.py --quiet --calc "B/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \ + gdal_calc.py --quiet --calc "(B*C)/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \ -A $(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif --A_band 1 \ -B $(data_dir)/pasture/6_Ch_2010_Aw.tif \ + -C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \ --outfile $(data_dir)/processed_commodities/harvest/GLO_2021_HensEggs_ha.tif; gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_ha -a Value -tr 0.083333 0.083333 -a_nodata 0 \ -te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \ $(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp \ $(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif; - gdal_calc.py --NoDataValue=0 --quiet --calc "((B!=3.40282e+38)*B)/((A!=0)*A)" --format GTiff --type Float64 \ + gdal_calc.py --NoDataValue=0 --quiet --calc "(((B!=3.40282e+38)*B)*C)/((A!=0)*A)" --format GTiff --type Float64 \ -A $(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif --A_band 1 \ -B $(data_dir)/pasture/6_Tm_2010_Aw.tif \ + -C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \ --outfile $(data_dir)/processed_commodities/harvest/GLO_2021_TotalRawMilk_ha.tif; upload_livestock_processed_production: diff --git a/data/preprocessing/livestock_processed/preprocess_faostats.py b/data/preprocessing/livestock_processed/preprocess_faostats_ha_prod.py similarity index 99% rename from data/preprocessing/livestock_processed/preprocess_faostats.py rename to data/preprocessing/livestock_processed/preprocess_faostats_ha_prod.py index 1d1162396..9272f4188 100644 --- a/data/preprocessing/livestock_processed/preprocess_faostats.py +++ b/data/preprocessing/livestock_processed/preprocess_faostats_ha_prod.py @@ -17,6 +17,7 @@ def get_country_geometry(): This geometry is used to spatalised the faostat data """ # Connect to the database + conn = psycopg2.connect( host=os.getenv("API_POSTGRES_HOST"), port=os.getenv("API_POSTGRES_PORT"), diff --git a/data/preprocessing/livestock_processed/preprocess_faostats_stocks.py b/data/preprocessing/livestock_processed/preprocess_faostats_stocks.py new file mode 100644 index 000000000..9e76cb725 --- /dev/null +++ b/data/preprocessing/livestock_processed/preprocess_faostats_stocks.py @@ -0,0 +1,126 @@ +import os +import logging +import argparse + +import pandas as pd +import geopandas as gpd +import psycopg2 + + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("preprocessing_processed_livestock_stock_faostats_file") + + +def clean_data(df, columns): + """ + Clean the input dataframe by keeping only the specified columns. + """ + df_clean = df[columns] + return df_clean + + +def rename_columns(df, column_map): + """ + Rename the columns of the input dataframe using the specified column map. + """ + df_renamed = df.rename(columns=column_map) + return df_renamed + + +def merge_data(df1, df2, on): + """ + Merge two dataframes on the specified column(s). + """ + df_merged = df1.merge(df2, on=on) + return df_merged + + +def calculate_percentage(df, numerator_col, denominator_col, output_col): + """ + Calculate the percentage of the numerator column from the total of the numerator and denominator columns. + """ + df[output_col] = df[numerator_col] / (df[numerator_col] + df[denominator_col]) + return df + + +def get_country_geometry(): + """ + Get the country geometry from a database. + """ + + # Connect to the database + conn = psycopg2.connect( + host=os.getenv("API_POSTGRES_HOST"), + port=os.getenv("API_POSTGRES_PORT"), + database=os.getenv("API_POSTGRES_DATABASE"), + user=os.getenv("API_POSTGRES_USERNAME"), + password=os.getenv("API_POSTGRES_PASSWORD"), + ) + + # Get the countries geometries + countries_df = gpd.read_postgis( + """SELECT ar."isoA3", gr."theGeom" + FROM admin_region ar + INNER JOIN geo_region gr ON gr.id = ar."geoRegionId" + WHERE ar."level" = 0 + """, + conn, + geom_col="theGeom", + ) + return countries_df + + +def merge_with_geometry(df, geometry_df, on): + """ + Merge the input dataframe with the country geometry dataframe on the specified column(s). + """ + df_merged = df.merge(geometry_df, on=on) + return df_merged + + +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process livestock preprocessed faostats data.") + parser.add_argument( + "input_file_main", + type=str, + help="Path to the input file containing vector files of the main value to preprocess", + ) + parser.add_argument( + "input_file_secondary", + type=str, + help="Path to the input file containing vector files of the secondary value to preproces", + ) + parser.add_argument("output_file", type=str, help="Path to the output file to save processed data") + args = parser.parse_args() + + # Open the files and clean the data + df_main = pd.read_csv(args.input_file_main) + df_secondary = pd.read_csv(args.input_file_secondary) + df_main_clean = clean_data(df_main, ["Area Code (ISO3)", "Value", "Unit"]) + df_secondary_clean = clean_data(df_secondary, ["Area Code (ISO3)", "Value", "Unit"]) + + # Rename the columns + df_main_renamed = rename_columns(df_main_clean, {"Area Code (ISO3)": "isoA3", "Value": "main_value"}) + df_secondary_renamed = rename_columns(df_secondary_clean, {"Area Code (ISO3)": "isoA3", "Value": "secondary_value"}) + + # Merge the dataframes + df_merged = merge_data(df_main_renamed, df_secondary_renamed, "isoA3") + + # Calculate the percentage + df_merged = calculate_percentage(df_merged, "main_value", "secondary_value", "percentage") + + # Get the country geometry + countries_df = get_country_geometry() + + # Merge the dataframes + df_merged = merge_with_geometry(df_merged, countries_df, "isoA3") + # Set geoeometry and crs + df_merged = df_merged.set_geometry("theGeom") + df_merged = df_merged.set_crs("EPSG:4326") + # Save the dataframe + df_merged.to_file(args.output_file) + + +if __name__ == "__main__": + main()