include stock percentage distribution in livestock preprocessing

Vizzuality · Nov 15, 2023 · 3a31a4e · 3a31a4e
1 parent fdca0a3
commit 3a31a4e
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 29 deletions.
diff --git a/data/h3_data_importer/data_checksums/livestock_processed_ha b/data/h3_data_importer/data_checksums/livestock_processed_ha
@@ -1,2 +1,2 @@
-36e9a56a00986ee9a578a07530b944970353b7bbe74c7546ffb150cadc1dea47  GLO_2021_HensEggs_ha.tif
-ac38acef226254d7902397fd4b2913438f5924b8fa316b4737d77977f0f8ead3  GLO_2021_TotalRawMilk_ha.tif
+9d42bb8467c0522c67776c3461bad80ef9ffa31ae1b7a1d04e6758a197661ff2  GLO_2021_HensEggs_ha.tif
+ddb5ce3950a4287e55333cc451cdf902ffb6a5f9cb54f6fd203cac2298707a29  GLO_2021_TotalRawMilk_ha.tif
diff --git a/data/h3_data_importer/data_checksums/livestock_processed_prod b/data/h3_data_importer/data_checksums/livestock_processed_prod
@@ -1,2 +1,2 @@
-919e2e088264d04e2e1d203200ace7e676b510d6b30e4191cbbea7b7559c2b09  GLO_2021_HensEggs_t.tif
-608cb760c7860370406e5a06239334abe02a87b87bbf07c5bb8a9d30bfaeb03a  GLO_2021_TotalRawMilk_t.tif
+890c9474871b98d932be5c79d1f2a7209b93649460ddb6b09e34c3c37447c91c  GLO_2021_HensEggs_t.tif
+4f4a6c1d18a54372d1c211337fa908138eb8e58d552c3f3b2f67b34d3f73133e  GLO_2021_TotalRawMilk_t.tif
diff --git a/data/preprocessing/livestock_processed/Makefile b/data/preprocessing/livestock_processed/Makefile
@@ -5,9 +5,9 @@ AWS_S3_BUCKET_URL=s3://landgriffon-raw-data
 export AWS_ACCESS_KEY_ID = $(DATA_S3_ACCESS_KEY)
 export AWS_SECRET_ACCESS_KEY = $(DATA_S3_SECRET_KEY)
 
-.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums
+.PHONY: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock
 
-all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest calculate_aggregation rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums
+all: download_pasture_data download_faostats_data_production download_faostats_data_harvest preprocess_faostats_data_production preprocess_faostats_data_harvest rasterize_and_calculate_commodities_production rasterize_and_calculate_commodities_harvest upload_livestock_processed_production upload_livestock_processed_harvest write_checksums download_faostats_stocks_data preprocess_faostats_data_stocks rasterize_percentage_stock
 
 download_pasture_data:
 	mkdir -p $(data_dir)/pasture
@@ -24,50 +24,72 @@ download_faostats_data_harvest:
 	mkdir -p $(data_dir)/faostats/harvest
 	aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/harvest data/faostats/harvest/
 
-# preprocess hens eggs and total milk production equivalent
+download_faostats_stocks_data:
+	mkdir -p $(data_dir)/faostats/stocks
+	aws s3 sync $(AWS_S3_BUCKET_URL)/raw/FAOSTAT_livestock/stocks data/faostats/stocks/
+
+# obtain percentage of cattle for dairy and chickens by country for layers and vectorize
+preprocess_faostats_data_stocks:
+	mkdir -p $(data_dir)/faostats_processed/stocks
+	python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_cattle_dairy_2019.csv \
+		$(data_dir)/faostats/stocks/FAOSTAT_cattle_non_dairy_2019.csv \
+		$(data_dir)/faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp;
+	python preprocess_faostats_stocks.py $(data_dir)/faostats/stocks/FAOSTAT_chickens_layers_2019.csv \
+		$(data_dir)/faostats/stocks/FAOSTAT_chickens_broilers_2019.csv \
+		$(data_dir)/faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp;
+
+# clean and vectorize faostats production data for hens and milk
 preprocess_faostats_data_production:
 	mkdir -p $(data_dir)/faostats_processed/production
-	python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \
+	python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_hens_eggs_iso3_2021.csv \
 		$(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp production;
-	python preprocess_faostats.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \
+	python preprocess_faostats_ha_prod.py $(data_dir)/faostats/production/FAOSTAT_data_total_milk_iso3_2021.csv \
 		$(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp production;
 
-# preprocess hens eggs and total milk harvest equivalent
+# clean and vectorize faostats harvest data for hens and milk
 preprocess_faostats_data_harvest:
 	mkdir -p $(data_dir)/faostats_processed/harvest
-	python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \
+	python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_chickens_LSU_ha.csv \
 		$(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp harvest;
-	python preprocess_faostats.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \
+	python preprocess_faostats_ha_prod.py $(data_dir)/faostats/harvest/FAOSTAT_data_cattle_buffalo_LSU_ha.csv \
 		$(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp harvest;
 
-# aggregate the pasture data to get the totals that we will need to the total milk calculation
-# the -1.7e+308 is the nodata value for the pasture data. We need to set it to 0 for the calculation.
-calculate_aggregation:
-	gdal_calc.py --NoDataValue=0 --quiet --calc "(A!=-1.7e+308)*A+(B!=-1.7e+308)*B+(C!=-1.7e+308)*C" --format GTiff --type Float64 \
-		-A $(data_dir)/pasture/6_Ct_2010_Aw.tif \
-		-B $(data_dir)/pasture/6_Gt_2010_Aw.tif \
-		-C $(data_dir)/pasture/6_Sh_2010_Aw.tif \
-		--outfile $(data_dir)/pasture/6_Tm_2010_Aw.tif
-
 #rasterise and calculate the tonnes of material
+rasterize_percentage_stock:
+	mkdir -p $(data_dir)/rasterized/stocks
+	gdal_rasterize -q -l FAOSTAT_cattle_dairy_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \
+		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
+		$(data_dir)faostats_processed/stocks/FAOSTAT_cattle_dairy_percentage.shp \
+		$(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif;
+	gdal_rasterize -q -l FAOSTAT_chickens_eggs_percentage -a percentage -tr 0.083333 0.083333 -a_nodata 0 \
+		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
+		$(data_dir)faostats_processed/stocks/FAOSTAT_chickens_eggs_percentage.shp \
+		$(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif;
+
+# first we need to rasterize stock data with percentage of dairy cattle and chicken eggs
+# We rasterize the production chickens and cattle data
+# then we need to multiply the rasterized stock percentage with the chicken and cattle data from glwv3
+# and the rasterized production data
 rasterize_and_calculate_commodities_production:
 	mkdir -p $(data_dir)/rasterized/production
 	mkdir -p $(data_dir)/processed_commodities/production
 	gdal_rasterize -q -l FAOSTAT_data_hens_eggs_iso3_2021_t -a Value -tr 0.083333 0.083333 -a_nodata 0 \
 		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
 		$(data_dir)/faostats_processed/production/FAOSTAT_data_hens_eggs_iso3_2021_t.shp \
 		$(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif;
-	gdal_calc.py --quiet --calc "A*B" --format GTiff --type Float32 --NoDataValue 0.0 \
-		-A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \
-		-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
-		--outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif;
 	gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_t -a Value -tr 0.083333 0.083333 -a_nodata 0 \
 		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
 		$(data_dir)/faostats_processed/production/FAOSTAT_data_total_milk_iso3_2021_t.shp \
 		$(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif;
-	gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B" --format GTiff --type Float64 \
+	gdal_calc.py --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float32 --NoDataValue 0.0 \
+		-A $(data_dir)/rasterized/production/FAOSTAT_data_hens_eggs_iso3_2021_t.tif --A_band 1 \
+		-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
+		-C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \
+		--outfile $(data_dir)/processed_commodities/production/GLO_2021_HensEggs_t.tif;
+	gdal_calc.py --NoDataValue=0 --quiet --calc "A*(B!=3.40282e+38)*B*C" --format GTiff --type Float64 \
 		-A $(data_dir)/rasterized/production/FAOSTAT_data_total_milk_iso3_2021_t.tif --A_band 1 \
-		-B $(data_dir)/pasture/6_Tm_2010_Aw.tif \
+		-B $(data_dir)/pasture/6_Ct_2010_Aw.tif \
+		-C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \
 		--outfile $(data_dir)/processed_commodities/production/GLO_2021_TotalRawMilk_t.tif;
 
 # 3.40282e+38 is the nodata value for the total milk data. We need to set it to 0 for the calculation.
@@ -78,17 +100,19 @@ rasterize_and_calculate_commodities_harvest:
 		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
 		$(data_dir)/faostats_processed/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.shp \
 		$(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif;
-	gdal_calc.py --quiet --calc "B/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \
+	gdal_calc.py --quiet --calc "(B*C)/((A!=0)*A)" --format GTiff --type Float32 --NoDataValue 0.0 \
 		-A $(data_dir)/rasterized/harvest/FAOSTAT_data_hens_eggs_iso3_2021_ha.tif --A_band 1 \
 		-B $(data_dir)/pasture/6_Ch_2010_Aw.tif \
+		-C $(data_dir)/rasterized/stocks/FAOSTAT_chickens_eggs_percentage.tif \
 		--outfile $(data_dir)/processed_commodities/harvest/GLO_2021_HensEggs_ha.tif;
 	gdal_rasterize -q -l FAOSTAT_data_total_milk_iso3_2021_ha -a Value -tr 0.083333 0.083333 -a_nodata 0 \
 		-te -180.0 -89.99928 179.99856 90.0 -ot Float32 -of GTiff \
 		$(data_dir)/faostats_processed/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.shp \
 		$(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif;
-	gdal_calc.py --NoDataValue=0 --quiet --calc "((B!=3.40282e+38)*B)/((A!=0)*A)" --format GTiff --type Float64 \
+	gdal_calc.py --NoDataValue=0 --quiet --calc "(((B!=3.40282e+38)*B)*C)/((A!=0)*A)" --format GTiff --type Float64 \
 		-A $(data_dir)/rasterized/harvest/FAOSTAT_data_total_milk_iso3_2021_ha.tif --A_band 1 \
 		-B $(data_dir)/pasture/6_Tm_2010_Aw.tif \
+		-C $(data_dir)/rasterized/stocks/FAOSTAT_cattle_dairy_percentage.tif \
 		--outfile $(data_dir)/processed_commodities/harvest/GLO_2021_TotalRawMilk_ha.tif;
 
 upload_livestock_processed_production:

diff --git a/...ivestock_processed/preprocess_faostats.py → ..._processed/preprocess_faostats_ha_prod.py b/...ivestock_processed/preprocess_faostats.py → ..._processed/preprocess_faostats_ha_prod.py
@@ -17,6 +17,7 @@ def get_country_geometry():
     This geometry is used to spatalised the faostat data
     """
     # Connect to the database
+
     conn = psycopg2.connect(
         host=os.getenv("API_POSTGRES_HOST"),
         port=os.getenv("API_POSTGRES_PORT"),

diff --git a/data/preprocessing/livestock_processed/preprocess_faostats_stocks.py b/data/preprocessing/livestock_processed/preprocess_faostats_stocks.py
@@ -0,0 +1,126 @@
+import os
+import logging
+import argparse
+
+import pandas as pd
+import geopandas as gpd
+import psycopg2
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("preprocessing_processed_livestock_stock_faostats_file")
+
+
+def clean_data(df, columns):
+    """
+    Clean the input dataframe by keeping only the specified columns.
+    """
+    df_clean = df[columns]
+    return df_clean
+
+
+def rename_columns(df, column_map):
+    """
+    Rename the columns of the input dataframe using the specified column map.
+    """
+    df_renamed = df.rename(columns=column_map)
+    return df_renamed
+
+
+def merge_data(df1, df2, on):
+    """
+    Merge two dataframes on the specified column(s).
+    """
+    df_merged = df1.merge(df2, on=on)
+    return df_merged
+
+
+def calculate_percentage(df, numerator_col, denominator_col, output_col):
+    """
+    Calculate the percentage of the numerator column from the total of the numerator and denominator columns.
+    """
+    df[output_col] = df[numerator_col] / (df[numerator_col] + df[denominator_col])
+    return df
+
+
+def get_country_geometry():
+    """
+    Get the country geometry from a database.
+    """
+
+    # Connect to the database
+    conn = psycopg2.connect(
+        host=os.getenv("API_POSTGRES_HOST"),
+        port=os.getenv("API_POSTGRES_PORT"),
+        database=os.getenv("API_POSTGRES_DATABASE"),
+        user=os.getenv("API_POSTGRES_USERNAME"),
+        password=os.getenv("API_POSTGRES_PASSWORD"),
+    )
+
+    # Get the countries geometries
+    countries_df = gpd.read_postgis(
+        """SELECT ar."isoA3", gr."theGeom"
+                                    FROM admin_region ar
+                                    INNER JOIN geo_region gr ON gr.id = ar."geoRegionId"
+                                    WHERE ar."level" = 0
+                                    """,
+        conn,
+        geom_col="theGeom",
+    )
+    return countries_df
+
+
+def merge_with_geometry(df, geometry_df, on):
+    """
+    Merge the input dataframe with the country geometry dataframe on the specified column(s).
+    """
+    df_merged = df.merge(geometry_df, on=on)
+    return df_merged
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process livestock preprocessed faostats data.")
+    parser.add_argument(
+        "input_file_main",
+        type=str,
+        help="Path to the input file containing vector files of the main value to preprocess",
+    )
+    parser.add_argument(
+        "input_file_secondary",
+        type=str,
+        help="Path to the input file containing vector files of the secondary value to preproces",
+    )
+    parser.add_argument("output_file", type=str, help="Path to the output file to save processed data")
+    args = parser.parse_args()
+
+    # Open the files and clean the data
+    df_main = pd.read_csv(args.input_file_main)
+    df_secondary = pd.read_csv(args.input_file_secondary)
+    df_main_clean = clean_data(df_main, ["Area Code (ISO3)", "Value", "Unit"])
+    df_secondary_clean = clean_data(df_secondary, ["Area Code (ISO3)", "Value", "Unit"])
+
+    # Rename the columns
+    df_main_renamed = rename_columns(df_main_clean, {"Area Code (ISO3)": "isoA3", "Value": "main_value"})
+    df_secondary_renamed = rename_columns(df_secondary_clean, {"Area Code (ISO3)": "isoA3", "Value": "secondary_value"})
+
+    # Merge the dataframes
+    df_merged = merge_data(df_main_renamed, df_secondary_renamed, "isoA3")
+
+    # Calculate the percentage
+    df_merged = calculate_percentage(df_merged, "main_value", "secondary_value", "percentage")
+
+    # Get the country geometry
+    countries_df = get_country_geometry()
+
+    # Merge the dataframes
+    df_merged = merge_with_geometry(df_merged, countries_df, "isoA3")
+    # Set geoeometry and crs
+    df_merged = df_merged.set_geometry("theGeom")
+    df_merged = df_merged.set_crs("EPSG:4326")
+    # Save the dataframe
+    df_merged.to_file(args.output_file)
+
+
+if __name__ == "__main__":
+    main()