Skip to content

Commit

Permalink
nutrient assimilation capacity indicator
Browse files Browse the repository at this point in the history
  • Loading branch information
alexeh committed Oct 10, 2023
1 parent bdd9d5d commit c0051c1
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 6 deletions.
61 changes: 55 additions & 6 deletions data/h3_data_importer/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ WORKDIR_HDI=data/contextual/hdi
WORKDIR_DEFORESTATION=data/hansen_loss
WORKDIR_GHG=data/forest_ghg
WORKDIR_WOODPULP=data/woodpulp
WORKING_NATURAL_CROP_CONVERSION=data/natural_crop_conversion
WORKDIR_NATURAL_CROP_CONVERSION=data/natural_crop_conversion
WORKDIR_NUTRIENT_LOAD_REDUCTION=data/nutrient_load_reduction
WORKDIR_BIODIVERSITY=data/biodiversity


CHECKSUMS_PATH=data_checksums

export AWS_ACCESS_KEY_ID = $(DATA_S3_ACCESS_KEY)
Expand Down Expand Up @@ -53,6 +55,7 @@ indicators:
make convert-forestGHG
make convert-satDeforestation
make convert-naturalCropConversion
make convert-nutrientLoadReduction
make convert-biodiversity
contextual-layers: convert-hdi-contextual convert-blue-water-contextual

Expand Down Expand Up @@ -226,14 +229,14 @@ convert-forestGHG: download-forestGHG
python raster_folder_to_h3_table.py $(WORKDIR_GHG) h3_grid_ghg_global indicator GHG_LUC_T 2021 --h3-res=6 --thread-count=$(PARALLELIZATION_FACTOR)

download-naturalCropConversion:
mkdir -p $(WORKING_NATURAL_CROP_CONVERSION)
aws s3 sync $(AWS_S3_BUCKET_URL)/processed/natural_crop_conversion $(WORKING_NATURAL_CROP_CONVERSION)
cd $(WORKING_NATURAL_CROP_CONVERSION) && sha256sum --check ../../$(CHECKSUMS_PATH)/natural_crop_conversion
mkdir -p $(WORKDIR_NATURAL_CROP_CONVERSION)
aws s3 sync $(AWS_S3_BUCKET_URL)/processed/natural_crop_conversion $(WORKDIR_NATURAL_CROP_CONVERSION)
cd $(WORKDIR_NATURAL_CROP_CONVERSION) && sha256sum --check ../../$(CHECKSUMS_PATH)/natural_crop_conversion


convert-naturalCropConversion: download-naturalCropConversion
@echo "Converting natural crop conversion data... "
python raster_folder_to_h3_table.py $(WORKING_NATURAL_CROP_CONVERSION) h3_grid_natural_crop_conversion_global indicator NECR 2022 --h3-res=6 --thread-count=$(PARALLELIZATION_FACTOR)
python raster_folder_to_h3_table.py $(WORKDIR_NATURAL_CROP_CONVERSION) h3_grid_natural_crop_conversion_global indicator NECR 2022 --h3-res=6 --thread-count=$(PARALLELIZATION_FACTOR)


download-biodiversity:
Expand Down Expand Up @@ -320,7 +323,7 @@ convert-woodpulp: download-woodpulp
python raster_folder_to_h3_table.py $(WORKDIR_WOODPULP)/ha h3_grid_woodpulp_ha harvest_area gfw_plantations 2021 --h3-res=6 --thread-count=$(PARALLELIZATION_FACTOR)
python raster_folder_to_h3_table.py $(WORKDIR_WOODPULP)/prod h3_grid_woodpulp_prod production gfw_plantations 2021 --h3-res=6 --thread-count=$(PARALLELIZATION_FACTOR)
###################
# Contextual data #
# Aqueduct data #
###################

# Aqueduct Global water risk contextual data
Expand All @@ -332,12 +335,58 @@ extract-aqueduct: download-aqueduct
unzip -q -u $(WORKDIR_AQUEDUCT)/excess_withdrawals.zip -d $(WORKDIR_AQUEDUCT)/
cd $(WORKDIR_AQUEDUCT) && sha256sum --check ../../../$(CHECKSUMS_PATH)/excess_withdrawals


convert-aqueduct: extract-aqueduct
@echo "Converting excess withdrawals data... "
python vector_folder_to_h3_table.py $(WORKDIR_AQUEDUCT) h3_grid_excess_withdrawals_global perc_reduc excess_withdrawals "Environmental datasets" 2023 --indicator=UWUSR_T --h3-res=6
@echo "Including contextual layer... "
python vector_folder_to_h3_table.py $(WORKDIR_AQUEDUCT) h3_grid_aqueduct_global bws_cat aqueduct "Environmental datasets" 2023


#########################################
# Nutrient load reduction data #
#########################################

download-nutrient-load-reduction:
mkdir -p $(WORKDIR_NUTRIENT_LOAD_REDUCTION)
aws s3 sync $(AWS_S3_BUCKET_URL)/processed/nutrients_load_reduction/ $(WORKDIR_NUTRIENT_LOAD_REDUCTION)

extract-nutrient-load-reduction:download-nutrient-load-reduction
unzip -q -u $(WORKDIR_NUTRIENT_LOAD_REDUCTION)/nutrient_load_reduction.zip -d $(WORKDIR_NUTRIENT_LOAD_REDUCTION)/
cd $(WORKDIR_NUTRIENT_LOAD_REDUCTION) && sha256sum --check ../../$(CHECKSUMS_PATH)/nutrient_load_reduction

convert-nutrientLoadReduction: extract-nutrient-load-reduction
@echo "Converting nutrient load reduction data... "
python vector_folder_to_h3_table.py $(WORKDIR_NUTRIENT_LOAD_REDUCTION) h3_grid_nutrient_load_global perc_reduc nutrient_load_reduction "Environmental datasets" 2023 --indicator=NLR --h3-res=6
@echo "Including contextual layer... "
python vector_folder_to_h3_table.py $(WORKDIR_NUTRIENT_LOAD_REDUCTION) h3_grid_limiting_nutrients_global Cases_v2_1 limiting_nutrient "Environmental datasets" 2023

###################
# Contextual data #
###################

#########################################
# Nutrient load reduction data #
#########################################

download-nutrient-load-reduction:

This comment has been minimized.

Copy link
@elpamart

elpamart Oct 10, 2023

Contributor

You are doing the download nutrient load, extract and convert already above. Wee need to remove the three duplicated commands below.

mkdir -p $(WORKDIR_NUTRIENT_LOAD_REDUCTION)
aws s3 sync $(AWS_S3_BUCKET_URL)/processed/nutrients_load_reduction/ $(WORKDIR_NUTRIENT_LOAD_REDUCTION)

extract-nutrient-load-reduction:download-nutrient-load-reduction
unzip -q -u $(WORKDIR_NUTRIENT_LOAD_REDUCTION)/nutrient_load_reduction.zip -d $(WORKDIR_NUTRIENT_LOAD_REDUCTION)/
cd $(WORKDIR_NUTRIENT_LOAD_REDUCTION) && sha256sum --check ../../$(CHECKSUMS_PATH)/nutrient_load_reduction

convert-nutrientLoadReduction: extract-nutrient-load-reduction
@echo "Converting nutrient load reduction data... "
python vector_folder_to_h3_table.py $(WORKDIR_NUTRIENT_LOAD_REDUCTION) h3_grid_nutrient_load_global perc_reduc nutrient_load_reduction "Environmental datasets" 2023 --indicator=NLR --h3-res=6
@echo "Including contextual layer... "
python vector_folder_to_h3_table.py $(WORKDIR_NUTRIENT_LOAD_REDUCTION) h3_grid_limiting_nutrients_global Cases_v2_1 limiting_nutrient "Environmental datasets" 2023

###################
# Contextual data #
###################

download-hdi-contextual:
mkdir -p $(WORKDIR_HDI)
wget --show-progress -q -O $(WORKDIR_HDI)/IHDI_HDR2020_040722.csv https://hdr.undp.org/sites/default/files/data/2020/IHDI_HDR2020_040722.csv
Expand Down
25 changes: 25 additions & 0 deletions data/preprocessing/nutrient_assimilation_capacity/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Makefile for downloading, processing, and uploading data
# Variables
DATA_DIR=data/
checksums_dir=../../../../h3_data_importer/data_checksums
AWS_S3_BUCKET_URL=s3://landgriffon-raw-data

# Targets
.PHONY: unzip-limiting-nutrient

all: unzip-limiting-nutrient

# First you need to download the data manually from https://figshare.com/articles/figure/DRP_NO3_TN_TP_rasters/14527638/1?file=31154728 and save it in nutrient_assimilation_capacity/data
unzip-limiting-nutrient:
unzip -q -u $(DATA_DIR)/hybas_l03_v1c_Cases.zip -d $(DATA_DIR)/

# Preprocess the data before ingesting instead of performing these calculations on the database
process-limiting-nutrients:
python process_data.py $(DATA_DIR)/hybas_l03_v1c_Cases

upload_results:
aws s3 cp $(DATA_DIR)/hybas_l03_v1c_Cases/nutrient_assimilation_capacity.shp ${AWS_S3_BUCKET_URL}/processed/nutrients_assimilation_capacity/

write_checksum:
cd $(DATA_DIR)/hybas_l03_v1c_Cases && sha256sum nutrient_assimilation_capacity.shp > $(checksums_dir)/nutrient_assimilation_capacity

65 changes: 65 additions & 0 deletions data/preprocessing/nutrient_assimilation_capacity/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Data Processing Pipeline README

This repository contains a data processing pipeline implemented using a Makefile and Python script to download, preprocess, upload, and generate checksums for data files. The pipeline is designed to work with geospatial data related to nutrient assimilation capacity.

## Prerequisites

Before running the pipeline, ensure you have the following prerequisites in place:

1. **Data Download**: You need to manually download the data from [here](https://figshare.com/articles/figure/DRP_NO3_TN_TP_rasters/14527638/1?file=31154728) and save it in the `data/` directory.

2. **Python Dependencies**: The preprocessing script requires Python and the following Python packages:
- `geopandas`
- Other dependencies as specified in your `process_data.py` script.

3. **AWS Credentials**: To upload results to an AWS S3 bucket, you should have AWS credentials configured on your machine.

## Usage

### 1. Download and Unzip Data

Use the following command to download and unzip the data:

```bash
make unzip-limiting-nutrient
```
This command will download the data and place it in the data/ directory.

### 2. Preprocess Data

Before ingesting the data into your database, preprocess it using the Python script. Run the following command:

``` bash
make process-limiting-nutrients
```
This command will execute the process_data.py script, which performs data preprocessing, including reprojection and calculation of nutrient reduction percentages.

### 3. Upload Process Data

To upload the processed data to an AWS S3 bucket, use the following command:

```bash
make upload_results
```
Make sure you have AWS credentials configured to access the specified S3 bucket.

### 4. Generate Checksum

Generate a SHA-256 checksum for the processed data by running the following command:

```bash
make write_checksum
```
This command will calculate the checksum and save it in the data_checksums/ directory.

## Configuration

You can configure the pipeline by modifying the variables at the top of the Makefile:

- `DATA_DIR`: Specify the directory where data files are stored.
- `checksums_dir`: Define the directory where checksum files will be saved.
- `AWS_S3_BUCKET_URL`: Set the AWS S3 bucket URL for uploading results.

Feel free to adapt this pipeline to suit your specific data processing needs and directory structure.

`Note`: Make sure you have the necessary permissions and access to the data sources and AWS resources mentioned in this README before running the pipeline.
90 changes: 90 additions & 0 deletions data/preprocessing/nutrient_assimilation_capacity/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
""" Reads the limiting nutrients equal area vector file, reporjects the file to EPSG4326 and estimates the percentage of reduction needed to meet a good water quality conditions.
Usage:
process_data.py <folder>
Arguments:
<folder> Folder containing the limiting nutrients shapefile
"""
import os
import logging
from pathlib import Path
import argparse

import geopandas as gpd

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("preprocessing_limiting_nutrients_file")

def check_and_reproject_to_4326(gdf):
"""
Checks if a GeoDataFrame is in CRS 4326 (WGS84) and reprojects it if not.
Parameters:
- gdf: GeoDataFrame to check and reproject if needed.
Returns:
- Reprojected GeoDataFrame (if reprojected) or the original GeoDataFrame (if already in 4326).
"""
if gdf.crs is None or gdf.crs.to_epsg() != 4326:
log.info("Reprojecting GeoDataFrame to EPSG:4326 (WGS84)...")
try:
# Reproject to EPSG:4326
gdf = gdf.to_crs(epsg=4326)
log.info("Reprojection successful.")
except:
log.error("Reprojection failed with error")
else:
log.info("GeoDataFrame is already in EPSG:4326 (WGS84).")

return gdf

# Define the function to calculate perc_reduction
def calculate_perc_reduction(row):
if row['Cases_v2_1'] == 4 and row['TP_con_V2_']:
return ((row['TP_con_V2_'] - 0.046) / row['TP_con_V2_']) * 100
elif row['Cases_v2_1'] == 2 and row['TN_con_V2_']:
return ((row['TN_con_V2_'] - 0.7) / row['TN_con_V2_']) * 100
else:
return 0

def process_folder(folder):
vec_extensions = "gdb gpkg shp json geojson".split()
path = Path(folder)
vectors = []
for ext in vec_extensions:
vectors.extend(path.glob(f"*.{ext}"))
if not vectors:
log.error(f"No vectors with extension {vec_extensions} found in {folder}")
return
if len(vectors) == 1: #folder just contains one vector file
# Read the shapefile
gdf = gpd.read_file(vectors[0])
# Check and reproject to EPSG:4326
gdf = check_and_reproject_to_4326(gdf)
# Calculate perc_reduction and add it as a new column
gdf['perc_reduc'] = gdf.apply(calculate_perc_reduction, axis=1)
# Save the processed data to a new shapefile
gdf = gdf[['Cases_v2_1', 'perc_reduc', 'geometry']]
output_file = os.path.join(folder, 'nutrient_assimilation_capacity.shp')
log.info(f"Saving preprocessed file to {output_file}")
gdf.to_file(output_file)
else:
mssg = (
f"Found more than one vector file in {folder}."
f" For now we only support folders with just one vector file."
)
logging.error(mssg)
return

def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Process limiting nutrients vector files.")
parser.add_argument("folder", type=str, help="Path to the folder containing vector files")
args = parser.parse_args()

# Process the specified folder
process_folder(args.folder)

if __name__ == "__main__":
main()

0 comments on commit c0051c1

Please sign in to comment.