diff --git a/Dockerfile b/Dockerfile index 2032cce8..1c765e1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,6 +30,7 @@ RUN apt-get update -y && apt-get install -y \ tmux \ git \ vim \ + zip \ && apt-get clean all diff --git a/analyses/CarbonFluxQA/Components/01_DownloadFiles.py b/analyses/CarbonFluxQA/Components/01_DownloadFiles.py new file mode 100644 index 00000000..dff8dcaa --- /dev/null +++ b/analyses/CarbonFluxQA/Components/01_DownloadFiles.py @@ -0,0 +1,4 @@ +from funcs import download_files + +# Create folder structure, download files, and clip TCL rasters to GADM extent +download_files() \ No newline at end of file diff --git a/analyses/CarbonFluxQA/Components/02_CreateMasks.py b/analyses/CarbonFluxQA/Components/02_CreateMasks.py new file mode 100644 index 00000000..68abf5bc --- /dev/null +++ b/analyses/CarbonFluxQA/Components/02_CreateMasks.py @@ -0,0 +1,9 @@ +import constants_and_names as cn +from funcs import create_masks + +# Set input folders = Mask, Inputs folders and select tcd_threshold/ gain/ save intermediate values +create_masks(cn.tcd_threshold, cn.gain, cn.save_intermediates) + +# Other options: +# create_masks([0, 75], cn.gain, False) +# create_masks([30], cn.gain, True) \ No newline at end of file diff --git a/analyses/CarbonFluxQA/Components/03_ZonalStats_Masked.py b/analyses/CarbonFluxQA/Components/03_ZonalStats_Masked.py new file mode 100644 index 00000000..8ef48537 --- /dev/null +++ b/analyses/CarbonFluxQA/Components/03_ZonalStats_Masked.py @@ -0,0 +1,5 @@ +import constants_and_names as cn +from funcs import zonal_stats_masked + +# Calculate zonal stats for all input rasters at each tcd threshold value +zonal_stats_masked(cn.aois_folder, cn.input_folder, cn.mask_output_folder, cn.outputs_folder) diff --git a/analyses/CarbonFluxQA/Components/04_ZonalStats_Annualized.py b/analyses/CarbonFluxQA/Components/04_ZonalStats_Annualized.py new file mode 100644 index 00000000..73fda3b8 --- /dev/null +++ b/analyses/CarbonFluxQA/Components/04_ZonalStats_Annualized.py @@ -0,0 +1,6 @@ +import constants_and_names as cn +from funcs import zonal_stats_annualized + +# Calculate emissions for each year of tree cover loss using TCL rasters +zonal_stats_annualized(cn.tcl_clip_folder, cn.input_folder, cn.mask_output_folder, cn.annual_folder) + diff --git a/analyses/CarbonFluxQA/Components/05_ZonalStats_Clean.py b/analyses/CarbonFluxQA/Components/05_ZonalStats_Clean.py new file mode 100644 index 00000000..6d3bf787 --- /dev/null +++ b/analyses/CarbonFluxQA/Components/05_ZonalStats_Clean.py @@ -0,0 +1,4 @@ +from funcs import zonal_stats_clean + +#Combine and clean masked and annual zonal stats output into final csv +zonal_stats_clean() \ No newline at end of file diff --git a/analyses/CarbonFluxQA/README.md b/analyses/CarbonFluxQA/README.md new file mode 100644 index 00000000..3182c16c --- /dev/null +++ b/analyses/CarbonFluxQA/README.md @@ -0,0 +1,157 @@ +### Purpose + + The purpose of this tool is to automate the zonal statistics portion of the QAQC process for the + Annual Carbon Flux Model (Harris, 2021) Update. Each year, the Carbon Flux Model is run with updated activity data + for tree cover loss (Hansen, 2013) and auxiliary inputs. Before the data is launched on Global Forest Watch, the + outputs are compared across platforms and methods including the GeoTrellis Tool, GFW Dashboard download spreadsheets, + the GFW API, and ArcGIS Zonal Statistics calculations. + +### Overview + + This code has been automated to create the working directory folder structure, download all of the required data + from s3, and produce a summary csv with the resulting zonal statisitcs. + +### User Inputs + +#### Area(s) of Interest: + + The only file(s) required by the user are the shapefile(s) for the area(s) of interest. The shapefile(s) need to be + located in a subfolder in the woking directory named "AOIS". + + This tool is set up to run statistics for as many areas of interest as the user provides. We used country + boundaries for Indonesia and Gambia from the GADM 3.6 Dataset, available for download here: + [](https://gadm.org/download_country_v3.html). + + The Indonesia boundary is IND.14.13 and the Gambia boundary is GMB.2. + + These inputs will need to be updated if and when GFW switches to a newer version of GADM. + +| Dataset | Directory | Description | +|---------------------|---------------------------|-----------------------------------------| +| Area(s) of Interest | {working_directory}/AOIS/ | Shapefiles for the area(s) of interest. | + + +#### User-Specified Parameters: + You must update the constants_and_names.py file with the path to your working_directory. This is the folder which + contains your areas of interest and where all of the data and results will be saved. There are a number of other + arguments in the constants_and_names.py file that users have the option to update. A description of each argument is + detailed below: + +| Argument | Description | Type | +|-------------------------|------------------------------------------------------------------------------|------------| +| working_directory | Directory which contains the AOIS subfolder. | String | +| overwrite_arcgis_output | Whether or not you want to overwrite previous arcpy outputs. | Boolean | +| loss_years | Number of years of tree cover loss in the TCL dataset. | Integer | +| model_run_date | s3 folder where per-pixel outputs from most recent model run are located. | String | +| tile_list | List of 10 x 10 degree tiles that overlap with all aoi(s). | List | +| tile_dictionary | Dictionary that matches each country to their overlapping tiles. | Dictionary | +| extent | Which tile set(s) to download for zonal stats (options: forest, full, both). | String | +| tcd_threshold | List of tree cover density thresholds to mask by. | List | +| gain | Whether to include tree cover gain pixels in masks. | Boolean | +| save_intermediates | Whether to save intermediate masks (useful for troubleshooting). | Boolean | + + +### Datasets + +#### Carbon Flux Model Data: + + Three separate outputs from the Carbon Flux Model, each with two different extents, are used as inputs in + this tool. This is a total of six different possible inputs. Inputs include gross emissions (all gasses), + gross removals (CO2), and net flux (CO2e). All are in inputs Mg / pixel. You have the option to calculate + zonal statistics according to tile extent: forest extent only, full extent only, or both extents. + +| AOI | Extent | Type | Units | Tile | +|-----|--------|-----------------|---------------|----------| +| IDN | Forest | Gross Emissions | Mg CO2e/pixel | 00N_110E | +| IDN | Forest | Gross Removals | Mg CO2/pixel | 00N_110E | +| IDN | Forest | Net Flux | Mg CO2e/pixel | 00N_110E | +| IDN | Full | Gross Emissions | Mg CO2e/pixel | 00N_110E | +| IDN | Full | Gross Removals | Mg CO2/pixel | 00N_110E | +| IDN | Full | Net Flux | Mg CO2e/pixel | 00N_110E | +| GMB | Forest | Gross Emissions | Mg CO2/pixel | 20N_020W | +| GMB | Forest | Gross Removals | Mg CO2e/pixel | 20N_020W | +| GMB | Forest | Net Flux | Mg CO2/pixel | 20N_020W | +| GMB | Full | Gross Emissions | Mg CO2e/pixel | 20N_020W | +| GMB | Full | Gross Removals | Mg CO2/pixel | 20N_020W | +| GMB | Full | Net Flux | Mg CO2e/pixel | 20N_020W | + + +#### Auxiliary Datasets: + + Other auxiliary inputs for this tool include: + +| Dataset | Use Description | +|----------------------|------------------------------------------------------------------------------------------| +| Tree Cover Gain | Used to create tree cover gain mask. | +| Above Ground Biomass | Used to filter tree cover gain mask to only pixels that contain biomass. | +| Tree Cover Density | Used to create density threshold mask. | +| Mangrove Extent | Used to create Mangrove mask. Areas of mangrove included in mask. | +| Pre-2000 Plantations | Used to create Pre-2000 plantations mask. Pre-2000 plantations masked from calculations. | +| Tree Cover Loss | Used to calculate annual emissions. | + + +### Outputs: + + The final outputs include one csv file summarizing results for each entry described in the "Carbon Flux Model + Inputs" table. Additionally, separate csv files for annual emissions are produced. + +### Code Summary + +#### calculate_zonal_statistics + This file is for running the code in its entirety. This script will execute all functions in the repository + consecutively and produce output csvs. + +#### constants_and_names + This file stores all of the input arguments provided to the functions. Any changes to the arguments in this file + will be applies to all scripts in the repository. + +#### funcs + This file stores all of the functions used in the tool. Any edits to functions would be made in this file. + +#### components + + This folder houses individual scripts for running separate functions. These can be useful for running particular + functions separately and testing edits/ troubleshootins. Each function is described below. + +##### 01 Download Files + This script creates the folder structure (other than the AOI folder) and downloads all of the required datasets from + s3 using the paths provided in the the constant_and_names file. You will need to set your AWS_ACCESS_KEY and + AWS_SECRET_ACCESS_KEY in your environment variables for this step to work (assuming you have s3 copy permissions). + +##### 02 Create Masks + This script uses data on tree cover density, tree cover gain, mangrove extent, WHRC biomass, and pre-2000 plantations + to replicate the masks that are used in GFW data processing. This facilitates direct comparison with results from the GFW + dashboard, geotrellis client, and GFW API. The script creates masks based on criteria for each input dataset and saves these + masks in a sub directory. These masks are used later as extent inputs in the Zonal Statistics Masked script. + +##### 03 Zonal Stats Masked + This script calculates zonal statistics for each area of interest and carbon dataset combination and applies each + mask saved in the Mask/ Mask directory. The number of masks depend on the number of tcd_threshold values you indicated + and wheter or not you set the save_intermediate flag to True. At minimum, this will include final masks for each + tcd_threshold value and at maximum this will be the number of tcd_threshold values multiplied by the number of + intermediate masks (this varies depending on whether or not the area of interest includes mangroves and/ or + pre-2000 plantations). + +##### 04 Zonal Stats Annualized + This script calculates annual emissions in each area of interest using the tree cover loss dataset. + +##### 05 Zonal Stats Cleaned + This script utilizes pandas to compile the results of all analyses and export them into a user-friendly csv file. + +### Running the Code + To run the code, you will need to set up a workspace with the AOI inputs organized into the correct directory, + update the user inputs section of the constants_and_names.py file, and provide your AWS_ACCESS_KEY and + AWS_SECRET_ACCESS_KEY in your environment variables. + + This code is built on arcpy, which will require a valid ArcGIS license to run. + +### Other Notes + Updates in progress include... + + - Currently, the annual zonal stats do not sum to the total emissions using the TCL dataset from s3, + but they do when using the previous TCL clipped rasters. For now, reach out to Erin Glen or Melissa Rose for + the previous TCL clipped rasters. + +#### Contact Info + Erin Glen - erin.glen@wri.org + Melissa Rose - melissa.rose@wri.org \ No newline at end of file diff --git a/analyses/CarbonFluxQA/calculcate_zonal_stats.py b/analyses/CarbonFluxQA/calculcate_zonal_stats.py new file mode 100644 index 00000000..a1ba2928 --- /dev/null +++ b/analyses/CarbonFluxQA/calculcate_zonal_stats.py @@ -0,0 +1,22 @@ +import constants_and_names as cn +from funcs import download_files, create_masks, zonal_stats_masked, zonal_stats_annualized, zonal_stats_clean + +#Execute Download File... +print("Step 1: Downloading Files... \n") +download_files() + +#Execute Create Masks... +print("Step 2: Creating Masks... \n") +create_masks(cn.tcd_threshold, cn.gain, cn.save_intermediates) + +#Execute Calculate Zonal Stats Masked... +print("Step 3: Calculating Zonal Stats with Masks... \n") +zonal_stats_masked(cn.aois_folder, cn.input_folder, cn.mask_output_folder, cn.outputs_folder) + +#Execute Calculcate Zonal Stats Annualized... +print("Step 4: Calculating Zonal Stats Annualized... \n") +zonal_stats_annualized(cn.tcl_clip_folder, cn.input_folder, cn.mask_output_folder, cn.annual_folder) + +#Execute Zonal Stats Clean... +print("Step 5: Cleaning Zonal Stats... \n") +zonal_stats_clean() \ No newline at end of file diff --git a/analyses/CarbonFluxQA/constants_and_names.py b/analyses/CarbonFluxQA/constants_and_names.py new file mode 100644 index 00000000..19ab2448 --- /dev/null +++ b/analyses/CarbonFluxQA/constants_and_names.py @@ -0,0 +1,115 @@ +import arcpy +import os + +##################################################################################### +# USER INPUTS +##################################################################################### +# Set the working directory to the folder which contains the AOIS subfolder +working_directory = r"C:\GIS\carbon_model\CarbonFlux_QA_v1.3.1" + +# Whether you want to overwrite previous arcpy outputs +overwrite_arcgis_output = True + +# With each model update, change loss years and model_run_date + # loss_years = number of years of tree cover loss (if input loss raster is changed, this must be changed, too) + # model_run_date = s3 folder where per-pixel outputs from most recent model run are located +loss_years = 22 +model_run_date = '20231114' + +# List of tile_ids to process (change according to which tiles overlap with your AOIS shapefiles) +tile_list = ['00N_110E', '20N_020W'] + +# Dictionary to cross-reference countries to their tile_id and GADM boundaries +tile_dictionary = {"IDN": "00N_110E", + "GMB": "20N_020W"} + +# Choose which extent to use for emission, removal, and net flux zonal stats + # options = 'forest', 'full', or 'both' +extent = 'full' + +# List of tree cover density thresholds to mask by +tcd_threshold = [0, 30, 75] +gain = True + +# Flag to save intermediate masks during create_masks() +save_intermediates = False + +##################################################################################### +# DEFAULT INPUTS +##################################################################################### + +# Setting the arcpy environ workspace +arcpy.env.workspace = working_directory +arcpy.env.overwriteOutput = overwrite_arcgis_output + +# Directories to be created/ checked +aois_folder = os.path.join(arcpy.env.workspace,"AOIS") +input_folder = os.path.join(arcpy.env.workspace,"Input") +mask_folder = os.path.join(arcpy.env.workspace,"Mask") +mask_input_folder = os.path.join(arcpy.env.workspace,"Mask", "Inputs") +mask_output_folder = os.path.join(arcpy.env.workspace,"Mask", "Mask") +gain_folder = os.path.join(mask_input_folder, "Gain") +mangrove_folder = os.path.join(mask_input_folder, "Mangrove") +plantations_folder = os.path.join(mask_input_folder, "Pre_2000_Plantations") +tcd_folder = os.path.join(mask_input_folder, "TCD") +whrc_folder = os.path.join(mask_input_folder, "WHRC") +outputs_folder = os.path.join(arcpy.env.workspace, "Outputs") +csv_folder = os.path.join(outputs_folder, "CSV") +annual_folder = os.path.join(outputs_folder, "Annual") +tcl_folder = os.path.join(arcpy.env.workspace, "TCL") +tcl_input_folder = os.path.join(tcl_folder, "Inputs") +tcl_clip_folder = os.path.join(tcl_folder, "Clip") + +# Filepath prefix for tile download step +s3_base_dir = 's3://gfw2-data/climate/carbon_model/' + +## Input folder s3 filepath informaiton +# Gross emissions per pixel in forest extent +gross_emis_forest_extent_s3_path = os.path.join(s3_base_dir, f'gross_emissions/all_drivers/all_gases/biomass_soil/standard/forest_extent/per_pixel/{model_run_date}/') +gross_emis_forest_extent_s3_pattern = f'gross_emis_all_gases_all_drivers_Mg_CO2e_pixel_biomass_soil_forest_extent_2001_{loss_years}' + +# Gross emissions per pixel in all pixels +gross_emis_full_extent_s3_path = os.path.join(s3_base_dir, f'gross_emissions/all_drivers/all_gases/biomass_soil/standard/full_extent/per_pixel/{model_run_date}/') +gross_emis_full_extent_s3_pattern = f'gross_emis_all_gases_all_drivers_Mg_CO2e_pixel_biomass_soil_full_extent_2001_{loss_years}' + +# Gross removals per pixel in forest extent +gross_removals_forest_extent_s3_path = os.path.join(s3_base_dir, f'gross_removals_AGCO2_BGCO2_all_forest_types/standard/forest_extent/per_pixel/{model_run_date}/') +gross_removals_forest_extent_s3_pattern = f'gross_removals_AGCO2_BGCO2_Mg_pixel_all_forest_types_forest_extent_2001_{loss_years}' + +# Gross removals per pixel in all pixels +gross_removals_full_extent_s3_path = os.path.join(s3_base_dir, f'gross_removals_AGCO2_BGCO2_all_forest_types/standard/full_extent/per_pixel/{model_run_date}/') +gross_removals_full_extent_s3_pattern = f'gross_removals_AGCO2_BGCO2_Mg_pixel_all_forest_types_full_extent_2001_{loss_years}' + +# Net flux per pixel in forest extent +netflux_forest_extent_s3_path = os.path.join(s3_base_dir, f'net_flux_all_forest_types_all_drivers/biomass_soil/standard/forest_extent/per_pixel/{model_run_date}/') +netflux_forest_extent_s3_pattern = f'net_flux_Mg_CO2e_pixel_biomass_soil_forest_extent_2001_{loss_years}' + +# Net flux per pixel in all pixels +netflux_full_extent_s3_path = os.path.join(s3_base_dir, f'net_flux_all_forest_types_all_drivers/biomass_soil/standard/full_extent/per_pixel/{model_run_date}/') +netflux_full_extent_s3_pattern = f'net_flux_Mg_CO2e_pixel_biomass_soil_full_extent_2001_{loss_years}' + +## Mask, Inputs folder s3 filepath informaiton +# Hansen removals tiles based on canopy height (2000-2020) +gain_s3_path = 's3://gfw-data-lake/umd_tree_cover_gain_from_height/v202206/raster/epsg-4326/10/40000/gain/geotiff/' +gain_s3_pattern = '' +gain_local_pattern = 'tree_cover_gain_2000_2020' + +# Woods Hole aboveground biomass 2000 version 4 tiles +whrc_s3_path = 's3://gfw2-data/climate/WHRC_biomass/WHRC_V4/Processed/' +whrc_s3_pattern = "t_aboveground_biomass_ha_2000" + +# Tree cover density 2000 tiles +tcd_s3_path = 's3://gfw2-data/forest_cover/2000_treecover/' +tcd_s3_pattern = 'Hansen_GFC2014_treecover2000' + +# Processed mangrove aboveground biomass in the year 2000 +mangrove_s3_path = os.path.join(s3_base_dir, 'mangrove_biomass/processed/standard/20190220/') +mangrove_s3_pattern = 'mangrove_agb_t_ha_2000' + +# Pre-2000 plantations +plantation_s3_path = os.path.join(s3_base_dir, 'other_emissions_inputs/IDN_MYS_plantation_pre_2000/processed/20200724/') +plantation_s3_pattern = 'plantation_2000_or_earlier_processed' + +# Annual Hansen loss tiles (2001-2022) +loss_s3_path = 's3://gfw2-data/forest_change/hansen_2022/' +loss_s3_pattern = 'GFW2022' \ No newline at end of file diff --git a/analyses/CarbonFluxQA/funcs.py b/analyses/CarbonFluxQA/funcs.py new file mode 100644 index 00000000..d40f5544 --- /dev/null +++ b/analyses/CarbonFluxQA/funcs.py @@ -0,0 +1,566 @@ +import os +import arcpy +import pandas as pd +import logging +import re +from subprocess import Popen, PIPE, STDOUT +import constants_and_names as cn + +######################################################################################################################## +# Functions to run modules +######################################################################################################################## +def download_files(): + # Step 1: Checking to see if the AOIS folder exists and if it contains a shapefile + print("Step 1.1: Checking to see if AOIS folder exists and contains a shapefile") + check_aois(cn.aois_folder) + + # Step 2: Create Input folder and subfolder for each tile in tile_list + print("Step 1.2: Creating Input folder structure") + create_tile_folders(cn.tile_list, cn.input_folder) + + # Step 3: Create Mask folder with Inputs subfolder (gain, mangrove, pre-200 plantations, and tree cover density) and + # Mask subfolder (folder for each tile in tile_list) + print("Step 1.3: Creating Mask folder structure") + create_subfolders([cn.mask_input_folder, cn.gain_folder, cn.mangrove_folder, cn.plantations_folder, cn.tcd_folder, cn.whrc_folder]) + create_tile_folders(cn.tile_list, cn.mask_output_folder) + + # Step 4: Create Output folder with Annual folder, CSV folder, and subfolder for each tile in tile_list + print("Step 1.4: Creating Output folder structure") + create_subfolders([cn.csv_folder, cn.annual_folder]) + create_tile_folders(cn.tile_list, cn.outputs_folder) + + # Step 5: Create TCL folder structure + print("Step 1.5: Creating TCL folder structure") + create_subfolders([cn.tcl_input_folder, cn.tcl_clip_folder]) + + # Step 6: Download emission/removal/netflux tiles (3 - 6 per tile) to Input folder + print("Step 1.6: Downloading files for Input folder") + if cn.extent == 'full' or cn.extent == 'both': + s3_flexible_download(cn.tile_list, cn.gross_emis_full_extent_s3_path, cn.gross_emis_full_extent_s3_pattern, cn.input_folder) + s3_flexible_download(cn.tile_list, cn.gross_removals_full_extent_s3_path, cn.gross_removals_full_extent_s3_pattern, cn.input_folder) + s3_flexible_download(cn.tile_list, cn.netflux_full_extent_s3_path, cn.netflux_full_extent_s3_pattern, cn.input_folder) + if cn.extent == 'forest' or cn.extent == 'both': + s3_flexible_download(cn.tile_list, cn.gross_emis_forest_extent_s3_path, cn.gross_emis_forest_extent_s3_pattern, cn.input_folder) + s3_flexible_download(cn.tile_list, cn.gross_removals_forest_extent_s3_path, cn.gross_removals_forest_extent_s3_pattern, cn.input_folder) + s3_flexible_download(cn.tile_list, cn.netflux_forest_extent_s3_path, cn.netflux_forest_extent_s3_pattern, cn.input_folder) + + # Step 7: Download Gain, Mangrove, Pre_2000_Plantations, TCD, and WHRC subfolders for each tile to Mask, Inputs subfolders + print("Step 1.7: Downloading files for Mask/Inputs folder") + s3_flexible_download(cn.tile_list, cn.gain_s3_path, cn.gain_s3_pattern, cn.gain_folder, cn.gain_local_pattern) + s3_flexible_download(cn.tile_list, cn.mangrove_s3_path, cn.mangrove_s3_pattern, cn.mangrove_folder) + s3_flexible_download(cn.tile_list, cn.plantation_s3_path, cn.plantation_s3_pattern, cn.plantations_folder) + s3_flexible_download(cn.tile_list, cn.tcd_s3_path, cn.tcd_s3_pattern, cn.tcd_folder) + s3_flexible_download(cn.tile_list, cn.whrc_s3_path, cn.whrc_s3_pattern, cn.whrc_folder) + + # Step 8: Download TCL tiles to TCL, Inputs folder + print("Step 1.8: Downloading files for TCL/Inputs folder") + s3_flexible_download(cn.tile_list, cn.loss_s3_path, cn.loss_s3_pattern, cn.tcl_input_folder) + +def create_masks(tcd_threshold, gain, save_intermediates): + # Get a list of tcd tiles in the tcd folder + tcd_list = pathjoin_files_in_directory(cn.tcd_folder, '.tif') + for tcd in tcd_list: + tile_id = get_tile_id(get_raster_name(tcd)) + mask_tiles = os.path.join(cn.mask_output_folder, tile_id) + process_raster(tile_id, tcd, mask_tiles, tcd_threshold, gain, save_intermediates) + +def zonal_stats_masked(aois_folder, input_folder, mask_outputs_folder, outputs_folder): + aoi_list = pathjoin_files_in_directory(aois_folder, '.shp') + for aoi in aoi_list: + aoi_name = get_raster_name(aoi) + print(f"Now processing {aoi_name}:") + tile_id = get_tile_id_from_country(get_country_id(aoi_name)) + raster_folder = os.path.join(input_folder, tile_id) + raster_list = pathjoin_files_in_directory(raster_folder, '.tif') + mask_tiles = os.path.join(mask_outputs_folder, tile_id) + mask_list = pathjoin_files_in_directory(mask_tiles, '.tif') + output_folder = os.path.join(outputs_folder, tile_id) + process_zonal_statistics(aoi, aoi_name, raster_list, mask_list, output_folder, "GID_0") + +def zonal_stats_annualized(tcl_clip_folder, input_folder, mask_outputs_folder, annual_folder): + tcl_list = pathjoin_files_in_directory(tcl_clip_folder, '.tif') + if len(tcl_list) < 1: + print("Clipping TCL tiles to GADM boundaries") + clip_tcl_to_gadm(cn.tcl_input_folder, cn.tcl_clip_folder) + tcl_list = pathjoin_files_in_directory(tcl_clip_folder, '.tif') + else: + print(f"Found {len(tcl_list)} clipped TCL rasters.") + + for tcl in tcl_list: + tcl_name = get_raster_name(tcl) + tcl_raster = arcpy.Raster(tcl) + tcl_raster = arcpy.sa.SetNull(tcl_raster == 0, tcl_raster) + print(f"Now processing {tcl_name}.tif:") + tile_id = get_tile_id(tcl) + raster_folder = os.path.join(input_folder, tile_id) + raster_list = [os.path.join(raster_folder, f) for f in os.listdir(raster_folder) if "emis" in f and f.endswith('tif')] + mask_tiles = os.path.join(mask_outputs_folder, tile_id) + mask_list = pathjoin_files_in_directory(mask_tiles, '.tif') + process_zonal_statistics(tcl_raster, tcl_name, raster_list, mask_list, annual_folder, "Value") + +def zonal_stats_clean(): + # Initialize an empty data frame to store the data + df = pd.DataFrame() + + # Create a list of all masked zonal stats output folders (one for each tile_id) + masked_input_folders = [] + for tile in cn.tile_list: + masked_input_folders.append(os.path.join(cn.outputs_folder, tile)) + + # Combine masked zonal stats output csvs + masked_output = clean_zonal_stats_csv(masked_input_folders, df) + + # Clean masked output dataframe + masked_output.drop(['ZONE_CODE'], axis=1, inplace=True) + masked_output.rename(columns={"GID_0": "Country", "SUM": "Sum"}, inplace = True) + + # Create a list of all annual zonal stats output folders + annual_input_folders = [cn.annual_folder] + + # Combine annual zonal stats outputs csvs + annual_output = clean_zonal_stats_csv(annual_input_folders, df) + annual_output.reset_index(inplace=True) + + # Clean and pivot annual output stats + annual_output["VALUE"] = annual_output["VALUE"] + 2000 + annual_output = annual_output.pivot(columns="VALUE", values="SUM", index="File") + annual_output.reset_index(inplace=True) + + # Join annual output zonal stats to masked zonal stats + annual_output["File"] = annual_output["File"].apply(lambda x: x.removeprefix("TCL_annualized_")) + final_output = masked_output.set_index('File').join(annual_output.set_index('File'), on = "File") + final_output.reset_index(inplace=True) + + # Make sure emission sums match + final_output["Annual_Sum"] =final_output.loc[:,2001:].sum(axis=1, min_count=1) + final_output["Match"] = final_output["Sum"].round() == final_output["Annual_Sum"].round() + + # Define the output location + output_path = os.path.join(cn.csv_folder, "final_output.csv") + + # Export the data frame as a CSV file + final_output.to_csv(output_path, index=False) + +####################################################################################################################### +# Utility functions +####################################################################################################################### +def check_aois(aois_folder): + # Checking to see if the AOIS folder exists + if os.path.isdir(aois_folder): + print(f" Success: {aois_folder} exists.") + # Checking to see if the AOIS folder has any shapefiles + aoi_list = pathjoin_files_in_directory(aois_folder, ".shp") + if len(aoi_list) >= 1: + print(f" Success: {aois_folder} contains {len(aoi_list)} shapefiles.") + else: + raise Exception(f" Failure: {aois_folder} does not contain a shapefile.") + else: + raise Exception(f" Failure: {aois_folder} does not exist.") + +def folder_check(folder): + if os.path.isdir(folder): + print(f" Option 1 success: {folder} exists.") + else: + os.makedirs(folder) + if os.path.isdir(folder): + print(f" Option 2 success: {folder} successfully created.") + else: + raise Exception(f" Option 2 failure: {folder} could not be created.") + +def create_tile_folders(tile_list, input_folder): + for tile in tile_list: + tile_id_folder = os.path.join(input_folder, tile) + folder_check(tile_id_folder) + +def create_subfolders(folder_list): + for subfolder in folder_list: + folder_check(subfolder) + +def list_files_in_directory(directory, file_extension): + return [f for f in os.listdir(directory) if f.endswith(file_extension)] + +def pathjoin_files_in_directory(directory, file_extension): + return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(file_extension)] + +def get_tile_id(tile_name): + tile_id = re.search("[0-9]{2}[A-Z][_][0-9]{3}[A-Z]", tile_name).group() + return tile_id + +def get_country_id(tile_name): + country = re.search("[_][A-Z]{3}[_]", tile_name).group() + country_id = country.split("_")[1] + return country_id + +def get_country_id_from_tile_id(tile_id): + for key, value in cn.tile_dictionary.items(): + if value == tile_id: + return key + +def get_tile_id_from_country(country): + for key, value in cn.tile_dictionary.items(): + if key == country: + return value + +def get_raster_name(raster): + return os.path.splitext(os.path.basename(raster))[0] + +def get_gadm_boundary(country): + for f in pathjoin_files_in_directory(cn.aois_folder, ".shp"): + if country in f: + return f + +def clip_to_gadm(country, input_raster, output_raster): + clip_feature = get_gadm_boundary(country) + no_data_value = arcpy.Raster(input_raster).noDataValue + print(f' Saving {output_raster}') + clipped_raster = arcpy.management.Clip(input_raster, "#", output_raster, clip_feature, no_data_value, "ClippingGeometry", "MAINTAIN_EXTENT") + print(f' Successfully finished') + +def clip_tcl_to_gadm(input_folder, output_folder): + print(f' Option 1: Checking if clipped TCL tiles already exist...') + tcl_list = list_files_in_directory(input_folder, ".tif") + if len(tcl_list) >= 1: + for raster in tcl_list: + raster_name = get_raster_name(raster) + tile_id = get_tile_id(raster) + country = get_country_id_from_tile_id(tile_id) + + input_raster = os.path.join(input_folder, raster) + output_raster = os.path.join(output_folder, f'{raster_name}_{country}_clip.tif') + + if os.path.exists(output_raster): + print(f" Option 1 success: Tile {output_raster} already exists.") + else: + print(f' Option 1 failure: Tile {output_raster} does not already exists."') + print(f' Option 2: Clipping TCL tile to GADM boundary') + clip_to_gadm(country, input_raster, output_raster) + + if os.path.exists(output_raster): + print(f' Option 2 success: Tile {output_raster} successfully created') + else: + print(f' Option 2 failure: Tile {output_raster} was not successfully created') + else: + print(f' Option 1 failure: {input_folder} does not contain any TCL tiles. Make sure TCL tiles have been downloaded.') + + +def or_mask_logic(raster1, raster2, raster1_value=None, raster2_value=None): + if raster1_value: + raster1_mask = arcpy.sa.Con(arcpy.Raster(raster1) > raster1_value, 1, 0) + else: + raster1_mask = raster1 + if raster2_value: + raster2_mask = arcpy.sa.Con(arcpy.Raster(raster2) > raster2_value, 1, 0) + else: + raster2_mask = raster2 + r1_and_r2_mask = arcpy.ia.Merge([raster2_mask, raster1_mask], "SUM") + output_mask = arcpy.sa.Con(arcpy.Raster(r1_and_r2_mask) > 0, 1, 0) + return output_mask + + +def and_mask_logic(raster1, raster2, raster1_value=None, raster2_value=None): + if raster1_value: + raster1_mask = arcpy.sa.Con(arcpy.Raster(raster1) > raster1_value, 1, 0) + else: + raster1_mask = raster1 + if raster2_value: + raster2_mask = arcpy.sa.Con(arcpy.Raster(raster2) > raster2_value, 1, 0) + else: + raster2_mask = raster2 + r1_and_r2_mask = arcpy.sa.Times(raster1_mask, raster2_mask) + output_mask = arcpy.sa.Con(arcpy.Raster(r1_and_r2_mask) > 0, 1, 0) + return output_mask + +def process_raster(tile_id, tcd, mask_tiles, tcd_threshold, gain, save_intermediates): + #Paths to Mask, Input files + gain_raster_path = os.path.join(cn.gain_folder, f'{tile_id}_{cn.gain_local_pattern}.tif') + whrc_raster_path = os.path.join(cn.whrc_folder, f'{tile_id}_{cn.whrc_s3_pattern}.tif') + mangrove_raster_path = os.path.join(cn.mangrove_folder, f'{tile_id}_{cn.mangrove_s3_pattern}.tif') + plantation_raster_path = os.path.join(cn.plantations_folder, f'{tile_id}_{cn.plantation_s3_pattern}.tif') + print(f'Creating masks for {tile_id}: ') + + for tcd_val in tcd_threshold: + #Read in the plantation raster and mask before saving each intermediate + if os.path.exists(plantation_raster_path): + plantation_raster = arcpy.sa.IsNull(arcpy.Raster(plantation_raster_path)) + + # Conditional logic for where TCD AND biomass + tcd_whrc_mask = and_mask_logic(tcd, whrc_raster_path, tcd_val, 0) + mask_path_tcd = os.path.join(mask_tiles, f'{tile_id}_tcd{tcd_val}') + + if save_intermediates == True: + # Conditional logic for TCD AND biomass NOT Pre-2000 Plantation + if os.path.exists(plantation_raster_path): + tcd_noplantation_mask = and_mask_logic(tcd_whrc_mask, plantation_raster) + mask_path_tcd_noplantation = f'{mask_path_tcd}_notPlantation' + + # Saving the tcd_noplantation mask + print(f' Saving {mask_path_tcd_noplantation}.tif') + tcd_noplantation_mask = arcpy.sa.SetNull(tcd_noplantation_mask == 0, tcd_noplantation_mask) + tcd_noplantation_mask.save(f'{mask_path_tcd_noplantation}.tif') + print(f' Successfully finished') + + else: + # Saving the tcd mask + print(f' Saving {mask_path_tcd}.tif') + tcd_whrc_mask = arcpy.sa.SetNull(tcd_whrc_mask == 0, tcd_whrc_mask) + tcd_whrc_mask.save(f'{mask_path_tcd}.tif') + print(f' Successfully finished') + + if gain == True: + # Conditional logic for TCD AND biomass OR gain + tcd_gain_mask = or_mask_logic(gain_raster_path, tcd_whrc_mask, 0) + mask_path_tcd_gain = f'{mask_path_tcd}_gain' + + if save_intermediates == True: + # Conditional logic for TCD AND biomass OR gain NOT Pre-2000 Plantation + if os.path.exists(plantation_raster_path): + tcd_gain_noplantation_mask = and_mask_logic(tcd_gain_mask, plantation_raster) + mask_path_tcd_gain_noplantation = f'{mask_path_tcd_gain}_notPlantation' + + # Saving the tcd_gain_noplantation mask + print(f' Saving {mask_path_tcd_gain_noplantation}.tif') + tcd_gain_noplantation_mask = arcpy.sa.SetNull(tcd_gain_noplantation_mask == 0, tcd_gain_noplantation_mask) + tcd_gain_noplantation_mask.save(f'{mask_path_tcd_gain_noplantation}.tif') + print(f' Successfully finished') + + else: + # Saving the tcd_gain mask + print(f' Saving {mask_path_tcd_gain}.tif') + tcd_gain_mask = arcpy.sa.SetNull(tcd_gain_mask == 0, tcd_gain_mask) + tcd_gain_mask.save(f'{mask_path_tcd_gain}.tif') + print(f' Successfully finished') + + else: + mask_path_tcd_gain = mask_path_tcd + tcd_gain_mask = tcd_whrc_mask + + if os.path.exists(mangrove_raster_path): + # Conditional logic for TCD AND biomass OR gain OR mangrove + mangrove_raster = arcpy.sa.Con(arcpy.Raster(mangrove_raster_path) > 0, 1, 0) + tcd_gain_mangrove_raster = arcpy.ia.Merge([tcd_gain_mask, mangrove_raster], "SUM") + tcd_gain_mangrove_mask = arcpy.sa.Con(arcpy.Raster(tcd_gain_mangrove_raster) > 0, 1, 0) + mask_path_tcd_gain_mangrove = f'{mask_path_tcd_gain}_mangrove' + + # Conditional logic for TCD AND biomass OR gain OR mangrove NOT Pre-2000 Plantation + if os.path.exists(plantation_raster_path): + tcd_gain_mangrove_noplantation_raster = arcpy.sa.Times(tcd_gain_mangrove_mask, plantation_raster) + tcd_gain_mangrove_noplantation_mask = arcpy.sa.Con(arcpy.Raster(tcd_gain_mangrove_noplantation_raster) > 0, 1, 0) + mask_path_tcd_gain_mangrove_noplantation = f'{mask_path_tcd_gain_mangrove}_notPlantation' + + # Saving the tcd_gain_mangrove_noplantation mask + print(f' Saving {mask_path_tcd_gain_mangrove_noplantation}.tif') + tcd_gain_mangrove_noplantation_mask = arcpy.sa.SetNull(tcd_gain_mangrove_noplantation_mask == 0, tcd_gain_mangrove_noplantation_mask) + tcd_gain_mangrove_noplantation_mask.save(f'{mask_path_tcd_gain_mangrove_noplantation}.tif') + print(f' Successfully finished') + + else: + # Saving tcd_gain_mangrove mask + print(f'Saving {mask_path_tcd_gain_mangrove}.tif') + tcd_gain_mangrove_mask = arcpy.sa.SetNull(tcd_gain_mangrove_mask == 0, tcd_gain_mangrove_mask) + tcd_gain_mangrove_mask.save(f'{mask_path_tcd_gain_mangrove}.tif') + print(f' Successfully finished') + +def process_zonal_statistics(aoi, aoi_name, raster_list, mask_list, output_folder, field): + for raster in raster_list: + raster_name = get_raster_name(raster) + raster_obj = arcpy.Raster(raster) + print(f'Calculating zonal statistics for {raster_name}.tif') + + for mask in mask_list: + mask_path = get_raster_name(mask) + mask_name = mask_path.split("_", 2)[2] + mask_obj = arcpy.Raster(mask) + + # Check if spatial references are the same + if (raster_obj.spatialReference.name == mask_obj.spatialReference.name): + print(f' Masking {raster_name}.tif with {mask_name}.tif') + if field == "GID_0": + output_name = "{}_{}.dbf".format(get_country_id(aoi_name), str(raster_name) + "_" + str(mask_name)) + csv_file = "{}_{}.csv".format(get_country_id(aoi_name), str(raster_name) + "_" + str(mask_name)) + elif field == "Value": + output_name = "{}_{}.dbf".format("TCL_annualized" + "_" + str(get_country_id(aoi_name)), str(raster_name) + "_" + str(mask_name)) + csv_file = "{}_{}.csv".format("TCL_annualized" + "_" + str(get_country_id(aoi_name)), str(raster_name) + "_" + str(mask_name)) + output_path = os.path.join(output_folder, output_name) + + masked_raster = arcpy.sa.Times(raster_obj, mask_obj) + arcpy.gp.ZonalStatisticsAsTable_sa(aoi, field, masked_raster, output_path, "DATA", "SUM") + arcpy.TableToTable_conversion(output_path, output_folder, csv_file) + print(f' Successfully finished') + + else: + print(f"Spatial references or extents do not match for {raster} and {mask_name}") + +def clean_zonal_stats_csv(input_folders, df): + for folder in input_folders: + # Loop through the files in each folder + for file in os.listdir(folder): + if file.endswith(".csv"): + # Load the csv file into a pandas data frame + csv_df = pd.read_csv(os.path.join(folder, file)) + + # Add column with name of the file + csv_df["File"] = file + + # Define type of calc + if "emis" in file: + type = "gross emissions" + elif "removals" in file: + type = "gross removals" + else: + type = "net flux" + csv_df["Type"] = type + + # Define extent of calc + if "forest_extent" in file: + extent = "forest extent" + else: + extent = "full extent" + csv_df["Extent"] = extent + + # Define tcd threshold + tcd = re.match(r'.*tcd([0-9]+).*', file) + if tcd is not None: + csv_df["Density"] = tcd.group(1) + else: + csv_df["Density"] = 'NA' + + # Define mask of calc + if "mangrove" in file: + mask = "tcd, gain, mangrove" + elif "gain" in file: + mask = "tcd, gain" + elif "tcd" in file: + mask = "tcd" + else: + mask = "no mask" + if "notPlantation" in file: + mask = f'{mask}, NOT plantation' + csv_df["Mask"] = mask + + # Drop all other fields + assert isinstance(csv_df, object) + csv_df.drop(['OID_', 'COUNT', 'AREA'], axis=1, inplace=True) + + # Append the data to the main data frame + df = pd.concat([df, csv_df], axis=0) + + return(df) + +####################################################################################################################### +# AWS S3 file download utilities +####################################################################################################################### +def s3_flexible_download(tile_id_list, s3_dir, s3_pattern, local_dir, local_pattern = ''): + + # Creates a full download name (path and file) + for tile_id in tile_id_list: + if s3_pattern in [cn.tcd_s3_pattern, cn.loss_s3_pattern]: + source = f'{s3_dir}{s3_pattern}_{tile_id}.tif' + elif s3_pattern in [cn.gain_s3_pattern]: + source = f'{s3_dir}{tile_id}.tif' + else: # For every other type of tile + source = f'{s3_dir}{tile_id}_{s3_pattern}.tif' + + if s3_pattern in [cn.gross_emis_forest_extent_s3_pattern, cn.gross_emis_full_extent_s3_pattern, cn.gross_removals_forest_extent_s3_pattern, cn.gross_removals_full_extent_s3_pattern, cn.netflux_forest_extent_s3_pattern, cn.netflux_full_extent_s3_pattern]: + dir = os.path.join(local_dir, tile_id) + else: + dir = local_dir + + s3_file_download(source, dir, local_pattern) + +def s3_file_download(source, dest, pattern=''): + # Retrieves the s3 directory and name of the tile from the full path name + dir = get_tile_dir(source) + file_name = get_tile_name(source) + + try: + tile_id = get_tile_id(file_name) + except: + pass + + # Special download procedures for tree cover gain because the tiles have no pattern, just an ID. + # Tree cover gain tiles are renamed as their downloaded to get a pattern added to them. + if dir == cn.gain_s3_path[:-1]: # Delete last character of gain_dir because it has the terminal / while dir does not have terminal / + local_file_name = f'{tile_id}_{pattern}.tif' + print(f' Option 1: Checking if {local_file_name} is already downloaded...') + if os.path.exists(os.path.join(dest, local_file_name)): + print(f' Option 1 success: {os.path.join(dest, local_file_name)} already downloaded', "\n") + return + else: + print(f' Option 1 failure: {local_file_name} is not already downloaded.') + print(f' Option 2: Checking for tile {source} on s3...') + + # If the tile isn't already downloaded, download is attempted + # source = os.path.join(dir, file_name) + source = f'{dir}/{file_name}' + local_folder = os.path.join(dest, local_file_name) + + # cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--only-show-errors'] + cmd = ['aws', 's3', 'cp', source, local_folder, + '--request-payer', 'requester', '--only-show-errors'] + log_subprocess_output_full(cmd) + + if os.path.exists(os.path.join(dest, local_file_name)): + print_log(f' Option 2 success: Tile {source} found on s3 and downloaded', "\n") + return + else: + print_log( + f' Option 2 failure: Tile {source} not found on s3. Tile not found but it seems it should be. Check file paths and names.', "\n") + + # All other tiles besides tree cover gain + else: + print_log(f' Option 1: Checking if {file_name} is already downloaded...') + if os.path.exists(os.path.join(dest, file_name)): + print_log(f' Option 1 success: {os.path.join(dest, file_name)} already downloaded', "\n") + return + else: + print_log(f' Option 1 failure: {file_name} is not already downloaded.') + print_log(f' Option 2: Checking for tile {source} on s3...') + + # If the tile isn't already downloaded, download is attempted + #source = os.path.join(dir, file_name) + source = f'{dir}/{file_name}' + + # cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--only-show-errors'] + cmd = ['aws', 's3', 'cp', source, dest, '--only-show-errors'] + log_subprocess_output_full(cmd) + if os.path.exists(os.path.join(dest, file_name)): + print_log(f' Option 2 success: Tile {source} found on s3 and downloaded', "\n") + return + else: + print_log(f' Option 2 failure: Tile {source} not found on s3. Tile not found but it seems it should be. Check file paths and names.', "\n") + +# Gets the directory of the tile +def get_tile_dir(tile): + tile_dir = os.path.split(tile)[0] + return tile_dir + +def get_tile_name(tile): + tile_name = os.path.split(tile)[1] + return tile_name + +def log_subprocess_output_full(cmd): + # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging + process = Popen(cmd, stdout=PIPE, stderr=STDOUT) + pipe = process.stdout + with pipe: + # Reads all the output into a string + for full_out in iter(pipe.readline, b''): # b"\n"-separated lines + # Separates the string into an array, where each entry is one line of output + line_array = full_out.splitlines() + # For reasons I don't know, the array is backwards, so this prints it out in reverse (i.e. correct) order + for line in reversed(line_array): + logging.info(line.decode( + "utf-8")) # https://stackoverflow.com/questions/37016946/remove-b-character-do-in-front-of-a-string-literal-in-python-3, answer by krock + print(line.decode( + "utf-8")) # https://stackoverflow.com/questions/37016946/remove-b-character-do-in-front-of-a-string-literal-in-python-3, answer by krock + +def print_log(*args): + # Empty string + full_statement = str(object='') + # Concatenates all individuals strings to the complete line to print + for arg in args: + full_statement = full_statement + str(arg) + " " + logging.info(full_statement) + # Prints to console + print("LOG: " + full_statement) + diff --git a/analyses/download_tile_set.py b/analyses/download_tile_set.py index b4d5930d..a6ac9851 100644 --- a/analyses/download_tile_set.py +++ b/analyses/download_tile_set.py @@ -29,7 +29,7 @@ def download_tile_set(tile_id_list): os.chdir(wd) download_dict = { - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.loss_dir: [cn.pattern_loss], cn.tcd_dir: [cn.pattern_tcd], cn.WHRC_biomass_2000_unmasked_dir: [cn.pattern_WHRC_biomass_2000_unmasked], diff --git a/analyses/mp_derivative_outputs.py b/analyses/mp_derivative_outputs.py index 64fcd654..f681171f 100644 --- a/analyses/mp_derivative_outputs.py +++ b/analyses/mp_derivative_outputs.py @@ -105,7 +105,7 @@ def mp_derivative_outputs(tile_id_list): uu.s3_flexible_download(cn.pixel_area_dir, cn.pattern_pixel_area, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) # Tree cover density, Hansen gain, and mangrove biomass tiles-- necessary for masking to forest extent uu.s3_flexible_download(cn.tcd_dir, cn.pattern_tcd, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) - uu.s3_flexible_download(cn.gain_dir, cn.pattern_gain_data_lake, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) + uu.s3_flexible_download(cn.gain_dir, cn.pattern_data_lake, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) uu.s3_flexible_download(cn.mangrove_biomass_2000_dir, cn.pattern_mangrove_biomass_2000, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) uu.s3_flexible_download(cn.plant_pre_2000_processed_dir, cn.pattern_plant_pre_2000, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list_outer) diff --git a/analyses/mp_tile_statistics.py b/analyses/mp_tile_statistics.py index 3221c836..579b56b5 100644 --- a/analyses/mp_tile_statistics.py +++ b/analyses/mp_tile_statistics.py @@ -57,7 +57,7 @@ def mp_tile_statistics(sensit_type, tile_id_list): # # Planted forest removals # cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir: [cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked], # 15 = 600 GB peak # cn.planted_forest_type_unmasked_dir: [cn.pattern_planted_forest_type_unmasked], # 15 = 360 GB peak - # cn.stdev_annual_gain_AGC_BGC_planted_forest_unmasked_dir: [cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked], # 15 = 600 GB peak + # cn.stdev_annual_gain_AGC_planted_forest_dir: [cn.pattern_stdev_annual_gain_AGC_planted_forest], # 15 = 600 GB peak # # # US forest removals # cn.FIA_regions_processed_dir: [cn.pattern_FIA_regions_processed], # 15 = 350 GB peak diff --git a/carbon_pools/create_carbon_pools.py b/carbon_pools/create_carbon_pools.py index a4d27ce6..1a0ddafc 100644 --- a/carbon_pools/create_carbon_pools.py +++ b/carbon_pools/create_carbon_pools.py @@ -874,7 +874,7 @@ def create_total_C(tile_id, carbon_pool_extent): uu.print_log(f' Soil C 2000 tile not found for {tile_id}') kwargs = AGC_2000_src.meta - kwargs.update(driver='GTiff', count=1, compress='DEFLATE', nodata=0) + kwargs.update(driver='GTiff', count=1, compress='DEFLATE', nodata=0, bigtiff='YES') windows = AGC_2000_src.block_windows(1) output_pattern_list = [cn.pattern_total_C_2000] if cn.SENSIT_TYPE != 'std': diff --git a/carbon_pools/mp_create_carbon_pools.py b/carbon_pools/mp_create_carbon_pools.py index a9652b5b..53c69563 100644 --- a/carbon_pools/mp_create_carbon_pools.py +++ b/carbon_pools/mp_create_carbon_pools.py @@ -39,7 +39,7 @@ def mp_create_carbon_pools(tile_id_list, carbon_pool_extent): """ :param tile_id_list: list of tile ids to process :param carbon_pool_extent: the pixels and years for which carbon pools are caculated: loss or 2000 - :return: set of tiles with each carbon pool density (Mg/ha): aboveground, belowground, dead wood, litter, soil, total + :return: sets of tiles with each carbon pool density (Mg/ha): aboveground, belowground, dead wood, litter, soil, total """ os.chdir(cn.docker_tile_dir) @@ -92,7 +92,7 @@ def mp_create_carbon_pools(tile_id_list, carbon_pool_extent): cn.precip_processed_dir: [cn.pattern_precip], cn.elevation_processed_dir: [cn.pattern_elevation], cn.soil_C_full_extent_2000_dir: [cn.pattern_soil_C_full_extent_2000], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.BGB_AGB_ratio_dir: [cn.pattern_BGB_AGB_ratio] } @@ -130,7 +130,7 @@ def mp_create_carbon_pools(tile_id_list, carbon_pool_extent): cn.precip_processed_dir: [cn.pattern_precip], cn.elevation_processed_dir: [cn.pattern_elevation], cn.soil_C_full_extent_2000_dir: [cn.pattern_soil_C_full_extent_2000], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.BGB_AGB_ratio_dir: [cn.pattern_BGB_AGB_ratio], cn.annual_gain_AGC_all_types_dir: [cn.pattern_annual_gain_AGC_all_types], cn.cumul_gain_AGCO2_all_types_dir: [cn.pattern_cumul_gain_AGCO2_all_types] diff --git a/carbon_pools/mp_create_soil_C.py b/carbon_pools/mp_create_soil_C.py deleted file mode 100644 index e26f24e1..00000000 --- a/carbon_pools/mp_create_soil_C.py +++ /dev/null @@ -1,302 +0,0 @@ -''' -This script creates tiles of soil carbon density, one of the carbon emitted_pools. -At this time, mineral soil carbon is for the top 30 cm of soil. -Mangrove soil carbon gets precedence over mineral soil carbon where there is mangrove biomass. -Mangrove soil C is limited to where mangrove AGB is. -Where there is no mangrove biomass, mineral soil C is used. -Peatland carbon is not recognized or involved in any way. -This is a convoluted way of doing this processing. Originally, I tried making mangrove soil tiles masked to -mangrove AGB tiles, then making a vrt of all those mangrove soil C tiles and the mineral soil raster, and then -using gdal_warp on that to get combined tiles. -However, for reasons I couldn't figure out, the gdalbuildvrt step in which I combined the mangrove 10x10 tiles -and the mineral soil raster never actually combined the mangrove tiles with the mineral soil raster; I just kept -getting mineral soil C values out. -So, I switched to this somewhat more convoluted method that uses both gdal and rasterio/numpy. -''' - -from subprocess import Popen, PIPE, STDOUT, check_call -from functools import partial -import multiprocessing -import datetime -import glob -import argparse -import os -import sys -import constants_and_names as cn -import universal_util as uu -from . import create_soil_C - -def mp_create_soil_C(tile_id_list, no_upload=None): - - os.chdir(cn.docker_tile_dir) - sensit_type = 'std' - - # If a full model run is specified, the correct set of tiles for the particular script is listed - if tile_id_list == 'all': - # List of tiles to run in the model - tile_id_list = uu.create_combined_tile_list( - [cn.WHRC_biomass_2000_unmasked_dir, cn.mangrove_biomass_2000_dir, cn.gain_dir], - sensit_type=cn.SENSIT_TYPE) - - uu.print_log(tile_id_list) - uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") - - - # List of output directories and output file name patterns - output_dir_list = [cn.soil_C_full_extent_2000_non_mang_dir, cn.soil_C_full_extent_2000_dir, - cn.stdev_soil_C_full_extent_2000_dir] - output_pattern_list = [cn.pattern_soil_C_full_extent_2000_non_mang, cn.pattern_soil_C_full_extent_2000, - cn.pattern_stdev_soil_C_full_extent] - - - ### Soil carbon density - - uu.print_log("Downloading mangrove soil C rasters") - uu.s3_file_download(os.path.join(cn.mangrove_soil_C_dir, cn.name_mangrove_soil_C), cn.docker_tile_dir, sensit_type) - - # For downloading all tiles in the input folders. - input_files = [cn.mangrove_biomass_2000_dir] - - for input in input_files: - uu.s3_folder_download(input, cn.docker_tile_dir, sensit_type) - - # Download raw mineral soil C density tiles. - # First tries to download index.html.tmp from every folder, then goes back and downloads all the tifs in each folder - # Based on https://stackoverflow.com/questions/273743/using-wget-to-recursively-fetch-a-directory-with-arbitrary-files-in-it - # There are 12951 tiles and it takes about 3 hours to download them! - cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', - '--accept', '*.tif', '{}'.format(cn.mineral_soil_C_url)] - uu.log_subprocess_output_full(cmd) - - uu.print_log("Unzipping mangrove soil C rasters...") - cmd = ['unzip', '-j', cn.name_mangrove_soil_C, '-d', cn.docker_tile_dir] - uu.log_subprocess_output_full(cmd) - - # Mangrove soil receives precedence over mineral soil - uu.print_log("Making mangrove soil C vrt...") - check_call('gdalbuildvrt mangrove_soil_C.vrt *{}*.tif'.format(cn.pattern_mangrove_soil_C_raw), shell=True) - uu.print_log("Done making mangrove soil C vrt") - - uu.print_log("Making mangrove soil C tiles...") - - if cn.count == 96: - processes = 32 # 32 processors = 570 GB peak - else: - processes = int(cn.count/3) - uu.print_log('Mangrove soil C max processors=', processes) - pool = multiprocessing.Pool(processes) - pool.map(partial(create_soil_C.create_mangrove_soil_C, no_upload=no_upload), tile_id_list) - pool.close() - pool.join() - - # # For single processor use - # for tile_id in tile_id_list: - # - # create_soil_C.create_mangrove_soil_C(tile_id, no_Upload) - - uu.print_log('Done making mangrove soil C tiles', "\n") - - uu.print_log("Making mineral soil C vrt...") - check_call('gdalbuildvrt mineral_soil_C.vrt *{}*'.format(cn.pattern_mineral_soil_C_raw), shell=True) - uu.print_log("Done making mineral soil C vrt") - - # Creates mineral soil C density tiles - source_raster = 'mineral_soil_C.vrt' - out_pattern = cn.pattern_soil_C_full_extent_2000_non_mang - dt = 'Int16' - if cn.count == 96: - processes = 80 # 32 processors = 100 GB peak; 50 = 160 GB peak; 80 = XXX GB peak - else: - processes = int(cn.count/2) - uu.print_log("Creating mineral soil C density tiles with {} processors...".format(processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), - tile_id_list) - pool.close() - pool.join() - - # # For single processor use - # for tile_id in tile_id_list: - # - # create_soil_C.create_mineral_soil_C(tile_id) - - uu.print_log("Done making non-mangrove soil C tiles", "\n") - - output_pattern = cn.pattern_soil_C_full_extent_2000_non_mang - processes = 60 # 50 processors = ~450 GB peak; 60 = XXX GB peak - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - - # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded to s3 - if not no_upload: - - uu.print_log("Uploading non-mangrove soil C density tiles") - uu.upload_final_set(output_dir_list[0], output_pattern_list[0]) - - - uu.print_log("Making combined (mangrove & non-mangrove) soil C tiles...") - - if cn.count == 96: - processes = 45 # 45 processors = XXX GB peak - else: - processes = int(cn.count/2) - uu.print_log('Combined soil C max processors=', processes) - pool = multiprocessing.Pool(processes) - pool.map(partial(create_soil_C.create_combined_soil_C, no_upload=no_upload), tile_id_list) - pool.close() - pool.join() - - # # For single processor use - # for tile in tile_list: - # - # create_soil_C.create_combined_soil_C(tile_id, no_upload) - - uu.print_log("Done making combined soil C tiles") - - # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded - if not no_upload: - - uu.print_log("Uploading combined soil C density tiles") - uu.upload_final_set(output_dir_list[1], output_pattern_list[1]) - - - # Need to delete soil c density rasters because they have the same pattern as the standard deviation rasters - uu.print_log("Deleting raw soil C density rasters") - c_stocks = glob.glob('*{}*'.format(cn.pattern_soil_C_full_extent_2000)) - for c_stock in c_stocks: - os.remove(c_stock) - - - ### Soil carbon density uncertainty - - # Separate directories for the 5% CI and 95% CI - dir_CI05 = '{0}{1}'.format(cn.docker_tile_dir, 'CI05/') - dir_CI95 = '{0}{1}'.format(cn.docker_tile_dir, 'CI95/') - vrt_CI05 = 'mineral_soil_C_CI05.vrt' - vrt_CI95 = 'mineral_soil_C_CI95.vrt' - soil_C_stdev_global = 'soil_C_stdev.tif' - - # Download raw mineral soil C density 5% CI tiles - # First tries to download index.html.tmp from every folder, then goes back and downloads all the tifs in each folder - # Based on https://stackoverflow.com/questions/273743/using-wget-to-recursively-fetch-a-directory-with-arbitrary-files-in-it - # Like soil C density rasters, there are 12951 tifs and they take about 3 hours to download. - os.mkdir(dir_CI05) - - cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', - '--directory-prefix={}'.format(dir_CI05), - '--accept', '*.tif', '{}'.format(cn.CI5_mineral_soil_C_url)] - uu.log_subprocess_output_full(cmd) - - uu.print_log("Making mineral soil C 5% CI vrt...") - - check_call('gdalbuildvrt {0} {1}*{2}*'.format(vrt_CI05, dir_CI05, cn.pattern_uncert_mineral_soil_C_raw), shell=True) - uu.print_log("Done making mineral soil C CI05 vrt") - - # Download raw mineral soil C density 5% CI tiles - # Like soil C density rasters, there are 12951 tifs and they take about 3 hours to download. - os.mkdir(dir_CI95) - - cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', - '--directory-prefix={}'.format(dir_CI95), - '--accept', '*.tif', '{}'.format(cn.CI95_mineral_soil_C_url)] - uu.log_subprocess_output_full(cmd) - - uu.print_log("Making mineral soil C 95% CI vrt...") - - check_call('gdalbuildvrt {0} {1}*{2}*'.format(vrt_CI95, dir_CI95, cn.pattern_uncert_mineral_soil_C_raw), shell=True) - uu.print_log("Done making mineral soil C CI95 vrt") - - - uu.print_log("Creating raster of standard deviations in soil C at native SoilGrids250 resolution. This may take a while...") - # global tif with approximation of the soil C stanard deviation (based on the 5% and 95% CIs) - - # This takes about 20 minutes. It doesn't show any progress until the last moment, when it quickly counts - # up to 100. - calc = '--calc=(A-B)/3' - out_filearg = '--outfile={}'.format(soil_C_stdev_global) - cmd = ['gdal_calc.py', '-A', vrt_CI95, '-B', vrt_CI05, calc, out_filearg, - '--NoDataValue=0', '--overwrite', '--co', 'COMPRESS=DEFLATE', '--type=Float32'] - uu.log_subprocess_output_full(cmd) - - uu.print_log("{} created.".format(soil_C_stdev_global)) - - - # Creates soil carbon 2000 density standard deviation tiles - out_pattern = cn.pattern_stdev_soil_C_full_extent - dt = 'Float32' - source_raster = soil_C_stdev_global - if cn.count == 96: - processes = 56 # 32 processors = 290 GB peak; 56 = XXX GB peal - else: - processes = 2 - uu.print_log("Creating mineral soil C stock stdev tiles with {} processors...".format(processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), - tile_id_list) - pool.close() - pool.join() - - - output_pattern = cn.pattern_stdev_soil_C_full_extent - processes = 50 # 50 processors = 550 GB peak - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - - - # Checks the gross removals outputs for tiles with no data - for output_pattern in output_pattern_list: - if cn.count <= 2: # For local tests - processes = 1 - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors using light function...".format( - output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty_light, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - else: - processes = 55 # 50 processors = XXX GB peak - uu.print_log( - "Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - - - # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded - if not no_upload: - - uu.print_log("Uploading soil C density standard deviation tiles") - uu.upload_final_set(output_dir_list[2], output_pattern_list[2]) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='Creates tiles of soil carbon density in 2000') - parser.add_argument('--tile_id_list', '-l', required=True, - help='List of tile ids to use in the model. Should be of form 00N_110E or 00N_110E,00N_120E or all.') - parser.add_argument('--run-date', '-d', required=False, - help='Date of run. Must be format YYYYMMDD.') - parser.add_argument('--no-upload', '-nu', action='store_true', - help='Disables uploading of outputs to s3') - args = parser.parse_args() - tile_id_list = args.tile_id_list - run_date = args.run_date - no_upload = args.NO_UPLOAD - - # Disables upload to s3 if no AWS credentials are found in environment - if not uu.check_aws_creds(): - no_upload = True - - # Create the output log - uu.initiate_log(tile_id_list) - tile_id_list = uu.tile_id_list_check(tile_id_list) - - mp_create_soil_C(tile_id_list=tile_id_list, no_upload=no_upload) \ No newline at end of file diff --git a/constants_and_names.py b/constants_and_names.py index c544d486..4eebfd23 100644 --- a/constants_and_names.py +++ b/constants_and_names.py @@ -8,7 +8,7 @@ ######## ######## # Model version -version = '1.2.4' +version = '1.3.1' version_filename = version.replace('.', '_') @@ -41,6 +41,8 @@ LOG_NOTE = '' +### Constants + # Number of years of tree cover loss. If input loss raster is changed, this must be changed, too. loss_years = 22 @@ -108,6 +110,10 @@ # Number of processors on the machine being used count = multiprocessing.cpu_count() +planted_forest_postgis_db = 'all_plant' +planted_forest_output_date = '20230911' +planted_forest_version = 'SDPTv2' + ########## ########## ##### File names and directories ##### @@ -147,7 +153,7 @@ ### Model extent ###### pattern_model_extent = 'model_extent' -model_extent_dir = os.path.join(s3_base_dir, 'model_extent/standard/20230815/') +model_extent_dir = os.path.join(s3_base_dir, 'model_extent/standard/20231114/') ###### ### Biomass tiles @@ -205,7 +211,7 @@ # Hansen removals tiles based on canopy height (2000-2020) # From https://www.frontiersin.org/articles/10.3389/frsen.2022.856903/full -pattern_gain_data_lake = '' +pattern_data_lake = '' pattern_gain_ec2 = 'tree_cover_gain_2000_2020' gain_dir = 's3://gfw-data-lake/umd_tree_cover_gain_from_height/v202206/raster/epsg-4326/10/40000/gain/geotiff/' @@ -235,9 +241,44 @@ cont_eco_raw_dir = os.path.join(s3_base_dir, 'fao_ecozones/ecozone_continent/20190116/raw/') cont_eco_dir = os.path.join(s3_base_dir, 'fao_ecozones/ecozone_continent/20190116/processed/') -# Plantation type: palm oil (code=1), wood fiber (code=2), and other (code=3) -pattern_planted_forest_type_unmasked = 'plantation_type_oilpalm_woodfiber_other_unmasked' -planted_forest_type_unmasked_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/plantation_type/standard/20200730/') + +### Planted forests + +# Note that planted forest data was rasterized using the gfw-data-api and the original copies live in +# s3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/. +# I then copied them into gfw2-data and renamed them to use my preferred patterns. + +# Planted forest aboveground only removal factors +datalake_pf_agc_rf_dir = 's3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/removalFactorAGCMgHaYr/geotiff/' +pattern_pf_rf_agc_ec2 = 'annual_gain_rate_AGC_Mg_ha_planted_forest' +planted_forest_aboveground_removalfactor_dir = os.path.join(s3_base_dir, f'annual_removal_factor_planted_forest/{planted_forest_version}_AGC/{planted_forest_output_date}/') + +# Planted forest aboveground + belowground removal factors +datalake_pf_agcbgc_rf_dir = 's3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/removalFactorAGCBGCMgHaYr/geotiff/' +pattern_pf_rf_agcbgc_ec2 = 'annual_gain_rate_AGC_BGC_Mg_ha_planted_forest' +planted_forest_aboveground_belowground_removalfactor_dir = os.path.join(s3_base_dir, f'annual_removal_factor_planted_forest/{planted_forest_version}_AGC_BGC/{planted_forest_output_date}/') + +# Planted forest aboveground only standard deviations +datalake_pf_agc_sd_dir = 's3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/removalFactorAGCstdevMgHaYr/geotiff/' +pattern_pf_sd_agc_ec2 = 'annual_gain_rate_stdev_AGC_Mg_ha_planted_forest_unmasked' +planted_forest_aboveground_standard_deviation_dir = os.path.join(s3_base_dir, f'stdev_annual_removal_factor_planted_forest/{planted_forest_version}_AGC/{planted_forest_output_date}/') + +# Planted forest aboveground + belowground standard deviations +datalake_pf_agcbgc_sd_dir = 's3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/removalFactorAGCBGCstdevMgHaYr/geotiff/' +pattern_pf_sd_agcbgc_ec2 = 'annual_gain_rate_stdev_AGC_BGC_Mg_ha_planted_forest_unmasked' +planted_forest_aboveground_belowground_standard_deviation_dir = os.path.join(s3_base_dir, f'stdev_annual_removal_factor_planted_forest/{planted_forest_version}_AGC_BGC/{planted_forest_output_date}/') + +# Planted forest type (simpleName): palm oil (code=1), wood fiber (code=2), and other (code=3) +datalake_pf_simplename_dir = 's3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/simpleName/geotiff/' +pattern_planted_forest_type = 'plantation_type_oilpalm_woodfiber_other' +planted_forest_type_dir = os.path.join(s3_base_dir, f'other_emissions_inputs/plantation_type/{planted_forest_version}/{planted_forest_output_date}/') + +# Planted forest establishment year +#datalake_pf_estab_year_dir = +pattern_planted_forest_estab_year = 'planted_forest_establishment_year' +planted_forest_estab_year_dir = os.path.join(s3_base_dir, f'planted_forest_estab_year/{planted_forest_version}/{planted_forest_output_date}/') + +### Peatland delineation # Peat mask inputs peat_unprocessed_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/peatlands/raw/') @@ -275,6 +316,9 @@ pattern_peat_mask = 'peat_mask_processed' peat_mask_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/peatlands/processed/20230315/') + +### Other emissions inputs + # Climate zone climate_zone_raw_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/climate_zone/raw/') climate_zone_raw = 'climate_zone.tif' @@ -291,7 +335,7 @@ drivers_raw_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/tree_cover_loss_drivers/raw/') pattern_drivers_raw = 'TCL_DD_2022_20230407_wgs84_setnodata.tif' pattern_drivers = 'tree_cover_loss_driver_processed' -drivers_processed_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407') +drivers_processed_dir = os.path.join(s3_base_dir, 'other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/') # Tree cover loss from fires TCLF_raw_dir = 's3://gfw-data-lake/umd_tree_cover_loss_from_fires/v20230315/raw/' @@ -309,20 +353,39 @@ gadm_iso = 'gadm_3_6_with_planted_forest_iso.shp' gadm_path = os.path.join(gadm_dir, gadm_zip) gadm_plant_1x1_index_dir = os.path.join(s3_base_dir, 'gadm_plantation_1x1_tile_index/') -pattern_gadm_1x1_index = 'gadm_index_1x1' +# pattern_gadm_1x1_index = 'gadm_index_1x1' pattern_plant_1x1_index = 'plantation_index_1x1' -# Countries with planted forests in them according to the planted forest geodatabase -plantation_countries = [ - 'ARG', 'VNM', 'VEN', 'THA', 'RWA', 'PNG', 'PHL', 'PAN', 'NIC', 'IND', 'HND', 'CRI', 'COD', 'COL', - 'GAB', 'GHA', 'GTM', 'IDN', 'KEN', 'KHM', 'PRK', 'KOR', 'LBR', 'LKA', 'MEX', 'MMR', 'MWI', 'MGA', - 'NPL', 'NZL', 'PAK', 'PER', 'SLB', 'URY', 'USA', 'ZAF', 'AUS', 'BRA', 'CHL', 'CHN', 'CIV', 'CMR', - 'JPN', 'MYS', 'ECU', - 'AUT', 'BEL', 'BGR', 'HRV', 'CYP', 'CZE', 'DNK', 'EST', 'FIN', 'FRA', 'DEU', 'GRC', 'HUN', 'IRL', - 'ITA', 'LVA', 'LTU', 'LUX', 'MLT', 'NLD', 'POL', 'PRT', 'ROU', 'SVK', 'SVN', 'ESP', 'SWE', 'GBR', - 'ALA', 'ALB', 'ARM', 'AZE', 'BIH', 'BLR', 'CHE', 'GEO', 'IRQ', 'ISL', 'MDA', 'MKD', 'MNE', - 'NGA', 'NOR', 'SRB', 'SYR', 'TUR', 'UKR', 'XKO' - ] +plantations_dir = os.path.join(s3_base_dir, 'plantations/') +pattern_gadm_1x1_index = 'fishnet_1x1_deg_SDPTv2_extent__20230821' + +# Countries with planted forests in them according to the Spatial Database of Planted Trees v2 +SDPT_v2_feature_classes = [ +'AGO', 'ARG', 'ARM', 'AUS', 'AZE', 'BDI', 'BEN', 'BFA', 'BGD', 'BLZ', 'BOL', 'BRA', 'BRN', 'BTN', 'CAF', 'CAN', +'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'CPV', 'CRI', 'CUB', 'CYP', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', +'ETH', 'FJI', 'GAB', 'GHA', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GTM', 'GUF', 'HND', 'HTI', 'IDN', 'IND', +'IRN', 'IRQ', 'ISR', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', +'LSO', 'MAR', 'MDG', 'MEX', 'MLI', 'MMR', 'MNG', 'MOZ', 'MRT', 'MWI', 'MYS', 'NCL', 'NGA', 'NIC', 'NPL', 'NZL', +'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'PRK', 'PRY', 'RUS', 'RWA', 'SEN', 'SLB', 'SLE', 'SLV', 'SOM', 'SSD', +'STP', 'SUR', 'SWZ', 'SYR', 'TGO', 'THA', 'TJK', 'TTO', 'TUN', 'TUR', 'TZA', 'UGA', 'URY', 'USA', 'UZB', 'VEN', +'VNM', 'VUT', 'ZAF', 'ZMB', 'ZWE', 'EU' +] + +SDPT_v2_iso_codes = [ +'AGO', 'ARG', 'ARM', 'AUS', 'AZE', 'BDI', 'BEN', 'BFA', 'BGD', 'BLZ', 'BOL', 'BRA', 'BRN', 'BTN', 'CAF', 'CAN', +'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'CPV', 'CRI', 'CUB', 'CYP', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', +'ETH', 'FJI', 'GAB', 'GHA', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GTM', 'GUF', 'HND', 'HTI', 'IDN', 'IND', +'IRN', 'IRQ', 'ISR', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 'LAO', 'LBN', 'LBR', 'LBY', 'LKA', +'LSO', 'MAR', 'MDG', 'MEX', 'MLI', 'MMR', 'MNG', 'MOZ', 'MRT', 'MWI', 'MYS', 'NCL', 'NGA', 'NIC', 'NPL', 'NZL', +'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'PRK', 'PRY', 'RUS', 'RWA', 'SEN', 'SLB', 'SLE', 'SLV', 'SOM', 'SSD', +'STP', 'SUR', 'SWZ', 'SYR', 'TGO', 'THA', 'TJK', 'TTO', 'TUN', 'TUR', 'TZA', 'UGA', 'URY', 'USA', 'UZB', 'VEN', +'VNM', 'VUT', 'ZAF', 'ZMB', 'ZWE', +# EU countries +'AUT', 'BEL', 'BGR', 'HRV', 'CYP', 'CZE', 'DNK', 'EST', 'FIN', 'FRA', 'DEU', 'GRC', 'HUN', 'IRL', 'ITA', 'LVA', +'LTU', 'LUX', 'MLT', 'NLD', 'POL', 'PRT', 'ROU', 'SVK', 'SVN', 'ESP', 'SWE', +# Countries that had SDPT in v1 but aren't in v2 ISO list +'NOR', 'GBR', 'MNE', 'XKO', 'SRB', 'ALB', 'BIH', 'MKD', 'MDA', 'UKR', 'BLR', 'ISL', 'GEO', 'ALA', 'CHE' +] ###### ### Removals @@ -339,7 +402,7 @@ # Age categories over entire model extent, as a precursor to assigning IPCC default removal rates pattern_age_cat_IPCC = 'forest_age_category_IPCC__1_young_2_mid_3_old' -age_cat_IPCC_dir = os.path.join(s3_base_dir, 'forest_age_category_IPCC/standard/20230815/') +age_cat_IPCC_dir = os.path.join(s3_base_dir, 'forest_age_category_IPCC/standard/20231114/') ### US-specific removal precursors @@ -364,8 +427,11 @@ ### Annual carbon and biomass removals rates for specific forest types that are precursors for composite annual removal factor # Annual aboveground and belowground carbon removals rate for planted forests, with removals rates everywhere inside the plantation boundaries (includes mangrove pixels) -pattern_annual_gain_AGC_BGC_planted_forest_unmasked = 'annual_gain_rate_AGC_BGC_t_ha_planted_forest_unmasked' -annual_gain_AGC_BGC_planted_forest_unmasked_dir = os.path.join(s3_base_dir, 'annual_gain_rate_AGC_BGC_planted_forest_unmasked/standard/20200730/') +# Note that planted forest data was rasterized using the gfw-data-api and the original copies live in +# s3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/. +# I then copied them into gfw2-data and renamed them to use my preferred patterns. +pattern_annual_gain_AGC_BGC_planted_forest = 'annual_gain_rate_AGC_BGC_Mg_ha_planted_forest' +annual_gain_AGC_BGC_planted_forest_dir = os.path.join(s3_base_dir, f'annual_removal_factor_planted_forest/{planted_forest_version}_AGC_BGC/{planted_forest_output_date}/') # Annual aboveground carbon removals rate for <20 year secondary, non-mangrove, non-planted natural forests (raw) name_annual_gain_AGC_natrl_forest_young_raw = 'sequestration_rate__mean__aboveground__full_extent__Mg_C_ha_yr.tif' @@ -398,31 +464,31 @@ # Annual aboveground biomass removals rate using IPCC default removal rates pattern_annual_gain_AGB_IPCC_defaults = 'annual_removal_factor_AGB_Mg_ha_IPCC_defaults_all_ages' -annual_gain_AGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGB_IPCC_defaults_all_ages/standard/20230815/') +annual_gain_AGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGB_IPCC_defaults_all_ages/standard/20231114/') # Annual aboveground biomass removals rate using IPCC default removal rates pattern_annual_gain_BGB_IPCC_defaults = 'annual_removal_factor_BGB_Mg_ha_IPCC_defaults_all_ages' -annual_gain_BGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'annual_removal_factor_BGB_IPCC_defaults_all_ages/standard/20230815/') +annual_gain_BGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'annual_removal_factor_BGB_IPCC_defaults_all_ages/standard/20231114/') ### Annual composite removal factor # Annual aboveground removals rate for all forest types pattern_annual_gain_AGC_all_types = 'annual_removal_factor_AGC_Mg_ha_all_forest_types' -annual_gain_AGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGC_all_forest_types/standard/20230815/') +annual_gain_AGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGC_all_forest_types/standard/20231114/') # Annual belowground removals rate for all forest types pattern_annual_gain_BGC_all_types = 'annual_removal_factor_BGC_Mg_ha_all_forest_types' -annual_gain_BGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_BGC_all_forest_types/standard/20230815/') +annual_gain_BGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_BGC_all_forest_types/standard/20231114/') # Annual aboveground+belowground removals rate for all forest types pattern_annual_gain_AGC_BGC_all_types = 'annual_removal_factor_AGC_BGC_Mg_ha_all_forest_types' -annual_gain_AGC_BGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGC_BGC_all_forest_types/standard/20230815/') +annual_gain_AGC_BGC_all_types_dir = os.path.join(s3_base_dir, 'annual_removal_factor_AGC_BGC_all_forest_types/standard/20231114/') ### Removal forest types (sources) # Forest type used in removals model pattern_removal_forest_type = 'removal_forest_type' -removal_forest_type_dir = os.path.join(s3_base_dir, 'removal_forest_type/standard/20230815/') +removal_forest_type_dir = os.path.join(s3_base_dir, 'removal_forest_type/standard/20231114/') # Removal model forest type codes mangrove_rank = 6 @@ -437,26 +503,26 @@ # Number of removals years for all forest types pattern_gain_year_count = 'gain_year_count_all_forest_types' -gain_year_count_dir = os.path.join(s3_base_dir, 'gain_year_count_all_forest_types/standard/20230815/') +gain_year_count_dir = os.path.join(s3_base_dir, 'gain_year_count_all_forest_types/standard/20231114/') ### Cumulative gross carbon dioxide removals # Gross aboveground removals for all forest types pattern_cumul_gain_AGCO2_all_types = f'gross_removals_AGCO2_Mg_ha_all_forest_types_2001_{loss_years}' -cumul_gain_AGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_all_forest_types/standard/per_hectare/20230815/') +cumul_gain_AGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_all_forest_types/standard/per_hectare/20231114/') # Gross belowground removals for all forest types pattern_cumul_gain_BGCO2_all_types = f'gross_removals_BGCO2_Mg_ha_all_forest_types_2001_{loss_years}' -cumul_gain_BGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_BGCO2_all_forest_types/standard/per_hectare/20230815/') +cumul_gain_BGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_BGCO2_all_forest_types/standard/per_hectare/20231114/') # Gross aboveground and belowground removals for all forest types in all pixels pattern_cumul_gain_AGCO2_BGCO2_all_types = f'gross_removals_AGCO2_BGCO2_Mg_ha_all_forest_types_2001_{loss_years}' -cumul_gain_AGCO2_BGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/full_extent/per_hectare/20230815/') +cumul_gain_AGCO2_BGCO2_all_types_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/full_extent/per_hectare/20231114/') # Gross aboveground and belowground removals for all forest types in pixels within forest extent pattern_cumul_gain_AGCO2_BGCO2_all_types_forest_extent = f'gross_removals_AGCO2_BGCO2_Mg_ha_all_forest_types_forest_extent_2001_{loss_years}' -cumul_gain_AGCO2_BGCO2_all_types_forest_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/forest_extent/per_hectare/20230815/') +cumul_gain_AGCO2_BGCO2_all_types_forest_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/forest_extent/per_hectare/20231114/') ###### @@ -492,7 +558,7 @@ ## Carbon emitted_pools in loss year # Date to include in the output directory for all emissions year carbon emitted_pools -emis_pool_run_date = '20230815' +emis_pool_run_date = '20231114' # Aboveground carbon in the year of emission for all forest types in loss pixels pattern_AGC_emis_year = "Mg_AGC_ha_emis_year" @@ -542,23 +608,29 @@ mangrove_soil_C_dir = os.path.join(s3_base_dir, 'carbon_pools/soil_carbon/raw/') name_mangrove_soil_C = 'Mangroves_SOCS_0_100cm_30m.zip' pattern_mangrove_soil_C_raw = 'dSOCS_0_100cm' -# Raw mineral soil C file site, SoilGrids250, updated summer 2020 +# Raw mineral soil C file site, SoilGrids250, updated November 2023. +# SoilGrids250 last modified date for these files is 2023-02-01. pattern_mineral_soil_C_raw = 'tileSG' mineral_soil_C_url = 'https://files.isric.org/soilgrids/latest/data/ocs/ocs_0-30cm_mean/' +# Soil C for mangroves only, maskes to mangrove AGB extent +pattern_soil_C_mangrove = 'mangrove_soil_C_masked_to_mangrove_AGB_Mg_C_ha' +soil_C_mangrove_processed_dir = os.path.join(base_carbon_pool_dir, 'soil_carbon/intermediate_full_extent/mangrove_only/20231108/') + # Soil C full extent but just from SoilGrids250 (mangrove soil C layer not added in) # Not used in model. -pattern_soil_C_full_extent_2000_non_mang = 'soil_C_ha_full_extent_2000_non_mangrove_Mg_ha' -soil_C_full_extent_2000_non_mang_dir = os.path.join(base_carbon_pool_dir, 'soil_carbon/intermediate_full_extent/no_mangrove/20210414/') +pattern_soil_C_full_extent_2000_non_mang = 'soil_C_full_extent_2000_non_mangrove_Mg_C_ha' +soil_C_full_extent_2000_non_mang_dir = os.path.join(base_carbon_pool_dir, 'soil_carbon/intermediate_full_extent/no_mangrove/20231108/') # Soil C full extent (all soil pixels, with mangrove soil C in Giri mangrove extent getting priority over mineral soil C) # Non-mangrove C is 0-30 cm, mangrove C is 0-100 cm -pattern_soil_C_full_extent_2000 = 't_soil_C_ha_full_extent_2000' -soil_C_full_extent_2000_dir = os.path.join(base_carbon_pool_dir, 'soil_carbon/intermediate_full_extent/standard/20200724/') +pattern_soil_C_full_extent_2000 = 'soil_C_full_extent_2000_Mg_C_ha' +soil_C_full_extent_2000_dir = os.path.join(base_carbon_pool_dir, 'soil_carbon/intermediate_full_extent/standard/20231108/') # Total carbon (all carbon emitted_pools combined) for the full biomass 2000 (mangrove and non-mangrove) extent based on 2000 stocks pattern_total_C_2000 = "Mg_total_C_ha_2000" -total_C_2000_dir = os.path.join(base_carbon_pool_dir, f'total_carbon/extent_2000/standard/{pool_2000_run_date}/') +# total_C_2000_dir = os.path.join(base_carbon_pool_dir, f'total_carbon/extent_2000/standard/{pool_2000_run_date}/') +total_C_2000_dir = os.path.join(base_carbon_pool_dir, f'total_carbon/extent_2000/standard/20231108/') ###### @@ -568,7 +640,7 @@ ### Emissions from biomass and soil (all carbon emitted_pools) # Date to include in the output directory -emis_run_date_biomass_soil = '20230815' +emis_run_date_biomass_soil = '20231114' # pattern_gross_emis_commod_biomass_soil = f'gross_emis_commodity_Mg_CO2e_ha_biomass_soil_2001_{loss_years}' pattern_gross_emis_commod_biomass_soil = f'gross_emis_commodity_Mg_CO2e_ha_biomass_soil_2001_{loss_years}' @@ -607,7 +679,7 @@ ### Emissions from soil only # Date to include in the output directory -emis_run_date_soil_only = '20230815' +emis_run_date_soil_only = '20231114' pattern_gross_emis_commod_soil_only = f'gross_emis_commodity_Mg_CO2e_ha_soil_only_2001_{loss_years}' gross_emis_commod_soil_only_dir = f'{s3_base_dir}gross_emissions/commodities/soil_only/standard/{emis_run_date_soil_only}/' @@ -645,11 +717,11 @@ # Net emissions for all forest types and all carbon emitted_pools in all pixels pattern_net_flux = f'net_flux_Mg_CO2e_ha_biomass_soil_2001_{loss_years}' -net_flux_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/full_extent/per_hectare/20230815/') +net_flux_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/full_extent/per_hectare/20231114/') # Net emissions for all forest types and all carbon emitted_pools in forest extent pattern_net_flux_forest_extent = f'net_flux_Mg_CO2e_ha_biomass_soil_forest_extent_2001_{loss_years}' -net_flux_forest_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/forest_extent/per_hectare/20230815/') +net_flux_forest_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/forest_extent/per_hectare/20231114/') ### Per pixel model outputs @@ -657,27 +729,27 @@ # Gross removals per pixel in all pixels pattern_cumul_gain_AGCO2_BGCO2_all_types_per_pixel_full_extent = f'gross_removals_AGCO2_BGCO2_Mg_pixel_all_forest_types_full_extent_2001_{loss_years}' -cumul_gain_AGCO2_BGCO2_all_types_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/full_extent/per_pixel/20230815/') +cumul_gain_AGCO2_BGCO2_all_types_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/full_extent/per_pixel/20231114/') # Gross removals per pixel in forest extent pattern_cumul_gain_AGCO2_BGCO2_all_types_per_pixel_forest_extent = f'gross_removals_AGCO2_BGCO2_Mg_pixel_all_forest_types_forest_extent_2001_{loss_years}' -cumul_gain_AGCO2_BGCO2_all_types_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/forest_extent/per_pixel/20230815/') +cumul_gain_AGCO2_BGCO2_all_types_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'gross_removals_AGCO2_BGCO2_all_forest_types/standard/forest_extent/per_pixel/20231114/') # Gross emissions per pixel in all pixels pattern_gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_full_extent = f'gross_emis_all_gases_all_drivers_Mg_CO2e_pixel_biomass_soil_full_extent_2001_{loss_years}' -gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'gross_emissions/all_drivers/all_gases/biomass_soil/standard/full_extent/per_pixel/20230815/') +gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'gross_emissions/all_drivers/all_gases/biomass_soil/standard/full_extent/per_pixel/20231114/') # Gross emissions per pixel in forest extent pattern_gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_forest_extent = f'gross_emis_all_gases_all_drivers_Mg_CO2e_pixel_biomass_soil_forest_extent_2001_{loss_years}' -gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'gross_emissions/all_drivers/all_gases/biomass_soil/standard/forest_extent/per_pixel/20230815/') +gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'gross_emissions/all_drivers/all_gases/biomass_soil/standard/forest_extent/per_pixel/20231114/') # Net flux per pixel in all pixels pattern_net_flux_per_pixel_full_extent = f'net_flux_Mg_CO2e_pixel_biomass_soil_full_extent_2001_{loss_years}' -net_flux_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/full_extent/per_pixel/20230815/') +net_flux_per_pixel_full_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/full_extent/per_pixel/20231114/') # Net flux per pixel in forest extent pattern_net_flux_per_pixel_forest_extent = f'net_flux_Mg_CO2e_pixel_biomass_soil_forest_extent_2001_{loss_years}' -net_flux_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/forest_extent/per_pixel/20230815/') +net_flux_per_pixel_forest_extent_dir = os.path.join(s3_base_dir, 'net_flux_all_forest_types_all_drivers/biomass_soil/standard/forest_extent/per_pixel/20231114/') ### 4x4 km aggregation tiles for mapping @@ -687,7 +759,7 @@ pattern_aggreg_sensit_perc_diff = f'net_flux_0_04deg_modelv{version_filename}_perc_diff_std' pattern_aggreg_sensit_sign_change = f'net_flux_0_04deg_modelv{version_filename}_sign_change_std' -output_aggreg_dir = os.path.join(s3_base_dir, '0_04deg_output_aggregation/biomass_soil/standard/20230815/') +output_aggreg_dir = os.path.join(s3_base_dir, '0_04deg_output_aggregation/biomass_soil/standard/20231114/') @@ -708,8 +780,11 @@ stdev_annual_gain_AGC_BGC_natrl_forest_Europe_dir = os.path.join(s3_base_dir, 'stdev_annual_removal_factor_AGC_BGC_natural_forest_Europe/processed/standard/20200724/') # Standard deviation for annual aboveground+belowground carbon removal factors for planted forests -pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked = 'annual_gain_rate_stdev_AGC_BGC_t_ha_planted_forest_unmasked' -stdev_annual_gain_AGC_BGC_planted_forest_unmasked_dir = 's3://gfw2-data/climate/carbon_model/stdev_annual_removal_factor_AGC_BGC_planted_forest_unmasked/standard/20200801/' +# Note that planted forest data was rasterized using the gfw-data-api and the original copies live in +# s3://gfw-data-lake/gfw_planted_forests/v20230911/raster/epsg-4326/10/40000/. +# I then copied them into gfw2-data and renamed them to use my preferred patterns. +pattern_stdev_annual_gain_AGC_BGC_planted_forest = 'annual_gain_rate_stdev_AGC_BGC_Mg_ha_planted_forest_unmasked' +stdev_annual_gain_AGC_BGC_planted_forest_dir = os.path.join(s3_base_dir, f'stdev_annual_removal_factor_planted_forest/{planted_forest_version}_AGC_BGC/{planted_forest_output_date}/') # Standard deviation for annual aboveground+belowground carbon removals rate for natural US forests pattern_stdev_annual_gain_AGC_BGC_natrl_forest_US = 'annual_removal_factor_stdev_AGC_BGC_Mg_ha_natural_forest_US' @@ -725,11 +800,11 @@ # Standard deviation for annual aboveground biomass removal factors using IPCC default removal rates pattern_stdev_annual_gain_AGB_IPCC_defaults = 'annual_removal_factor_stdev_AGB_Mg_ha_IPCC_defaults_all_ages' -stdev_annual_gain_AGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'stdev_annual_removal_factor_AGB_IPCC_defaults_all_ages/standard/20230815/') +stdev_annual_gain_AGB_IPCC_defaults_dir = os.path.join(s3_base_dir, 'stdev_annual_removal_factor_AGB_IPCC_defaults_all_ages/standard/20231114/') # Standard deviation for aboveground and belowground removal factors for all forest types pattern_stdev_annual_gain_AGC_all_types = 'annual_removal_factor_stdev_AGC_Mg_ha_all_forest_types' -stdev_annual_gain_AGC_all_types_dir = os.path.join(s3_base_dir, 'stdev_annual_removal_factor_AGC_all_forest_types/standard/20230815/') +stdev_annual_gain_AGC_all_types_dir = os.path.join(s3_base_dir, 'stdev_annual_removal_factor_AGC_all_forest_types/standard/20231114/') # Raw mineral soil C file site @@ -740,7 +815,7 @@ # Standard deviation in soil C stocks (0-30 cm) pattern_stdev_soil_C_full_extent = 'Mg_soil_C_ha_stdev_full_extent_2000' -stdev_soil_C_full_extent_2000_dir = os.path.join(s3_base_dir, 'stdev_soil_carbon_full_extent/standard/20200828/') +stdev_soil_C_full_extent_2000_dir = os.path.join(s3_base_dir, 'stdev_soil_carbon_full_extent/standard/20231108/') ### Testing materials diff --git a/carbon_pools/create_soil_C.py b/data_prep/create_soil_C.py similarity index 79% rename from carbon_pools/create_soil_C.py rename to data_prep/create_soil_C.py index b2649408..53aca2ee 100644 --- a/carbon_pools/create_soil_C.py +++ b/data_prep/create_soil_C.py @@ -22,13 +22,13 @@ import constants_and_names as cn # Creates 10x10 mangrove soil C tiles -def create_mangrove_soil_C(tile_id, no_upload): +def create_mangrove_soil_C(tile_id): # Start time start = datetime.datetime.now() # Checks if mangrove biomass exists. If not, it won't create a mangrove soil C tile. - if os.path.exists('{0}_{1}.tif'.format(tile_id, cn.pattern_mangrove_biomass_2000)): + if os.path.exists(f'{tile_id}_{cn.pattern_mangrove_biomass_2000}.tif'): uu.print_log("Mangrove aboveground biomass tile found for", tile_id) @@ -36,11 +36,11 @@ def create_mangrove_soil_C(tile_id, no_upload): xmin, ymin, xmax, ymax = uu.coords(tile_id) uu.print_log("Clipping mangrove soil C from mangrove soil vrt for", tile_id) - uu.warp_to_Hansen('mangrove_soil_C.vrt', '{0}_mangrove_full_extent.tif'.format(tile_id), xmin, ymin, xmax, ymax, 'Int16') + uu.warp_to_Hansen('mangrove_soil_C.vrt', f'{tile_id}_mangrove_full_extent.tif', xmin, ymin, xmax, ymax, 'Int16') - mangrove_soil = '{0}_mangrove_full_extent.tif'.format(tile_id) - mangrove_biomass = '{0}_{1}.tif'.format(tile_id, cn.pattern_mangrove_biomass_2000) - outname = '{0}_mangrove_masked_to_mangrove.tif'.format(tile_id) + mangrove_soil = f'{tile_id}_mangrove_full_extent.tif' + mangrove_biomass = f'{tile_id}_{cn.pattern_mangrove_biomass_2000}.tif' + outname = f'{tile_id}_{cn.pattern_soil_C_mangrove}.tif' out = '--outfile={}'.format(outname) calc = '--calc=A*(B>0)' datatype = '--type={}'.format('Int16') @@ -55,24 +55,24 @@ def create_mangrove_soil_C(tile_id, no_upload): uu.print_log("Mangrove aboveground biomass tile not found for", tile_id) # Prints information about the tile that was just processed - uu.end_of_fx_summary(start, tile_id, 'mangrove_masked_to_mangrove', no_upload) + uu.end_of_fx_summary(start, tile_id, cn.pattern_soil_C_mangrove) # Overlays the mangrove soil C tiles with the mineral soil C tiles, giving precedence to the mangrove soil C -def create_combined_soil_C(tile_id, no_upload): +def create_combined_soil_C(tile_id): # Start time start = datetime.datetime.now() # Input files - mangrove_soil = '{0}_mangrove_masked_to_mangrove.tif'.format(tile_id) - mineral_soil = '{0}_{1}.tif'.format(tile_id, cn.pattern_soil_C_full_extent_2000_non_mang) + mangrove_soil = f'{tile_id}_{cn.pattern_soil_C_mangrove}.tif' + mineral_soil = f'{tile_id}_{cn.pattern_soil_C_full_extent_2000_non_mang}.tif' # Output file - combined_soil = '{0}_{1}.tif'.format(tile_id, cn.pattern_soil_C_full_extent_2000) + combined_soil = f'{tile_id}_{cn.pattern_soil_C_full_extent_2000}.tif' # Checks if mangrove AGB tile exists. If not, mangrove soil C is not combined with mineral soil C. - if os.path.exists('{0}_{1}.tif'.format(tile_id, cn.pattern_mangrove_biomass_2000)): + if os.path.exists(f'{tile_id}_{cn.pattern_mangrove_biomass_2000}.tif'): uu.print_log("Mangrove aboveground biomass tile found for", tile_id) @@ -114,7 +114,7 @@ def create_combined_soil_C(tile_id, no_upload): # If there is no mangrove soil C tile, the final output of the mineral soil function needs to receive the # correct final name. - os.rename('{0}_{1}.tif'.format(tile_id, cn.pattern_soil_C_full_extent_2000_non_mang), combined_soil) + os.rename(f'{tile_id}_{cn.pattern_soil_C_full_extent_2000_non_mang}.tif', combined_soil) # Prints information about the tile that was just processed - uu.end_of_fx_summary(start, tile_id, cn.pattern_soil_C_full_extent_2000, no_upload) + uu.end_of_fx_summary(start, tile_id, cn.pattern_soil_C_full_extent_2000) diff --git a/data_prep/model_extent.py b/data_prep/model_extent.py index 135500ca..ba3c49cb 100644 --- a/data_prep/model_extent.py +++ b/data_prep/model_extent.py @@ -130,7 +130,7 @@ def model_extent(tile_id, pattern): # Array of pixels that have both biomass and tree cover density tcd_with_biomass_window = np.where((biomass_window > 0) & (tcd_window > 0), 1, 0) - # For all moel types except legal_Amazon_loss sensitivity analysis + # For all model types except legal_Amazon_loss sensitivity analysis if cn.SENSIT_TYPE != 'legal_Amazon_loss': # Array of pixels with (biomass AND tcd) OR mangrove biomass OR Hansen gain diff --git a/data_prep/mp_create_soil_C.py b/data_prep/mp_create_soil_C.py new file mode 100644 index 00000000..5ee02718 --- /dev/null +++ b/data_prep/mp_create_soil_C.py @@ -0,0 +1,329 @@ +''' +This script creates tiles of soil carbon density, one of the carbon pools. +At this time, mineral soil carbon is for the top 30 cm of soil. +Mangrove soil carbon gets precedence over mineral soil carbon where there is mangrove biomass. +Mangrove soil C is limited to where mangrove AGB is. +Where there is no mangrove biomass, mineral soil C is used. +Peatland carbon is not recognized or involved in any way. +This is a convoluted way of doing this processing. Originally, I tried making mangrove soil tiles masked to +mangrove AGB tiles, then making a vrt of all those mangrove soil C tiles and the mineral soil raster, and then +using gdal_warp on that to get combined tiles. +However, for reasons I couldn't figure out, the gdalbuildvrt step in which I combined the mangrove 10x10 tiles +and the mineral soil raster never actually combined the mangrove tiles with the mineral soil raster; I just kept +getting mineral soil C values out. +So, I switched to this somewhat more convoluted method that uses both gdal and rasterio/numpy. +SoilGrids250 last modified date for these files is 2023-02-01. + +python -m carbon_pools.mp_create_soil_C -l 00N_000E -nu +python -m carbon_pools.mp_create_soil_C -l all +''' + +from subprocess import check_call +from functools import partial +import multiprocessing +import datetime +import glob +import argparse +import os +import sys +import constants_and_names as cn +import universal_util as uu +from data_prep import create_soil_C + + +def mp_create_soil_C(tile_id_list): + """ + :param tile_id_list: list of tile ids to process + :return: sets of tiles with soil organic C density for mangroves only, SoilGrids250, mangroves overlaid on SoilGrids250, + and standard deviation in soil organic C + """ + + os.chdir(cn.docker_tile_dir) + sensit_type = 'std' + + # If a full model run is specified, the correct set of tiles for the particular script is listed + if tile_id_list == 'all': + # List of tiles to run in the model + tile_id_list = uu.create_combined_tile_list( + [cn.WHRC_biomass_2000_unmasked_dir, cn.mangrove_biomass_2000_dir, cn.gain_dir], + sensit_type=cn.SENSIT_TYPE) + + uu.print_log(tile_id_list) + uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") + + + # List of output directories and output file name patterns + output_dir_list = [cn.soil_C_mangrove_processed_dir, cn.soil_C_full_extent_2000_non_mang_dir, + cn.soil_C_full_extent_2000_dir, cn.stdev_soil_C_full_extent_2000_dir] + output_pattern_list = [cn.pattern_soil_C_mangrove, cn.pattern_soil_C_full_extent_2000_non_mang, + cn.pattern_soil_C_full_extent_2000, cn.pattern_stdev_soil_C_full_extent] + + + # ### Soil carbon density + # + # uu.print_log("Downloading mangrove soil C rasters") + # uu.s3_file_download(os.path.join(cn.mangrove_soil_C_dir, cn.name_mangrove_soil_C), cn.docker_tile_dir, sensit_type) + # + # # For downloading all tiles in the input folders. + # input_files = [cn.mangrove_biomass_2000_dir] + # + # for input in input_files: + # uu.s3_folder_download(input, cn.docker_tile_dir, sensit_type) + # + # # Download raw mineral soil C density tiles. + # # First tries to download index.html.tmp from every folder, then goes back and downloads all the tifs in each folder + # # Based on https://stackoverflow.com/questions/273743/using-wget-to-recursively-fetch-a-directory-with-arbitrary-files-in-it + # # There are 12951 tiles and it takes about 2 hours to download them! + # cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', + # '--accept', '*.tif', f'{cn.mineral_soil_C_url}'] + # uu.log_subprocess_output_full(cmd) + # + # uu.print_log("Unzipping mangrove soil C rasters...") + # cmd = ['unzip', '-j', cn.name_mangrove_soil_C, '-d', cn.docker_tile_dir] + # uu.log_subprocess_output_full(cmd) + # + # # Mangrove soil receives precedence over mineral soil + # uu.print_log("Making mangrove soil C vrt...") + # check_call('gdalbuildvrt mangrove_soil_C.vrt *{}*.tif'.format(cn.pattern_mangrove_soil_C_raw), shell=True) + # uu.print_log("Done making mangrove soil C vrt") + # + # uu.print_log("Making mangrove soil C tiles...") + # + # if cn.SINGLE_PROCESSOR: + # for tile_id in tile_id_list: + # create_soil_C.create_mangrove_soil_C(tile_id) + # else: + # if cn.count == 96: + # processes = 36 # 32 processors = 570 GB peak; 36 = 590 GB peak + # else: + # processes = int(cn.count/3) + # uu.print_log('Mangrove soil C max processors=', processes) + # pool = multiprocessing.Pool(processes) + # pool.map(partial(create_soil_C.create_mangrove_soil_C), tile_id_list) + # pool.close() + # pool.join() + # + # uu.print_log('Done making mangrove soil C tiles', "\n") + # + # # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded to s3 + # if not cn.NO_UPLOAD: + # + # uu.print_log("Uploading non-mangrove soil C density tiles") + # uu.upload_final_set(output_dir_list[0], output_pattern_list[0]) + # + # uu.print_log("Making mineral soil C vrt...") + # check_call('gdalbuildvrt mineral_soil_C.vrt *{}*'.format(cn.pattern_mineral_soil_C_raw), shell=True) + # uu.print_log("Done making mineral soil C vrt") + # + # # Creates mineral soil C density tiles + # if cn.SINGLE_PROCESSOR: + # for tile_id in tile_id_list: + # create_soil_C.create_mineral_soil_C(tile_id) + # else: + # source_raster = 'mineral_soil_C.vrt' + # out_pattern = cn.pattern_soil_C_full_extent_2000_non_mang + # dt = 'Int16' + # if cn.count == 96: + # processes = 90 # 32 processors = 100 GB peak; 50 = 160 GB peak; 80 = >190 GB peak; 90 = XXX GB peak + # else: + # processes = int(cn.count/2) + # uu.print_log(f"Creating mineral soil C density tiles with {processes} processors...") + # pool = multiprocessing.Pool(processes) + # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), + # tile_id_list) + # pool.close() + # pool.join() + # + # uu.print_log("Done making non-mangrove soil C tiles", "\n") + # + # output_pattern = cn.pattern_soil_C_full_extent_2000_non_mang + # processes = 60 # 50 processors = ~450 GB peak; 60 = XXX GB peak + # uu.print_log(f"Checking for empty tiles of {output_pattern} pattern with {processes} processors...") + # pool = multiprocessing.Pool(processes) + # pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) + # pool.close() + # pool.join() + # + # # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded to s3 + # if not cn.NO_UPLOAD: + # + # uu.print_log("Uploading non-mangrove soil C density tiles") + # uu.upload_final_set(output_dir_list[1], output_pattern_list[1]) + # + # + # uu.print_log("Making combined (mangrove & non-mangrove) soil C tiles...") + # + # # New tile list specifically for soil carbon tile sets + # if len(tile_id_list) > 10: + # # List of tiles to run in the model + # tile_id_list = uu.create_combined_tile_list( + # [cn.soil_C_mangrove_processed_dir, cn.soil_C_full_extent_2000_non_mang_dir], + # sensit_type=cn.SENSIT_TYPE) + # + # uu.print_log(tile_id_list) + # uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") + # + # if cn.SINGLE_PROCESSOR: + # for tile_id in tile_id_list: + # create_soil_C.create_combined_soil_C(tile_id) + # else: + # if cn.count == 96: + # processes = 62 # 45 processors = 420 GB peak; 55 = 500 GB peak; 62 = XXX GB peak + # else: + # processes = int(cn.count/2) + # uu.print_log('Combined soil C max processors=', processes) + # pool = multiprocessing.Pool(processes) + # pool.map(partial(create_soil_C.create_combined_soil_C), tile_id_list) + # pool.close() + # pool.join() + # + # uu.print_log("Done making combined soil C tiles") + # + # # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded + # if not cn.NO_UPLOAD: + # + # uu.print_log("Uploading combined soil C density tiles") + # uu.upload_final_set(output_dir_list[2], output_pattern_list[2]) + + + # # Need to delete raw soil c density rasters because they have the same pattern as the standard deviation rasters + # uu.print_log("Deleting raw soil C density rasters") + # c_stocks = glob.glob(f'*{cn.pattern_soil_C_full_extent_2000}*') + # for c_stock in c_stocks: + # os.remove(c_stock) + + + ### Soil carbon density uncertainty + + # Separate directories for the 5% CI and 95% CI + dir_CI05 = '{0}{1}'.format(cn.docker_tile_dir, 'CI05/') + dir_CI95 = '{0}{1}'.format(cn.docker_tile_dir, 'CI95/') + vrt_CI05 = 'mineral_soil_C_CI05.vrt' + vrt_CI95 = 'mineral_soil_C_CI95.vrt' + soil_C_stdev_global = 'soil_C_stdev.tif' + + # Download raw mineral soil C density 5% CI tiles + # First tries to download index.html.tmp from every folder, then goes back and downloads all the tifs in each folder + # Based on https://stackoverflow.com/questions/273743/using-wget-to-recursively-fetch-a-directory-with-arbitrary-files-in-it + # Like soil C density rasters, there are 12951 tifs and they take about 3 hours to download. + os.mkdir(dir_CI05) + + cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', + '--directory-prefix={}'.format(dir_CI05), + '--accept', '*.tif', f'{cn.CI5_mineral_soil_C_url}'] + uu.log_subprocess_output_full(cmd) + + uu.print_log("Making mineral soil C 5% CI vrt...") + + check_call('gdalbuildvrt {0} {1}*{2}*'.format(vrt_CI05, dir_CI05, cn.pattern_uncert_mineral_soil_C_raw), shell=True) + uu.print_log("Done making mineral soil C CI05 vrt") + + # Download raw mineral soil C density 5% CI tiles + # Like soil C density rasters, there are 12951 tifs and they take about 3 hours to download. + os.mkdir(dir_CI95) + + cmd = ['wget', '--recursive', '-nH', '--cut-dirs=6', '--no-parent', '--reject', 'index.html*', + '--directory-prefix={}'.format(dir_CI95), + '--accept', '*.tif', f'{cn.CI95_mineral_soil_C_url}'] + uu.log_subprocess_output_full(cmd) + + uu.print_log("Making mineral soil C 95% CI vrt...") + + check_call('gdalbuildvrt {0} {1}*{2}*'.format(vrt_CI95, dir_CI95, cn.pattern_uncert_mineral_soil_C_raw), shell=True) + uu.print_log("Done making mineral soil C CI95 vrt") + + + uu.print_log("Creating raster of standard deviations in soil C at native SoilGrids250 resolution. This may take a while...") + # global tif with approximation of the soil C stanard deviation (based on the 5% and 95% CIs) + + # This takes about 20 minutes. It doesn't show any progress until the last moment, when it quickly counts + # up to 100. + calc = '--calc=(A-B)/3' + out_filearg = '--outfile={}'.format(soil_C_stdev_global) + cmd = ['gdal_calc.py', '-A', vrt_CI95, '-B', vrt_CI05, calc, out_filearg, + '--NoDataValue=0', '--overwrite', '--co', 'COMPRESS=DEFLATE', '--type=Float32'] + uu.log_subprocess_output_full(cmd) + + uu.print_log(f"{soil_C_stdev_global} created.") + + + # Creates soil carbon 2000 density standard deviation tiles + out_pattern = cn.pattern_stdev_soil_C_full_extent + dt = 'Float32' + source_raster = soil_C_stdev_global + if cn.count == 96: + processes = 56 # 32 processors = 290 GB peak; 56 = XXX GB peal + else: + processes = 2 + uu.print_log(f"Creating mineral soil C stock stdev tiles with {processes} processors...") + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), + tile_id_list) + pool.close() + pool.join() + + + output_pattern = cn.pattern_stdev_soil_C_full_extent + processes = 50 # 50 processors = 550 GB peak + uu.print_log(f"Checking for empty tiles of {output_pattern} pattern with {processes} processors...") + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + + + # Checks the gross removals outputs for tiles with no data + for output_pattern in output_pattern_list: + if cn.count <= 2: # For local tests + processes = 1 + uu.print_log(f"Checking for empty tiles of {output_pattern} pattern with {processes} processors using light function...") + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty_light, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + else: + processes = 55 # 50 processors = XXX GB peak + uu.print_log( + f"Checking for empty tiles of {output_pattern} pattern with {processes} processors...") + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + + + # If no_upload flag is not activated (by choice or by lack of AWS credentials), output is uploaded + if not cn.NO_UPLOAD: + + uu.print_log("Uploading soil C density standard deviation tiles") + uu.upload_final_set(output_dir_list[3], output_pattern_list[3]) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Creates tiles of soil carbon density in 2000') + parser.add_argument('--tile_id_list', '-l', required=True, + help='List of tile ids to use in the model. Should be of form 00N_110E or 00N_110E,00N_120E or all.') + parser.add_argument('--run-date', '-d', required=False, + help='Date of run. Must be format YYYYMMDD.') + parser.add_argument('--single-processor', '-sp', action='store_true', + help='Uses single processing rather than multiprocessing') + parser.add_argument('--no-upload', '-nu', action='store_true', + help='Disables uploading of outputs to s3') + + args = parser.parse_args() + + tile_id_list = args.tile_id_list + cn.RUN_DATE = args.run_date + cn.SINGLE_PROCESSOR = args.single_processor + cn.NO_UPLOAD = args.no_upload + + # Disables upload to s3 if no AWS credentials are found in environment + if not uu.check_aws_creds(): + cn.NO_UPLOAD = True + + # Create the output log + uu.initiate_log(tile_id_list) + tile_id_list = uu.tile_id_list_check(tile_id_list) + + mp_create_soil_C(tile_id_list=tile_id_list) \ No newline at end of file diff --git a/data_prep/mp_model_extent.py b/data_prep/mp_model_extent.py index 0956bf65..ca38e2bf 100644 --- a/data_prep/mp_model_extent.py +++ b/data_prep/mp_model_extent.py @@ -39,7 +39,7 @@ def mp_model_extent(tile_id_list): else: tile_id_list = uu.create_combined_tile_list( [cn.WHRC_biomass_2000_unmasked_dir, cn.mangrove_biomass_2000_dir, cn.gain_dir, cn.tcd_dir, - cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir], + cn.annual_gain_AGC_BGC_planted_forest_dir], sensit_type=cn.SENSIT_TYPE) uu.print_log(tile_id_list) @@ -49,7 +49,7 @@ def mp_model_extent(tile_id_list): # Files to download for this script. download_dict = { cn.mangrove_biomass_2000_dir: [cn.pattern_mangrove_biomass_2000], - cn.gain_dir: [cn.pattern_gain_data_lake] + cn.gain_dir: [cn.pattern_data_lake] } if cn.SENSIT_TYPE == 'legal_Amazon_loss': diff --git a/data_prep/mp_plantation_preparation.py b/data_prep/mp_plantation_preparation.py deleted file mode 100644 index 06f215dc..00000000 --- a/data_prep/mp_plantation_preparation.py +++ /dev/null @@ -1,504 +0,0 @@ -''' -Code for creating 10x10 degree tiles of aboveground+belowground carbon accumulation rate -and plantation category out of the plantation geodatabase. -Its outputs are two sets of tiles at the full extent of planted forest features (not masked by mangrove or -non-mangrove natural forest): above+belowground carbon accumulation rate, and plantation type (1: oil palm, 2: wood fiber, -3: other). -Unlike other carbon model scripts, this one requires prep work done outside of the script (the large, commented chunk -below this). -Once that prep work of converting the gdb to a PostGIS table has been done, there are a few entry points to this script, -unlike other carbon model scripts. -Which entry point is chosen depends on what has changed in the plantation data since it was last processed. -## NOTE: The entry point for the script is indicated by supplying two arguments following the python call. -The entry points rely on two shapefiles of the indexes (outlines) of 1x1 tiles created during plantation processing: -1. Shapefile of 1x1 tiles of the countries with planted forests (basically, countries with planted forests rasterized to 1x1 deg), -2. Shapefile of 1x1 tiles of planted forest extent (basically, planted forest extent rasterized to 1x1 deg). -These shapefiles should have been created during the previous run of the processing script and be on s3. - -First entry point: Script runs from the beginning. Do this if the planted forest database now includes countries -that were not in it during the previous planted forest growth rate processing. It will take several days to run. -## NOTE: This also requires updating the list of countries with planted forests in constants_and_names.plantation_countries. -This entry point is accessed by supplying None to both arguments, i.e. mp_plantation_preparation.py None None - -Second entry point: Script uses existing index shapefile of 1x1 tiles of countries with planted forests. Use this entry point -if no countries have been added to the planted forest database but some other spatial aspect of the data has changed -since the last processing, e.g., newly added planted forests in countries already in the database or the boundaries -of existing features have been altered. This entry point will use the supplied index shapefile of the 1x1 tiles of -countries with planted forests to create new 1x1 planted forest growth rate tiles. This entry point is accessed by -providing the s3 location of the index shapefile of the 1x1 country tiles, -e.g., python mp_plantation_preparation.py -gi s3://gfw2-data/climate/carbon_model/gadm_plantation_1x1_tile_index/gadm_index_1x1_20190108.shp -pi None - -Third entry point: Script uses existing index shapefile of 1x1 tiles of planted forest extent to create new 1x1 tiles -of planted forest growth rates. Use this entry point if the spatial properties of the database haven't changed but -the growth rates or forest type have. This route will iterate through only the 1x1 tiles that had planted forests previously and -create new planted forest growth rate tiles for them. -This entry point is accessed by providing the s3 location of the index shapefile of the 1x1 -planted forest extent tiles, -e.g., python mp_plantation_preparation.py -gi None -pi s3://gfw2-data/climate/carbon_model/gadm_plantation_1x1_tile_index/plantation_index_1x1_20190813.shp -e.g., python mp_plantation_preparation.py -gi s3://gfw2-data/climate/carbon_model/gadm_plantation_1x1_tile_index/gadm_index_1x1_20190108.shp -pi s3://gfw2-data/climate/carbon_model/gadm_plantation_1x1_tile_index/plantation_index_1x1_20190813.shp - -All entry points conclude with creating 10x10 degree tiles of planted forest carbon accumulation rates and -planted forest type from 1x1 tiles of planted forest extent. - -To run this for just a part of the world, create a new shapefile of 1x1 GADM or plantation tile boundaries (making sure that they -extend to 10x10 degree tiles (that is, the area being processed must match 10x10 degree tiles) and use that as a -command line argument. Supply the rest of the global tiles (the unchanged tiles) from the previous model run. -''' - -""" -Before running this script, the plantation gdb must be converted into a PostGIS table. That's more easily done as a series -of commands than as a script. Below are the instructions for creating a single PostGIS table of all plantations. -This assumes that the plantation gdb has one feature class for each country with plantations and that -each country's feature class's attribute table has a growth rate column named "growth". - -# Start a r5d.24xlarge spot machine -spotutil new r5d.24xlarge dgibbs_wri - -# Change directory to where the data are kept in the docker -cd /usr/local/tiles/ - -# Copy zipped plantation gdb with growth rate field in tables -aws s3 cp s3://gfw-files/plantations/final/global/plantations_v3_1.gdb.zip . - -# Unzip the zipped plantation gdb. This can take several minutes. -unzip plantations_v3_1.gdb.zip - -# Start the postgres service and check that it has actually started -service postgresql restart -pg_lsclusters - -# Create a postgres database called ubuntu. I tried adding RUN createdb ubuntu to the Dockerfile after RUN service postgresql restart -# but got the error: -# createdb: could not connect to database template1: could not connect to server: No such file or directory. -# Is the server running locally and accepting -# So I'm adding that step here. -createdb ubuntu - -# Enter the postgres database called ubuntu and add the postgis exension to it -psql -CREATE EXTENSION postgis; -\q - - -##### NOTE: I HAVEN'T ACTUALLY CHECKED THAT THE BELOW PROCEDURES REALLY RESULT IN A TABLE THAT CAN BE RASTERIZED CORRECTLY. -##### I JUST CHECKED THAT ROWS WERE BEING ADDED TO THE TABLE - - -# Add the feature class of one country's plantations to PostGIS. This creates the "all_plant" table for other countries to be appended to. -# Using ogr2ogr requires the PG connection info but entering the PostGIS shell (psql) doesn't. -ogr2ogr -f Postgresql PG:"dbname=ubuntu" plantations_v3_1.gdb -progress -nln all_plant -sql "SELECT growth, species_simp, SD_error FROM cmr_plant" - -# Enter PostGIS and check that the table is there and that it has only the growth field. -psql -\d+ all_plant; -SELECT * FROM all_plant LIMIT 2; # To see what the first two rows look like -SELECT COUNT (*) FROM all_plant; # Should be 697 for CMR in plantations v3.1 - -# Delete all rows from the table so that it is now empty -DELETE FROM all_plant; - -# Exit the PostGIS shell -\q - -# Get a list of all feature classes (countries) in the geodatabase and save it as a txt -ogrinfo plantations_v3_1.gdb | cut -d: -f2 | cut -d'(' -f1 | grep plant | grep -v Open | sed -e 's/ //g' > out.txt - -# Make sure all the country tables are listed in the txt, then exit it -more out.txt -q - -# Run a loop in bash that iterates through all the gdb feature classes and imports them to the all_plant PostGIS table. -# I think it's okay that there's a message "Warning 1: organizePolygons() received a polygon with more than 100 parts. The processing may be really slow. You can skip the processing by setting METHOD=SKIP, or only make it analyze counter-clock wise parts by setting METHOD=ONLY_CCW if you can assume that the outline of holes is counter-clock wise defined" -# It just seems to mean that the processing is slow, but the processing methods haven't changed. -while read p; do echo $p; ogr2ogr -f Postgresql PG:"dbname=ubuntu" plantations_v3_1.gdb -nln all_plant -progress -append -sql "SELECT growth, species_simp, SD_error FROM $p"; done < out.txt - -# Create a spatial index of the plantation table to speed up the intersections with 1x1 degree tiles -psql -CREATE INDEX IF NOT EXISTS all_plant_index ON all_plant using gist(shape); - -# Adds a new column to the table and stores the plantation type reclassified as 1 (palm), 2 (wood fiber), or 3 (other) -ALTER TABLE all_plant ADD COLUMN type_reclass SMALLINT; -# Based on https://stackoverflow.com/questions/15766102/i-want-to-use-case-statement-to-update-some-records-in-sql-server-2005 -UPDATE all_plant SET type_reclass = ( CASE WHEN species_simp = 'Oil Palm ' then '1' when species_simp = 'Oil Palm Mix ' then '1' when species_simp = 'Oil Palm ' then '1' when species_simp = 'Oil Palm Mix' then 1 when species_simp = 'Wood fiber / timber' then 2 when species_simp = 'Wood fiber / timber ' then 2 ELSE '3' END ); - -# Exit Postgres shell -\q -""" - -import plantation_preparation -from multiprocessing.pool import Pool -from functools import partial -import glob -import datetime -from subprocess import Popen, PIPE, STDOUT, check_call -import argparse -import os -from simpledbf import Dbf5 -import sys -sys.path.append('../') -import constants_and_names as cn -import universal_util as uu - - -def mp_plantation_preparation(gadm_index_shp, planted_index_shp, tile_id_list, run_date = None, no_upload = None): - - os.chdir(cn.docker_tile_dir) - - # ## Not actually using this but leaving it here in case I want to add this functionality eventually. This - # # was to allow users to run plantations for a select (contiguous) area rather than for the whole planet. - # # List of bounding box coordinates - # bound_list = args.bounding_box - # # Checks if bounding box coordinates are in multiples of 10 (10 degree tiles). If they're not, the script stops. - # for bound in bound_list: - # if bound%10: - # uu.exception_log(bound, 'not a multiple of 10. Please make bounding box coordinates are multiples of 10.') - - # Checks the validity of the two arguments. If either one is invalid, the script ends. - if (gadm_index_path not in cn.gadm_plant_1x1_index_dir or planted_index_path not in cn.gadm_plant_1x1_index_dir): - uu.exception_log('Invalid inputs. Please provide None or s3 shapefile locations for both arguments.') - - - # If a full model run is specified, the correct set of tiles for the particular script is listed - if tile_id_list == 'all': - # List of all possible 10x10 Hansen tiles except for those at very extreme latitudes (not just WHRC biomass tiles) - total_tile_list = uu.tile_list_s3(cn.pixel_area_dir) - - # Removes the latitude bands that don't have any planted forests in them according to Liz Goldman. - # i.e., Liz Goldman said by Slack on 1/2/19 that the nothernmost planted forest is 69.5146 and the southernmost is -46.938968. - # This creates a more focused list of 10x10 tiles to iterate through (removes ones that definitely don't have planted forest). - # NOTE: If the planted forest gdb is updated, the list of latitudes to exclude below may need to be changed to not exclude certain latitude bands. - planted_lat_tile_list = [tile for tile in total_tile_list if '90N' not in tile] - planted_lat_tile_list = [tile for tile in planted_lat_tile_list if '80N' not in tile] - planted_lat_tile_list = [tile for tile in planted_lat_tile_list if '50S' not in tile] - planted_lat_tile_list = [tile for tile in planted_lat_tile_list if '60S' not in tile] - planted_lat_tile_list = [tile for tile in planted_lat_tile_list if '70S' not in tile] - planted_lat_tile_list = [tile for tile in planted_lat_tile_list if '80S' not in tile] - uu.print_log(planted_lat_tile_list) - uu.print_log("There are {} tiles to process after extreme latitudes have been removed".format(str(len(planted_lat_tile_list))) + "\n") - else: - planted_lat_tile_list = tile_id_list - uu.print_log("There are {} tiles to process".format(str(len(planted_lat_tile_list))) + "\n") - - - ################# Stopped updating plantation processing script to use tile_id_list argument here. - ################# But I didn't check the earlier work, either. - - - - - # If a planted forest extent 1x1 tile index shapefile isn't supplied - if 'None' in args.planted_tile_index: - - ### Entry point 1: - # If no shapefile of 1x1 tiles for countries with planted forests is supplied, 1x1 tiles of country extents will be created. - # This runs the process from the very beginning and will take a few days. - if 'None' in args.gadm_tile_index: - - uu.print_log("No GADM 1x1 tile index shapefile provided. Creating 1x1 planted forest country tiles from scratch...") - - # Downloads and unzips the GADM shapefile, which will be used to create 1x1 tiles of land areas - uu.s3_file_download(cn.gadm_path, cn.docker_tile_dir) - cmd = ['unzip', cn.gadm_zip] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - # Creates a new GADM shapefile with just the countries that have planted forests in them. - # This limits creation of 1x1 rasters of land area on the countries that have planted forests rather than on all countries. - # NOTE: If the planted forest gdb is updated and has new countries added to it, the planted forest country list - # in constants_and_names.py must be updated, too. - uu.print_log("Creating shapefile of countries with planted forests...") - os.system('''ogr2ogr -sql "SELECT * FROM gadm_3_6_adm2_final WHERE iso IN ({0})" {1} gadm_3_6_adm2_final.shp'''.format(str(cn.plantation_countries)[1:-1], cn.gadm_iso)) - - # Creates 1x1 degree tiles of countries that have planted forests in them. - # I think this can handle using 50 processors because it's not trying to upload files to s3 and the tiles are small. - # This takes several days to run because it iterates through at least 250 10x10 tiles. - # For multiprocessor use. - processes = 50 - uu.print_log('Rasterize GADM 1x1 max processors=', processes) - pool = Pool(processes) - pool.map(plantation_preparation.rasterize_gadm_1x1, planted_lat_tile_list) - pool.close() - pool.join() - - # # Creates 1x1 degree tiles of countries that have planted forests in them. - # # For single processor use. - # for tile in planted_lat_tile_list: - # - # plantation_preparation.rasterize_gadm_1x1(tile) - - # Creates a shapefile of the boundaries of the 1x1 GADM tiles in countries with planted forests - os.system('''gdaltindex {0}_{1}.shp GADM_*.tif'''.format(cn.pattern_gadm_1x1_index, uu.date_time_today)) - cmd = ['aws', 's3', 'cp', cn.docker_tile_dir, cn.gadm_plant_1x1_index_dir, '--exclude', '*', '--include', '{}*'.format(cn.pattern_gadm_1x1_index), '--recursive'] - - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - - # # Saves the 1x1 country extent tiles to s3 - # # Only use if the entire process can't run in one go on the spot machine - # cmd = ['aws', 's3', 'cp', cn.docker_base_dir, 's3://gfw2-data/climate/carbon_model/temp_spotmachine_output/', '--exclude', '*', '--include', 'GADM_*.tif', '--recursive'] - - # # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - # process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - # with process.stdout: - # uu.log_subprocess_output(process.stdout) - - - # Delete the aux.xml files - os.system('''rm GADM*.tif.*''') - - # List of all 1x1 degree countey extent tiles created - gadm_list_1x1 = uu.tile_list_spot_machine(".", "GADM_") - uu.print_log("List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", gadm_list_1x1) - uu.print_log(len(gadm_list_1x1)) - - ### Entry point 2: - # If a shapefile of the boundaries of 1x1 degree tiles of countries with planted forests is supplied, - # a list of the 1x1 tiles is created from the shapefile. - # This avoids creating the 1x1 country extent tiles all over again because the relevant tile extent are supplied - # in the shapefile. - elif cn.gadm_plant_1x1_index_dir in args.gadm_tile_index: - - uu.print_log("Country extent 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest tiles...") - - uu.print_log('{}/'.format(gadm_index_path)) - - # Copies the shapefile of 1x1 tiles of extent of countries with planted forests - cmd = ['aws', 's3', 'cp', '{}/'.format(gadm_index_path), cn.docker_tile_dir, '--recursive', '--exclude', '*', '--include', '{}*'.format(gadm_index_shp)] - - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - # Gets the attribute table of the country extent 1x1 tile shapefile - gadm = glob.glob('{}*.dbf'.format(cn.pattern_gadm_1x1_index))[0] - - # Converts the attribute table to a dataframe - dbf = Dbf5(gadm) - df = dbf.to_dataframe() - - # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list - gadm_list_1x1 = df['location'].tolist() - gadm_list_1x1 = [str(y) for y in gadm_list_1x1] - uu.print_log("List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", gadm_list_1x1) - uu.print_log("There are", len(gadm_list_1x1), "1x1 country extent tiles to iterate through.") - - # In case some other arguments are provided - else: - uu.exception_log('Invalid GADM tile index shapefile provided. Please provide a valid shapefile.') - - # Creates 1x1 degree tiles of plantation growth wherever there are plantations. - # Because this is iterating through all 1x1 tiles in countries with planted forests, it first checks - # whether each 1x1 tile intersects planted forests before creating a 1x1 planted forest tile for that - # 1x1 country extent tile. - # 55 processors seems to use about 350 GB of memory, which seems fine. But there was some error about "PQconnectdb failed-- sorry, too many clients already". - # So, moved the number of processors down to 48. - # For multiprocessor use - processes = 48 - uu.print_log('Create 1x1 plantation from 1x1 gadm max processors=', processes) - pool = Pool(processes) - pool.map(plantation_preparation.create_1x1_plantation_from_1x1_gadm, gadm_list_1x1) - pool.close() - pool.join() - - # # Creates 1x1 degree tiles of plantation growth wherever there are plantations - # # For single processor use - # for tile in gadm_list_1x1: - # - # plantation_preparation.create_1x1_plantation(tile) - - # Creates a shapefile in which each feature is the extent of a plantation extent tile. - # This index shapefile can be used the next time this process is run if starting with Entry Point 3. - os.system('''gdaltindex {0}_{1}.shp plant_gain_*.tif'''.format(cn.pattern_plant_1x1_index, uu.date_time_today)) - cmd = ['aws', 's3', 'cp', cn.docker_tile_dir, cn.gadm_plant_1x1_index_dir, '--exclude', '*', '--include', '{}*'.format(cn.pattern_plant_1x1_index), '--recursive'] - - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - ### Entry point 3 - # If a shapefile of the extents of 1x1 planted forest tiles is provided. - # This is the part that actually creates the sequestration rate and forest type tiles. - - if cn.pattern_plant_1x1_index in args.planted_tile_index: - - uu.print_log("Planted forest 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest growth rate and forest type tiles...") - - # Copies the shapefile of 1x1 tiles of extent of planted forests - cmd = ['aws', 's3', 'cp', '{}/'.format(planted_index_path), cn.docker_tile_dir, '--recursive', '--exclude', '*', '--include', - '{}*'.format(planted_index_shp), '--recursive'] - - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - - # Gets the attribute table of the planted forest extent 1x1 tile shapefile - gadm = glob.glob('{}*.dbf'.format(cn.pattern_plant_1x1_index))[0] - - # Converts the attribute table to a dataframe - dbf = Dbf5(gadm) - df = dbf.to_dataframe() - - # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list - planted_list_1x1 = df['location'].tolist() - planted_list_1x1 = [str(y) for y in planted_list_1x1] - uu.print_log("List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", planted_list_1x1) - uu.print_log("There are", len(planted_list_1x1), "1x1 planted forest extent tiles to iterate through.") - - # Creates 1x1 degree tiles of plantation growth and type wherever there are plantations. - # Because this is iterating through only 1x1 tiles that are known to have planted forests (from a previous run - # of this script), it does not need to check whether there are planted forests in this tile. It goes directly - # to intersecting the planted forest table with the 1x1 tile. - - # For single processor use - #for tile in planted_list_1x1: - # plantation_preparation.create_1x1_plantation_growth_from_1x1_planted(tile) - - # For multiprocessor use - # processes=40 uses about 360 GB of memory. Works on r4.16xlarge with space to spare - # processes=52 uses about 465 GB of memory (quite stably), so this is basically the max. - num_of_processes = 52 - pool = Pool(num_of_processes) - pool.map(plantation_preparation.create_1x1_plantation_growth_from_1x1_planted, planted_list_1x1) - pool.close() - pool.join() - - # This works with 50 processors on an r4.16xlarge marchine. Uses about 430 GB out of 480 GB. - num_of_processes = 52 - pool = Pool(num_of_processes) - processes = 50 - uu.print_log('Create 1x1 plantation type max processors=', processes) - pool = Pool(processes) - pool.map(plantation_preparation.create_1x1_plantation_type_from_1x1_planted, planted_list_1x1) - pool.close() - pool.join() - - # This rasterizes the plantation removal factor standard deviations - # processes=50 peaks at about 450 GB - num_of_processes = 50 - pool = Pool(num_of_processes) - pool.map(plantation_preparation.create_1x1_plantation_stdev_from_1x1_planted, planted_list_1x1) - pool.close() - pool.join() - - - ### All script entry points meet here: creation of 10x10 degree planted forest removals rate and rtpe tiles - ### from 1x1 degree planted forest removals rate and type tiles - - # Name of the vrt of 1x1 planted forest removals rate tiles - plant_gain_1x1_vrt = 'plant_gain_1x1.vrt' - - # Creates a mosaic of all the 1x1 plantation removals rate tiles - uu.print_log("Creating vrt of 1x1 plantation removals rate tiles") - os.system('gdalbuildvrt {} plant_gain_*.tif'.format(plant_gain_1x1_vrt)) - - # Creates 10x10 degree tiles of plantation removals rate by iterating over the set of pixel area tiles supplied - # at the start of the script that are in latitudes with planted forests. - # For multiprocessor use - processes = 20 - uu.print_log('Create 10x10 plantation removals rate max processors=', processes) - pool = Pool(processes) - pool.map(partial(plantation_preparation.create_10x10_plantation_gain, plant_gain_1x1_vrt=plant_gain_1x1_vrt), planted_lat_tile_list) - pool.close() - pool.join() - - # Creates 10x10 degree tiles of plantation removals rate by iterating over the set of pixel area tiles supplied - #at the start of the script that are in latitudes with planted forests. - # For single processor use - #for tile in planted_lat_tile_list: - # plantation_preparation.create_10x10_plantation_gain(tile, plant_gain_1x1_vrt) - - - # Name of the vrt of 1x1 planted forest type tiles - plant_type_1x1_vrt = 'plant_type_1x1.vrt' - - # Creates a mosaic of all the 1x1 plantation type tiles - uu.print_log("Creating vrt of 1x1 plantation type tiles") - os.system('gdalbuildvrt {} plant_type_*.tif'.format(plant_type_1x1_vrt)) - - # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied - # at the start of the script that are in latitudes with planted forests. - # For multiprocessor use - num_of_processes = 26 - pool = Pool(num_of_processes) - uu.print_log('Create 10x10 plantation type max processors=', processes) - pool.map(partial(plantation_preparation.create_10x10_plantation_type, plant_type_1x1_vrt=plant_type_1x1_vrt), planted_lat_tile_list) - pool.close() - pool.join() - - # # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied - # at the start of the script that are in latitudes with planted forests. - # # For single processor use - # for tile in planted_lat_tile_list: - # - # plantation_preparation.create_10x10_plantation_type(tile, plant_type_1x1_vrt) - - - - # Name of the vrt of 1x1 planted forest removals rate standard deviation tiles - plant_stdev_1x1_vrt = 'plant_stdev_1x1.vrt' - - # Creates a mosaic of all the 1x1 plantation removals rate standard deviation tiles - uu.print_log("Creating vrt of 1x1 plantation removals rate standard deviation tiles") - os.system('gdalbuildvrt {} plant_stdev_*.tif'.format(plant_stdev_1x1_vrt)) - - # Creates 10x10 degree tiles of plantation removals rate standard deviation by iterating over the set of pixel area tiles supplied - # at the start of the script that are in latitudes with planted forests. - # For multiprocessor use - num_of_processes = 26 - pool = Pool(num_of_processes) - pool.map(partial(plantation_preparation.create_10x10_plantation_gain_stdev, plant_stdev_1x1_vrt=plant_stdev_1x1_vrt), planted_lat_tile_list) - pool.close() - pool.join() - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Create planted forest carbon removals rate tiles') - parser.add_argument('--gadm-tile-index', '-gi', required=True, - help='Shapefile of 1x1 degree tiles of countries that contain planted forests (i.e. countries with planted forests rasterized to 1x1 deg). If no shapefile, write None.') - parser.add_argument('--planted-tile-index', '-pi', required=True, - help='Shapefile of 1x1 degree tiles of that contain planted forests (i.e. planted forest extent rasterized to 1x1 deg). If no shapefile, write None.') - parser.add_argument('--tile_id_list', '-l', required=True, - help='List of tile ids to use in the model. Should be of form 00N_110E or 00N_110E,00N_120E or all.') - parser.add_argument('--run-date', '-d', required=False, - help='Date of run. Must be format YYYYMMDD.') - parser.add_argument('--no-upload', '-nu', action='store_true', - help='Disables uploading of outputs to s3') - - args = parser.parse_args() - tile_id_list = args.tile_id_list - run_date = args.run_date - no_upload = args.NO_UPLOAD - - # Creates the directory and shapefile names for the two possible arguments (index shapefiles) - gadm_index = os.path.split(args.gadm_tile_index) - gadm_index_path = gadm_index[0] - gadm_index_shp = gadm_index[1] - gadm_index_shp = gadm_index_shp[:-4] - planted_index = os.path.split(args.planted_tile_index) - planted_index_path = planted_index[0] - planted_index_shp = planted_index[1] - planted_index_shp = planted_index_shp[:-4] - - # Disables upload to s3 if no AWS credentials are found in environment - if not uu.check_aws_creds(): - no_upload = True - - # Create the output log - uu.initiate_log(tile_id_list) - - # Checks whether the sensitivity analysis and tile_id_list arguments are valid - uu.check_sensit_type(sensit_type) - tile_id_list = uu.tile_id_list_check(tile_id_list) - - mp_plantation_preparation(gadm_index_shp=gadm_index_shp, planted_index_shp=planted_index_shp, - tile_id_list=tile_id_list, run_date=run_date, no_upload=no_upload) diff --git a/data_prep/mp_prep_other_inputs_annual.py b/data_prep/mp_prep_other_inputs_annual.py index 99249042..711da4ee 100644 --- a/data_prep/mp_prep_other_inputs_annual.py +++ b/data_prep/mp_prep_other_inputs_annual.py @@ -30,7 +30,7 @@ def mp_prep_other_inputs(tile_id_list): # List of tiles to run in the model tile_id_list = uu.create_combined_tile_list( [cn.WHRC_biomass_2000_unmasked_dir, cn.mangrove_biomass_2000_dir, cn.gain_dir, cn.tcd_dir, - cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir] + cn.annual_gain_AGC_BGC_planted_forest_dir] ) uu.print_log(tile_id_list) diff --git a/data_prep/mp_prep_other_inputs_one_off.py b/data_prep/mp_prep_other_inputs_one_off.py index d5506f1f..181cddb3 100644 --- a/data_prep/mp_prep_other_inputs_one_off.py +++ b/data_prep/mp_prep_other_inputs_one_off.py @@ -7,8 +7,10 @@ python -m data_prep.mp_prep_other_inputs_one_off -l 00N_000E -nu python -m data_prep.mp_prep_other_inputs_one_off -l all -''' +#TODO make each preprocessing step have its own if statement +#TODO make the process command line argument text rather than numeric and ability to launch all of the pre-processing steps individually +''' import argparse import multiprocessing import datetime @@ -23,7 +25,7 @@ from . import prep_other_inputs_one_off -def mp_prep_other_inputs(tile_id_list): +def mp_prep_other_inputs(tile_id_list, process): os.chdir(cn.docker_tile_dir) sensit_type='std' @@ -33,367 +35,248 @@ def mp_prep_other_inputs(tile_id_list): # List of tiles to run in the model tile_id_list = uu.create_combined_tile_list( [cn.WHRC_biomass_2000_unmasked_dir, cn.mangrove_biomass_2000_dir, cn.gain_dir, cn.tcd_dir, - cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir] + cn.annual_gain_AGC_BGC_planted_forest_dir] ) uu.print_log(tile_id_list) uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") + if process == 1: + + #List of output directories and output file name patterns + output_dir_list = [ + cn.climate_zone_processed_dir, cn.plant_pre_2000_processed_dir, + cn.ifl_primary_processed_dir, + cn.annual_gain_AGC_natrl_forest_young_dir, + cn.stdev_annual_gain_AGC_natrl_forest_young_dir, + cn.annual_gain_AGC_BGC_natrl_forest_Europe_dir, + cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_dir, + cn.FIA_forest_group_processed_dir, + cn.age_cat_natrl_forest_US_dir, + cn.FIA_regions_processed_dir, + ] + + output_pattern_list = [ + cn.pattern_climate_zone, cn.pattern_plant_pre_2000, + cn.pattern_ifl_primary, + cn.pattern_annual_gain_AGC_natrl_forest_young, + cn.pattern_stdev_annual_gain_AGC_natrl_forest_young, + cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe, + cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe, + cn.pattern_FIA_forest_group_processed, + cn.pattern_age_cat_natrl_forest_US, + cn.pattern_FIA_regions_processed, + ] + + + # If the model run isn't the standard one, the output directory and file names are changed + if sensit_type != 'std': + + uu.print_log('Changing output directory and file name pattern based on sensitivity analysis') + output_dir_list = uu.alter_dirs(sensit_type, output_dir_list) + output_pattern_list = uu.alter_patterns(sensit_type, output_pattern_list) + + + # A date can optionally be provided by the full model script or a run of this script. + # This replaces the date in constants_and_names. + # Only done if output upload is enabled. + if cn.RUN_DATE is not None and cn.NO_UPLOAD is not None: + output_dir_list = uu.replace_output_dir_date(output_dir_list, cn.RUN_DATE) + + + # Files to process: climate zone, IDN/MYS plantations before 2000, tree cover loss drivers, combine IFL and primary forest + uu.s3_file_download(os.path.join(cn.climate_zone_raw_dir, cn.climate_zone_raw), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.plant_pre_2000_raw_dir, '{}.zip'.format(cn.pattern_plant_pre_2000_raw)), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.annual_gain_AGC_BGC_natrl_forest_Europe_raw_dir, cn.name_annual_gain_AGC_BGC_natrl_forest_Europe_raw), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw_dir, cn.name_stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.FIA_regions_raw_dir, cn.name_FIA_regions_raw), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.age_cat_natrl_forest_US_raw_dir, cn.name_age_cat_natrl_forest_US_raw), cn.docker_base_dir, sensit_type) + uu.s3_file_download(os.path.join(cn.FIA_forest_group_raw_dir, cn.name_FIA_forest_group_raw), cn.docker_base_dir, sensit_type) + # For some reason, using uu.s3_file_download or otherwise using AWSCLI as a subprocess doesn't work for this raster. + + # Thus, using wget instead. + cmd = ['wget', '{}'.format(cn.annual_gain_AGC_natrl_forest_young_raw_URL), '-P', '{}'.format(cn.docker_base_dir)] + uu.log_subprocess_output_full(cmd) + uu.s3_file_download(cn.stdev_annual_gain_AGC_natrl_forest_young_raw_URL, cn.docker_base_dir, sensit_type) + cmd = ['aws', 's3', 'cp', cn.primary_raw_dir, cn.docker_base_dir, '--recursive'] + uu.log_subprocess_output_full(cmd) + + uu.s3_flexible_download(cn.ifl_dir, cn.pattern_ifl, cn.docker_base_dir, sensit_type, tile_id_list) + + uu.print_log("Unzipping pre-2000 plantations...") + cmd = ['unzip', '-j', '{}.zip'.format(cn.pattern_plant_pre_2000_raw)] + uu.log_subprocess_output_full(cmd) + + + ### Creates young natural forest removal rate tiles + source_raster = cn.name_annual_gain_AGC_natrl_forest_young_raw + out_pattern = cn.pattern_annual_gain_AGC_natrl_forest_young + dt = 'float32' + if cn.count == 96: + processes = 80 # 32 processors = 210 GB peak; 60 = 370 GB peak; 80 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating young natural forest removals rate tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + ### Creates young natural forest removal rate standard deviation tiles + source_raster = cn.name_stdev_annual_gain_AGC_natrl_forest_young_raw + out_pattern = cn.pattern_stdev_annual_gain_AGC_natrl_forest_young + dt = 'float32' + if cn.count == 96: + processes = 80 # 32 processors = 210 GB peak; 60 = 370 GB peak; 80 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating standard deviation for young natural forest removal rate tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + ### Creates pre-2000 oil palm plantation tiles + if cn.count == 96: + processes = 80 # 45 processors = 100 GB peak; 80 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating pre-2000 oil palm plantation tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(prep_other_inputs.rasterize_pre_2000_plantations, tile_id_list) + pool.close() + pool.join() - # List of output directories and output file name patterns - output_dir_list = [ - cn.climate_zone_processed_dir, cn.plant_pre_2000_processed_dir, - cn.ifl_primary_processed_dir, - cn.annual_gain_AGC_natrl_forest_young_dir, - cn.stdev_annual_gain_AGC_natrl_forest_young_dir, - cn.annual_gain_AGC_BGC_natrl_forest_Europe_dir, - cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_dir, - cn.FIA_forest_group_processed_dir, - cn.age_cat_natrl_forest_US_dir, - cn.FIA_regions_processed_dir, - cn.BGB_AGB_ratio_dir - ] - output_pattern_list = [ - cn.pattern_climate_zone, cn.pattern_plant_pre_2000, - cn.pattern_ifl_primary, - cn.pattern_annual_gain_AGC_natrl_forest_young, - cn.pattern_stdev_annual_gain_AGC_natrl_forest_young, - cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe, - cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe, - cn.pattern_FIA_forest_group_processed, - cn.pattern_age_cat_natrl_forest_US, - cn.pattern_FIA_regions_processed, - cn.pattern_BGB_AGB_ratio - ] - - - # If the model run isn't the standard one, the output directory and file names are changed - if sensit_type != 'std': - - uu.print_log('Changing output directory and file name pattern based on sensitivity analysis') - output_dir_list = uu.alter_dirs(sensit_type, output_dir_list) - output_pattern_list = uu.alter_patterns(sensit_type, output_pattern_list) - - - # A date can optionally be provided by the full model script or a run of this script. - # This replaces the date in constants_and_names. - # Only done if output upload is enabled. - if cn.RUN_DATE is not None and no_upload is not None: - output_dir_list = uu.replace_output_dir_date(output_dir_list, cn.RUN_DATE) - - - # # Files to process: climate zone, IDN/MYS plantations before 2000, tree cover loss drivers, combine IFL and primary forest - # uu.s3_file_download(os.path.join(cn.climate_zone_raw_dir, cn.climate_zone_raw), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.plant_pre_2000_raw_dir, '{}.zip'.format(cn.pattern_plant_pre_2000_raw)), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.annual_gain_AGC_BGC_natrl_forest_Europe_raw_dir, cn.name_annual_gain_AGC_BGC_natrl_forest_Europe_raw), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw_dir, cn.name_stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.FIA_regions_raw_dir, cn.name_FIA_regions_raw), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.age_cat_natrl_forest_US_raw_dir, cn.name_age_cat_natrl_forest_US_raw), cn.docker_base_dir, sensit_type) - # uu.s3_file_download(os.path.join(cn.FIA_forest_group_raw_dir, cn.name_FIA_forest_group_raw), cn.docker_base_dir, sensit_type) - # # For some reason, using uu.s3_file_download or otherwise using AWSCLI as a subprocess doesn't work for this raster. - # # Thus, using wget instead. - # cmd = ['wget', '{}'.format(cn.annual_gain_AGC_natrl_forest_young_raw_URL), '-P', '{}'.format(cn.docker_base_dir)] - # uu.log_subprocess_output_full(cmd) - # uu.s3_file_download(cn.stdev_annual_gain_AGC_natrl_forest_young_raw_URL, cn.docker_base_dir, sensit_type) - # cmd = ['aws', 's3', 'cp', cn.primary_raw_dir, cn.docker_base_dir, '--recursive'] - # uu.log_subprocess_output_full(cmd) - # - # uu.s3_flexible_download(cn.ifl_dir, cn.pattern_ifl, cn.docker_base_dir, sensit_type, tile_id_list) - # - # uu.print_log("Unzipping pre-2000 plantations...") - # cmd = ['unzip', '-j', '{}.zip'.format(cn.pattern_plant_pre_2000_raw)] - # uu.log_subprocess_output_full(cmd) - - - # ### Creates young natural forest removal rate tiles - # source_raster = cn.name_annual_gain_AGC_natrl_forest_young_raw - # out_pattern = cn.pattern_annual_gain_AGC_natrl_forest_young - # dt = 'float32' - # if cn.count == 96: - # processes = 80 # 32 processors = 210 GB peak; 60 = 370 GB peak; 80 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating young natural forest removals rate tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates young natural forest removal rate standard deviation tiles - # source_raster = cn.name_stdev_annual_gain_AGC_natrl_forest_young_raw - # out_pattern = cn.pattern_stdev_annual_gain_AGC_natrl_forest_young - # dt = 'float32' - # if cn.count == 96: - # processes = 80 # 32 processors = 210 GB peak; 60 = 370 GB peak; 80 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating standard deviation for young natural forest removal rate tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates pre-2000 oil palm plantation tiles - # if cn.count == 96: - # processes = 80 # 45 processors = 100 GB peak; 80 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating pre-2000 oil palm plantation tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(prep_other_inputs.rasterize_pre_2000_plantations, tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates climate zone tiles - # if cn.count == 96: - # processes = 80 # 45 processors = 230 GB peak (on second step); 80 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating climate zone tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(prep_other_inputs.create_climate_zone_tiles, tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates European natural forest removal rate tiles - # source_raster = cn.name_annual_gain_AGC_BGC_natrl_forest_Europe_raw - # out_pattern = cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe - # dt = 'float32' - # if cn.count == 96: - # processes = 60 # 32 processors = 60 GB peak; 60 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating European natural forest removals rate tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates European natural forest standard deviation of removal rate tiles - # source_raster = cn.name_stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw - # out_pattern = cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe - # dt = 'float32' - # if cn.count == 96: - # processes = 32 # 32 processors = 60 GB peak; 60 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating standard deviation for European natural forest removals rate tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates humid tropical primary forest tiles - # # Creates a vrt of the primary forests with nodata=0 from the continental primary forest rasters - # uu.print_log("Creating vrt of humid tropial primary forest...") - # primary_vrt = 'primary_2001.vrt' - # os.system('gdalbuildvrt -srcnodata 0 {} *2001_primary.tif'.format(primary_vrt)) - # uu.print_log(" Humid tropical primary forest vrt created") - # - # # Creates primary forest tiles - # source_raster = primary_vrt - # out_pattern = 'primary_2001' - # dt = 'Byte' - # if cn.count == 96: - # processes = 45 # 45 processors = 650 GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating primary forest tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates a combined IFL/primary forest raster - # # Uses very little memory since it's just file renaming - # if cn.count == 96: - # processes = 60 # 60 processors = 10 GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Assigning each tile to ifl2000 or primary forest with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(prep_other_inputs.create_combined_ifl_primary, tile_id_list) - # pool.close() - # pool.join() - # - # - # ### Creates forest age category tiles for US forests - # source_raster = cn.name_age_cat_natrl_forest_US_raw - # out_pattern = cn.pattern_age_cat_natrl_forest_US - # dt = 'Byte' - # if cn.count == 96: - # processes = 70 # 32 processors = 35 GB peak; 70 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating US forest age category tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # ### Creates forest groups for US forests - # source_raster = cn.name_FIA_forest_group_raw - # out_pattern = cn.pattern_FIA_forest_group_processed - # dt = 'Byte' - # if cn.count == 96: - # processes = 80 # 32 processors = 25 GB peak; 80 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating US forest group tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - # - # ### Creates FIA regions for US forests - # source_raster = cn.name_FIA_regions_raw - # out_pattern = cn.pattern_FIA_regions_processed - # dt = 'Byte' - # if cn.count == 96: - # processes = 70 # 32 processors = 35 GB peak; 70 = XXX GB peak - # else: - # processes = int(cn.count/2) - # uu.print_log("Creating US forest region tiles with {} processors...".format(processes)) - # pool = multiprocessing.Pool(processes) - # pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - # pool.close() - # pool.join() - - - ### Creates Hansen tiles of AGB:BGB based on Huang et al. 2021: https://essd.copernicus.org/articles/13/4263/2021/ - - # uu.print_log("Downloading raw NetCDF files...") - # cmd = ['aws', 's3', 'cp', cn.AGB_BGB_Huang_raw_dir, '.'] - # uu.log_subprocess_output_full(cmd) - - # # Converts the AGB and BGB NetCDF files to global geotifs. - # # Note that, for some reason, this isn't working in Docker locally; when it gets to the to_raster step, it keeps - # # saying "Killed", perhaps because it's running out of memory (1.87/1.95 GB used). - # # So I did this in Python shell locally outside Docker and it worked fine. - # # Methods for converting NetCDF4 to geotif are from approach 1 at - # # https://help.marine.copernicus.eu/en/articles/5029956-how-to-convert-netcdf-to-geotiff - # # Compression argument from: https://github.com/corteva/rioxarray/issues/112 - # agb = xr.open_dataset(cn.name_raw_AGB_Huang_global) - # # uu.print_log(agb) - # agb_den = agb['ASHOOT'] - # # uu.print_log(agb_den) - # agb_den = agb_den.rio.set_spatial_dims(x_dim='LON', y_dim='LAT') - # uu.print_log(agb_den) - # agb_den.rio.write_crs("epsg:4326", inplace=True) - # # Produces: - # # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # followed by NetCDF properties. But I think this error isn't a problem; the resulting geotif seems fine. - # agb_den.rio.to_raster(cn.name_rasterized_AGB_Huang_global, compress='DEFLATE') - # # Produces: - # # ERROR 1: PROJ: proj_create_from_name: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # But I think this error isn't a problem; the resulting geotif seems fine. - # - # bgb = xr.open_dataset(cn.name_raw_BGB_Huang_global) - # # uu.print_log(bgb) - # bgb_den = bgb['AROOT'] - # # uu.print_log(bgb_den) - # bgb_den = bgb_den.rio.set_spatial_dims(x_dim='LON', y_dim='LAT') - # uu.print_log(bgb_den) - # bgb_den.rio.write_crs("epsg:4326", inplace=True) - # # Produces: - # # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # followed by NetCDF properties. But I think this error isn't a problem; the resulting geotif seems fine. - # bgb_den.rio.to_raster(cn.name_rasterized_BGB_Huang_global, compress='DEFLATE') - # # Produces: - # # ERROR 1: PROJ: proj_create_from_name: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. - # # But I think this error isn't a problem; the resulting geotif seems fine. - - # uu.print_log("Generating global BGB:AGB map...") - # - # out = f'--outfile={cn.name_rasterized_BGB_AGB_Huang_global}' - # calc = '--calc=A/B' - # datatype = f'--type=Float32' - # - # # Divides BGB by AGB to get BGB:AGB (root:shoot ratio) - # cmd = ['gdal_calc.py', '-A', cn.name_rasterized_BGB_Huang_global, '-B', cn.name_rasterized_AGB_Huang_global, - # calc, out, '--NoDataValue=0', '--co', 'COMPRESS=DEFLATE', '--overwrite', datatype, '--quiet'] - # uu.log_subprocess_output_full(cmd) - - # The resulting global BGB:AGB map has many gaps, as Huang et al. didn't map AGB and BGB on all land. - # Presumably, most of the places without BGB:AGB don't have much forest, but for completeness it seems good to - # fill the BGB:AGB map gaps, both internally and make sure that continental margins aren't left without BGB:AGB. - # I used gdal_fillnodata.py to do this (https://gdal.org/programs/gdal_fillnodata.html). I tried different - # --max_distance parameters, extending it until the interior of the Sahara was covered. Obviously, there's not much - # carbon flux in the interior of the Sahara but I wanted to have full land coverage, which meant using - # --max_distance=1400 (pixels). Times for different --max_distance values are below. - # I didn't experiment with the --smooth_iterations parameter. - # I confirmed that gdal_fillnodata wasn't changing the original BGB:AGB raster and was just filling the gaps. - # The pixels it assigned to the gaps looked plausible. - - # # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_10.tif -co COMPRESS=DEFLATE -md 10 - # # real 5m7.600s; 6m17.684s - # # user 5m7.600s; 5m38.180s - # # sys 0m5.560s; 0m6.710s - # # - # # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_100.tif -co COMPRESS=DEFLATE -md 100 - # # real 7m44.302s - # # user 7m24.310s - # # sys 0m4.160s - # # - # # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1000.tif -co COMPRESS=DEFLATE -md 1000 - # # real 51m55.893s - # # user 51m25.800s - # # sys 0m6.510s - # # - # # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1200.tif -co COMPRESS=DEFLATE -md 1200 - # # real 74m41.544s - # # user 74m5.130s - # # sys 0m7.070s - # # - # # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1400.tif -co COMPRESS=DEFLATE -md 1400 - # # real - # # user - # # sys - - # cmd = ['gdal_fillnodata.py', - # cn.name_rasterized_BGB_AGB_Huang_global, 'BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_10.tif', - # '-co', 'COMPRESS=DEFLATE', '-md', '10'] - # uu.log_subprocess_output_full(cmd) - - # # upload_final_set isn't uploading the global BGB:AGB map for some reason. - # # It just doesn't show anything in the console and nothing gets uploaded. - # # But I'm not going to try to debug it since it's not an important part of the workflow. - # uu.upload_final_set(cn.AGB_BGB_Huang_rasterized_dir, '_global_from_Huang_2021') - - # Creates BGB:AGB tiles - source_raster = cn.name_rasterized_BGB_AGB_Huang_global_extended - out_pattern = cn.pattern_BGB_AGB_ratio - dt = 'Float32' - if cn.count == 96: - processes = 75 # 15=95 GB peak; 45=280 GB peak; 75=460 GB peak; 85=XXX GB peak - else: - processes = int(cn.count/2) - uu.print_log(f'Creating BGB:AGB {processes} processors...') - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) - pool.close() - pool.join() - - - for output_pattern in [ - # cn.pattern_annual_gain_AGC_natrl_forest_young, cn.pattern_stdev_annual_gain_AGC_natrl_forest_young, - cn.pattern_BGB_AGB_ratio - ]: + ### Creates climate zone tiles + if cn.count == 96: + processes = 80 # 45 processors = 230 GB peak (on second step); 80 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating climate zone tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(prep_other_inputs.create_climate_zone_tiles, tile_id_list) + pool.close() + pool.join() + + + ### Creates European natural forest removal rate tiles + source_raster = cn.name_annual_gain_AGC_BGC_natrl_forest_Europe_raw + out_pattern = cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe + dt = 'float32' + if cn.count == 96: + processes = 60 # 32 processors = 60 GB peak; 60 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating European natural forest removals rate tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + ### Creates European natural forest standard deviation of removal rate tiles + source_raster = cn.name_stdev_annual_gain_AGC_BGC_natrl_forest_Europe_raw + out_pattern = cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe + dt = 'float32' + if cn.count == 96: + processes = 32 # 32 processors = 60 GB peak; 60 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating standard deviation for European natural forest removals rate tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + ### Creates humid tropical primary forest tiles + # Creates a vrt of the primary forests with nodata=0 from the continental primary forest rasters + uu.print_log("Creating vrt of humid tropial primary forest...") + primary_vrt = 'primary_2001.vrt' + os.system('gdalbuildvrt -srcnodata 0 {} *2001_primary.tif'.format(primary_vrt)) + uu.print_log(" Humid tropical primary forest vrt created") + + # Creates primary forest tiles + source_raster = primary_vrt + out_pattern = 'primary_2001' + dt = 'Byte' + if cn.count == 96: + processes = 45 # 45 processors = 650 GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating primary forest tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + ### Creates a combined IFL/primary forest raster + # Uses very little memory since it's just file renaming + if cn.count == 96: + processes = 60 # 60 processors = 10 GB peak + else: + processes = int(cn.count/2) + uu.print_log("Assigning each tile to ifl2000 or primary forest with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(prep_other_inputs.create_combined_ifl_primary, tile_id_list) + pool.close() + pool.join() + + + ### Creates forest age category tiles for US forests + source_raster = cn.name_age_cat_natrl_forest_US_raw + out_pattern = cn.pattern_age_cat_natrl_forest_US + dt = 'Byte' + if cn.count == 96: + processes = 70 # 32 processors = 35 GB peak; 70 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating US forest age category tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + ### Creates forest groups for US forests + source_raster = cn.name_FIA_forest_group_raw + out_pattern = cn.pattern_FIA_forest_group_processed + dt = 'Byte' + if cn.count == 96: + processes = 80 # 32 processors = 25 GB peak; 80 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating US forest group tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + ### Creates FIA regions for US forests + source_raster = cn.name_FIA_regions_raw + out_pattern = cn.pattern_FIA_regions_processed + dt = 'Byte' + if cn.count == 96: + processes = 70 # 32 processors = 35 GB peak; 70 = XXX GB peak + else: + processes = int(cn.count/2) + uu.print_log("Creating US forest region tiles with {} processors...".format(processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + for output_pattern in [ + cn.pattern_annual_gain_AGC_natrl_forest_young, cn.pattern_stdev_annual_gain_AGC_natrl_forest_young + ]: # For some reason I can't figure out, the young forest rasters (rate and stdev) have NaN values in some places where 0 (NoData) # should be. These NaN values show up as values when the check_and_delete_if_empty function runs, making the tiles not # deleted even if they have no data. However, the light version (which uses gdalinfo rather than rasterio masks) doesn't # have this problem. So I'm forcing the young forest rates to and stdev to have their emptiness checked by the gdalinfo version. - if output_pattern in [cn.pattern_annual_gain_AGC_natrl_forest_young, cn.pattern_stdev_annual_gain_AGC_natrl_forest_young]: processes = int(cn.count / 2) uu.print_log("Checking for empty tiles of {0} pattern with {1} processors using light function...".format(output_pattern, processes)) pool = multiprocessing.Pool(processes) @@ -401,33 +284,303 @@ def mp_prep_other_inputs(tile_id_list): pool.close() pool.join() + + # Uploads output tiles to s3 + for i in range(0, len(output_dir_list)): + uu.upload_final_set(output_dir_list[i], output_pattern_list[i]) + + + if process == 2: + ### Creates Hansen tiles of AGB:BGB based on Huang et al. 2021: https://essd.copernicus.org/articles/13/4263/2021/ + output_dir_list = [cn.BGB_AGB_ratio_dir] + output_pattern_list = [cn.pattern_BGB_AGB_ratio] + + # If the model run isn't the standard one, the output directory and file names are changed + if sensit_type != 'std': + uu.print_log('Changing output directory and file name pattern based on sensitivity analysis') + output_dir_list = uu.alter_dirs(sensit_type, output_dir_list) + output_pattern_list = uu.alter_patterns(sensit_type, output_pattern_list) + + # A date can optionally be provided by the full model script or a run of this script. + # This replaces the date in constants_and_names. + # Only done if output upload is enabled. + if cn.RUN_DATE is not None and cn.NO_UPLOAD is not None: + output_dir_list = uu.replace_output_dir_date(output_dir_list, cn.RUN_DATE) + + uu.print_log("Downloading raw NetCDF files...") + cmd = ['aws', 's3', 'cp', cn.AGB_BGB_Huang_raw_dir, '.'] + uu.log_subprocess_output_full(cmd) + + # Converts the AGB and BGB NetCDF files to global geotifs. + # Note that, for some reason, this isn't working in Docker locally; when it gets to the to_raster step, it keeps + # saying "Killed", perhaps because it's running out of memory (1.87/1.95 GB used). + # So I did this in Python shell locally outside Docker and it worked fine. + # Methods for converting NetCDF4 to geotif are from approach 1 at + # https://help.marine.copernicus.eu/en/articles/5029956-how-to-convert-netcdf-to-geotiff + # Compression argument from: https://github.com/corteva/rioxarray/issues/112 + agb = xr.open_dataset(cn.name_raw_AGB_Huang_global) + # uu.print_log(agb) + agb_den = agb['ASHOOT'] + # uu.print_log(agb_den) + agb_den = agb_den.rio.set_spatial_dims(x_dim='LON', y_dim='LAT') + uu.print_log(agb_den) + agb_den.rio.write_crs("epsg:4326", inplace=True) + # Produces: + # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # followed by NetCDF properties. But I think this error isn't a problem; the resulting geotif seems fine. + agb_den.rio.to_raster(cn.name_rasterized_AGB_Huang_global, compress='DEFLATE') + # Produces: + # ERROR 1: PROJ: proj_create_from_name: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # But I think this error isn't a problem; the resulting geotif seems fine. + + bgb = xr.open_dataset(cn.name_raw_BGB_Huang_global) + # uu.print_log(bgb) + bgb_den = bgb['AROOT'] + # uu.print_log(bgb_den) + bgb_den = bgb_den.rio.set_spatial_dims(x_dim='LON', y_dim='LAT') + uu.print_log(bgb_den) + bgb_den.rio.write_crs("epsg:4326", inplace=True) + # Produces: + # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # followed by NetCDF properties. But I think this error isn't a problem; the resulting geotif seems fine. + bgb_den.rio.to_raster(cn.name_rasterized_BGB_Huang_global, compress='DEFLATE') + # Produces: + # ERROR 1: PROJ: proj_create_from_name: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # ERROR 1: PROJ: proj_create_from_database: C:\Program Files\GDAL\projlib\proj.db lacks DATABASE.LAYOUT.VERSION.MAJOR / DATABASE.LAYOUT.VERSION.MINOR metadata. It comes from another PROJ installation. + # But I think this error isn't a problem; the resulting geotif seems fine. + + uu.print_log("Generating global BGB:AGB map...") + + out = f'--outfile={cn.name_rasterized_BGB_AGB_Huang_global}' + calc = '--calc=A/B' + datatype = f'--type=Float32' + + # Divides BGB by AGB to get BGB:AGB (root:shoot ratio) + cmd = ['gdal_calc.py', '-A', cn.name_rasterized_BGB_Huang_global, '-B', cn.name_rasterized_AGB_Huang_global, + calc, out, '--NoDataValue=0', '--co', 'COMPRESS=DEFLATE', '--overwrite', datatype, '--quiet'] + uu.log_subprocess_output_full(cmd) + + # The resulting global BGB:AGB map has many gaps, as Huang et al. didn't map AGB and BGB on all land. + # Presumably, most of the places without BGB:AGB don't have much forest, but for completeness it seems good to + # fill the BGB:AGB map gaps, both internally and make sure that continental margins aren't left without BGB:AGB. + # I used gdal_fillnodata.py to do this (https://gdal.org/programs/gdal_fillnodata.html). I tried different + # --max_distance parameters, extending it until the interior of the Sahara was covered. Obviously, there's not much + # carbon flux in the interior of the Sahara but I wanted to have full land coverage, which meant using + # --max_distance=1400 (pixels). Times for different --max_distance values are below. + # I didn't experiment with the --smooth_iterations parameter. + # I confirmed that gdal_fillnodata wasn't changing the original BGB:AGB raster and was just filling the gaps. + # The pixels it assigned to the gaps looked plausible. + + # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_10.tif -co COMPRESS=DEFLATE -md 10 + # real 5m7.600s; 6m17.684s + # user 5m7.600s; 5m38.180s + # sys 0m5.560s; 0m6.710s + # + # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_100.tif -co COMPRESS=DEFLATE -md 100 + # real 7m44.302s + # user 7m24.310s + # sys 0m4.160s + # + # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1000.tif -co COMPRESS=DEFLATE -md 1000 + # real 51m55.893s + # user 51m25.800s + # sys 0m6.510s + # + # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1200.tif -co COMPRESS=DEFLATE -md 1200 + # real 74m41.544s + # user 74m5.130s + # sys 0m7.070s + # + # time gdal_fillnodata.py BGB_AGB_ratio_global_from_Huang_2021__20230201.tif BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_1400.tif -co COMPRESS=DEFLATE -md 1400 + # real + # user + # sys + + cmd = ['gdal_fillnodata.py', + cn.name_rasterized_BGB_AGB_Huang_global, 'BGB_AGB_ratio_global_from_Huang_2021__20230201_extended_10.tif', + '-co', 'COMPRESS=DEFLATE', '-md', '10'] + uu.log_subprocess_output_full(cmd) + + # upload_final_set isn't uploading the global BGB:AGB map for some reason. + # It just doesn't show anything in the console and nothing gets uploaded. + # But I'm not going to try to debug it since it's not an important part of the workflow. + uu.upload_final_set(cn.AGB_BGB_Huang_rasterized_dir, '_global_from_Huang_2021') + + # Creates BGB:AGB tiles + source_raster = cn.name_rasterized_BGB_AGB_Huang_global_extended + out_pattern = cn.pattern_BGB_AGB_ratio + dt = 'Float32' if cn.count == 96: - processes = 50 # 60 processors = >730 GB peak (for European natural forest forest removal rates); 50 = 600 GB peak - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - elif cn.count <= 2: # For local tests - processes = 1 - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors using light function...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty_light, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() + processes = 75 # 15=95 GB peak; 45=280 GB peak; 75=460 GB peak; 85=XXX GB peak else: - processes = int(cn.count / 2) - uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) - pool = multiprocessing.Pool(processes) - pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) - pool.close() - pool.join() - uu.print_log("\n") - - - # Uploads output tiles to s3 - for i in range(0, len(output_dir_list)): - uu.upload_final_set(output_dir_list[i], output_pattern_list[i]) + processes = int(cn.count/2) + uu.print_log(f'Creating BGB:AGB {processes} processors...') + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=source_raster, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + for output_pattern in [ + cn.pattern_BGB_AGB_ratio + ]: + + if cn.count == 96: + processes = 50 # 60 processors = >730 GB peak (for European natural forest forest removal rates); 50 = 600 GB peak + uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + elif cn.count <= 2: # For local tests + processes = 1 + uu.print_log("Checking for empty tiles of {0} pattern with {1} processors using light function...".format(output_pattern, processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty_light, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + else: + processes = int(cn.count / 2) + uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes)) + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list) + pool.close() + pool.join() + uu.print_log("\n") + + + # Uploads output tiles to s3 + for i in range(0, len(output_dir_list)): + uu.upload_final_set(output_dir_list[i], output_pattern_list[i]) + + + if process == 3: + # Preprocess 6 SDPTv2 tile sets from gfw-data-lake bucket to gfw2-data: + # AGC removal factors, AGC+BGC removal factors, AGC standard deviations, AGC+BGC standard deviations, planted forest type, and planted forest established year + + # Files to download for the SDPTv2 update + download_dict_sdptv2 = { + cn.datalake_pf_agc_rf_dir: [cn.pattern_data_lake], + cn.datalake_pf_agcbgc_rf_dir: [cn.pattern_data_lake], + cn.datalake_pf_agc_sd_dir: [cn.pattern_data_lake], + cn.datalake_pf_agcbgc_sd_dir: [cn.pattern_data_lake], + cn.datalake_pf_simplename_dir: [cn.pattern_data_lake], + cn.datalake_pf_estab_year_dir: [cn.pattern_data_lake], + } + + # List of output directories for SDPTv2 update upload + output_dir_list_sdptv2 = [ + cn.planted_forest_aboveground_removalfactor_dir, + cn.planted_forest_aboveground_belowground_removalfactor_dir, + cn.planted_forest_aboveground_standard_deviation_dir, + cn.planted_forest_aboveground_belowground_standard_deviation_dir, + cn.planted_forest_type_dir, + cn.planted_forest_estab_year_dir, + ] + + # List of output file name patterns for SDPTv2 to upload to EC2 + output_pattern_list_sdptv2 = [ + cn.pattern_pf_rf_agc_ec2, + cn.pattern_pf_rf_agcbgc_ec2, + cn.pattern_pf_sd_agc_ec2, + cn.pattern_pf_sd_agcbgc_ec2, + cn.pattern_planted_forest_type, + cn.pattern_planted_forest_estab_year, + ] + + # List of data types for each dataset to use in the gdal_warp_to_hansen + output_dt_list_sdptv2 = [ + 'Float32', + 'Float32', + 'Float32', + 'Float32', + 'Byte', + 'Byte', + ] + + + # If the model run isn't the standard one, the output directory and file names are changed + if sensit_type != 'std': + uu.print_log('Changing output directory and file name pattern based on sensitivity analysis') + output_dir_list_sdptv2 = uu.alter_dirs(sensit_type, output_dir_list_sdptv2) + output_pattern_list_sdptv2 = uu.alter_patterns(sensit_type, output_pattern_list_sdptv2) + + + # A date can optionally be provided by the full model script or a run of this script. + # This replaces the date in constants_and_names. + # Only done if output upload is enabled. + if cn.RUN_DATE is not None and cn.NO_UPLOAD is not None: + output_dir_list_sdptv2 = uu.replace_output_dir_date(output_dir_list_sdptv2, cn.RUN_DATE) + + # Create a data upload dictionary from the list of output directories and list of output file name patterns + upload_dict_sdptv2 = { + output_pattern_list_sdptv2[0]: [output_dir_list_sdptv2[0], output_dt_list_sdptv2[0]], + output_pattern_list_sdptv2[1]: [output_dir_list_sdptv2[1], output_dt_list_sdptv2[1]], + output_pattern_list_sdptv2[2]: [output_dir_list_sdptv2[2], output_dt_list_sdptv2[2]], + output_pattern_list_sdptv2[3]: [output_dir_list_sdptv2[3], output_dt_list_sdptv2[3]], + output_pattern_list_sdptv2[4]: [output_dir_list_sdptv2[4], output_dt_list_sdptv2[4]], + output_pattern_list_sdptv2[5]: [output_dir_list_sdptv2[5], output_dt_list_sdptv2[5]], + } + + + # Downloads input files or entire directories, depending on how many tiles are in the tile_id_list + uu.print_log("STEP 1: Downloading datasets from gfw-data-lake") + for key, values in download_dict_sdptv2.items(): + directory = key + pattern = values[0] + uu.s3_flexible_download(directory, pattern, cn.docker_tile_dir, cn.SENSIT_TYPE, tile_id_list) + + + uu.print_log("STEP 2: Rewindowing tiles to 40,000 x 1 pixel windows") + + for key in upload_dict_sdptv2.keys(): + download_pattern_name = key + + if cn.SINGLE_PROCESSOR: + for tile_id in tile_id_list: + uu.rewindow(tile_id, download_pattern_name=download_pattern_name , striped=True) + else: + if cn.count == 96: + processes = 80 + else: + processes = int(cn.count/2) + uu.print_log(f'Rewindow {download_pattern_name} tiles with {processes} processors...') + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.rewindow, download_pattern_name=download_pattern_name, striped=True), + tile_id_list) + pool.close() + pool.join() + + + uu.print_log("STEP 3: Warping SDPT datasets to Hansen dataset") + + for key, values in upload_dict_sdptv2.items(): + out_pattern = key + dt = values[1] + + if cn.SINGLE_PROCESSOR: + for tile_id in tile_id_list: + uu.mp_warp_to_Hansen(tile_id, source_raster=None, out_pattern=out_pattern, dt=dt) + else: + if cn.count == 96: + processes = 80 + else: + processes = int(cn.count/2) + uu.print_log(f'Warping tiles with {out_pattern} to Hansen dataset with {processes} processors...') + pool = multiprocessing.Pool(processes) + pool.map(partial(uu.mp_warp_to_Hansen, source_raster=None, out_pattern=out_pattern, dt=dt), tile_id_list) + pool.close() + pool.join() + + + uu.print_log("STEP 4: Uploading datasets to their output directories") + + if cn.NO_UPLOAD == False: + for key, values in upload_dict_sdptv2.items(): + pattern = key + directory = values[0] + uu.upload_final_set(directory, pattern) if __name__ == '__main__': @@ -442,6 +595,8 @@ def mp_prep_other_inputs(tile_id_list): help='Disables uploading of outputs to s3') parser.add_argument('--single-processor', '-sp', action='store_true', help='Uses single processing rather than multiprocessing') + parser.add_argument('--process', '-p', required=True, + help='Specifies which one-off process to run: 1 = climate zone, IDN/MYS plantations before 2000, tree cover loss drivers, combine IFL and primary forest, 2 = AGB:BGB based on Huang et al, and 3 = Spatial Database of Planted Trees Version 2 Update') args = parser.parse_args() # Sets global variables to the command line arguments @@ -450,6 +605,7 @@ def mp_prep_other_inputs(tile_id_list): cn.SINGLE_PROCESSOR = args.single_processor tile_id_list = args.tile_id_list + process = args.process # Disables upload to s3 if no AWS credentials are found in environment if not uu.check_aws_creds(): @@ -461,4 +617,4 @@ def mp_prep_other_inputs(tile_id_list): # Checks whether the tile_id_list argument is valid tile_id_list = uu.tile_id_list_check(tile_id_list) - mp_prep_other_inputs(tile_id_list=tile_id_list) \ No newline at end of file + mp_prep_other_inputs(tile_id_list=tile_id_list, process=int(process)) diff --git a/data_prep/plantation_preparation.py b/data_prep/plantation_preparation.py deleted file mode 100644 index f5548f1e..00000000 --- a/data_prep/plantation_preparation.py +++ /dev/null @@ -1,292 +0,0 @@ - -from subprocess import Popen, PIPE, STDOUT, check_call -import os -import psycopg2 -import sys -sys.path.append('../') -import constants_and_names as cn -import universal_util as uu - -# Creates 1x1 tiles of the extent of select countries are in select latitude bands, with the defining coordinates of each tile -# in the northwest corner -def rasterize_gadm_1x1(tile_id): - - uu.print_log("Getting bounding coordinates for tile", tile_id) - xmin, ymin, xmax, ymax = uu.coords(tile_id) - uu.print_log(" xmin:", xmin, "; xmax:", xmax, "; ymin", ymin, "; ymax:", ymax) - - # Degrees of tile in x and y dimensions - x_size = abs(int(xmin) - int(xmax)) - y_size = abs(int(ymin) - int(ymax)) - - # Iterates through input 10x10 tile by 1x1 degree - for x in range(x_size): - - xmin_1x1 = int(xmin) + x - xmax_1x1 = int(xmin) + x + 1 - - for y in range(y_size): - - ymin_1x1 = int(ymin) + y - ymax_1x1 = int(ymin) + y + 1 - - uu.print_log(" xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) - - tile_1x1 = 'GADM_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1) - uu.print_log("Rasterizing", tile_1x1) - cmd = ['gdal_rasterize', '-tr', '{}'.format(str(cn.Hansen_res)), '{}'.format(str(cn.Hansen_res)), - '-co', 'COMPRESS=DEFLATE', '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), - '-burn', '1', '-a_nodata', '0', cn.gadm_iso, tile_1x1] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - # Only keeps 1x1 GADM tiles if they actually include a country; many 1x1 tiles created out of 10x10 tiles - # don't actually include a country. - uu.print_log("Checking if {} contains any data...".format(tile_1x1)) - stats = uu.check_for_data(tile_1x1) - - if stats[1] > 0: - uu.print_log(" Data found in {}. Keeping tile".format(tile_1x1)) - - else: - uu.print_log(" No data found in {}. Deleting.".format(tile_1x1)) - os.remove(tile_1x1) - - -# Creates 1x1 degree tiles for the entire extent of planted forest using the supplied growth rates -# (defining coordinate in the northwest corner of the tile). -# Because this iterates through all 1x1 tiles in countries with planted forests, it first checks -# whether each 1x1 tile intersects planted forests before creating a 1x1 planted forest tile for that -# 1x1 country extent tile. -def create_1x1_plantation_from_1x1_gadm(tile_1x1): - - # Gets the bounding coordinates for the 1x1 degree tile - coords = tile_1x1.split("_") - uu.print_log(coords) - xmin_1x1 = str(coords[2])[:-4] - xmax_1x1 = int(xmin_1x1) + 1 - ymax_1x1 = int(coords[1]) - ymin_1x1 = ymax_1x1 - 1 - - uu.print_log("For", tile_1x1, "-- xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) - - # Connects Python to PostGIS using psycopg2. The credentials work on spot machines as they are currently configured - # and are based on this: https://github.com/wri/gfw-annual-loss-processing/blob/master/1b_Summary-AOIs-to-TSV/utilities/postgis_util.py - creds = {'host': 'localhost', 'user': 'ubuntu', 'dbname': 'ubuntu'} - conn = psycopg2.connect(**creds) - cursor = conn.cursor() - - # Intersects the plantations PostGIS table with the 1x1 tile, then saves any growth rates in that tile as a 1x1 tile - # https://gis.stackexchange.com/questions/30267/how-to-create-a-valid-global-polygon-grid-in-postgis - # https://stackoverflow.com/questions/48978616/best-way-to-run-st-intersects-on-features-inside-one-table - # https://postgis.net/docs/ST_Intersects.html - uu.print_log("Checking if {} has plantations in it".format(tile_1x1)) - - # Does the intersect of the PostGIS table and the 1x1 GADM tile - cursor.execute("SELECT growth FROM all_plant WHERE ST_Intersects(all_plant.wkb_geometry, ST_GeogFromText('POLYGON(({0} {1},{2} {1},{2} {3},{0} {3},{0} {1}))'))".format( - xmin_1x1, ymax_1x1, xmax_1x1, ymin_1x1)) - - # A Python list of the output of the intersection, which in this case is a list of features that were successfully intersected. - # This is what I use to determine if any PostGIS features were intersected. - features = cursor.fetchall() - cursor.close() - - # If any features in the PostGIS table were intersected with the 1x1 GADM tile, then the features in this 1x1 tile - # are converted to a planted forest gain rate tile and a plantation type tile - if len(features) > 0: - - uu.print_log("There are plantations in {}. Converting to gain rate and plantation type rasters...".format(tile_1x1)) - - # https://gis.stackexchange.com/questions/187224/how-to-use-gdal-rasterize-with-postgis-vector - # For plantation gain rate - cmd = ['gdal_rasterize', '-tr', '{}'.format(cn.Hansen_res), '{}'.format(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', 'PG:dbname=ubuntu', '-l', 'all_plant', 'plant_gain_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1), '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), '-a', 'growth', '-a_nodata', '0'] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - # https://gis.stackexchange.com/questions/187224/how-to-use-gdal-rasterize-with-postgis-vector - # For plantation type - cmd = ['gdal_rasterize', '-tr', '{}'.format(cn.Hansen_res), '{}'.format(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', 'PG:dbname=ubuntu', '-l', 'all_plant', 'plant_type_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1), '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), '-a', 'type_reclass', '-a_nodata', '0'] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - # If no features in the PostGIS table were intersected with the 1x1 GADM tile, nothing happens. - else: - uu.print_log("There are no plantations in {}. Not converting to raster.".format(tile_1x1)) - - -# Creates 1x1 degree tiles for the entire extent of planted forest using the supplied growth rates -# (defining coordinate in the northwest corner of the tile). -# Because this iterates through only 1x1 tiles that are known to have planted forests (from a previous run -# of this script), it does not need to check whether there are planted forests in this tile. It goes directly -# to intersecting the planted forest table with the 1x1 tile. -def create_1x1_plantation_growth_from_1x1_planted(tile_1x1): - - # Gets the bounding coordinates for the 1x1 degree tile - coords = tile_1x1.split("_") - xmin_1x1 = str(coords[3])[:-4] - xmax_1x1 = int(xmin_1x1) + 1 - ymax_1x1 = int(coords[2]) - ymin_1x1 = ymax_1x1 - 1 - - uu.print_log("For", tile_1x1, "-- xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) - - uu.print_log("There are plantations in {}. Converting to raster...".format(tile_1x1)) - - # https://gis.stackexchange.com/questions/187224/how-to-use-gdal-rasterize-with-postgis-vector - cmd = ['gdal_rasterize', '-tr', '{}'.format(cn.Hansen_res), '{}'.format(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', - 'PG:dbname=ubuntu', '-l', 'all_plant', 'plant_gain_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1), '-te', - str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), '-a', 'growth', '-a_nodata', '0'] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - -# Creates 1x1 degree tiles for the entire extent of planted forest using the supplied forest types -# (defining coordinate in the northwest corner of the tile). -# Because this iterates through only 1x1 tiles that are known to have planted forests (from a previous run -# of this script), it does not need to check whether there are planted forests in this tile. It goes directly -# to intersecting the planted forest table with the 1x1 tile. -def create_1x1_plantation_type_from_1x1_planted(tile_1x1): - - # Gets the bounding coordinates for the 1x1 degree tile - coords = tile_1x1.split("_") - xmin_1x1 = str(coords[3])[:-4] - xmax_1x1 = int(xmin_1x1) + 1 - ymax_1x1 = int(coords[2]) - ymin_1x1 = ymax_1x1 - 1 - - uu.print_log("For", tile_1x1, "-- xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) - - uu.print_log("There are plantations in {}. Converting to raster...".format(tile_1x1)) - - # https://gis.stackexchange.com/questions/187224/how-to-use-gdal-rasterize-with-postgis-vector - cmd = ['gdal_rasterize', '-tr', '{}'.format(cn.Hansen_res), '{}'.format(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', 'PG:dbname=ubuntu', - '-l', 'all_plant', 'plant_type_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1), - '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), - '-a', 'type_reclass', '-a_nodata', '0', '-ot', 'Byte'] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - - -# Makes 1x1 rasters of the removal rate standard deviation -def create_1x1_plantation_stdev_from_1x1_planted(tile_1x1): - - # Gets the bounding coordinates for the 1x1 degree tile - coords = tile_1x1.split("_") - xmin_1x1 = str(coords[3])[:-4] - xmax_1x1 = int(xmin_1x1) + 1 - ymax_1x1 = int(coords[2]) - ymin_1x1 = ymax_1x1 - 1 - - print("For", tile_1x1, "-- xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) - - print("There are plantations in {}. Converting to stdev raster...".format(tile_1x1)) - - # https://gis.stackexchange.com/questions/187224/how-to-use-gdal-rasterize-with-postgis-vector - cmd = ['gdal_rasterize', '-tr', '{}'.format(cn.Hansen_res), '{}'.format(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', 'PG:dbname=ubuntu', - '-l', 'all_plant', 'plant_stdev_{0}_{1}.tif'.format(ymax_1x1, xmin_1x1), - '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), - '-a', 'SD_error', '-a_nodata', '0'] - subprocess.check_call(cmd) - - - -# Combines the 1x1 plantation tiles into 10x10 plantation carbon gain rate tiles, the final output of this process -def create_10x10_plantation_gain(tile_id, plant_gain_1x1_vrt): - - uu.print_log("Getting bounding coordinates for tile", tile_id) - xmin, ymin, xmax, ymax = uu.coords(tile_id) - uu.print_log(" xmin:", xmin, "; xmax:", xmax, "; ymin", ymin, "; ymax:", ymax) - - tile_10x10 = '{0}_{1}.tif'.format(tile_id, cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked) - uu.print_log("Rasterizing", tile_10x10) - cmd = ['gdalwarp', '-tr', '{}'.format(str(cn.Hansen_res)), '{}'.format(str(cn.Hansen_res)), - '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), - '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Float32', plant_gain_1x1_vrt, tile_10x10] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - uu.print_log("Checking if {} contains any data...".format(tile_id)) - stats = uu.check_for_data(tile_10x10) - - if stats[0] > 0: - - uu.print_log(" Data found in {}. Copying tile to s3...".format(tile_id)) - uu.upload_final(cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir, tile_id, cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked) - uu.print_log(" Tile converted and copied to s3") - - else: - - uu.print_log(" No data found. Not copying {}.".format(tile_id)) - - -# Combines the 1x1 plantation tiles into 10x10 plantation carbon gain rate tiles, the final output of this process -def create_10x10_plantation_type(tile_id, plant_type_1x1_vrt): - - uu.print_log("Getting bounding coordinates for tile", tile_id) - xmin, ymin, xmax, ymax = uu.coords(tile_id) - uu.print_log(" xmin:", xmin, "; xmax:", xmax, "; ymin", ymin, "; ymax:", ymax) - - tile_10x10 = '{0}_{1}.tif'.format(tile_id, cn.pattern_planted_forest_type_unmasked) - uu.print_log("Rasterizing", tile_10x10) - cmd = ['gdalwarp', '-tr', '{}'.format(str(cn.Hansen_res)), '{}'.format(str(cn.Hansen_res)), - '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), - '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Byte', plant_type_1x1_vrt, tile_10x10] - # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging - process = Popen(cmd, stdout=PIPE, stderr=STDOUT) - with process.stdout: - uu.log_subprocess_output(process.stdout) - - uu.print_log("Checking if {} contains any data...".format(tile_id)) - stats = uu.check_for_data(tile_10x10) - - if stats[0] > 0: - - uu.print_log(" Data found in {}. Copying tile to s3...".format(tile_id)) - uu.upload_final(cn.planted_forest_type_unmasked_dir, tile_id, cn.pattern_planted_forest_type_unmasked) - uu.print_log(" Tile converted and copied to s3") - - else: - - print(" No data found. Not copying {}.".format(tile_id)) - - -# Combines the 1x1 plantation tiles into 10x10 plantation carbon gain rate tiles, the final output of this process -def create_10x10_plantation_gain_stdev(tile_id, plant_stdev_1x1_vrt): - - print("Getting bounding coordinates for tile", tile_id) - xmin, ymin, xmax, ymax = uu.coords(tile_id) - print(" xmin:", xmin, "; xmax:", xmax, "; ymin", ymin, "; ymax:", ymax) - - tile_10x10 = '{0}_{1}.tif'.format(tile_id, cn.pattern_planted_forest_stdev_unmasked) - print("Rasterizing", tile_10x10) - cmd = ['gdalwarp', '-tr', '{}'.format(str(cn.Hansen_res)), '{}'.format(str(cn.Hansen_res)), - '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), - '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Float32', plant_stdev_1x1_vrt, tile_10x10] - subprocess.check_call(cmd) - - print("Checking if {} contains any data...".format(tile_id)) - stats = uu.check_for_data_old(tile_10x10) - - if stats[0] > 0: - - print(" Data found in {}. Copying tile to s3...".format(tile_id)) - uu.upload_final(cn.planted_forest_stdev_unmasked_dir, tile_id, cn.pattern_planted_forest_stdev_unmasked) - print(" Tile converted and copied to s3") - - else: - - print(" No data found. Not copying {}.".format(tile_id)) - diff --git a/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_RFupdates.R b/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_RFupdates.R new file mode 100644 index 00000000..6d4a4c00 --- /dev/null +++ b/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_RFupdates.R @@ -0,0 +1,164 @@ +#Author: Melissa Rose +#Date Modified: +#Title: SDPT Version 2 RF Updates +############################################################################### +#Step 1: Load packages, create path variable to carbon-budget repo, and open SDPT spreadsheets +library(readxl) +library(openxlsx) +library(stringr) +library(data.table) +library(tidyverse) +library(janitor) + +#update with user-specified path to carbon-budget repo +carbon_budget_repo <- paste0(getwd(), '/git', '/carbon-budget/') + +sdpt_v1 <- data.table(read_excel(paste0(carbon_budget_repo, 'data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv1_RF_20200716.xlsx'), sheet = 'plant_attributes_simp_jul2020')) +sdpt_v2 <- data.table(read_excel(paste0(carbon_budget_repo, 'data_prep/planted_forests_prep/SDPTv2_RFupdates/plantation_attributes_v2.0_draft_v6142023.xlsx'), sheet = 'in')) +sdpt_calc <- data.table(read_excel(paste0(carbon_budget_repo, 'data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_singleClassCalculations_20230831.xlsx'), sheet = 'sc_calc', n_max=844)) +source(paste0(carbon_budget_repo, 'data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_utils.R')) + +#Step 2: Update sdpt_v2 with RFs from sdpt_v1 spreadsheet +#check and remove duplicated final_id codes from v1 +anyDuplicated(sdpt_v1$final_id) +sdptv1_dup <- sdpt_v1[duplicated(sdpt_v1$final_id) == TRUE, ] #create new DT with only duplicated final_id codes from v1 +sdptv1_trunc <- sdpt_v1[duplicated(sdpt_v1$final_id) == FALSE, ] #remove 47 duplicated final_id codes from v1 +sdptv1_dup <- left_join(sdptv1_dup, sdptv1_trunc, by = 'final_id') #check to see if duplicated values have the same RFs +sdptv1_dup <- sdptv1_dup[growth.x != growth.y] #all duplicated values match +rm(sdptv1_dup) + +#copy V1 RFs to V2 +sdptv1_trunc <- sdptv1_trunc[, .(final_id, growth, SD_error, grow_source)] #keep only columns in v1 that need to be copied over to v2 +sdptv2_update <- left_join(sdpt_v2, sdptv1_trunc, by = 'final_id') #join v1 and v2 by 'final_id' +sdptv2_update[, ":=" (removal_factor_Mg_AGC_BGC_ha_yr = growth, #move v1 data into v2 columns + removal_factor_SD_Mg_AGC_BGC_ha_yr = SD_error, + removal_factor_source = grow_source)] + +sdptv2_update[, ":=" (growth = NULL, #remove v1 columns + SD_error = NULL, + grow_source = NULL)] + +sdptv2_update <- sdptv2_update[ , removal_factor_notes := as.character(removal_factor_notes)] +sdptv2_update$sd_source <- as.character('') #add sd_source column +rm(sdptv1_trunc) + + +#Step 3: Update sdpt_v2 with updated RFs from sdpt_calc spreadsheet +sdpt_calc_trunc <- sdpt_calc[, .(final_id, `agc (t C/ha/yr)`, agc_uncertainty, `bgc (t C/ha/yr)`, bgc_uncertainty, removal_factor_Mg_AGC_BGC_ha_yr, removal_factor_SD_Mg_AGC_BGC_ha_yr, removal_factor_Mg_AGC_ha_yr, removal_factor_SD_Mg_AGC_ha_yr, removal_factor_source, sd_source, removal_factor_notes, removal_factor_region)] +sdptv2_update <- left_join(sdptv2_update, sdpt_calc_trunc, by = 'final_id') + +#check which final_ids have updated RF values #32 final_ids have been changed or updated from v1 +sdptv2_check <- sdptv2_update[is.na(removal_factor_Mg_AGC_BGC_ha_yr.x) == FALSE & is.na(removal_factor_Mg_AGC_BGC_ha_yr.y) == FALSE & + between(removal_factor_Mg_AGC_BGC_ha_yr.x, (removal_factor_Mg_AGC_BGC_ha_yr.y - 1), (removal_factor_Mg_AGC_BGC_ha_yr.y + 1)) == FALSE] +#rm(sdptv2_check) + +#replace empty v2 RFs with v1 values +sdptv2_update[is.na(removal_factor_Mg_AGC_BGC_ha_yr.y), ":=" + (removal_factor_Mg_AGC_BGC_ha_yr.y = removal_factor_Mg_AGC_BGC_ha_yr.x, + removal_factor_SD_Mg_AGC_BGC_ha_yr.y = removal_factor_SD_Mg_AGC_BGC_ha_yr.x, + removal_factor_notes.y = removal_factor_source.x)] + +#remove V1 columns +sdptv2_update[, ":=" (removal_factor_Mg_AGC_BGC_ha_yr.x = NULL, + removal_factor_SD_Mg_AGC_BGC_ha_yr.x = NULL, + removal_factor_source.x = NULL, + removal_factor_notes.x = NULL, + sd_source.x = NULL)] + + +#Step 4: Calculate removal factors for mixed classes +#Cambodia +#KHM_4 - Acacia and Teak +rf_average('KHM_4', 'KHM_9', 'IDN_7', sdptv2_update) + +sdptv2_update[final_id == 'KHM_4' , ":=" + (removal_factor_source.y = 'IPCC 2019 Refinement MAI Table 4.11', + removal_factor_region = 'Asia', + removal_factor_notes.y = 'Average of Acacia sp. for S and SE Asia and Tectona grandis for Asia.')] + +#Uruguay +#URY_13 & URY_14: Eucalyptus mix +sdptv2_update[final_id == 'URY_13' | final_id == 'URY_14', ":=" + (`agc (t C/ha/yr)` = sdptv2_update[final_id == 'URY_11', `agc (t C/ha/yr)`], + agc_uncertainty = sdptv2_update[final_id == 'URY_11', agc_uncertainty], + `bgc (t C/ha/yr)` = sdptv2_update[final_id == 'URY_11', `bgc (t C/ha/yr)`], + bgc_uncertainty = sdptv2_update[final_id == 'URY_11', bgc_uncertainty], + removal_factor_Mg_AGC_ha_yr = sdptv2_update[final_id == 'URY_11', removal_factor_Mg_AGC_ha_yr], + removal_factor_SD_Mg_AGC_ha_yr = sdptv2_update[final_id == 'URY_11', removal_factor_SD_Mg_AGC_ha_yr], + removal_factor_Mg_AGC_BGC_ha_yr.y = sdptv2_update[final_id == 'URY_11', removal_factor_Mg_AGC_BGC_ha_yr.y], + removal_factor_SD_Mg_AGC_BGC_ha_yr.y = sdptv2_update[final_id == 'URY_11', removal_factor_SD_Mg_AGC_BGC_ha_yr.y], + removal_factor_source.y = sdptv2_update[final_id == 'URY_11', removal_factor_source.y], + sd_source.y = sdptv2_update[final_id == 'URY_11', sd_source.y], + removal_factor_region = sdptv2_update[final_id == 'URY_11', removal_factor_region], + removal_factor_notes.y = 'Assume similar to other Eucalyptus sp. for Uruguay.')] + +#URY_16: Pinus elliotti and Pinus taeda +sdptv2_update[final_id == 'URY_16', ":=" + (`agc (t C/ha/yr)` = sdptv2_update[final_id == 'URY_18', `agc (t C/ha/yr)`], + agc_uncertainty = sdptv2_update[final_id == 'URY_18', agc_uncertainty], + `bgc (t C/ha/yr)` = sdptv2_update[final_id == 'URY_18', `bgc (t C/ha/yr)`], + bgc_uncertainty = sdptv2_update[final_id == 'URY_18', bgc_uncertainty], + removal_factor_Mg_AGC_ha_yr = sdptv2_update[final_id == 'URY_18', removal_factor_Mg_AGC_ha_yr], + removal_factor_SD_Mg_AGC_ha_yr = sdptv2_update[final_id == 'URY_18', removal_factor_SD_Mg_AGC_ha_yr], + removal_factor_Mg_AGC_BGC_ha_yr.y = sdptv2_update[final_id == 'URY_18', removal_factor_Mg_AGC_BGC_ha_yr.y], + removal_factor_SD_Mg_AGC_BGC_ha_yr.y = sdptv2_update[final_id == 'URY_18', removal_factor_SD_Mg_AGC_BGC_ha_yr.y], + removal_factor_source.y = sdptv2_update[final_id == 'URY_18', removal_factor_source.y], + sd_source.y = sdptv2_update[final_id == 'URY_18', sd_source.y], + removal_factor_region = sdptv2_update[final_id == 'URY_18', removal_factor_region], + removal_factor_notes.y = 'Assume similar to Pinus pinaster for Uruguay.')] + +#URY_17: Salix sp. and Populus sp. +rf_average('URY_17', 'ARG_22', 'ARG_25', sdptv2_update) + +sdptv2_update[final_id == 'URY_17' , ":=" + (removal_factor_source.y = 'FAO 2006 Planted Forest Assessment Table 6a | IPCC 2019 Refinement MAI Table 4.11', + removal_factor_region = 'South America', + removal_factor_notes.y = 'Average of Salix sp. for Argentina and Populus sp. for South America.')] + +#Mexico +mex_mix_list <- sdptv2_update[iso3 == 'MEX' & is.na(sciName2) == FALSE & is.na(removal_factor_Mg_AGC_ha_yr) == TRUE, final_id] + +for(i in 1:(length(mex_mix_list)+1)){ + id <- mex_mix_list[i] + rf_average_country(id, sdptv2_update, 'Mexico') +} + +#Guatemala +gtm_mix_list <- sdptv2_update[iso3 == 'GTM' & is.na(sciName2) == FALSE & is.na(removal_factor_Mg_AGC_ha_yr) == TRUE, final_id] + +for(i in 1:(length(gtm_mix_list)+1)){ + id <- gtm_mix_list[i] + rf_average_country(id, sdptv2_update, 'Guatemala') +} + + +##Step 5: QA/QC +#Check that all classes have AGB and BGB split +check <- sdptv2_update[is.na(`agc (t C/ha/yr)`) & is.na(`bgc (t C/ha/yr)`),] +sdptv2_update[is.na(`agc (t C/ha/yr)`) & is.na(`bgc (t C/ha/yr)`), ":=" + (`agc (t C/ha/yr)` = , + `bgc (t C/ha/yr)` = , + agc_uncertainty = , + bgc_uncertainty = , + removal_factor_Mg_AGC_ha_yr = , + removal_factor_SD_Mg_AGC_ha_yr = , + removal_factor_source.y = , + sd_source.y = , )] + + +#Step 6: Write out csv/ xlsx + +write.xlsx(sdptv2_update, 'C://Users/Melissa.Rose/OneDrive - World Resources Institute/Documents/Projects/sdpt/SPDT_v2.0_updates_MR_083123.xlsx') + + +##################################################################################################################### + +#To keep version 1 rf +#sdptv2_calc_update <- sdptv2_comb[is.na(removal_factor_Mg_AGC_BGC_ha_yr.x), ":=" +# (removal_factor_Mg_AGC_BGC_ha_yr.x = removal_factor_Mg_AGC_BGC_ha_yr.y, +# removal_factor_SD_Mg_AGC_BGC_ha_yr.x = removal_factor_SD_Mg_AGC_BGC_ha_yr.y, +# removal_factor_notes = removal_factor_source.y)] + +#sdptv2_calc_update[, ":=" (removal_factor_Mg_AGC_BGC_ha_yr.y = NULL, +# removal_factor_SD_Mg_AGC_BGC_ha_yr.y = NULL, +# removal_factor_source.y = NULL)] \ No newline at end of file diff --git a/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_utils.R b/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_utils.R new file mode 100644 index 00000000..4ebe0fb4 --- /dev/null +++ b/data_prep/planted_forests_prep/SDPTv2_RFupdates/SDPTv2_utils.R @@ -0,0 +1,94 @@ +#Author: Melissa Rose +#Date Modified: 08-14-2023 +#Title: SDPTv2_utils +############################################################################### + +rf_average <- function(ID, ID1, ID2, datatable){ + #Aboveground + datatable[final_id == ID , ":=" + ( `agc (t C/ha/yr)` = mean(c(datatable[final_id == ID1, `agc (t C/ha/yr)`], + datatable[final_id == ID2, `agc (t C/ha/yr)`])), + + agc_uncertainty = sqrt(((datatable[final_id == ID1, `agc (t C/ha/yr)`] * datatable[final_id == ID1, agc_uncertainty])^2) + +((datatable[final_id == ID2, `agc (t C/ha/yr)`] * datatable[final_id == ID2, agc_uncertainty])^2)) + /(datatable[final_id == ID1, `agc (t C/ha/yr)`] + datatable[final_id == ID2, `agc (t C/ha/yr)`]))] + #Belowground + datatable[final_id == ID , ":=" + (`bgc (t C/ha/yr)` = 0.26 * `agc (t C/ha/yr)` , + bgc_uncertainty = agc_uncertainty)] + #combined + datatable[final_id == ID , ":=" + (removal_factor_Mg_AGC_ha_yr = `agc (t C/ha/yr)`, + removal_factor_SD_Mg_AGC_ha_yr = `agc (t C/ha/yr)` * agc_uncertainty, + removal_factor_Mg_AGC_BGC_ha_yr.y = `agc (t C/ha/yr)` + `bgc (t C/ha/yr)`, + removal_factor_SD_Mg_AGC_BGC_ha_yr.y = (`agc (t C/ha/yr)` + `bgc (t C/ha/yr)`) * agc_uncertainty, + sd_source.y = 'Standard deviation calculated from combined standard deviations of input classes')] +} + +rf_average_country <- function(id_value, datatable, country_name) { + sciName_list <- as.data.table(t(remove_empty(as.data.frame(datatable[final_id == id_value, .(sciName1, sciName2, sciName3, sciName4, sciName5, sciName6, sciName7, sciName8, sciName9, sciName10, sciName11, sciName12, sciName13)]), which = 'cols'))) + if(ncol(sciName_list)>0){colnames(sciName_list)[1] = 'sciName'} + + sciName_list[, ":=" + (final_id = as.character(NA), + `agc (t C/ha/yr)` = as.numeric(NA), + removal_factor_SD_Mg_AGC_ha_yr = as.numeric(NA))] + + for(i in 1:(nrow(sciName_list)+1)){ + species <- sciName_list[i, 1] + id <- datatable[country == country_name & sciName1 == paste0(species) & is.na(sciName2), final_id] + if(length(id)>1){id <- id[1]} + if(length(id)>0){sciName_list[i, 2] <- id} + } + + for(i in 1:(nrow(sciName_list)+1)){ + id <- sciName_list[i, 2] + agc <- datatable[final_id == paste0(id), `agc (t C/ha/yr)`] + agc_sd <- datatable[final_id == paste0(id), removal_factor_SD_Mg_AGC_ha_yr] + sciName_list[final_id == id, `agc (t C/ha/yr)` := agc] + sciName_list[final_id == id, removal_factor_SD_Mg_AGC_ha_yr := agc_sd] + } + + if(country_name == 'Mexico'){ + sciName_list[is.na(sciName_list$final_id), ":=" + ( sciName = 'average RF', + final_id = 'average RF', + `agc (t C/ha/yr)` = 3.796803393, + removal_factor_SD_Mg_AGC_ha_yr = 0.846613629)]} + + if(country_name == 'Guatemala'){ + sciName_list[is.na(sciName_list$final_id), ":=" + ( sciName = 'average RF', + final_id = 'average RF', + `agc (t C/ha/yr)` = 4.863677551, + removal_factor_SD_Mg_AGC_ha_yr = 1.519178923)]} + + + + sciName_list[, agc_sd_2 := (removal_factor_SD_Mg_AGC_ha_yr)^2] + + agc_avg <- mean(sciName_list[, `agc (t C/ha/yr)`]) + bgc_avg <- 0.26*agc_avg + agc_avg_sd <- sqrt(sum(sciName_list[, agc_sd_2])/nrow(sciName_list)) + agc_avg_unc <- agc_avg_sd / agc_avg + bgc_avg_unc <- agc_avg_unc + agc_bgc_avg <- agc_avg + bgc_avg + combined_unc <- agc_avg_unc + agc_bgc_avg_sd <- agc_bgc_avg*combined_unc + + + datatable[final_id == id_value, ':=' ( + `agc (t C/ha/yr)` = agc_avg, + agc_uncertainty = agc_avg_unc, + `bgc (t C/ha/yr)` = bgc_avg, + bgc_uncertainty = bgc_avg_unc, + removal_factor_Mg_AGC_ha_yr = agc_avg, + removal_factor_SD_Mg_AGC_ha_yr = agc_avg_sd, + removal_factor_Mg_AGC_BGC_ha_yr.y = agc_bgc_avg, + removal_factor_SD_Mg_AGC_BGC_ha_yr.y = agc_bgc_avg_sd, + removal_factor_source.y = 'mix', + sd_source.y = 'Standard deviation calculated from combined standard deviations of input classes', + removal_factor_region = country_name, + removal_factor_notes.y = paste0('Assume average of ', word(toString(sciName_list[,sciName]), start = 0, end = -3), ' and ', word(toString(sciName_list[,sciName]), start = -2, end = -1), ' for ', country_name, '.') + )] +} \ No newline at end of file diff --git a/data_prep/planted_forests_prep/__init__.py b/data_prep/planted_forests_prep/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data_prep/planted_forests_prep/mp_plantation_preparation.py b/data_prep/planted_forests_prep/mp_plantation_preparation.py new file mode 100644 index 00000000..034620cf --- /dev/null +++ b/data_prep/planted_forests_prep/mp_plantation_preparation.py @@ -0,0 +1,376 @@ +#TODO Add establishment year from Du et al. update as another output column. Change readme accordingly. + +""" +I updated the plantation rasterization scripts to use SDPT v2.0 in August 2023. +I tested them locally and they worked fine. +I was even able to ingest the entire gdb into postgres on an r5d spot machine eventually. +However, when I tried running rasterizing on the full postgres database on the spot machine, I ran into issues. +First, rasterizing 1x1 tiles was really slow, like 20 minutes per tile. +Second, a lot of tiles were throwing errors about not existing or something (I really don't remember what it was). +I couldn't figure out why rasterization was so slow and why some tiles were failing. +So I gave up on using this script and switched to rasterizing SDPT v2 using the gfw-data-api, +which also had issues but ended up seeming to work fine. +I am wondering if rasterizing with this script was very slow because I could never get PostGIS to make a spatial +index for the database after ingestion, unlike when I rasterized SDPT v1. +Also, I realized that the 1x1 tile fishnet shapefile I was using to rasterize didn't have projection information in it for +some reason, so maybe it was very hard to match with the database features. +Anyhow, those are both things to investigate in case I do try to get this script working in the future once again. +""" + + +""" +This script rasterizes four attributes of the Spatial Database of Planted Trees v2 (SDPT v2) geodatabase +into 10x10 degree tiles: +1) aboveground carbon removal factors (AGC RF), +2) AGC RF standard deviations, +3) general plantation type (1: oil palm, 2: wood fiber, 3: other), +4) planted forest establishment year. + +Unlike other carbon model scripts, this one requires prep work be done outside the script +(the two large, commented chunks below this). + +Chunk 1: Steps for getting the SDPT gdb into PostGIS for rasterization. +This must be done any time the script is tested or run. +The reason for importing the SDPT to PostGIS is that it simply merges all the SDPT country feature classes into a +single table, which makes rasterization easier. +Chunk 2: Steps for creating a 1x1 degree fishnet shapefile covering all countries that have SDPT in them. +This must be done any time the SDPT is revised in a way that changes the countries included in the SDPT. +So, it's not part of the regular workflow for SDPT rasterization. + +After Chunk 1 is completed (ingesting SDPT into PostGIS), this script goes through two steps to create 10x10 +SDPT property tiles: +1. Create 1x1 deg SDPT property tiles by iterating across the 1x1 fishnet that covers SDPT countries +2. Create 10x10 deg SDPT property tiles by iterating across the supplied tile_id_list + +NOTE: 10N_010E is a good tile to test this script on because it includes CMR, which has a few planted forests in it, +so it processes fairly quickly. Note that setting -l in the command line doesn't limit the area used in the first step; +testing the first step (1x1 rasterization) is best done by commenting the code chunk that sets df = df.iloc[[XYZ]]. + +NOTE: If the column names in the SDPT gdb related to the properties attributes rasterized here change, +they must be modified in Chunk 1 below and in plantation_preparation.create_1x1_plantation_from_1x1_gadm. + +python -m data_prep.mp_plantation_preparation -l 10N_010E -nu +python -m data_prep.mp_plantation_preparation -l all +""" + +""" +Chunk 1: Ingesting SDPT into PostGIS +Before running this script, the SDPT gdb must be converted into a PostGIS table. That's more easily done as a series +of commands than as a script. Below are the instructions for creating a single PostGIS table of all plantations. +This assumes that the plantation gdb has one feature class for each country with plantations (and the EU can be in a combined feature class). +PostGIS column names below need to match the corresponding SDPT gdb attributes. +This can be done in a flux model Docker container locally (for testing) or in ec2 (for a full run). + +# Start a r5d.24xlarge ec2 instance (or some other size r5d machine for ec2 testing) +# Because this is a long process where there aren't really good checkpoints, it's best to use an on-demand ec2 +# instance instead of a spot instance. This would be set up in the AWS console using template +# https://us-east-1.console.aws.amazon.com/ec2/home?region=us-east-1#LaunchTemplateDetails:launchTemplateId=lt-00205de607ab6d4d9, +# version 30 (on-demand instance). + +# Change directory to where data are kept in the docker +cd /usr/local/tiles/ + +# Copy zipped plantation gdb with growth rate field in tables +aws s3 cp s3://gfw-files/plantations/SDPT_2.0/sdpt_v2.0_v08182023.gdb.zip . + +# I tried unzipping the gdb with sdpt_v2.0_v07102023.gdb.zip but got an error about extra bytes at the beginning. +# I followed the solution at https://unix.stackexchange.com/a/115831 to fix the zipped gdb before unzipping. +zip -FFv sdpt_v2.0_v08182023.gdb.zip --out sdpt_v2.0_v08182023.gdb.fixed.zip + +# Unzip the gdb (1.5 minutes) +time unzip sdpt_v2.0_v08182023.gdb.fixed.zip + +# Start the postgres service and check that it has actually started +service postgresql restart +pg_lsclusters + +# Create a postgres database called ubuntu. +createdb ubuntu + +# Enter the postgres database called ubuntu and add the postgis exension to it +psql +CREATE EXTENSION postgis; +\q + +# Add the feature class of one country's plantations to PostGIS. This creates the "all_plant" table for other countries to be appended to. +# Using ogr2ogr requires the PG connection info but entering the PostGIS shell (psql) doesn't. +# The fields in the SELECT statement need to change if the column names in the gdb attribute tables are changed. +ogr2ogr -f Postgresql PG:"dbname=ubuntu" sdpt_v2.0_v08182023.gdb -progress -nln all_plant --config PG_USE_COPY YES -sql "SELECT growth, simpleName, growSDerror FROM cmr_plant_v2" + +# Enter PostGIS and check that the table is there and that it has the correct fields with reasonable values. +psql +\d+ all_plant; +SELECT * FROM all_plant LIMIT 2; # To see what the first two rows look like +SELECT COUNT (*) FROM all_plant; # Should be 7777 for CMR in plantations v2.0 + +# Delete all rows from the table so that it is now empty +DELETE FROM all_plant; +\q + +# Get a list of all feature classes (countries) in the geodatabase and save it as a txt +ogrinfo sdpt_v2.0_v08182023.gdb | cut -d: -f2 | cut -d'(' -f1 | grep plant | grep -v Open | sed -e 's/ //g' > out.txt + +# Make sure all the country tables are listed in the txt, then exit it +more out.txt +q + +# Run a loop in bash that iterates through all the gdb feature classes and imports them to the all_plant PostGIS table. +# I think it's okay that there's a message "Warning 1: organizePolygons() received a polygon with more than 100 parts. The processing may be really slow. You can skip the processing by setting METHOD=SKIP, or only make it analyze counter-clock wise parts by setting METHOD=ONLY_CCW if you can assume that the outline of holes is counter-clock wise defined" +# It just seems to mean that the processing is slow, but the processing methods haven't changed. +# KOR is very slowly; it paused at the last increment for about 30 minutes but did actually finish. +# Overall, this took about 75 minutes on SDPT v2. IDN is the last feature class to be imported and it hung at +# the very last progress marker for a long time. +# However, I knew importing was continuing because htop showed an ogr2ogr process running. +time while read p; do echo $p; ogr2ogr -f Postgresql PG:"dbname=ubuntu" sdpt_v2.0_v08182023.gdb -nln all_plant --config PG_USE_COPY YES -progress -append -sql "SELECT growth, simpleName, growSDerror FROM $p"; done < out.txt + +psql +SELECT COUNT (*) FROM all_plant; +\q +# 25,917,174 features in SDPT v2 gdb + +# Create a spatial index of the plantation table to speed up the intersections with 1x1 degree tiles +# This doesn't work for v2.0 in the postgres/postgis in Docker but it does work for v2.0 in r4 instances outside Docker. +# When I try to do this in Docker, I get "column 'shape' does not exist", even though it clearly shows up in the table. +# When I do this on v2.0 in an r4 instance, swapping wkb_geometry for shape, it works fine and adds another index. +# However, I'm not sure that adding this index is necessary; +# tables in both r4 and Docker already have an index "all_plant_Shape_geom_idx" gist ("Shape") according to \d+. +# And when I do what is described at https://gis.stackexchange.com/questions/241599/finding-postgis-tables-that-are-missing-indexes, +# no tables are shown as missing indexes. +psql +CREATE INDEX IF NOT EXISTS all_plant_index ON all_plant using gist(shape); + +# Adds a new column to the table and stores the plantation type reclassified as 1 (palm), 2 (wood fiber), or 3 (other) +# The SDPT planted forest category field name must be changed below to match the gdb. +ALTER TABLE all_plant ADD COLUMN type_reclass SMALLINT; +# Based on https://stackoverflow.com/questions/15766102/i-want-to-use-case-statement-to-update-some-records-in-sql-server-2005 +UPDATE all_plant SET type_reclass = (CASE WHEN simpleName = 'Oil palm' THEN 1 WHEN simpleName = 'Oil palm mix' THEN 1 WHEN simpleName = 'Wood fiber or timber' THEN 2 WHEN simpleName = 'Wood fiber or timber mix' THEN 2 ELSE 3 END); +\q +# Takes about 10 minutes to reclassify type_reclass + +""" + +""" +Chunk 2: Preparing 1x1 degree fishnet shapefile for countries that have SDPT +Did these steps locally 8/18/23. +They are a combination of ArcMap and command line in the Docker container. +It's a mishmash but for a task that rarely needs to occur (changing the countries in the SDPT), it's fine to not script it. +The 1x1 degree shapefile is what's actually used for the script below; the PostGIS parts in this chunk are just a simple +way to select the relevant countries from the GADM iso shapefile compared to using ArcMap but all of this could be +done in ArcMap. + +# Made 1x1 deg global fishnet in ArcMap +arcpy.CreateFishnet_management(out_feature_class="C:/GIS/Carbon_model/test_tiles/docker_output/1x1_deg_fishnet_global__20230818.shp", origin_coord="-180 -90", y_axis_coord="-180 -80", cell_width="1", cell_height="1", number_rows="", number_columns="", corner_coord="180 90", labels="NO_LABELS", template="-180 -90 180 90", geometry_type="POLYGON") + +# Set up Postgres in Docker container (as in Chunk 1) +service postgresql restart +pg_lsclusters +createdb ubuntu +psql +CREATE EXTENSION postgis; +\q + +# Imported GADM iso to postgres in Docker container command line +ogr2ogr -f Postgresql PG:"dbname=ubuntu" gadm_3_6_by_iso.shp -progress -nln gadm_3_6_iso_final -nlt PROMOTE_TO_MULTI; + +# Imported 1x1 fishnet to postgres in Docker container command line +ogr2ogr -f Postgresql PG:"dbname=ubuntu" 1x1_deg_fishnet_global__20230818.shp -progress -nln fishnet_1x1_deg -s_srs EPSG:4326 -t_srs EPSG:4326; + +psql + +# Select all the GADM features that have plantations in them according to cn.SDPT_v2_iso_codes and make a new table in Docker container command line +# Replace cn.SDPT_v2_iso_codes with the actual list of codes in constants_and_names. +# This list differs from cn.SDPT_v2_feature_classes because this one lists all the countries contained in EU, +# plus some other countries that may have SDPT but aren't actually in the EU (mostly in Europe, though). +CREATE TABLE gadm_sdpt_v2 AS (SELECT * FROM gadm_3_6_iso_final WHERE iso IN (cn.SDPT_v2_iso_codes)); +# e.g., CREATE TABLE gadm_sdpt_v2 AS (SELECT * FROM gadm_3_6_iso_final WHERE iso IN ('AGO', 'ARG', 'ARM', 'AUS', 'AZE'...)); +\q +# 157 countries selected for SDPT v2 + +# Export the countries that have SDPT v2 to a shapefile (just for QC/reference) in Docker container command line. Takes a few minutes, pauses at a few places. +ogr2ogr -f "ESRI Shapefile" gadm_3_6_by_iso__with_SDPT_v2__20230821.shp PG:"dbname=ubuntu" gadm_sdpt_v2 -progress + +# Select all the 1x1 degree cells that intersect GADM with SDPT v2 and make a new table in ArcMap +arcpy.SelectLayerByLocation_management(in_layer="1x1_deg_fishnet_global__20230818", overlap_type="INTERSECT", select_features="gadm_3_6_by_iso__with_SDPT_v2__20230821", search_distance="", selection_type="NEW_SELECTION", invert_spatial_relationship="NOT_INVERT") +# Then saved the selection to fishnet_1x1_deg_SDPTv2_extent__20230821.shp in ArcMap (17,127 features) + +# Add bounding coordinate attributes to shapefile of 1x1 boxes that are within SDPT v2 countries in ArcMap +arcpy.AddGeometryAttributes_management(Input_Features="1x1_deg_fishnet_SDPTv2_extent__20230821", Geometry_Properties="EXTENT", Length_Unit="", Area_Unit="", Coordinate_System="") + +# Add field that will store NW corner of each feature in ArcMap +arcpy.AddField_management(in_table="fishnet_1x1_deg_SDPTv2_extent__20230821", field_name="NW_corner", field_type="TEXT", field_precision="", field_scale="", field_length="", field_alias="", field_is_nullable="NULLABLE", field_is_required="NON_REQUIRED", field_domain="") + +# Populate field with coordinate of NW corner of 1x1 cells that overlap with SDPT v2 countries in ArcMap +arcpy.CalculateField_management(in_table="fishnet_1x1_deg_SDPTv2_extent__20230821", field="NW_corner", expression="str(!EXT_MAX_Y!) + '_' + str(!EXT_MIN_X!)", expression_type="PYTHON_9.3", code_block="") + +# Ultimately, the list of northwest corners of the 1x1 cells is what's used in the first stage of the script below. +""" + +from multiprocessing.pool import Pool +from functools import partial +from simpledbf import Dbf5 +import glob +import datetime +import argparse +import os +import sys + +import constants_and_names as cn +import universal_util as uu +from data_prep.planted_forests_prep import plantation_preparation + + +def mp_plantation_preparation(tile_id_list): + + os.chdir(cn.docker_tile_dir) + + # If a full model run is specified, the correct set of tiles for the particular script is listed + if tile_id_list == 'all': + # List of all possible 10x10 Hansen tiles except for those at very extreme latitudes (not just WHRC biomass tiles) + tile_id_list = uu.tile_list_s3(cn.pixel_area_dir) + + uu.print_log(tile_id_list) + uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") + + + # List of output directories and output file name patterns + output_dir_list = [cn.annual_gain_AGC_BGC_planted_forest_dir, + cn.stdev_annual_gain_AGC_BGC_planted_forest_dir, + cn.planted_forest_type_dir, + cn.planted_forest_estab_year_dir] + + output_pattern_list = [cn.pattern_annual_gain_AGC_BGC_planted_forest, + cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest, + cn.pattern_planted_forest_type, + cn.pattern_planted_forest_estab_year] + + # If the model run isn't the standard one, the output directory and file names are changed + if cn.SENSIT_TYPE != 'std': + uu.print_log('Changing output directory and file name pattern based on sensitivity analysis') + output_dir_list = uu.alter_dirs(cn.SENSIT_TYPE, output_dir_list) + output_pattern_list = uu.alter_patterns(cn.SENSIT_TYPE, output_pattern_list) + + # A date can optionally be provided by the full model script or a run of this script. + # This replaces the date in constants_and_names. + # Only done if output upload is enabled. + if cn.RUN_DATE is not None and cn.NO_UPLOAD is False: + output_dir_list = uu.replace_output_dir_date(output_dir_list, cn.RUN_DATE) + + + ### Step 1: Creation of 1x1 degree planted forest property tiles + + # Downloads and unzips the GADM shapefile, which will be used to create 1x1 tiles of land areas + uu.s3_file_download(os.path.join(cn.plantations_dir, f'{cn.pattern_gadm_1x1_index}.zip'), cn.docker_tile_dir, 'std') + if not os.path.exists(f'{cn.pattern_gadm_1x1_index}.zip'): + cmd = ['unzip', f'{cn.pattern_gadm_1x1_index}.zip'] + uu.log_subprocess_output_full(cmd) + + # Gets the attribute table of the country extent 1x1 tile shapefile + gadm = glob.glob(f'{cn.pattern_gadm_1x1_index}*.dbf')[0] + + # Converts the attribute table to a dataframe + dbf = Dbf5(gadm) + df = dbf.to_dataframe() + + # To select one cell to test methods on + df = df.iloc[[1513]] + uu.print_log('Testing on ', df) + + # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list + gadm_list_1x1 = df['NW_corner'].tolist() + gadm_list_1x1 = [str(y) for y in gadm_list_1x1] + # uu.print_log("List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:", gadm_list_1x1) + uu.print_log("There are", len(gadm_list_1x1), "1x1 country extent tiles to iterate through.") + + # Creates 1x1 degree tiles of plantation attributes wherever there are plantations + # by iterating through all 1x1 tiles that intersect with countries that have planted forests + if cn.SINGLE_PROCESSOR: + for tile in gadm_list_1x1: + plantation_preparation.create_1x1_plantation_from_1x1_gadm(tile) + else: + processes = 35 #40 processors=730 GB peak + uu.print_log('Create 1x1 plantation attributes from 1x1 gadm max processors=', processes) + pool = Pool(processes) + pool.map(plantation_preparation.create_1x1_plantation_from_1x1_gadm, gadm_list_1x1) + pool.close() + pool.join() + + + ### Step 2: Creation of 10x10 degree planted forest property tiles + ### from 1x1 degree planted forest property tiles + + # Creates a mosaic of all the 1x1 plantation removal factor tiles + plant_RF_1x1_vrt = 'plant_RF_1x1.vrt' + uu.print_log('Creating vrt of 1x1 planted forest removal factor tiles') + os.system(f'gdalbuildvrt {plant_RF_1x1_vrt} *{cn.pattern_annual_gain_AGC_BGC_planted_forest}.tif') + + # Creates a mosaic of all the 1x1 plantation removals factor standard deviation tiles + plant_stdev_1x1_vrt = 'plant_stdev_1x1.vrt' + uu.print_log('Creating vrt of 1x1 planted forest removal factor standard deviation tiles') + os.system(f'gdalbuildvrt {plant_stdev_1x1_vrt} *{cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest}.tif') + + # Creates a mosaic of all the 1x1 planted forest type tiles + plant_type_1x1_vrt = 'plant_type_1x1.vrt' + uu.print_log('Creating vrt of 1x1 planted forest type tiles') + os.system(f'gdalbuildvrt {plant_type_1x1_vrt} *{cn.pattern_planted_forest_type}.tif') + + # Creates a mosaic of all the 1x1 planted forest establishment year tiles + plant_estab_year_1x1_vrt = 'plant_estab_year_1x1.vrt' + uu.print_log('Creating vrt of 1x1 planted forest establishment year tiles') + os.system(f'gdalbuildvrt {plant_estab_year_1x1_vrt} *{cn.pattern_planted_forest_estab_year}.tif') + + # Creates 10x10 degree tiles of plantation attributes iterating over the set of tiles supplied + # at the start of the script that are in latitudes with planted forests. + if cn.SINGLE_PROCESSOR: + for tile_id in tile_id_list: + plantation_preparation.create_10x10_plantation_tiles(tile_id, plant_RF_1x1_vrt, plant_stdev_1x1_vrt, + plant_type_1x1_vrt, plant_estab_year_1x1_vrt) + else: + processes = 40 + uu.print_log('Create 10x10 plantation attributes max processors=', processes) + pool = Pool(processes) + pool.map(partial(plantation_preparation.create_10x10_plantation_tiles, + plant_RF_1x1_vrt=plant_RF_1x1_vrt, + plant_stdev_1x1_vrt=plant_stdev_1x1_vrt, + plant_type_1x1_vrt=plant_type_1x1_vrt, + plant_estab_year_1x1_vrt_1x1_vrt=plant_estab_year_1x1_vrt), + tile_id_list) + pool.close() + pool.join() + + # If cn.NO_UPLOAD flag is not activated (by choice or by lack of AWS credentials), output is uploaded + if not cn.NO_UPLOAD: + for output_dir, output_pattern in zip(output_dir_list, output_pattern_list): + uu.upload_final_set(output_dir, output_pattern) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Create planted forest carbon removals rate tiles') + parser.add_argument('--tile_id_list', '-l', required=True, + help='List of tile ids to use in the model. Should be of form 00N_110E or 00N_110E,00N_120E or all.') + parser.add_argument('--run-date', '-d', required=False, + help='Date of run. Must be format YYYYMMDD.') + parser.add_argument('--no-upload', '-nu', action='store_true', + help='Disables uploading of outputs to s3') + parser.add_argument('--single-processor', '-sp', action='store_true', + help='Uses single processing rather than multiprocessing') + args = parser.parse_args() + + # Sets global variables to the command line arguments + cn.RUN_DATE = args.run_date + cn.NO_UPLOAD = args.no_upload + cn.SINGLE_PROCESSOR = args.single_processor + + tile_id_list = args.tile_id_list + + # Disables upload to s3 if no AWS credentials are found in environment + if not uu.check_aws_creds(): + no_upload = True + + # Create the output log + uu.initiate_log(tile_id_list) + + # Checks whether the sensitivity analysis and tile_id_list arguments are valid + tile_id_list = uu.tile_id_list_check(tile_id_list) + + mp_plantation_preparation(tile_id_list) diff --git a/data_prep/planted_forests_prep/plantation_preparation.py b/data_prep/planted_forests_prep/plantation_preparation.py new file mode 100644 index 00000000..928dbe1f --- /dev/null +++ b/data_prep/planted_forests_prep/plantation_preparation.py @@ -0,0 +1,157 @@ +""" +This script rasterizes four attributes of the Spatial Database of Planted Trees v2 (SDPT v2) geodatabase +into 10x10 degree tiles: +1) aboveground carbon removal factors (AGC RF), +2) AGC RF standard deviations, +3) general plantation type (1: oil palm, 2: wood fiber, 3: other), +4) planted forest establishment year. +""" + +import datetime +import os +import sys +import rasterio + +import constants_and_names as cn +import universal_util as uu + +# Creates 1x1 degree tiles of planted forest attributes +def create_1x1_plantation_from_1x1_gadm(tile_1x1): + """ + :param tile_1x1: Top left (northwest) coordinates of 1x1 degree grid cell + :return: SDPT attributes as 1x1 degree geotifs (if SDPT occurs inside the 1x1 degree cell) + """ + + # Start time + start = datetime.datetime.now() + + # Gets the bounding coordinates for the 1x1 degree tile + coords = tile_1x1.split("_") + uu.print_log(coords) + ymax_1x1 = coords[0] + ymin_1x1 = float(ymax_1x1) - 1 + xmin_1x1 = coords[1] + xmax_1x1 = float(xmin_1x1) + 1 + + uu.print_log("For", tile_1x1, "-- xmin_1x1:", xmin_1x1, "; xmax_1x1:", xmax_1x1, + "; ymin_1x1", ymin_1x1, "; ymax_1x1:", ymax_1x1) + + RF_1x1 = f'{ymax_1x1}_{xmin_1x1}_{cn.pattern_annual_gain_AGC_BGC_planted_forest}.tif' + + uu.print_log('Rasterizing planted forest removal factor...') + cmd = ['gdal_rasterize', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', + 'PG:dbname=ubuntu', + '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), + '-a', 'growth', '-a_nodata', '0', '-ot', 'Float32','-l', cn.planted_forest_postgis_db, RF_1x1] + uu.log_subprocess_output_full(cmd) + + uu.print_log(f'Checking if {RF_1x1} contains any data...') + no_data = uu.check_for_data(RF_1x1) + + uu.upload_log() + + if not no_data: + + uu.print_log(f' Data found in {RF_1x1}. Rasterizing other SDPT outputs...') + + uu.print_log('Rasterizing planted forest removal factor standard deviation...') + cmd = ['gdal_rasterize', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', + 'PG:dbname=ubuntu', + '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), + '-a', 'growSDError', '-a_nodata', '0', '-ot', 'Float32', '-l', cn.planted_forest_postgis_db, + f'{ymax_1x1}_{xmin_1x1}_{cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest}.tif'] + uu.log_subprocess_output_full(cmd) + + uu.print_log('Rasterizing planted forest type...') + cmd = ['gdal_rasterize', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', + 'PG:dbname=ubuntu', + '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), + '-a', 'type_reclass', '-a_nodata', '0', '-ot', 'Byte', '-l', cn.planted_forest_postgis_db, + f'{ymax_1x1}_{xmin_1x1}_{cn.pattern_planted_forest_type}.tif'] + uu.log_subprocess_output_full(cmd) + + # TODO: add the plantation year rasterization + + # uu.print_log('Rasterizing planted forest establishment year...') + # cmd = ['gdal_rasterize', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), '-co', 'COMPRESS=DEFLATE', + # 'PG:dbname=ubuntu', + # '-te', str(xmin_1x1), str(ymin_1x1), str(xmax_1x1), str(ymax_1x1), + # '-a', 'plant_year', '-a_nodata', '0', '-ot', 'UInt8', '-l', cn.planted_forest_postgis_db, + # f'{ymax_1x1}_{xmin_1x1}_{cn.pattern_planted_forest_estab_year}.tif'] + # uu.log_subprocess_output_full(cmd) + + else: + + uu.print_log('No SDPT in 1x1 cell. Deleting raster.') + os.remove(RF_1x1) + + # Prints information about the tile that was just processed + uu.end_of_fx_summary(start, tile_1x1, pattern) + + + +# Combines the 1x1 planted forest attribute tiles into 10x10 planted forest attribute tiles +def create_10x10_plantation_tiles(tile_id, plant_RF_1x1_vrt, plant_stdev_1x1_vrt, + plant_type_1x1_vrt, plant_estab_year_1x1_vrt): + """ + :param tile_id: tile to be processed, identified by its tile id + :param plant_RF_1x1_vrt: VRT of all planted forest removal factor 1x1 geotifs + :param plant_stdev_1x1_vrt: VRT of all planted forest removal factor standard deviation 1x1 geotifs + :param plant_type_1x1_vrt: VRT of all planted forest type 1x1 geotifs + :param plant_estab_year_1x1_vrt: VRT of all planted forest establishment year 1x1 geotifs + :return: SDPT attributes as 10x10 degree geotifs (if SDPT occurs inside the tile) + """ + + # Start time + start = datetime.datetime.now() + + uu.print_log("Getting bounding coordinates for tile", tile_id) + xmin, ymin, xmax, ymax = uu.coords(tile_id) + uu.print_log(" xmin:", xmin, "; xmax:", xmax, "; ymin", ymin, "; ymax:", ymax) + + RF_10x10 = f'{tile_id}_{cn.pattern_annual_gain_AGC_BGC_planted_forest}.tif' + uu.print_log("Rasterizing", RF_10x10) + cmd = ['gdalwarp', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), + '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), + '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Float32', plant_RF_1x1_vrt, RF_10x10] + uu.log_subprocess_output_full(cmd) + + + uu.print_log(f'Checking if {RF_10x10} contains any data...') + no_data = uu.check_for_data(RF_10x10) + + if not no_data: + + uu.print_log(f' Data found in {RF_10x10}. Rasterizing other SDPT outputs...') + + RF_stdev_10x10 = f'{tile_id}_{cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest}.tif' + print("Rasterizing", RF_stdev_10x10) + cmd = ['gdalwarp', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), + '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), + '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Float32', plant_stdev_1x1_vrt, + RF_stdev_10x10] + subprocess.check_call(cmd) + + type_10x10 = f'{tile_id}_{cn.pattern_planted_forest_type}.tif' + uu.print_log("Rasterizing", type_10x10) + cmd = ['gdalwarp', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), + '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), + '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'Byte', plant_type_1x1_vrt, + type_10x10] + uu.log_subprocess_output_full(cmd) + + estab_year_10x10 = f'{tile_id}_{cn.pattern_planted_forest_estab_year}.tif' + uu.print_log("Rasterizing", estab_year_10x10) + cmd = ['gdalwarp', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), + '-co', 'COMPRESS=DEFLATE', '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), + '-dstnodata', '0', '-t_srs', 'EPSG:4326', '-overwrite', '-ot', 'UInt8', plant_estab_year_1x1_vrt, + estab_year_10x10] + uu.log_subprocess_output_full(cmd) + + else: + + uu.print_log(f' No data found in {RF_10x10}. Deleting and not rasterizing other SDPT outputs...') + os.remove(RF_10x10) + + # Prints information about the tile that was just processed + uu.end_of_fx_summary(start, tile_id, pattern) \ No newline at end of file diff --git a/emissions/cpp_util/constants.h b/emissions/cpp_util/constants.h index 465666d0..57e11f9e 100644 --- a/emissions/cpp_util/constants.h +++ b/emissions/cpp_util/constants.h @@ -45,7 +45,7 @@ namespace constants constexpr char tcl_drivers[] = "_tree_cover_loss_driver_processed.tif"; constexpr char peat_mask[] = "_peat_mask_processed.tif"; constexpr char ifl_primary[] = "_ifl_2000_primary_2001_merged.tif"; - constexpr char plantation_type[] = "_plantation_type_oilpalm_woodfiber_other_unmasked.tif"; + constexpr char plantation_type[] = "_plantation_type_oilpalm_woodfiber_other.tif"; // Outputs constexpr char commod_emis[] = "_gross_emis_commodity_Mg_CO2e_ha_biomass_soil_2001_"; diff --git a/emissions/mp_calculate_gross_emissions.py b/emissions/mp_calculate_gross_emissions.py index c61b99f3..db3caed4 100644 --- a/emissions/mp_calculate_gross_emissions.py +++ b/emissions/mp_calculate_gross_emissions.py @@ -73,7 +73,7 @@ def mp_calculate_gross_emissions(tile_id_list, emitted_pools): cn.soil_C_emis_year_2000_dir: [cn.pattern_soil_C_emis_year_2000], cn.peat_mask_dir: [cn.pattern_peat_mask], cn.ifl_primary_processed_dir: [cn.pattern_ifl_primary], - cn.planted_forest_type_unmasked_dir: [cn.pattern_planted_forest_type_unmasked], + cn.planted_forest_type_dir: [cn.pattern_planted_forest_type], cn.drivers_processed_dir: [cn.pattern_drivers], cn.climate_zone_processed_dir: [cn.pattern_climate_zone], cn.bor_tem_trop_processed_dir: [cn.pattern_bor_tem_trop_processed], @@ -196,7 +196,7 @@ def mp_calculate_gross_emissions(tile_id_list, emitted_pools): # If it doesn't get the necessary inputs, it skips that tile. uu.print_log('Making blank tiles for inputs that do not currently exist') # All of the inputs that need to have dummy tiles made in order to match the tile list of the carbon emitted_pools - pattern_list = [cn.pattern_planted_forest_type_unmasked, cn.pattern_peat_mask, cn.pattern_ifl_primary, + pattern_list = [cn.pattern_planted_forest_type, cn.pattern_peat_mask, cn.pattern_ifl_primary, cn.pattern_drivers, cn.pattern_bor_tem_trop_processed, cn.pattern_TCLF_processed, cn.pattern_climate_zone, cn.pattern_soil_C_emis_year_2000] diff --git a/readme.md b/readme.md index 6e3d09e0..318e635b 100644 --- a/readme.md +++ b/readme.md @@ -23,7 +23,8 @@ All spatial data are converted to 10x10 degree raster tiles at 0.00025x0.00025 d Spatial data include annual tree cover loss, biomass densities in 2000, drivers of tree cover loss, ecozones, tree cover extent in 2000, elevation, etc. Many inputs can be processed the same way (e.g., many rasters can be processed using the same `gdal` function) but some need special treatment. -The input processing scripts are mostly in the `data_prep` folder but a few are unfortunately in other folders. +The input processing scripts are in the `data_prep` folder and are mostly run in `mp_prep_other_inputs_annual.py` or +`mp_prep_other_inputs_one_off.py`. The tabular data are generally annual biomass removal (i.e. sequestration) factors (e.g., mangroves, planted forests, natural forests), which are then applied to spatial data. Different inputs are needed for different steps in the framework. @@ -34,7 +35,7 @@ The framework looks for files locally before downloading them in order to reduce The framework can still be run without AWS credentials; inputs will be downloaded from s3 but outputs will not be uploaded to s3. In that case, outputs will only be stored locally. -A complete list of inputs, including changes made to the framework, can be found +A complete list of inputs, including changes made to the framework since the original publication, can be found [here](http://gfw2-data.s3.amazonaws.com/climate/carbon_model/Table_S3_data_sources__updated_20230406.pdf). ### Outputs @@ -212,8 +213,8 @@ The gross emissions script is the only part of the framework that uses C++. Thus emissions file must be compiled for emissions to run. There are a few different versions of the emissions C++ script: one for the standard version and a few other for sensitivity analyses. -`mp_calculate_gross_emissions.py` will compile the correct C++ file each time it is run, so the C++ file does not -need to be compiled manually. +`mp_calculate_gross_emissions.py` will compile the correct C++ files (for all carbon pools and for soil only) +each time it is run, so the C++ files do not need to be compiled manually. However, for completeness, the command for compiling the C++ script is (subbing in the actual file name): `c++ /usr/local/app/emissions/cpp_util/calc_gross_emissions_[VERSION].cpp -o /usr/local/app/emissions/cpp_util/calc_gross_emissions_[VERSION].exe -lgdal` @@ -393,6 +394,8 @@ Otherwise, I do not know the limitations and constraints on running this framewo ### Contact information David Gibbs: david.gibbs@wri.org +Melissa Rose: melissa.rose@wri.org + Nancy Harris: nancy.harris@wri.org Global Forest Watch, World Resources Institute, Washington, D.C. diff --git a/removals/annual_gain_rate_AGC_BGC_all_forest_types.py b/removals/annual_gain_rate_AGC_BGC_all_forest_types.py index a8210840..39614707 100644 --- a/removals/annual_gain_rate_AGC_BGC_all_forest_types.py +++ b/removals/annual_gain_rate_AGC_BGC_all_forest_types.py @@ -29,7 +29,7 @@ def annual_gain_rate_AGC_BGC_all_forest_types(tile_id, output_pattern_list): mangrove_AGB = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGB_mangrove) mangrove_BGB = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_BGB_mangrove) europe_AGC_BGC = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe) - plantations_AGC_BGC = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked) + plantations_AGC_BGC = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGC_BGC_planted_forest) us_AGC_BGC = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGC_BGC_natrl_forest_US) young_AGC = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_annual_gain_AGC_natrl_forest_young) age_category = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_age_cat_IPCC) @@ -39,7 +39,7 @@ def annual_gain_rate_AGC_BGC_all_forest_types(tile_id, output_pattern_list): # Removal factor standard deviations mangrove_AGB_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGB_mangrove) europe_AGC_BGC_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe) - plantations_AGC_BGC_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked) + plantations_AGC_BGC_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest) us_AGC_BGC_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_US) young_AGC_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGC_natrl_forest_young) ipcc_AGB_default_stdev = uu.sensit_tile_rename(cn.SENSIT_TYPE, tile_id, cn.pattern_stdev_annual_gain_AGB_IPCC_defaults) diff --git a/removals/mp_US_removal_rates.py b/removals/mp_US_removal_rates.py index e069f9a3..2c30f918 100644 --- a/removals/mp_US_removal_rates.py +++ b/removals/mp_US_removal_rates.py @@ -62,7 +62,7 @@ def mp_US_removal_rates(tile_id_list): uu.print_log(f'There are {str(len(tile_id_list))} tiles to process', "\n") # Files to download for this script - download_dict = {cn.gain_dir: [cn.pattern_gain_data_lake], + download_dict = {cn.gain_dir: [cn.pattern_data_lake], cn.FIA_regions_processed_dir: [cn.pattern_FIA_regions_processed], cn.FIA_forest_group_processed_dir: [cn.pattern_FIA_forest_group_processed], cn.age_cat_natrl_forest_US_dir: [cn.pattern_age_cat_natrl_forest_US] diff --git a/removals/mp_annual_gain_rate_AGC_BGC_all_forest_types.py b/removals/mp_annual_gain_rate_AGC_BGC_all_forest_types.py index f4ea3b21..038aa493 100644 --- a/removals/mp_annual_gain_rate_AGC_BGC_all_forest_types.py +++ b/removals/mp_annual_gain_rate_AGC_BGC_all_forest_types.py @@ -49,7 +49,7 @@ def mp_annual_gain_rate_AGC_BGC_all_forest_types(tile_id_list): cn.annual_gain_AGB_mangrove_dir: [cn.pattern_annual_gain_AGB_mangrove], cn.annual_gain_BGB_mangrove_dir: [cn.pattern_annual_gain_BGB_mangrove], cn.annual_gain_AGC_BGC_natrl_forest_Europe_dir: [cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe], - cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir: [cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked], + cn.annual_gain_AGC_BGC_planted_forest_dir: [cn.pattern_annual_gain_AGC_BGC_planted_forest], cn.annual_gain_AGC_BGC_natrl_forest_US_dir: [cn.pattern_annual_gain_AGC_BGC_natrl_forest_US], cn.annual_gain_AGC_natrl_forest_young_dir: [cn.pattern_annual_gain_AGC_natrl_forest_young], cn.age_cat_IPCC_dir: [cn.pattern_age_cat_IPCC], @@ -58,7 +58,7 @@ def mp_annual_gain_rate_AGC_BGC_all_forest_types(tile_id_list): cn.stdev_annual_gain_AGB_mangrove_dir: [cn.pattern_stdev_annual_gain_AGB_mangrove], cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_dir: [cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe], - cn.stdev_annual_gain_AGC_BGC_planted_forest_unmasked_dir: [cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked], + cn.stdev_annual_gain_AGC_BGC_planted_forest_dir: [cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest], cn.stdev_annual_gain_AGC_BGC_natrl_forest_US_dir: [cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_US], cn.stdev_annual_gain_AGC_natrl_forest_young_dir: [cn.pattern_stdev_annual_gain_AGC_natrl_forest_young], cn.stdev_annual_gain_AGB_IPCC_defaults_dir: [cn.pattern_stdev_annual_gain_AGB_IPCC_defaults] @@ -120,7 +120,7 @@ def mp_annual_gain_rate_AGC_BGC_all_forest_types(tile_id_list): # No single-processor versions of these check-if-empty functions # Checks the gross removals outputs for tiles with no data for output_pattern in output_pattern_list: - if cn.count <= 2: # For local tests + if cn.count <= 12: # For local tests processes = 1 uu.print_log(f'Checking for empty tiles of {output_pattern} pattern with {processes} processors using light function...') with multiprocessing.Pool(processes) as pool: diff --git a/removals/mp_forest_age_category_IPCC.py b/removals/mp_forest_age_category_IPCC.py index 6cb2571a..57ac0ed1 100644 --- a/removals/mp_forest_age_category_IPCC.py +++ b/removals/mp_forest_age_category_IPCC.py @@ -46,7 +46,7 @@ def mp_forest_age_category_IPCC(tile_id_list): # Files to download for this script. download_dict = { cn.model_extent_dir: [cn.pattern_model_extent], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.ifl_primary_processed_dir: [cn.pattern_ifl_primary], cn.cont_eco_dir: [cn.pattern_cont_eco_processed] } diff --git a/removals/mp_gain_year_count_all_forest_types.py b/removals/mp_gain_year_count_all_forest_types.py index 63f7e392..145cd3d2 100644 --- a/removals/mp_gain_year_count_all_forest_types.py +++ b/removals/mp_gain_year_count_all_forest_types.py @@ -45,7 +45,7 @@ def mp_gain_year_count_all_forest_types(tile_id_list): # changed for a sensitivity analysis. This does not need to change based on what run is being done; # this assignment should be true for all sensitivity analyses and the standard model. download_dict = { - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.model_extent_dir: [cn.pattern_model_extent] } diff --git a/removals/mp_gross_removals_all_forest_types.py b/removals/mp_gross_removals_all_forest_types.py index c281e70e..384c83a0 100644 --- a/removals/mp_gross_removals_all_forest_types.py +++ b/removals/mp_gross_removals_all_forest_types.py @@ -98,7 +98,7 @@ def mp_gross_removals_all_forest_types(tile_id_list): # Checks the gross removals outputs for tiles with no data for output_pattern in output_pattern_list: - if cn.count <= 2: # For local tests + if cn.count <= 12: # For local tests processes = 1 uu.print_log(f'Checking for empty tiles of {output_pattern} pattern with {processes} processors using light function...') with multiprocessing.Pool(processes) as pool: diff --git a/run_full_model.py b/run_full_model.py index 852142b1..c28ec74e 100644 --- a/run_full_model.py +++ b/run_full_model.py @@ -349,7 +349,7 @@ def main (): tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGB_mangrove}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_BGB_mangrove}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe}*tif')) - tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked}*tif')) + tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGC_BGC_planted_forest}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGC_BGC_natrl_forest_US}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_annual_gain_AGC_natrl_forest_young}*tif')) # tiles_to_delete.extend(glob.glob(f'*{cn.pattern_age_cat_IPCC}*tif')) @@ -361,7 +361,7 @@ def main (): # tiles_to_delete.extend(glob.glob(f'*{cn.pattern_plant_pre_2000}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGB_mangrove}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe}*tif')) - tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked}*tif')) + tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_US}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGC_natrl_forest_young}*tif')) tiles_to_delete.extend(glob.glob(f'*{cn.pattern_stdev_annual_gain_AGB_IPCC_defaults}*tif')) diff --git a/sensitivity_analysis/legal_AMZ_loss.py b/sensitivity_analysis/legal_AMZ_loss.py index da20158c..c636daf7 100644 --- a/sensitivity_analysis/legal_AMZ_loss.py +++ b/sensitivity_analysis/legal_AMZ_loss.py @@ -17,7 +17,7 @@ def legal_Amazon_forest_age_category(tile_id, sensit_type, output_pattern): gain = f'{tile_id}_{cn.pattern_gain_ec2}.tif' extent = '{0}_{1}.tif'.format(tile_id, cn.pattern_Brazil_forest_extent_2000_processed) biomass = uu.sensit_tile_rename(sensit_type, tile_id, cn.pattern_WHRC_biomass_2000_non_mang_non_planted) - plantations = uu.sensit_tile_rename(sensit_type, tile_id, cn.pattern_planted_forest_type_unmasked) + plantations = uu.sensit_tile_rename(sensit_type, tile_id, cn.pattern_planted_forest_type) mangroves = uu.sensit_tile_rename(sensit_type, tile_id, cn.pattern_mangrove_biomass_2000) # Opens biomass tile diff --git a/sensitivity_analysis/mp_US_removal_rates.py b/sensitivity_analysis/mp_US_removal_rates.py index b2e03553..3c79d1f2 100644 --- a/sensitivity_analysis/mp_US_removal_rates.py +++ b/sensitivity_analysis/mp_US_removal_rates.py @@ -48,7 +48,7 @@ def main (): os.chdir(cn.docker_tile_dir) # Files to download for this script. - download_dict = {cn.gain_dir: [cn.pattern_gain_data_lake], + download_dict = {cn.gain_dir: [cn.pattern_data_lake], cn.annual_gain_AGB_IPCC_defaults_dir: [cn.pattern_annual_gain_AGB_IPCC_defaults], cn.BGB_AGB_ratio_dir: [cn.pattern_BGB_AGB_ratio] } diff --git a/sensitivity_analysis/mp_legal_AMZ_loss.py b/sensitivity_analysis/mp_legal_AMZ_loss.py index 0c773bc3..b07a58a1 100644 --- a/sensitivity_analysis/mp_legal_AMZ_loss.py +++ b/sensitivity_analysis/mp_legal_AMZ_loss.py @@ -182,9 +182,9 @@ def main (): # Files to download for this script. download_dict = {cn.Brazil_annual_loss_processed_dir: [cn.pattern_Brazil_annual_loss_processed], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.WHRC_biomass_2000_non_mang_non_planted_dir: [cn.pattern_WHRC_biomass_2000_non_mang_non_planted], - cn.planted_forest_type_unmasked_dir: [cn.pattern_planted_forest_type_unmasked], + cn.planted_forest_type_dir: [cn.pattern_planted_forest_type], cn.mangrove_biomass_2000_dir: [cn.pattern_mangrove_biomass_2000], cn.Brazil_forest_extent_2000_processed_dir: [cn.pattern_Brazil_forest_extent_2000_processed] } @@ -239,9 +239,9 @@ def main (): # Files to download for this script. download_dict = { cn.Brazil_annual_loss_processed_dir: [cn.pattern_Brazil_annual_loss_processed], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.WHRC_biomass_2000_non_mang_non_planted_dir: [cn.pattern_WHRC_biomass_2000_non_mang_non_planted], - cn.planted_forest_type_unmasked_dir: [cn.pattern_planted_forest_type_unmasked], + cn.planted_forest_type_dir: [cn.pattern_planted_forest_type], cn.mangrove_biomass_2000_dir: [cn.pattern_mangrove_biomass_2000], cn.Brazil_forest_extent_2000_processed_dir: [cn.pattern_Brazil_forest_extent_2000_processed] } @@ -563,7 +563,7 @@ def main (): cn.precip_processed_dir: [cn.pattern_precip], cn.elevation_processed_dir: [cn.pattern_elevation], cn.soil_C_full_extent_2000_dir: [cn.pattern_soil_C_full_extent_2000], - cn.gain_dir: [cn.pattern_gain_data_lake], + cn.gain_dir: [cn.pattern_data_lake], cn.cumul_gain_AGCO2_mangrove_dir: [cn.pattern_cumul_gain_AGCO2_mangrove], cn.cumul_gain_AGCO2_planted_forest_non_mangrove_dir: [cn.pattern_cumul_gain_AGCO2_planted_forest_non_mangrove], cn.cumul_gain_AGCO2_natrl_forest_dir: [cn.pattern_cumul_gain_AGCO2_natrl_forest], diff --git a/test/removals/test_annual_removals_all_forest_types_rasterio.py b/test/removals/test_annual_removals_all_forest_types_rasterio.py index 1c61721a..6203a6b4 100644 --- a/test/removals/test_annual_removals_all_forest_types_rasterio.py +++ b/test/removals/test_annual_removals_all_forest_types_rasterio.py @@ -64,7 +64,7 @@ def test_rasterio_runs(upload_log_dummy, make_tile_name_fake, sensit_tile_rename cn.annual_gain_AGB_mangrove_dir: cn.pattern_annual_gain_AGB_mangrove, cn.annual_gain_BGB_mangrove_dir: cn.pattern_annual_gain_BGB_mangrove, cn.annual_gain_AGC_BGC_natrl_forest_Europe_dir: cn.pattern_annual_gain_AGC_BGC_natrl_forest_Europe, - cn.annual_gain_AGC_BGC_planted_forest_unmasked_dir: cn.pattern_annual_gain_AGC_BGC_planted_forest_unmasked, + cn.annual_gain_AGC_BGC_planted_forest_dir: cn.pattern_annual_gain_AGC_BGC_planted_forest, cn.annual_gain_AGC_BGC_natrl_forest_US_dir: cn.pattern_annual_gain_AGC_BGC_natrl_forest_US, cn.annual_gain_AGC_natrl_forest_young_dir: cn.pattern_annual_gain_AGC_natrl_forest_young, cn.age_cat_IPCC_dir: cn.pattern_age_cat_IPCC, @@ -73,7 +73,7 @@ def test_rasterio_runs(upload_log_dummy, make_tile_name_fake, sensit_tile_rename cn.stdev_annual_gain_AGB_mangrove_dir: cn.pattern_stdev_annual_gain_AGB_mangrove, cn.stdev_annual_gain_AGC_BGC_natrl_forest_Europe_dir: cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_Europe, - cn.stdev_annual_gain_AGC_BGC_planted_forest_unmasked_dir: cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest_unmasked, + cn.stdev_annual_gain_AGC_BGC_planted_forest_dir: cn.pattern_stdev_annual_gain_AGC_BGC_planted_forest, cn.stdev_annual_gain_AGC_BGC_natrl_forest_US_dir: cn.pattern_stdev_annual_gain_AGC_BGC_natrl_forest_US, cn.stdev_annual_gain_AGC_natrl_forest_young_dir: cn.pattern_stdev_annual_gain_AGC_natrl_forest_young, cn.stdev_annual_gain_AGB_IPCC_defaults_dir: cn.pattern_stdev_annual_gain_AGB_IPCC_defaults diff --git a/universal_util.py b/universal_util.py index d58c545e..eb2f4dd2 100644 --- a/universal_util.py +++ b/universal_util.py @@ -509,8 +509,9 @@ def count_tiles_s3(source, pattern=None): file_list = [] - if source == cn.gain_dir: - print_log("Not counting gain tiles... No good mechanism for it, sadly.") + if 'gfw-data-lake' in source: + #TODO: Change this function to count tiles in gfw-data-lake + print_log("Not counting gfw-data-lake tiles... No good mechanism for it, sadly.") return # Iterates through the text file to get the names of the tiles and appends them to list @@ -578,7 +579,7 @@ def s3_flexible_download(source_dir, pattern, dest, sensit_type, tile_id_list): for tile_id in tile_id_list: if pattern in [cn.pattern_tcd, cn.pattern_pixel_area, cn.pattern_loss]: # For tiles that do not have the tile_id first source = f'{source_dir}{pattern}_{tile_id}.tif' - elif pattern in [cn.pattern_gain_data_lake]: + elif pattern in [cn.pattern_data_lake]: source = f'{source_dir}{tile_id}.tif' else: # For every other type of tile source = f'{source_dir}{tile_id}_{pattern}.tif' @@ -600,15 +601,30 @@ def s3_folder_download(source, dest, sensit_type, pattern = None): # Special cases are below. local_tile_count = len(glob.glob(f'*{pattern}*.tif')) - # For gain tiles, which have a different pattern on the ec2 instance from s3 - if source == cn.gain_dir: - local_tile_count = len(glob.glob(f'*{cn.pattern_gain_ec2}*.tif')) + # For data-lake tiles, which have a different pattern on the ec2 instance from s3 + if pattern == cn.pattern_data_lake: + if source == cn.gain_dir: + ec2_pattern = cn.pattern_gain_ec2 + elif source == cn.datalake_pf_agc_rf_dir: + ec2_pattern = cn.pattern_pf_rf_agc_ec2 + elif source == cn.datalake_pf_agcbgc_rf_dir: + ec2_pattern = cn.pattern_pf_rf_agcbgc_ec2 + elif source == cn.datalake_pf_agc_sd_dir: + ec2_pattern = cn.pattern_pf_sd_agc_ec2 + elif source == cn.datalake_pf_agcbgc_sd_dir: + ec2_pattern = cn.pattern_pf_sd_agcbgc_ec2 + elif source == cn.datalake_pf_simplename_dir: + ec2_pattern = cn.pattern_planted_forest_type + elif source == cn.datalake_pf_estab_year_dir: + ec2_pattern = cn.pattern_planted_forest_estab_year + + local_tile_count = len(glob.glob(f'*{ec2_pattern}*.tif')) + print_log(f'There are {local_tile_count} tiles on the spot machine with the pattern {ec2_pattern}') # For tile types that have the tile_id after the pattern if pattern in [cn.pattern_tcd, cn.pattern_pixel_area, cn.pattern_loss]: local_tile_count = len(glob.glob(f'{pattern}*.tif')) - - print_log(f'There are {local_tile_count} tiles on the spot machine with the pattern {pattern}') + print_log(f'There are {local_tile_count} tiles on the spot machine with the pattern {pattern}') # Changes the path to download from based on the sensitivity analysis being run and whether that particular input # has a sensitivity analysis path on s3 @@ -676,21 +692,25 @@ def s3_folder_download(source, dest, sensit_type, pattern = None): else: # Counts how many tiles are in the source s3 folder - s3_count = count_tiles_s3(source, pattern=pattern) - print_log(f'There are {s3_count} tiles at {source} with the pattern {pattern}') + if pattern == cn.pattern_data_lake: + s3_count = count_tiles_s3(source, pattern=ec2_pattern) + #print_log(f'There are {s3_count} tiles at {source} with the pattern {ec2_pattern}') + else: + s3_count = count_tiles_s3(source, pattern=pattern) + print_log(f'There are {s3_count} tiles at {source} with the pattern {pattern}') # If there are as many tiles on the spot machine with the relevant pattern as there are on s3, no tiles are downloaded if local_tile_count == s3_count: print_log(f'Tiles with pattern {pattern} are already on spot machine. Not downloading.', "\n") return - print_log(f'Tiles with pattern {pattern} are not on spot machine. Downloading...') - # Downloads tile sets from the gfw-data-lake. # They need a special process because they don't have a tile pattern on the data-lake, # so I have to download them into their own folder and then give them a pattern while moving them to the main folder if 'gfw-data-lake' in source: + print_log(f'Downloading tiles with pattern {ec2_pattern}...') + # Deletes special folder for downloads from data-lake (if it already exists) if os.path.exists(os.path.join(dest, 'data-lake-downloads')): os.rmdir(os.path.join(dest, 'data-lake-downloads')) @@ -700,7 +720,7 @@ def s3_folder_download(source, dest, sensit_type, pattern = None): cmd = ['aws', 's3', 'cp', source, os.path.join(dest, 'data-lake-downloads'), '--request-payer', 'requester', '--exclude', '*xml', - '--exclude', '*geojason', '--exclude', '*vrt', '--exclude', '*csv', '--no-progress', '--recursive'] + '--exclude', '*geojson', '--exclude', '*vrt', '--exclude', '*csv', '--no-progress', '--recursive'] log_subprocess_output_full(cmd) # Copies pattern-less tiles from their special folder to main tile folder and renames them with @@ -708,19 +728,20 @@ def s3_folder_download(source, dest, sensit_type, pattern = None): print_log("Copying tiles to main tile folder...") for filename in os.listdir(os.path.join(dest, 'data-lake-downloads')): move(os.path.join(dest, f'data-lake-downloads/{filename}'), - os.path.join(cn.docker_tile_dir, f'{filename[:-4]}_{cn.pattern_gain_ec2}.tif')) + os.path.join(cn.docker_tile_dir, f'{filename[:-4]}_{ec2_pattern}.tif')) # Deletes special folder for downloads from data-lake os.rmdir(os.path.join(dest, 'data-lake-downloads')) - print_log("Tree cover gain tiles copied to main tile folder...") + print_log(f'data-lake tiles with pattern {ec2_pattern} copied to main tile folder...') # Downloads non-data-lake inputs else: + print_log(f'Tiles with pattern {pattern} are not on spot machine. Downloading...') cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--exclude', '*tiled/*', - '--exclude', '*geojason', '--exclude', '*vrt', '--exclude', '*csv', '--no-progress', '--recursive'] + '--exclude', '*geojson', '--exclude', '*vrt', '--exclude', '*csv', '--no-progress', '--recursive'] # cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--exclude', '*tiled/*', - # '--exclude', '*geojason', '--exclude', '*vrt', '--exclude', '*csv', '--recursive'] + # '--exclude', '*geojson', '--exclude', '*vrt', '--exclude', '*csv', '--recursive'] log_subprocess_output_full(cmd) @@ -802,34 +823,32 @@ def s3_file_download(source, dest, sensit_type): # If not a sensitivity run or a tile type without sensitivity analysis variants, the standard file is downloaded - # Special download procedures for tree cover gain because the tiles have no pattern, just an ID. - # Tree cover gain tiles are renamed as their downloaded to get a pattern added to them. + # Special download procedures for gfw-data-lake datasets (tree cover gain, planted forests) because the tiles have no pattern, just an ID. + # These tiles are renamed as they are downloaded to get a pattern added to them. else: - if dir == cn.gain_dir[:-1]: # Delete last character of gain_dir because it has the terminal / while dir does not have terminal / - ec2_file_name = f'{tile_id}_{cn.pattern_gain_ec2}.tif' - print_log(f'Option 1: Checking if {ec2_file_name} is already on spot machine...') - if os.path.exists(os.path.join(dest, ec2_file_name)): - print_log(f' Option 1 success: {os.path.join(dest, ec2_file_name)} already downloaded', "\n") - return + if 'gfw-data-lake' in source: + if dir == cn.gain_dir[:-1]: # Delete last character of gain_dir because it has the terminal / while dir does not have terminal / + ec2_file_name = f'{tile_id}_{cn.pattern_gain_ec2}.tif' + elif dir == cn.datalake_pf_agc_rf_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_pf_rf_agc_ec2}.tif' + elif dir == cn.datalake_pf_agcbgc_rf_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_pf_rf_agcbgc_ec2}.tif' + elif dir == cn.datalake_pf_agc_sd_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_pf_sd_agc_ec2}.tif' + elif dir == cn.datalake_pf_agcbgc_sd_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_pf_sd_agcbgc_ec2}.tif' + elif dir == cn.datalake_pf_simplename_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_planted_forest_type}.tif' + elif dir == cn.datalake_pf_estab_year_dir[:-1]: + ec2_file_name = f'{tile_id}_{cn.pattern_planted_forest_estab_year}.tif' else: - print_log(f' Option 1 failure: {ec2_file_name} is not already on spot machine.') - print_log(f'Option 2: Checking for tile {source} on s3...') - - # If the tile isn't already downloaded, download is attempted - source = os.path.join(dir, file_name) + print_log(f' Warning: {source} is located in the gfw-data-lake bucket but has not been assigned a file name pattern for download. Please update the constants_and_names.py file and the s3_file_download function in the universal_util.py file to include this dataset for download.') + return + gfw_data_lake_download(source, dest, dir, file_name, ec2_file_name) + return - # cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--only-show-errors'] - cmd = ['aws', 's3', 'cp', source, f'{dest}{ec2_file_name}', - '--request-payer', 'requester', '--only-show-errors'] - log_subprocess_output_full(cmd) - if os.path.exists(os.path.join(dest, ec2_file_name)): - print_log(f' Option 2 success: Tile {source} found on s3 and downloaded', "\n") - return - else: - print_log( - f' Option 2 failure: Tile {source} not found on s3. Tile not found but it seems it should be. Check file paths and names.', "\n") - # All other tiles besides tree cover gain + # All other tiles besides gfw-data-lake datasets else: print_log(f'Option 1: Checking if {file_name} is already on spot machine...') if os.path.exists(os.path.join(dest, file_name)): @@ -852,6 +871,34 @@ def s3_file_download(source, dest, sensit_type): else: print_log(f' Option 2 failure: Tile {source} not found on s3. Tile not found but it seems it should be. Check file paths and names.', "\n") +def gfw_data_lake_download(source, dest, dir, file_name, ec2_file_name): + + print_log(f'Option 1: Checking if {ec2_file_name} is already on spot machine...') + + if os.path.exists(os.path.join(dest, ec2_file_name)): + print_log(f' Option 1 success: {os.path.join(dest, ec2_file_name)} already downloaded', "\n") + return + else: + print_log(f' Option 1 failure: {ec2_file_name} is not already on spot machine.') + print_log(f'Option 2: Checking for tile {source} on s3...') + + # If the tile isn't already downloaded, download is attempted + source = os.path.join(dir, file_name) + + # cmd = ['aws', 's3', 'cp', source, dest, '--no-sign-request', '--only-show-errors'] + cmd = ['aws', 's3', 'cp', source, f'{dest}{ec2_file_name}', + '--request-payer', 'requester', '--only-show-errors'] + + log_subprocess_output_full(cmd) + + if os.path.exists(os.path.join(dest, ec2_file_name)): + print_log(f' Option 2 success: Tile {source} found on s3 and downloaded', "\n") + return + else: + print_log( + f' Option 2 failure: Tile {source} not found on s3. Tile not found but it seems it should be. Check file paths and names.', + "\n") + # Uploads all tiles of a pattern to specified location def upload_final_set(upload_dir, pattern): @@ -1008,10 +1055,15 @@ def mp_warp_to_Hansen(tile_id, source_raster, out_pattern, dt): print_log("Getting extent of", tile_id) xmin, ymin, xmax, ymax = coords(tile_id) + #Uses rewindowed tiles from docker container with same tile_id and same out_pattern if no source raster is given + if source_raster == None: + source_raster = f'{tile_id}_{out_pattern}_rewindow.tif' + out_tile = f'{tile_id}_{out_pattern}.tif' cmd = ['gdalwarp', '-t_srs', 'EPSG:4326', '-co', 'COMPRESS=DEFLATE', '-tr', str(cn.Hansen_res), str(cn.Hansen_res), '-tap', '-te', str(xmin), str(ymin), str(xmax), str(ymax), '-dstnodata', '0', '-ot', dt, '-overwrite', source_raster, out_tile] + process = Popen(cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: log_subprocess_output(process.stdout) @@ -1133,7 +1185,7 @@ def name_aggregated_output(pattern): # print(out_pattern) out_pattern = re.sub(f'2001_{cn.loss_years}', '', out_pattern) # print(out_pattern) - out_pattern = re.sub('_Mg_', '_Mt_per_year', out_pattern) + out_pattern = re.sub('_Mg_', '_Mt_per_year_', out_pattern) # print(out_pattern) out_pattern = re.sub('all_drivers_Mt_CO2e', 'all_drivers_Mt_CO2e_per_year', out_pattern) # print(out_pattern) @@ -1332,7 +1384,7 @@ def add_emissions_metadata(tile_id, output_pattern): # Converts 10x10 degree Hansen tiles that are in windows of 40000x1 pixels to windows of 160x160 pixels, # which is the resolution of the output tiles. This allows the 30x30 m pixels in each window to be summed # into 0.04x0.04 degree rasters. -def rewindow(tile_id, download_pattern_name): +def rewindow(tile_id, download_pattern_name, striped = False): # start time start = datetime.datetime.now() @@ -1348,17 +1400,29 @@ def rewindow(tile_id, download_pattern_name): check_memory() + # Only rewindows if the rewindowed tile does not already exist in the docker container + #if os.path.exists(out_tile): + # print_log(f'{out_tile} exists. No need to rewindow') + # return + # Only rewindows if the tile exists if os.path.exists(in_tile): - print_log(f'{in_tile} exists. Rewindowing to {out_tile} with {cn.agg_pixel_window}x{cn.agg_pixel_window} pixel windows...') # Just using gdalwarp inflated the output rasters about 10x, even with COMPRESS=LZW. # Solution was to use gdal_translate instead, although, for unclear reasons, this still inflates the size # of the pixel area tiles but not other tiles using LZW. DEFLATE makes all outputs smaller. - cmd = ['gdal_translate', '-co', 'COMPRESS=DEFLATE', - '-co', 'TILED=YES', '-co', 'BLOCKXSIZE={}'.format(cn.agg_pixel_window), '-co', 'BLOCKYSIZE={}'.format(cn.agg_pixel_window), - in_tile, out_tile] - log_subprocess_output_full(cmd) + if striped == False: + cmd = ['gdal_translate', '-co', 'COMPRESS=DEFLATE', '-co', 'TILED=YES', + '-co', 'BLOCKXSIZE={}'.format(cn.agg_pixel_window), '-co', 'BLOCKYSIZE={}'.format(cn.agg_pixel_window), + in_tile, out_tile] + log_subprocess_output_full(cmd) + print_log(f'{in_tile} exists. Rewindowing to {out_tile} with {cn.agg_pixel_window}x{cn.agg_pixel_window} pixel windows...') + + elif striped == True: + cmd = ['gdal_translate', '-co', 'COMPRESS=DEFLATE', '-co', 'TILED=NO', + in_tile, out_tile] + log_subprocess_output_full(cmd) + print_log(f'{in_tile} exists. Rewindowing to {out_tile} with 40000x1 pixel windows...') else: @@ -1368,4 +1432,3 @@ def rewindow(tile_id, download_pattern_name): end_of_fx_summary(start, tile_id, "{}_rewindow".format(download_pattern_name)) -