Merge branch 'mdc/julia_brt' into conference_staging

GEO-BON · Oct 5, 2023 · e31bac4 · e31bac4
2 parents ba027be + 5353057
commit e31bac4
Show file tree

Hide file tree

Showing 12 changed files with 1,851 additions and 2 deletions.
diff --git a/pipelines/SDM/BRT_julia_sdm.json b/pipelines/SDM/BRT_julia_sdm.json
diff --git a/pipelines/SDM/BRT_julia_viewer.json b/pipelines/SDM/BRT_julia_viewer.json
diff --git a/scripts/SDM/julia_sdms/fitBRT.jl b/scripts/SDM/julia_sdms/fitBRT.jl
@@ -0,0 +1,171 @@
+using JSON
+using CSV
+using DataFrames
+using EvoTrees
+using StatsBase
+using SpeciesDistributionToolkit
+
+include("./shared.jl")
+
+function get_features_and_labels(presences, absences, climate_layers)
+    presences = mask(presences, climate_layers[begin])
+    absences = mask(absences, climate_layers[begin])
+    coord_presence = keys(replace(presences, false => nothing))
+    coord_absence = keys(replace(absences, false => nothing))
+    coord = vcat(coord_presence, coord_absence)
+
+    X = hcat([layer[coord] for layer in climate_layers]...)
+    y = vcat(fill(1.0, length(coord_presence)), fill(0.0, length(coord_absence)))
+    return X, y, coord
+end
+
+function layers_to_matrix!(climate_layers, mat, land_idx)
+    for (i, idx) in enumerate(land_idx)
+        for l in eachindex(climate_layers)
+            mat[l, i] = climate_layers[l].grid[idx]
+        end
+    end
+end
+
+function compute_fit_stats_and_cutoff(distribution, coords, y)
+    cutoff = LinRange(extrema(distribution)..., 500)
+    coords = convert(Vector{typeof(coords[begin])}, coords)
+    idx = findall(!isnothing, coords)
+    I = [SimpleSDMLayers._point_to_cartesian(distribution, c) for c in coords][idx]
+
+    obs = y .> 0
+
+    tp = zeros(Float64, length(cutoff))
+    fp = zeros(Float64, length(cutoff))
+    tn = zeros(Float64, length(cutoff))
+    fn = zeros(Float64, length(cutoff))
+
+    for (i, c) in enumerate(cutoff)
+        prd = [distribution.grid[i] >= c for i in I]
+        tp[i] = sum(prd .& obs)
+        tn[i] = sum(.!(prd) .& (.!obs))
+        fp[i] = sum(prd .& (.!obs))
+        fn[i] = sum(.!(prd) .& obs)
+    end
+
+    tpr = tp ./ (tp .+ fn)
+    fpr = fp ./ (fp .+ tn)
+    J = (tp ./ (tp .+ fn)) + (tn ./ (tn .+ fp)) .- 1.0
+
+    roc_dx = [reverse(fpr)[i] - reverse(fpr)[i - 1] for i in 2:length(fpr)]
+    roc_dy = [reverse(tpr)[i] + reverse(tpr)[i - 1] for i in 2:length(tpr)]
+    ROCAUC = sum(roc_dx .* (roc_dy ./ 2.0))
+
+    thr_index = last(findmax(J))
+    τ = cutoff[thr_index]
+
+    return Dict(:rocauc => ROCAUC, :threshold => τ, :J => J[last(findmax(J))])
+end
+
+
+function test_train_split(X, y, proportion=0.7)
+    train_size = floor(Int, proportion * length(y))
+    Itrain = StatsBase.sample(1:length(y), train_size; replace=false)
+    Itest = setdiff(1:length(y), Itrain)
+    Xtrain, Xtest = X[Itrain, :], X[Itest, :]
+    Ytrain, Ytest = y[Itrain], y[Itest]
+    return Xtrain, Ytrain, Xtest, Ytest
+end
+
+function predict_single_sdm(model, layers)
+
+    land_idx = findall(!isnothing, layers[begin].grid)
+
+    mat = zeros(Float32, length(layers), length(land_idx))
+
+    # Handle nothings here
+    layers_to_matrix!(layers, mat, land_idx)
+
+    pred = EvoTrees.predict(model, mat')
+
+    distribution = SimpleSDMPredictor(
+        zeros(Float32, size(layers[begin])); 
+        SpeciesDistributionToolkit.boundingbox(layers[begin])...
+    )
+    distribution.grid[land_idx] = pred[:, 1]
+
+    uncertainty = SimpleSDMPredictor(zeros(Float32, size(layers[begin])); SpeciesDistributionToolkit.boundingbox(layers[begin])...)
+    uncertainty.grid[land_idx] = pred[:, 2]
+
+    rescale(distribution, (0, 1)), rescale(uncertainty, (0, 1))
+end 
+
+function main()
+    runtime_dir = ARGS[1]
+    inputs = read_inputs_dict(runtime_dir)
+    predictor_paths = inputs["predictors"]
+    occurrence_path = inputs["occurrence"]
+    pseudoabs_path = inputs["background"] 
+
+    predictors = SimpleSDMPredictor.(predictor_paths)
+
+    occurrence = CSV.read(occurrence_path, DataFrame, delim="\t")
+    occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence)
+
+    pseudoabsences = CSV.read(pseudoabs_path, DataFrame, delim="\t")
+    pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabsences)
+    #pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabs_df)
+
+    X, y, p_and_a_coords = get_features_and_labels(occurrence_layer, pseudoabs_layer, predictors)
+
+    Xtrain, Ytrain, Xtest, Ytest = test_train_split(X, y)
+
+    brt = EvoTreeGaussian(;
+        loss = :gaussian,
+        metric = :gaussian,
+        nrounds = 100,
+        nbins = 100,
+        λ = 0.0,
+        γ = 0.0,
+        η = 0.1,
+        max_depth = 7,
+        min_weight = 1.0,
+        rowsample = 0.5,
+        colsample = 1.0,
+    )
+
+
+    model = fit_evotree(
+        brt;
+        x_train=Xtrain, 
+        y_train=Ytrain, 
+        x_eval=Xtest, 
+        y_eval=Ytest
+    )
+
+    prediction, uncertainty = predict_single_sdm(model, predictors)
+
+    fit_dict = compute_fit_stats_and_cutoff(prediction, p_and_a_coords, y)  
+    τ = fit_dict[:threshold]
+
+    # Set below threshold to 0
+    prediction.grid[findall(x -> x < τ, prediction.grid)] .= 0
+
+    sdm_path = joinpath(runtime_dir, "sdm.tif")
+    SpeciesDistributionToolkit.save(sdm_path, prediction)
+    uncertainty_path = joinpath(runtime_dir, "uncertainty.tif")
+    SpeciesDistributionToolkit.save(uncertainty_path, uncertainty)
+
+
+    fit_stats_path = joinpath(runtime_dir, "fit_stats.json")
+    open(fit_stats_path, "w") do f 
+        write(f, JSON.json(fit_dict))
+    end
+
+    output_json_path = joinpath(runtime_dir, "output.json")
+    open(output_json_path, "w") do f
+        write(f, JSON.json(Dict(
+            :sdm => sdm_path,
+            :uncertainty => uncertainty_path,
+            :fit_stats => fit_stats_path
+        )))
+    end 
+
+end
+
+main()
diff --git a/scripts/SDM/julia_sdms/fitBRT.yml b/scripts/SDM/julia_sdms/fitBRT.yml
@@ -0,0 +1,35 @@
+script: fitBRT.jl
+name: BRT
+description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl"
+author:
+  - name: Michael D. Catchen
+    identifier: https://orcid.org/0000-0002-6506-6487
+inputs:
+  occurrence:
+    label: occurrence coordinate dataframe
+    description: Dataframe, presence data.
+    type: text/tab-separated-values
+    example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv"
+  background:
+    label: background
+    description: Dataframe, background data.
+    type: text/tab-separated-values
+    example: "/output/SDM/julia_sdms/generateBackground/f69fe7abd1711e193bb4f8aef51c74cc/background.csv"
+  predictors:
+    label: predictors
+    description: layer names (predictors) as a list
+    type: image/tiff;application=geotiff[]
+    example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"]
+outputs:
+  sdm:
+    label: sdm
+    description: map of predicted occurrence probability
+    type: image/tiff;application=geotiff
+  sdm_uncertainty:
+    label: sdm uncertainty
+    description: map of relative uncertainty 
+    type: image/tiff;application=geotiff
+  fit_stats:
+    label: fit_stats
+    description: JSON of model fit stats and threshold 
+    type: text/json
diff --git a/scripts/SDM/julia_sdms/generateBackground.jl b/scripts/SDM/julia_sdms/generateBackground.jl
@@ -0,0 +1,35 @@
+using JSON
+using CSV
+using DataFrames
+using SpeciesDistributionToolkit
+
+include("./shared.jl")
+
+function main()
+    runtime_dir = ARGS[1]
+    inputs = read_inputs_dict(runtime_dir)
+
+    predictor_paths = inputs["predictors"]
+    occurrence_path = inputs["presence"]
+    buffer_distance = inputs["buffer_distance"] / 1000 # div by 1000 to convert to km
+
+    predictors = SimpleSDMPredictor.(predictor_paths)
+
+    occurrence = CSV.read(occurrence_path, DataFrame, delim="\t")
+    occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence)
+
+    buffer = pseudoabsencemask(WithinRadius, occurrence_layer; distance = buffer_distance)
+    absences = SpeciesDistributionToolkit.sample(.!buffer, floor(Int, 0.5sum(occurrence_layer)))
+
+    abs_coords = findall(absences)
+    pseudoabs_df = DataFrame(lon=[c[1] for c in abs_coords], lat=[c[2] for c in abs_coords])
+    CSV.write("$runtime_dir/background.tsv", pseudoabs_df, delim="\t")
+
+    output_json_path = joinpath(runtime_dir, "output.json")
+    open(output_json_path, "w") do f
+        write(f, JSON.json(Dict(:background=>"$runtime_dir/background.tsv")))
+    end 
+end
+
+main()
+
diff --git a/scripts/SDM/julia_sdms/generateBackground.yml b/scripts/SDM/julia_sdms/generateBackground.yml
@@ -0,0 +1,28 @@
+script: generateBackground.jl
+name: Generate Background
+description: "This script creates a set of pseudoabsences/background points."
+author:
+  - name: Michael D. Catchen
+    identifier: https://orcid.org/0000-0002-6506-6487
+inputs:
+  presence:
+    label: presence
+    description: Dataframe, presence data.
+    type: text/tab-separated-values
+    example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv"
+  predictors:
+    label: predictors
+    description: layer names (predictors) as a list
+    type: image/tiff;application=geotiff[]
+    example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"]
+  buffer_distance:
+    label: buffer_distance
+    description: the minimum distance between any presence and any pseudoabsence in meters 
+    type: int
+    example: 50000
+outputs:
+  background:
+    label: background
+    description: TSV file containing with the coordinates background points.
+    type: text/tab-separated-values
+
diff --git a/scripts/SDM/julia_sdms/loadCHELSA.jl b/scripts/SDM/julia_sdms/loadCHELSA.jl
@@ -0,0 +1,92 @@
+using SpeciesDistributionToolkit
+using JSON
+using CSV
+using MultivariateStats         
+using StatsBase
+using DataFrames
+
+include("./shared.jl")
+
+const PROVIDER = RasterData(CHELSA2, BioClim)
+
+function convert_layers_to_features_matrix(layers)
+    I = findall(!isnothing, layers[1].grid)
+    data_matrix = zeros(Float32, length(layers), length(I))
+    for (i,l) in enumerate(layers)
+        x = Float32.(vec(l.grid[I]))
+        z = StatsBase.fit(ZScoreTransform, x)
+        data_matrix[i,:] .= StatsBase.transform(z, x)
+    end
+    data_matrix
+end 
+
+function fill_layer!(empty_layer, vec)
+    m = reshape(vec, size(empty_layer))
+    for j in eachindex(empty_layer.grid)
+        empty_layer.grid[j] = m[j]
+    end
+end
+
+function pca_data_matrix(data_matrix)
+    pca = MultivariateStats.fit(PCA, data_matrix)
+    MultivariateStats.transform(pca, data_matrix)
+end 
+
+function make_pca_layers(layers)  
+    pca_mat = pca_data_matrix(convert_layers_to_features_matrix(layers))
+    pca_layers = [convert(Float32, similar(layers[begin])) for l in 1:size(pca_mat, 1)]
+    for (i,pca_layer) in enumerate(pca_layers)
+        fill_layer!(pca_layer, pca_mat[i,:])
+    end 
+    pca_layers
+end
+
+function write_outputs(runtime_dir, layers)
+    predictor_paths = []
+
+    for (i,l) in enumerate(layers)
+        outpath = joinpath(runtime_dir, "predictor$i.tif")
+        push!(predictor_paths, outpath)
+        SpeciesDistributionToolkit.save(outpath, l)
+    end
+
+    output_json_path = joinpath(runtime_dir, "output.json")
+    open(output_json_path, "w") do f
+        write(f, JSON.json(Dict(:predictors=>predictor_paths)))
+    end 
+end
+
+function main()
+    runtime_dir = ARGS[1]
+    inputs = read_inputs_dict(runtime_dir)
+
+    bbox = inputs["bbox"]
+    pca_input = inputs["pca"]
+    layer_nums = inputs["layer_numbers"]
+    layer_names = ["BIO$i" for i in layer_nums]         
+
+    bbox = (left=bbox[1], bottom=bbox[2], right=bbox[3], top=bbox[4])
+    @info bbox
+
+    layers = []
+    for l in layer_names
+        success = false
+        ct = 1
+        while !success     
+            try  
+                a = convert(Float32, SimpleSDMPredictor(PROVIDER; layer=l, bbox...)) 
+                success = true
+                push!(layers, a)
+            catch
+                @info "Errored on $l on attempt $ct. Almost certainly a network error on CHELSA's side. Trying again..."
+                ct += 1
+            end 
+        end
+    end
+
+    layers = pca_input ? make_pca_layers(layers) : layers
+
+    write_outputs(runtime_dir, layers)
+end
+
+main()
diff --git a/scripts/SDM/julia_sdms/loadCHELSA.yml b/scripts/SDM/julia_sdms/loadCHELSA.yml
@@ -0,0 +1,27 @@
+script: loadCHELSA.jl
+name: Load CHELSA
+description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl"
+author:
+  - name: Michael D. Catchen
+    identifier: https://orcid.org/0000-0002-6506-6487
+inputs:
+  bbox:
+    label: bbox
+    description: Vector of float, bbox coordinates of the extent in the order xmin, ymin, xmax, ymax in coordinates
+    type: float[]
+    example: [-86.75, 33.72,-52.21,63.27]
+  pca:
+    label: pca layers
+    description: Boolean, whether to PCA predictor layers or not
+    type: boolean
+    example: True
+  layer_numbers:
+    label: CHELSA layer numbers
+    description: the CHELSA layers to use
+    type: int[]
+    example: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
+outputs:
+  predictors:
+    label: predictors
+    description: raster, predictors
+    type: image/tiff;application=geotiff[]