Skip to content

Commit

Permalink
Merge branch 'mdc/julia_brt' into conference_staging
Browse files Browse the repository at this point in the history
  • Loading branch information
glaroc committed Oct 5, 2023
2 parents ba027be + 5353057 commit e31bac4
Show file tree
Hide file tree
Showing 12 changed files with 1,851 additions and 2 deletions.
624 changes: 624 additions & 0 deletions pipelines/SDM/BRT_julia_sdm.json

Large diffs are not rendered by default.

720 changes: 720 additions & 0 deletions pipelines/SDM/BRT_julia_viewer.json

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions scripts/SDM/julia_sdms/fitBRT.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
using JSON
using CSV
using DataFrames
using EvoTrees
using StatsBase
using SpeciesDistributionToolkit

include("./shared.jl")

function get_features_and_labels(presences, absences, climate_layers)
presences = mask(presences, climate_layers[begin])
absences = mask(absences, climate_layers[begin])
coord_presence = keys(replace(presences, false => nothing))
coord_absence = keys(replace(absences, false => nothing))
coord = vcat(coord_presence, coord_absence)

X = hcat([layer[coord] for layer in climate_layers]...)
y = vcat(fill(1.0, length(coord_presence)), fill(0.0, length(coord_absence)))
return X, y, coord
end

function layers_to_matrix!(climate_layers, mat, land_idx)
for (i, idx) in enumerate(land_idx)
for l in eachindex(climate_layers)
mat[l, i] = climate_layers[l].grid[idx]
end
end
end

function compute_fit_stats_and_cutoff(distribution, coords, y)
cutoff = LinRange(extrema(distribution)..., 500)
coords = convert(Vector{typeof(coords[begin])}, coords)
idx = findall(!isnothing, coords)
I = [SimpleSDMLayers._point_to_cartesian(distribution, c) for c in coords][idx]

obs = y .> 0

tp = zeros(Float64, length(cutoff))
fp = zeros(Float64, length(cutoff))
tn = zeros(Float64, length(cutoff))
fn = zeros(Float64, length(cutoff))

for (i, c) in enumerate(cutoff)
prd = [distribution.grid[i] >= c for i in I]
tp[i] = sum(prd .& obs)
tn[i] = sum(.!(prd) .& (.!obs))
fp[i] = sum(prd .& (.!obs))
fn[i] = sum(.!(prd) .& obs)
end

tpr = tp ./ (tp .+ fn)
fpr = fp ./ (fp .+ tn)
J = (tp ./ (tp .+ fn)) + (tn ./ (tn .+ fp)) .- 1.0

roc_dx = [reverse(fpr)[i] - reverse(fpr)[i - 1] for i in 2:length(fpr)]
roc_dy = [reverse(tpr)[i] + reverse(tpr)[i - 1] for i in 2:length(tpr)]
ROCAUC = sum(roc_dx .* (roc_dy ./ 2.0))

thr_index = last(findmax(J))
τ = cutoff[thr_index]

return Dict(:rocauc => ROCAUC, :threshold => τ, :J => J[last(findmax(J))])
end


function test_train_split(X, y, proportion=0.7)
train_size = floor(Int, proportion * length(y))
Itrain = StatsBase.sample(1:length(y), train_size; replace=false)
Itest = setdiff(1:length(y), Itrain)
Xtrain, Xtest = X[Itrain, :], X[Itest, :]
Ytrain, Ytest = y[Itrain], y[Itest]
return Xtrain, Ytrain, Xtest, Ytest
end

function predict_single_sdm(model, layers)

land_idx = findall(!isnothing, layers[begin].grid)

mat = zeros(Float32, length(layers), length(land_idx))

# Handle nothings here
layers_to_matrix!(layers, mat, land_idx)

pred = EvoTrees.predict(model, mat')

distribution = SimpleSDMPredictor(
zeros(Float32, size(layers[begin]));
SpeciesDistributionToolkit.boundingbox(layers[begin])...
)
distribution.grid[land_idx] = pred[:, 1]

uncertainty = SimpleSDMPredictor(zeros(Float32, size(layers[begin])); SpeciesDistributionToolkit.boundingbox(layers[begin])...)
uncertainty.grid[land_idx] = pred[:, 2]

rescale(distribution, (0, 1)), rescale(uncertainty, (0, 1))
end

function main()
runtime_dir = ARGS[1]
inputs = read_inputs_dict(runtime_dir)
predictor_paths = inputs["predictors"]
occurrence_path = inputs["occurrence"]
pseudoabs_path = inputs["background"]

predictors = SimpleSDMPredictor.(predictor_paths)

occurrence = CSV.read(occurrence_path, DataFrame, delim="\t")
occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence)

pseudoabsences = CSV.read(pseudoabs_path, DataFrame, delim="\t")
pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabsences)
#pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabs_df)

X, y, p_and_a_coords = get_features_and_labels(occurrence_layer, pseudoabs_layer, predictors)

Xtrain, Ytrain, Xtest, Ytest = test_train_split(X, y)

brt = EvoTreeGaussian(;
loss = :gaussian,
metric = :gaussian,
nrounds = 100,
nbins = 100,
λ = 0.0,
γ = 0.0,
η = 0.1,
max_depth = 7,
min_weight = 1.0,
rowsample = 0.5,
colsample = 1.0,
)


model = fit_evotree(
brt;
x_train=Xtrain,
y_train=Ytrain,
x_eval=Xtest,
y_eval=Ytest
)

prediction, uncertainty = predict_single_sdm(model, predictors)

fit_dict = compute_fit_stats_and_cutoff(prediction, p_and_a_coords, y)
τ = fit_dict[:threshold]

# Set below threshold to 0
prediction.grid[findall(x -> x < τ, prediction.grid)] .= 0

sdm_path = joinpath(runtime_dir, "sdm.tif")
SpeciesDistributionToolkit.save(sdm_path, prediction)
uncertainty_path = joinpath(runtime_dir, "uncertainty.tif")
SpeciesDistributionToolkit.save(uncertainty_path, uncertainty)


fit_stats_path = joinpath(runtime_dir, "fit_stats.json")
open(fit_stats_path, "w") do f
write(f, JSON.json(fit_dict))
end

output_json_path = joinpath(runtime_dir, "output.json")
open(output_json_path, "w") do f
write(f, JSON.json(Dict(
:sdm => sdm_path,
:uncertainty => uncertainty_path,
:fit_stats => fit_stats_path
)))
end

end

main()
35 changes: 35 additions & 0 deletions scripts/SDM/julia_sdms/fitBRT.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
script: fitBRT.jl
name: BRT
description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl"
author:
- name: Michael D. Catchen
identifier: https://orcid.org/0000-0002-6506-6487
inputs:
occurrence:
label: occurrence coordinate dataframe
description: Dataframe, presence data.
type: text/tab-separated-values
example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv"
background:
label: background
description: Dataframe, background data.
type: text/tab-separated-values
example: "/output/SDM/julia_sdms/generateBackground/f69fe7abd1711e193bb4f8aef51c74cc/background.csv"
predictors:
label: predictors
description: layer names (predictors) as a list
type: image/tiff;application=geotiff[]
example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"]
outputs:
sdm:
label: sdm
description: map of predicted occurrence probability
type: image/tiff;application=geotiff
sdm_uncertainty:
label: sdm uncertainty
description: map of relative uncertainty
type: image/tiff;application=geotiff
fit_stats:
label: fit_stats
description: JSON of model fit stats and threshold
type: text/json
35 changes: 35 additions & 0 deletions scripts/SDM/julia_sdms/generateBackground.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
using JSON
using CSV
using DataFrames
using SpeciesDistributionToolkit

include("./shared.jl")

function main()
runtime_dir = ARGS[1]
inputs = read_inputs_dict(runtime_dir)

predictor_paths = inputs["predictors"]
occurrence_path = inputs["presence"]
buffer_distance = inputs["buffer_distance"] / 1000 # div by 1000 to convert to km

predictors = SimpleSDMPredictor.(predictor_paths)

occurrence = CSV.read(occurrence_path, DataFrame, delim="\t")
occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence)

buffer = pseudoabsencemask(WithinRadius, occurrence_layer; distance = buffer_distance)
absences = SpeciesDistributionToolkit.sample(.!buffer, floor(Int, 0.5sum(occurrence_layer)))

abs_coords = findall(absences)
pseudoabs_df = DataFrame(lon=[c[1] for c in abs_coords], lat=[c[2] for c in abs_coords])
CSV.write("$runtime_dir/background.tsv", pseudoabs_df, delim="\t")

output_json_path = joinpath(runtime_dir, "output.json")
open(output_json_path, "w") do f
write(f, JSON.json(Dict(:background=>"$runtime_dir/background.tsv")))
end
end

main()

28 changes: 28 additions & 0 deletions scripts/SDM/julia_sdms/generateBackground.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
script: generateBackground.jl
name: Generate Background
description: "This script creates a set of pseudoabsences/background points."
author:
- name: Michael D. Catchen
identifier: https://orcid.org/0000-0002-6506-6487
inputs:
presence:
label: presence
description: Dataframe, presence data.
type: text/tab-separated-values
example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv"
predictors:
label: predictors
description: layer names (predictors) as a list
type: image/tiff;application=geotiff[]
example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"]
buffer_distance:
label: buffer_distance
description: the minimum distance between any presence and any pseudoabsence in meters
type: int
example: 50000
outputs:
background:
label: background
description: TSV file containing with the coordinates background points.
type: text/tab-separated-values

92 changes: 92 additions & 0 deletions scripts/SDM/julia_sdms/loadCHELSA.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
using SpeciesDistributionToolkit
using JSON
using CSV
using MultivariateStats
using StatsBase
using DataFrames

include("./shared.jl")

const PROVIDER = RasterData(CHELSA2, BioClim)

function convert_layers_to_features_matrix(layers)
I = findall(!isnothing, layers[1].grid)
data_matrix = zeros(Float32, length(layers), length(I))
for (i,l) in enumerate(layers)
x = Float32.(vec(l.grid[I]))
z = StatsBase.fit(ZScoreTransform, x)
data_matrix[i,:] .= StatsBase.transform(z, x)
end
data_matrix
end

function fill_layer!(empty_layer, vec)
m = reshape(vec, size(empty_layer))
for j in eachindex(empty_layer.grid)
empty_layer.grid[j] = m[j]
end
end

function pca_data_matrix(data_matrix)
pca = MultivariateStats.fit(PCA, data_matrix)
MultivariateStats.transform(pca, data_matrix)
end

function make_pca_layers(layers)
pca_mat = pca_data_matrix(convert_layers_to_features_matrix(layers))
pca_layers = [convert(Float32, similar(layers[begin])) for l in 1:size(pca_mat, 1)]
for (i,pca_layer) in enumerate(pca_layers)
fill_layer!(pca_layer, pca_mat[i,:])
end
pca_layers
end

function write_outputs(runtime_dir, layers)
predictor_paths = []

for (i,l) in enumerate(layers)
outpath = joinpath(runtime_dir, "predictor$i.tif")
push!(predictor_paths, outpath)
SpeciesDistributionToolkit.save(outpath, l)
end

output_json_path = joinpath(runtime_dir, "output.json")
open(output_json_path, "w") do f
write(f, JSON.json(Dict(:predictors=>predictor_paths)))
end
end

function main()
runtime_dir = ARGS[1]
inputs = read_inputs_dict(runtime_dir)

bbox = inputs["bbox"]
pca_input = inputs["pca"]
layer_nums = inputs["layer_numbers"]
layer_names = ["BIO$i" for i in layer_nums]

bbox = (left=bbox[1], bottom=bbox[2], right=bbox[3], top=bbox[4])
@info bbox

layers = []
for l in layer_names
success = false
ct = 1
while !success
try
a = convert(Float32, SimpleSDMPredictor(PROVIDER; layer=l, bbox...))
success = true
push!(layers, a)
catch
@info "Errored on $l on attempt $ct. Almost certainly a network error on CHELSA's side. Trying again..."
ct += 1
end
end
end

layers = pca_input ? make_pca_layers(layers) : layers

write_outputs(runtime_dir, layers)
end

main()
27 changes: 27 additions & 0 deletions scripts/SDM/julia_sdms/loadCHELSA.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
script: loadCHELSA.jl
name: Load CHELSA
description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl"
author:
- name: Michael D. Catchen
identifier: https://orcid.org/0000-0002-6506-6487
inputs:
bbox:
label: bbox
description: Vector of float, bbox coordinates of the extent in the order xmin, ymin, xmax, ymax in coordinates
type: float[]
example: [-86.75, 33.72,-52.21,63.27]
pca:
label: pca layers
description: Boolean, whether to PCA predictor layers or not
type: boolean
example: True
layer_numbers:
label: CHELSA layer numbers
description: the CHELSA layers to use
type: int[]
example: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
outputs:
predictors:
label: predictors
description: raster, predictors
type: image/tiff;application=geotiff[]
Loading

0 comments on commit e31bac4

Please sign in to comment.