-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'mdc/julia_brt' into conference_staging
- Loading branch information
Showing
12 changed files
with
1,851 additions
and
2 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
using JSON | ||
using CSV | ||
using DataFrames | ||
using EvoTrees | ||
using StatsBase | ||
using SpeciesDistributionToolkit | ||
|
||
include("./shared.jl") | ||
|
||
function get_features_and_labels(presences, absences, climate_layers) | ||
presences = mask(presences, climate_layers[begin]) | ||
absences = mask(absences, climate_layers[begin]) | ||
coord_presence = keys(replace(presences, false => nothing)) | ||
coord_absence = keys(replace(absences, false => nothing)) | ||
coord = vcat(coord_presence, coord_absence) | ||
|
||
X = hcat([layer[coord] for layer in climate_layers]...) | ||
y = vcat(fill(1.0, length(coord_presence)), fill(0.0, length(coord_absence))) | ||
return X, y, coord | ||
end | ||
|
||
function layers_to_matrix!(climate_layers, mat, land_idx) | ||
for (i, idx) in enumerate(land_idx) | ||
for l in eachindex(climate_layers) | ||
mat[l, i] = climate_layers[l].grid[idx] | ||
end | ||
end | ||
end | ||
|
||
function compute_fit_stats_and_cutoff(distribution, coords, y) | ||
cutoff = LinRange(extrema(distribution)..., 500) | ||
coords = convert(Vector{typeof(coords[begin])}, coords) | ||
idx = findall(!isnothing, coords) | ||
I = [SimpleSDMLayers._point_to_cartesian(distribution, c) for c in coords][idx] | ||
|
||
obs = y .> 0 | ||
|
||
tp = zeros(Float64, length(cutoff)) | ||
fp = zeros(Float64, length(cutoff)) | ||
tn = zeros(Float64, length(cutoff)) | ||
fn = zeros(Float64, length(cutoff)) | ||
|
||
for (i, c) in enumerate(cutoff) | ||
prd = [distribution.grid[i] >= c for i in I] | ||
tp[i] = sum(prd .& obs) | ||
tn[i] = sum(.!(prd) .& (.!obs)) | ||
fp[i] = sum(prd .& (.!obs)) | ||
fn[i] = sum(.!(prd) .& obs) | ||
end | ||
|
||
tpr = tp ./ (tp .+ fn) | ||
fpr = fp ./ (fp .+ tn) | ||
J = (tp ./ (tp .+ fn)) + (tn ./ (tn .+ fp)) .- 1.0 | ||
|
||
roc_dx = [reverse(fpr)[i] - reverse(fpr)[i - 1] for i in 2:length(fpr)] | ||
roc_dy = [reverse(tpr)[i] + reverse(tpr)[i - 1] for i in 2:length(tpr)] | ||
ROCAUC = sum(roc_dx .* (roc_dy ./ 2.0)) | ||
|
||
thr_index = last(findmax(J)) | ||
τ = cutoff[thr_index] | ||
|
||
return Dict(:rocauc => ROCAUC, :threshold => τ, :J => J[last(findmax(J))]) | ||
end | ||
|
||
|
||
function test_train_split(X, y, proportion=0.7) | ||
train_size = floor(Int, proportion * length(y)) | ||
Itrain = StatsBase.sample(1:length(y), train_size; replace=false) | ||
Itest = setdiff(1:length(y), Itrain) | ||
Xtrain, Xtest = X[Itrain, :], X[Itest, :] | ||
Ytrain, Ytest = y[Itrain], y[Itest] | ||
return Xtrain, Ytrain, Xtest, Ytest | ||
end | ||
|
||
function predict_single_sdm(model, layers) | ||
|
||
land_idx = findall(!isnothing, layers[begin].grid) | ||
|
||
mat = zeros(Float32, length(layers), length(land_idx)) | ||
|
||
# Handle nothings here | ||
layers_to_matrix!(layers, mat, land_idx) | ||
|
||
pred = EvoTrees.predict(model, mat') | ||
|
||
distribution = SimpleSDMPredictor( | ||
zeros(Float32, size(layers[begin])); | ||
SpeciesDistributionToolkit.boundingbox(layers[begin])... | ||
) | ||
distribution.grid[land_idx] = pred[:, 1] | ||
|
||
uncertainty = SimpleSDMPredictor(zeros(Float32, size(layers[begin])); SpeciesDistributionToolkit.boundingbox(layers[begin])...) | ||
uncertainty.grid[land_idx] = pred[:, 2] | ||
|
||
rescale(distribution, (0, 1)), rescale(uncertainty, (0, 1)) | ||
end | ||
|
||
function main() | ||
runtime_dir = ARGS[1] | ||
inputs = read_inputs_dict(runtime_dir) | ||
predictor_paths = inputs["predictors"] | ||
occurrence_path = inputs["occurrence"] | ||
pseudoabs_path = inputs["background"] | ||
|
||
predictors = SimpleSDMPredictor.(predictor_paths) | ||
|
||
occurrence = CSV.read(occurrence_path, DataFrame, delim="\t") | ||
occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence) | ||
|
||
pseudoabsences = CSV.read(pseudoabs_path, DataFrame, delim="\t") | ||
pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabsences) | ||
#pseudoabs_layer = create_occurrence_layer(similar(predictors[1]), pseudoabs_df) | ||
|
||
X, y, p_and_a_coords = get_features_and_labels(occurrence_layer, pseudoabs_layer, predictors) | ||
|
||
Xtrain, Ytrain, Xtest, Ytest = test_train_split(X, y) | ||
|
||
brt = EvoTreeGaussian(; | ||
loss = :gaussian, | ||
metric = :gaussian, | ||
nrounds = 100, | ||
nbins = 100, | ||
λ = 0.0, | ||
γ = 0.0, | ||
η = 0.1, | ||
max_depth = 7, | ||
min_weight = 1.0, | ||
rowsample = 0.5, | ||
colsample = 1.0, | ||
) | ||
|
||
|
||
model = fit_evotree( | ||
brt; | ||
x_train=Xtrain, | ||
y_train=Ytrain, | ||
x_eval=Xtest, | ||
y_eval=Ytest | ||
) | ||
|
||
prediction, uncertainty = predict_single_sdm(model, predictors) | ||
|
||
fit_dict = compute_fit_stats_and_cutoff(prediction, p_and_a_coords, y) | ||
τ = fit_dict[:threshold] | ||
|
||
# Set below threshold to 0 | ||
prediction.grid[findall(x -> x < τ, prediction.grid)] .= 0 | ||
|
||
sdm_path = joinpath(runtime_dir, "sdm.tif") | ||
SpeciesDistributionToolkit.save(sdm_path, prediction) | ||
uncertainty_path = joinpath(runtime_dir, "uncertainty.tif") | ||
SpeciesDistributionToolkit.save(uncertainty_path, uncertainty) | ||
|
||
|
||
fit_stats_path = joinpath(runtime_dir, "fit_stats.json") | ||
open(fit_stats_path, "w") do f | ||
write(f, JSON.json(fit_dict)) | ||
end | ||
|
||
output_json_path = joinpath(runtime_dir, "output.json") | ||
open(output_json_path, "w") do f | ||
write(f, JSON.json(Dict( | ||
:sdm => sdm_path, | ||
:uncertainty => uncertainty_path, | ||
:fit_stats => fit_stats_path | ||
))) | ||
end | ||
|
||
end | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
script: fitBRT.jl | ||
name: BRT | ||
description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl" | ||
author: | ||
- name: Michael D. Catchen | ||
identifier: https://orcid.org/0000-0002-6506-6487 | ||
inputs: | ||
occurrence: | ||
label: occurrence coordinate dataframe | ||
description: Dataframe, presence data. | ||
type: text/tab-separated-values | ||
example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv" | ||
background: | ||
label: background | ||
description: Dataframe, background data. | ||
type: text/tab-separated-values | ||
example: "/output/SDM/julia_sdms/generateBackground/f69fe7abd1711e193bb4f8aef51c74cc/background.csv" | ||
predictors: | ||
label: predictors | ||
description: layer names (predictors) as a list | ||
type: image/tiff;application=geotiff[] | ||
example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"] | ||
outputs: | ||
sdm: | ||
label: sdm | ||
description: map of predicted occurrence probability | ||
type: image/tiff;application=geotiff | ||
sdm_uncertainty: | ||
label: sdm uncertainty | ||
description: map of relative uncertainty | ||
type: image/tiff;application=geotiff | ||
fit_stats: | ||
label: fit_stats | ||
description: JSON of model fit stats and threshold | ||
type: text/json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
using JSON | ||
using CSV | ||
using DataFrames | ||
using SpeciesDistributionToolkit | ||
|
||
include("./shared.jl") | ||
|
||
function main() | ||
runtime_dir = ARGS[1] | ||
inputs = read_inputs_dict(runtime_dir) | ||
|
||
predictor_paths = inputs["predictors"] | ||
occurrence_path = inputs["presence"] | ||
buffer_distance = inputs["buffer_distance"] / 1000 # div by 1000 to convert to km | ||
|
||
predictors = SimpleSDMPredictor.(predictor_paths) | ||
|
||
occurrence = CSV.read(occurrence_path, DataFrame, delim="\t") | ||
occurrence_layer = create_occurrence_layer(similar(predictors[1]), occurrence) | ||
|
||
buffer = pseudoabsencemask(WithinRadius, occurrence_layer; distance = buffer_distance) | ||
absences = SpeciesDistributionToolkit.sample(.!buffer, floor(Int, 0.5sum(occurrence_layer))) | ||
|
||
abs_coords = findall(absences) | ||
pseudoabs_df = DataFrame(lon=[c[1] for c in abs_coords], lat=[c[2] for c in abs_coords]) | ||
CSV.write("$runtime_dir/background.tsv", pseudoabs_df, delim="\t") | ||
|
||
output_json_path = joinpath(runtime_dir, "output.json") | ||
open(output_json_path, "w") do f | ||
write(f, JSON.json(Dict(:background=>"$runtime_dir/background.tsv"))) | ||
end | ||
end | ||
|
||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
script: generateBackground.jl | ||
name: Generate Background | ||
description: "This script creates a set of pseudoabsences/background points." | ||
author: | ||
- name: Michael D. Catchen | ||
identifier: https://orcid.org/0000-0002-6506-6487 | ||
inputs: | ||
presence: | ||
label: presence | ||
description: Dataframe, presence data. | ||
type: text/tab-separated-values | ||
example: "/output/data/getObservations/9f7d1cc148464cd0517e01c67af0ab5b/obs_data.tsv" | ||
predictors: | ||
label: predictors | ||
description: layer names (predictors) as a list | ||
type: image/tiff;application=geotiff[] | ||
example: ["/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio1_242b2b01561981-01-01.tif","/output/data/loadFromStac/ea82148a2926d97acf85cea61a110194/bio2_243a4e7f541981-01-01.tif"] | ||
buffer_distance: | ||
label: buffer_distance | ||
description: the minimum distance between any presence and any pseudoabsence in meters | ||
type: int | ||
example: 50000 | ||
outputs: | ||
background: | ||
label: background | ||
description: TSV file containing with the coordinates background points. | ||
type: text/tab-separated-values | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
using SpeciesDistributionToolkit | ||
using JSON | ||
using CSV | ||
using MultivariateStats | ||
using StatsBase | ||
using DataFrames | ||
|
||
include("./shared.jl") | ||
|
||
const PROVIDER = RasterData(CHELSA2, BioClim) | ||
|
||
function convert_layers_to_features_matrix(layers) | ||
I = findall(!isnothing, layers[1].grid) | ||
data_matrix = zeros(Float32, length(layers), length(I)) | ||
for (i,l) in enumerate(layers) | ||
x = Float32.(vec(l.grid[I])) | ||
z = StatsBase.fit(ZScoreTransform, x) | ||
data_matrix[i,:] .= StatsBase.transform(z, x) | ||
end | ||
data_matrix | ||
end | ||
|
||
function fill_layer!(empty_layer, vec) | ||
m = reshape(vec, size(empty_layer)) | ||
for j in eachindex(empty_layer.grid) | ||
empty_layer.grid[j] = m[j] | ||
end | ||
end | ||
|
||
function pca_data_matrix(data_matrix) | ||
pca = MultivariateStats.fit(PCA, data_matrix) | ||
MultivariateStats.transform(pca, data_matrix) | ||
end | ||
|
||
function make_pca_layers(layers) | ||
pca_mat = pca_data_matrix(convert_layers_to_features_matrix(layers)) | ||
pca_layers = [convert(Float32, similar(layers[begin])) for l in 1:size(pca_mat, 1)] | ||
for (i,pca_layer) in enumerate(pca_layers) | ||
fill_layer!(pca_layer, pca_mat[i,:]) | ||
end | ||
pca_layers | ||
end | ||
|
||
function write_outputs(runtime_dir, layers) | ||
predictor_paths = [] | ||
|
||
for (i,l) in enumerate(layers) | ||
outpath = joinpath(runtime_dir, "predictor$i.tif") | ||
push!(predictor_paths, outpath) | ||
SpeciesDistributionToolkit.save(outpath, l) | ||
end | ||
|
||
output_json_path = joinpath(runtime_dir, "output.json") | ||
open(output_json_path, "w") do f | ||
write(f, JSON.json(Dict(:predictors=>predictor_paths))) | ||
end | ||
end | ||
|
||
function main() | ||
runtime_dir = ARGS[1] | ||
inputs = read_inputs_dict(runtime_dir) | ||
|
||
bbox = inputs["bbox"] | ||
pca_input = inputs["pca"] | ||
layer_nums = inputs["layer_numbers"] | ||
layer_names = ["BIO$i" for i in layer_nums] | ||
|
||
bbox = (left=bbox[1], bottom=bbox[2], right=bbox[3], top=bbox[4]) | ||
@info bbox | ||
|
||
layers = [] | ||
for l in layer_names | ||
success = false | ||
ct = 1 | ||
while !success | ||
try | ||
a = convert(Float32, SimpleSDMPredictor(PROVIDER; layer=l, bbox...)) | ||
success = true | ||
push!(layers, a) | ||
catch | ||
@info "Errored on $l on attempt $ct. Almost certainly a network error on CHELSA's side. Trying again..." | ||
ct += 1 | ||
end | ||
end | ||
end | ||
|
||
layers = pca_input ? make_pca_layers(layers) : layers | ||
|
||
write_outputs(runtime_dir, layers) | ||
end | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
script: loadCHELSA.jl | ||
name: Load CHELSA | ||
description: "This script creates an SDM and uncertainty map based on using Boosted Regression Trees (BRTs) using the package SpeciesDistributionToolkit.jl and EvoTrees.jl" | ||
author: | ||
- name: Michael D. Catchen | ||
identifier: https://orcid.org/0000-0002-6506-6487 | ||
inputs: | ||
bbox: | ||
label: bbox | ||
description: Vector of float, bbox coordinates of the extent in the order xmin, ymin, xmax, ymax in coordinates | ||
type: float[] | ||
example: [-86.75, 33.72,-52.21,63.27] | ||
pca: | ||
label: pca layers | ||
description: Boolean, whether to PCA predictor layers or not | ||
type: boolean | ||
example: True | ||
layer_numbers: | ||
label: CHELSA layer numbers | ||
description: the CHELSA layers to use | ||
type: int[] | ||
example: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19] | ||
outputs: | ||
predictors: | ||
label: predictors | ||
description: raster, predictors | ||
type: image/tiff;application=geotiff[] |
Oops, something went wrong.