From 27d72a52ae64aa5a09337916e994b4fcd2ca4a9a Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Fri, 8 Sep 2023 15:58:45 +0200
Subject: [PATCH 1/4] use FixedSizeDiffCache for flows

When profiling runs with the default `autodiff=true`, this line was responsible for 35% of the time and almost all allocations:

https://github.com/Deltares/Ribasim/blob/b3eb044a722d1655c5465bafe50951b75fe960d6/core/src/solve.jl#L1002

`connectivity.flow` is a sparse matrix, but the DiffCache does not seem to like sparse matrixes. The `dual_du` field was a dense vector of length n x n x cache_size, and the `get_tmp` call led to further allocations trying to restructure the sparse matrix from the vector. Luckily there is the FixedSizeDiffCache that helps here: https://docs.sciml.ai/PreallocationTools/stable/#FixedSizeDiffCache

This retains the sparsity in the dual, and returns a `ReinterpretArray` from `get_tmp` during autodiff. To avoid materializing this reinterpretarray I needed to additionally fill the parent array with zeros rather than the array itself.

There is another unrelated performance fix here, and that is to concretely type the Parameter struct, by adding type parameters from its fields. Otherwise you have situations like

```
struct A
    a::Vector
end
```

where the compiler doesn't know the element type of the Vector, so it can perform less optimizations. The solution:

```
struct A{T}
    a::Vector{T}
end
```

Finally I consistently added AbstractVector/Matrix argument type annotations to ensure the ReinterpretArray could enter everywhere. And I renamed the functions to formulate flows to `formulate_flow`, to make it easier to separate them from the other `formulate!` methods.
---
 core/src/Ribasim.jl         |  2 +-
 core/src/create.jl          |  5 ++-
 core/src/solve.jl           | 80 +++++++++++++++++++++----------------
 core/test/run_models.jl     |  5 +++
 docs/contribute/addnode.qmd |  4 +-
 5 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/core/src/Ribasim.jl b/core/src/Ribasim.jl
index c1bc06678..78b615aaa 100644
--- a/core/src/Ribasim.jl
+++ b/core/src/Ribasim.jl
@@ -20,7 +20,7 @@ using Legolas: Legolas, @schema, @version, validate, SchemaVersion, declared
 using Logging: current_logger, min_enabled_level, with_logger
 using LoggingExtras: EarlyFilteredLogger, LevelOverrideLogger
 using OrdinaryDiffEq
-using PreallocationTools: DiffCache, get_tmp
+using PreallocationTools: DiffCache, FixedSizeDiffCache, get_tmp
 using SciMLBase
 using SparseArrays
 using SQLite: SQLite, DB, Query, esc_id
diff --git a/core/src/create.jl b/core/src/create.jl
index 8221f4504..c75a6c0e8 100644
--- a/core/src/create.jl
+++ b/core/src/create.jl
@@ -202,10 +202,11 @@ function Connectivity(db::DB, config::Config, chunk_size::Int)::Connectivity
     edge_ids_flow_inv = Dictionary(values(edge_ids_flow), keys(edge_ids_flow))
 
     flow = adjacency_matrix(graph_flow, Float64)
-    nonzeros(flow) .= 0.0
+    flow .= 0.0
 
     if config.solver.autodiff
-        flow = DiffCache(flow, chunk_size)
+        # FixedSizeDiffCache performs better for sparse matrix
+        flow = FixedSizeDiffCache(flow, chunk_size)
     end
 
     return Connectivity(
diff --git a/core/src/solve.jl b/core/src/solve.jl
index 9c684d35f..35ad28646 100644
--- a/core/src/solve.jl
+++ b/core/src/solve.jl
@@ -401,21 +401,21 @@ struct PidControl{T} <: AbstractParameterNode
 end
 
 # TODO Automatically add all nodetypes here
-struct Parameters
+struct Parameters{T, TSparse, C1, C2}
     starttime::DateTime
-    connectivity::Connectivity
-    basin::Basin
+    connectivity::Connectivity{TSparse}
+    basin::Basin{T, C1}
     linear_resistance::LinearResistance
     manning_resistance::ManningResistance
-    tabulated_rating_curve::TabulatedRatingCurve
+    tabulated_rating_curve::TabulatedRatingCurve{C2}
     fractional_flow::FractionalFlow
     level_boundary::LevelBoundary
     flow_boundary::FlowBoundary
-    pump::Pump
-    outlet::Outlet
+    pump::Pump{T}
+    outlet::Outlet{T}
     terminal::Terminal
     discrete_control::DiscreteControl
-    pid_control::PidControl
+    pid_control::PidControl{T}
     lookup::Dict{Int, Symbol}
 end
 
@@ -508,8 +508,8 @@ function formulate!(
     du::AbstractVector,
     basin::Basin,
     storage::AbstractVector,
-    current_area,
-    current_level,
+    current_area::AbstractVector,
+    current_level::AbstractVector,
     t::Float64,
 )::Nothing
     for i in eachindex(storage)
@@ -553,13 +553,13 @@ end
 function continuous_control!(
     u::ComponentVector,
     du::ComponentVector,
-    current_area,
+    current_area::AbstractVector,
     pid_control::PidControl,
     p::Parameters,
     integral_value::SubArray,
     current_level::AbstractVector,
-    flow::SparseMatrixCSC,
-    pid_error,
+    flow::AbstractMatrix,
+    pid_error::AbstractVector,
     t::Float64,
 )::Nothing
     (; connectivity, pump, outlet, basin, fractional_flow) = p
@@ -700,11 +700,11 @@ end
 """
 Directed graph: outflow is positive!
 """
-function formulate!(
+function formulate_flow!(
     linear_resistance::LinearResistance,
     p::Parameters,
-    current_level,
-    flow,
+    current_level::AbstractVector,
+    flow::AbstractMatrix,
     t::Float64,
 )::Nothing
     (; connectivity) = p
@@ -733,11 +733,11 @@ end
 """
 Directed graph: outflow is positive!
 """
-function formulate!(
+function formulate_flow!(
     tabulated_rating_curve::TabulatedRatingCurve,
     p::Parameters,
-    current_level,
-    flow::SparseMatrixCSC,
+    current_level::AbstractVector,
+    flow::AbstractMatrix,
     t::Float64,
 )::Nothing
     (; connectivity) = p
@@ -800,11 +800,11 @@ The average of the upstream and downstream water depth is used to compute cross-
 hydraulic radius. This ensures that a basin can receive water after it has gone
 dry.
 """
-function formulate!(
+function formulate_flow!(
     manning_resistance::ManningResistance,
     p::Parameters,
-    current_level,
-    flow,
+    current_level::AbstractVector,
+    flow::AbstractMatrix,
     t::Float64,
 )::Nothing
     (; basin, connectivity) = p
@@ -859,7 +859,11 @@ function formulate!(
     return nothing
 end
 
-function formulate!(fractional_flow::FractionalFlow, flow, p::Parameters)::Nothing
+function formulate_flow!(
+    fractional_flow::FractionalFlow,
+    flow::AbstractMatrix,
+    p::Parameters,
+)::Nothing
     (; connectivity) = p
     (; graph_flow) = connectivity
     (; node_id, fraction) = fractional_flow
@@ -871,7 +875,12 @@ function formulate!(fractional_flow::FractionalFlow, flow, p::Parameters)::Nothi
     return nothing
 end
 
-function formulate!(flow_boundary::FlowBoundary, p::Parameters, flow, t::Float64)::Nothing
+function formulate_flow!(
+    flow_boundary::FlowBoundary,
+    p::Parameters,
+    flow::AbstractMatrix,
+    t::Float64,
+)::Nothing
     (; connectivity) = p
     (; graph_flow) = connectivity
     (; node_id, active, flow_rate) = flow_boundary
@@ -892,10 +901,10 @@ function formulate!(flow_boundary::FlowBoundary, p::Parameters, flow, t::Float64
     end
 end
 
-function formulate!(
+function formulate_flow!(
     node::Union{Pump, Outlet},
     p::Parameters,
-    flow,
+    flow::AbstractMatrix,
     storage::AbstractVector,
 )::Nothing
     (; connectivity, basin) = p
@@ -938,7 +947,7 @@ end
 function formulate!(
     du::ComponentVector,
     connectivity::Connectivity,
-    flow::SparseMatrixCSC,
+    flow::AbstractMatrix,
     basin::Basin,
 )::Nothing
     # loop over basins
@@ -960,7 +969,7 @@ function formulate_flows!(
     p::Parameters,
     storage::AbstractVector,
     current_level,
-    flow::SparseMatrixCSC,
+    flow::AbstractMatrix,
     t::Float64,
 )::Nothing
     (;
@@ -973,13 +982,13 @@ function formulate_flows!(
         outlet,
     ) = p
 
-    formulate!(linear_resistance, p, current_level, flow, t)
-    formulate!(manning_resistance, p, current_level, flow, t)
-    formulate!(tabulated_rating_curve, p, current_level, flow, t)
-    formulate!(flow_boundary, p, flow, t)
-    formulate!(fractional_flow, flow, p)
-    formulate!(pump, p, flow, storage)
-    formulate!(outlet, p, flow, storage)
+    formulate_flow!(linear_resistance, p, current_level, flow, t)
+    formulate_flow!(manning_resistance, p, current_level, flow, t)
+    formulate_flow!(tabulated_rating_curve, p, current_level, flow, t)
+    formulate_flow!(flow_boundary, p, flow, t)
+    formulate_flow!(fractional_flow, flow, p)
+    formulate_flow!(pump, p, flow, storage)
+    formulate_flow!(outlet, p, flow, storage)
 
     return nothing
 end
@@ -1000,7 +1009,8 @@ function water_balance!(
 
     du .= 0.0
     flow = get_tmp(connectivity.flow, u)
-    nonzeros(flow) .= 0.0
+    # use parent to avoid materializing the ReinterpretArray from FixedSizeDiffCache
+    parent(flow) .= 0.0
 
     current_area = get_tmp(basin.current_area, u)
     current_level = get_tmp(basin.current_level, u)
diff --git a/core/test/run_models.jl b/core/test/run_models.jl
index e95197086..db2b97f22 100644
--- a/core/test/run_models.jl
+++ b/core/test/run_models.jl
@@ -32,6 +32,11 @@ end
     end
 
     @test model isa Ribasim.Model
+    p = model.integrator.p
+    @test p isa Ribasim.Parameter
+    @test isconcretetype(typeof(p))
+    @test all(isconcretetype, fieldtypes(typeof(p)))
+
     @test successful_retcode(model)
     @test model.integrator.sol.u[end] ≈ Float32[519.8817, 519.8798, 339.3959, 1418.4331] skip =
         Sys.isapple() atol = 1.5
diff --git a/docs/contribute/addnode.qmd b/docs/contribute/addnode.qmd
index b925824ce..3c15e841f 100644
--- a/docs/contribute/addnode.qmd
+++ b/docs/contribute/addnode.qmd
@@ -61,10 +61,10 @@ end
 
 ## Node behavior
 
-In general if the new node type dictates flow, the behaviour of the new node in the Ribasim core is defined in a method of the `formulate!` function, which is called within the `water_balance!` (both in `solve.jl`) function being the right hand side of the system of differential equations solved by Ribasim. Here the details depend highly on the specifics of the node type. An example structure of a `formulate!` method is given below.
+In general if the new node type dictates flow, the behaviour of the new node in the Ribasim core is defined in a method of the `formulate_flow!` function, which is called within the `water_balance!` (both in `solve.jl`) function being the right hand side of the system of differential equations solved by Ribasim. Here the details depend highly on the specifics of the node type. An example structure of a `formulate_flow!` method is given below.
 
 ```julia
-function formulate!(new_node_type::NewNodeType, p::Parameters)::Nothing
+function formulate_flow!(new_node_type::NewNodeType, p::Parameters)::Nothing
     # Retrieve relevant parameters
     (; connectivity) = p
     (; flow) = connectivity

From ce69e3d9c75bd67d19df0892e9751e5b662a8858 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Fri, 8 Sep 2023 17:05:52 +0200
Subject: [PATCH 2/4] s

---
 core/test/run_models.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/test/run_models.jl b/core/test/run_models.jl
index db2b97f22..466bbe2bc 100644
--- a/core/test/run_models.jl
+++ b/core/test/run_models.jl
@@ -33,7 +33,7 @@ end
 
     @test model isa Ribasim.Model
     p = model.integrator.p
-    @test p isa Ribasim.Parameter
+    @test p isa Ribasim.Parameters
     @test isconcretetype(typeof(p))
     @test all(isconcretetype, fieldtypes(typeof(p)))
 

From 8a2f3ad1dcd8a188d4979da60440a6349803c8d1 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Fri, 8 Sep 2023 17:49:34 +0200
Subject: [PATCH 3/4] avoid allocating value vectors in get_level

`Dictionary` uses `Indices{I}` as keys, and `Vector{T}` as values. The Parameters contain both, and therefore it was free to construct a `Dictionary` in a frequently called function like `get_level`. However with autodiff, the values could be a ReinterpretArray with Duals instead of just a Vector. This meant that on Dictionary creation it would convert the ReinterpretArray to a Vector, leading to many allocations.

This is on top of #581. After that, this was responsible for 94% of the time spent. With this PR that goes down to about 2%, leading to a nice little speedup.
---
 core/src/Ribasim.jl |  2 +-
 core/src/solve.jl   |  2 +-
 core/src/utils.jl   | 32 +++++++++++++++++---------------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/core/src/Ribasim.jl b/core/src/Ribasim.jl
index 78b615aaa..8821ef2c8 100644
--- a/core/src/Ribasim.jl
+++ b/core/src/Ribasim.jl
@@ -12,7 +12,7 @@ using ComponentArrays: ComponentVector
 using DataInterpolations: LinearInterpolation, derivative
 using Dates
 using DBInterface: execute, prepare
-using Dictionaries: Indices, Dictionary, gettoken, gettokenvalue, dictionary
+using Dictionaries: Indices, Dictionary, gettoken, dictionary
 using ForwardDiff: pickchunksize
 using DiffEqCallbacks
 using Graphs: DiGraph, add_edge!, adjacency_matrix, inneighbors, outneighbors
diff --git a/core/src/solve.jl b/core/src/solve.jl
index 35ad28646..227ced273 100644
--- a/core/src/solve.jl
+++ b/core/src/solve.jl
@@ -968,7 +968,7 @@ end
 function formulate_flows!(
     p::Parameters,
     storage::AbstractVector,
-    current_level,
+    current_level::AbstractVector,
     flow::AbstractMatrix,
     t::Float64,
 )::Nothing
diff --git a/core/src/utils.jl b/core/src/utils.jl
index 13da0e4b6..d3ebcda82 100644
--- a/core/src/utils.jl
+++ b/core/src/utils.jl
@@ -406,34 +406,36 @@ end
 Get the current water level of a node ID.
 The ID can belong to either a Basin or a LevelBoundary.
 """
-function get_level(p::Parameters, node_id::Int, current_level, t::Float64)::Real
+function get_level(
+    p::Parameters,
+    node_id::Int,
+    current_level::AbstractVector,
+    t::Float64,
+)::Real
     (; basin, level_boundary) = p
-    # since the node_id fields are already Indices, Dictionary creation is instant
-    basin = Dictionary(basin.node_id, current_level)
-    hasindex, token = gettoken(basin, node_id)
+    hasindex, i = id_index(basin.node_id, node_id)
     return if hasindex
-        gettokenvalue(basin, token)
+        current_level[i]
     else
-        boundary = Dictionary(level_boundary.node_id, level_boundary.level)
-        boundary[node_id](t)
+        hasindex, i = id_index(level_boundary.node_id, node_id)
+        level_boundary.level[i](t)
     end
 end
 
 "Get the index of an ID in a set of indices."
-function id_index(ids::Indices{Int}, id::Int)
-    # There might be a better approach for this, this feels too internal
-    # the second return is the token, a Tuple{Int, Int}
-    hasindex, (_, idx) = gettoken(ids, id)
-    return hasindex, idx
+function id_index(ids::Indices{Int}, id::Int)::Tuple{Bool, Int}
+    # We avoid creating Dictionary here since it converts the values to a Vector,
+    # leading to allocations when used with PreallocationTools's ReinterpretArrays.
+    hasindex, (_, i) = gettoken(ids, id)
+    return hasindex, i
 end
 
 "Return the bottom elevation of the basin with index i, or nothing if it doesn't exist"
 function basin_bottom(basin::Basin, node_id::Int)::Union{Float64, Nothing}
-    basin = Dictionary(basin.node_id, basin.level)
-    hasindex, token = gettoken(basin, node_id)
+    hasindex, i = id_index(basin.node_id, node_id)
     return if hasindex
         # get level(storage) interpolation function
-        level_discrete = gettokenvalue(basin, token)
+        level_discrete = basin.level[i]
         # and return the first level in this vector, representing the bottom
         first(level_discrete)
     else

From 5205db61b80203f09d36e20fdd0630f658f2ad75 Mon Sep 17 00:00:00 2001
From: Martijn Visser <mgvisser@gmail.com>
Date: Fri, 8 Sep 2023 18:19:28 +0200
Subject: [PATCH 4/4] level_boundary.node_id are not Indices

Perhaps we should make them Indices at some point, if the O(log n) time spent doing searchsorted is much more than the hash table lookup.
---
 core/src/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/utils.jl b/core/src/utils.jl
index d3ebcda82..d4b80baaa 100644
--- a/core/src/utils.jl
+++ b/core/src/utils.jl
@@ -417,7 +417,7 @@ function get_level(
     return if hasindex
         current_level[i]
     else
-        hasindex, i = id_index(level_boundary.node_id, node_id)
+        i = findsorted(level_boundary.node_id, node_id)::Int
         level_boundary.level[i](t)
     end
 end