Performance Optimizations throughouth the codebase (#68)

* Minimally faster projection matrix * Minimally faster prior computation * Remove an old unused line of code * Fix a wrong check; diffusions are now faster for the EK0 * Specify matrices instead of letting it compute zero matmuls * Pass the diffusion as type, not as symbol * Separate the `copy`s and `similar`s from the cache creation * Hard-code `X_A_Xt` for diagonal X'es * Make `A` a full matrix since Octavian is that much faster * Use floats in `_matmul!(C, A, B, a, b)` * Reduce allocations in `estimate_diffusion(DynamicDiffusion)` * Optimize the `update!` step some more * More usage of `_matmul!` * Use Octavian.jl's matmul! for a wider range of matrices and vectors * Move the diffusion back to being specified with a symbol * Remove the fixedMAP diffusion from the tests * Fix `update!` for the EK0 * Remove the `OctavianCompatibleMatrices` type * Fix the usage of `update!` in the RungeKuttaInit() * Fix the allocating convenience `update!` function * Fix some filtering tests that passed LowerTriangular matrices * Update the benchmark.ipynb * Fix an issue by using mul! instead of matmul! I'm really not sure what goes wrong there, but this seems to fix it... * Make the prior more stable again, using the elType that is supplied * Update the benchmark once more
nathanaelbosch · Sep 9, 2021 · 535e52c · 535e52c
1 parent 803ec17
commit 535e52c
Show file tree

Hide file tree

Showing 14 changed files with 1,765 additions and 1,743 deletions.
diff --git a/benchmarks/multi-language-benchmark.ipynb b/benchmarks/multi-language-benchmark.ipynb
diff --git a/src/ProbNumDiffEq.jl b/src/ProbNumDiffEq.jl
@@ -35,18 +35,26 @@ using ForwardDiff
 using Tullio
 import Octavian: matmul!
 # Define some fallbacks
-_matmul!(C::VecOrMat{T}, A::VecOrMat{T}, B::VecOrMat{T}) where {
-    T <: Union{Bool, Float16, Float32, Float64,
-               Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8}} =
-                   matmul!(C, A, B)
-_matmul!(C::VecOrMat{T}, A::VecOrMat{T}, B::VecOrMat{T}, a::T, b::T) where {T <: Union{
-             Bool, Float16, Float32, Float64,
-             Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8}} =
-                 matmul!(C, A, B, a, b)
+const OctavianCompatibleEltypes = Union{
+    Bool, Float16, Float32, Float64,
+    Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8}
+_matmul!(C::AbstractVecOrMat{T},
+         A::AbstractVecOrMat{T},
+         B::AbstractVecOrMat{T}
+         ) where T <: OctavianCompatibleEltypes = matmul!(C, A, B)
+_matmul!(C::AbstractVecOrMat{T},
+         A::AbstractVecOrMat{T},
+         B::AbstractVecOrMat{T},
+         a::T, b::T) where T <: OctavianCompatibleEltypes = matmul!(C, A, B, a, b)
 _matmul!(C, A, B) = mul!(C, A, B)
 _matmul!(C, A, B, a, b) = mul!(C, A, B, a, b)
-_matmul!(C::AbstractMatrix, A::AbstractMatrix, B::Diagonal) = (C .= A .* B.diag')
-_matmul!(C::AbstractMatrix, A::Diagonal, B::AbstractMatrix) = (C .= A.diag .* B)
+_matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::Diagonal{T}
+         ) where T <: OctavianCompatibleEltypes = (C .= A .* B.diag')
+_matmul!(C::AbstractMatrix{T}, A::Diagonal{T}, B::AbstractMatrix{T}
+         ) where T <: OctavianCompatibleEltypes = (C .= A.diag .* B)
+_matmul!(C::Diagonal{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}
+         ) where T <: OctavianCompatibleEltypes =
+             mul!(C, A, B)
 
 
 # @reexport using PSDMatrices

diff --git a/src/caches.jl b/src/caches.jl
@@ -92,7 +92,8 @@ function OrdinaryDiffEq.alg_cache(
 
     A, Q = ibm(d, q, uElType)
 
-    x0 = Gaussian(zeros(uElType, D), SRMatrix(Matrix(uElType(1.0)*I, D, D)))
+    x0 = Gaussian(zeros(uElType, D),
+                  SRMatrix(Matrix(uElType(1.0)*I, D, D), Matrix(uElType(1.0)*I, D, D)))
 
     # Measurement model
     R = zeros(uElType, d, d)
@@ -106,28 +107,38 @@ function OrdinaryDiffEq.alg_cache(
     v = similar(h)
     S = alg isa EK0 ?
         SRMatrix(zeros(uElType, d, D), Diagonal(zeros(uElType, d, d))) :
-        SRMatrix(zeros(uElType, d, D))
+        SRMatrix(zeros(uElType, d, D), zeros(uElType, d, d))
     measurement = Gaussian(v, S)
     pu_tmp =
         f isa DynamicalODEFunction ?
         Gaussian(zeros(uElType, 2d), SRMatrix(zeros(uElType, 2d, D))) :
         similar(measurement)
     K = zeros(uElType, D, d)
     G = zeros(uElType, D, D)
-    C1 = SRMatrix(zeros(uElType, D, 2D))
-    C2 = SRMatrix(zeros(uElType, D, 3D))
+    C1 = SRMatrix(zeros(uElType, D, 2D), zeros(uElType, D, D))
+    C2 = SRMatrix(zeros(uElType, D, 3D), zeros(uElType, D, D))
     covmatcache = similar(G)
 
-    diffusion_models = Dict(
-        :dynamic => DynamicDiffusion(),
-        :dynamicMV => MVDynamicDiffusion(),
-        :fixed => FixedDiffusion(),
-        :fixedMV => MVFixedDiffusion(),
-        :fixedMAP => MAPFixedDiffusion(),
-    )
-    diffmodel = diffusion_models[alg.diffusionmodel]
+    diffmodel = alg.diffusionmodel == :dynamic ? DynamicDiffusion() :
+        alg.diffusionmodel == :fixed ? FixedDiffusion() :
+        alg.diffusionmodel == :dynamicMV ? MVDynamicDiffusion() :
+        alg.diffusionmodel == :fixedMV ? MVFixedDiffusion() :
+        error("The specified diffusion could not be recognized! Use e.g. `:dynamic`.")
+
     initdiff = initial_diffusion(diffmodel, d, q, uEltypeNoUnits)
 
+    Ah, Qh = copy(A), copy(Q)
+    u_pred = similar(u)
+    u_filt = similar(u)
+    tmp = similar(u)
+    x_pred = copy(x0)
+    x_filt = similar(x0)
+    x_tmp = similar(x0)
+    x_tmp2 = similar(x0)
+    m_tmp = similar(measurement)
+    K2 = similar(K)
+    G2 = similar(G)
+    err_tmp = similar(du)
     return GaussianODEFilterCache{
         typeof(R), typeof(Proj), typeof(SolProj),
         typeof(P), typeof(PI),
@@ -137,16 +148,16 @@ function OrdinaryDiffEq.alg_cache(
         typeof(C1),
     }(
         # Constants
-        d, q, A, Q, copy(A), copy(Q), diffmodel, R, Proj, SolProj,
+        d, q, A, Q, Ah, Qh, diffmodel, R, Proj, SolProj,
         P, PI,
         E0, E1, E2,
         # Mutable stuff
-        u, similar(u), similar(u), copy(u),
-        x0, copy(x0), similar(x0), similar(x0), similar(x0),
-        measurement, similar(measurement), pu_tmp,
-        H, du, ddu, K, similar(K), G, similar(G),
+        u, u_pred, u_filt, tmp,
+        x0, x_pred, x_filt, x_tmp, x_tmp2,
+        measurement, m_tmp, pu_tmp,
+        H, du, ddu, K, K2, G, G2,
         covmatcache, initdiff, initdiff,
-        similar(du),
+        err_tmp,
         zero(uEltypeNoUnits),
         C1, C2,
     )

diff --git a/src/diffusions.jl b/src/diffusions.jl
@@ -74,11 +74,13 @@ function estimate_diffusion(kind::DynamicDiffusion, integ)
     # @assert all(R .== 0) "The dynamic-diffusion assumes R==0!"
     z = measurement.μ
     HQH = X_A_Xt!(m_tmp.Σ, Qh, H)
-    if HQH isa Diagonal
+    if HQH.mat isa Diagonal
         σ² = dot(z ./ HQH.mat.diag, z) / d
         return σ², σ²
     else
-        σ² = z' * (Symmetric(HQH.mat) \ z) / d
+        C = cholesky!(HQH.mat)
+        e = ldiv!(m_tmp.μ, C, z)
+        σ² = dot(z, e) / d
         return σ², σ²
     end
 end

diff --git a/src/filtering.jl b/src/filtering.jl
@@ -34,8 +34,8 @@ function predict_cov!(x_out::SRGaussian, x_curr::SRGaussian, Ah::AbstractMatrix,
     M, L = cachemat.mat, cachemat.squareroot
     D, D = size(Qh.mat)
 
-    mul!(view(L, 1:D, 1:D), Ah, x_curr.Σ.squareroot)
-    mul!(view(L, 1:D, D+1:2D), sqrt.(diffusion), Qh.squareroot)
+    _matmul!(view(L, 1:D, 1:D), Ah, x_curr.Σ.squareroot)
+    _matmul!(view(L, 1:D, D+1:2D), sqrt.(diffusion), Qh.squareroot)
     _matmul!(M, L, L')
     chol = cholesky!(Symmetric(M), check=false)
 
@@ -91,22 +91,22 @@ See also: [`predict`](@ref)
 function update!(x_out::Gaussian, x_pred::Gaussian, measurement::Gaussian,
                  H::AbstractMatrix, R,
                  K1::AbstractMatrix, K2::AbstractMatrix,
-                 M_cache::AbstractMatrix)
+                 M_cache::AbstractMatrix, m_tmp)
     @assert iszero(R)
-    z, S = measurement.μ, measurement.Σ
+    z, S = measurement.μ, copy!(m_tmp.Σ, measurement.Σ)
     m_p, P_p = x_pred.μ, x_pred.Σ
     D = length(m_p)
 
-    S_inv = inv(S)
-    # K = P_p * H' * S_inv
-    K1 = _matmul!(K1, Matrix(P_p), H')
-    K = _matmul!(K2, K1, S_inv)
+    # K = P_p * H' / S
+    S_chol = cholesky!(S)
+    K = _matmul!(K1, Matrix(P_p), H')
+    rdiv!(K, S_chol)
 
     # x_out.μ .= m_p .+ K * (0 .- z)
     x_out.μ .= m_p .- _matmul!(x_out.μ, K, z)
 
     # M_cache .= I(D) .- mul!(M_cache, K, H)
-    _matmul!(M_cache, K, H, -1, 0)
+    _matmul!(M_cache, K, H, -1.0, 0.0)
     @inbounds @simd ivdep for i in 1:D
         M_cache[i, i] += 1
     end
@@ -122,7 +122,8 @@ function update!(x_out::Gaussian, x_pred::Gaussian, measurement::Gaussian,
     K1 = zeros(D, d)
     K2 = zeros(D, d)
     M_cache = zeros(D, D)
-    return update!(x_out, x_pred, measurement, H, R, K1, K2, M_cache)
+    m_tmp = copy(measurement)
+    return update!(x_out, x_pred, measurement, H, R, K1, K2, M_cache, m_tmp)
 end
 """
     update(x_pred, measurement, H, R=0)

diff --git a/src/initialization/common.jl b/src/initialization/common.jl
@@ -36,11 +36,11 @@ function condition_on!(x::SRGaussian, H::AbstractMatrix, data::AbstractVector,
     Kcache2 .= Kcache ./ S.diag'
     K = Kcache2
 
-    _matmul!(x.μ, K, data - z, 1, 1)
+    _matmul!(x.μ, K, data - z, 1.0, 1.0)
     # x.μ .+= K*(data - z)
 
     D = length(x.μ)
-    _matmul!(Mcache, K, H, -1, 0)
+    mul!(Mcache, K, H, -1.0, 0.0)
     @inbounds @simd ivdep for i in 1:D
         Mcache[i, i] += 1
     end

diff --git a/src/initialization/rungekutta.jl b/src/initialization/rungekutta.jl
@@ -89,7 +89,7 @@ function rk_init_improve(integ, cache::GaussianODEFilterCache, ts, us, dt)
         X_A_Xt!(measurement.Σ, x_pred.Σ, H)
 
         update!(x_filt, x_pred, measurement, H, 0,
-                cache.K1, cache.K2, cache.x_tmp2.Σ.mat)
+                cache.K1, cache.K2, cache.x_tmp2.Σ.mat, cache.m_tmp)
         push!(filts, copy(x_filt))
 
         x = x_filt

diff --git a/src/perform_step.jl b/src/perform_step.jl
@@ -148,9 +148,8 @@ function evaluate_ode!(integ, x_pred, t, second_order::Val{false})
         end
 
         integ.destats.njacs += 1
-        _matmul!(H, ddu, -E0)
         H .= E1
-        _matmul!(H, ddu, E0, -1, 1)
+        _matmul!(H, ddu, E0, -1.0, 1.0)
     else
         # H .= E1 # This is already the case!
     end
@@ -222,8 +221,8 @@ compute_measurement_covariance!(cache) =
 
 function update!(integ, prediction)
     @unpack measurement, H, R, x_filt = integ.cache
-    @unpack K1, K2, x_tmp2 = integ.cache
-    update!(x_filt, prediction, measurement, H, R, K1, K2, x_tmp2.Σ.mat)
+    @unpack K1, K2, x_tmp2, m_tmp = integ.cache
+    update!(x_filt, prediction, measurement, H, R, K1, K2, x_tmp2.Σ.mat, m_tmp)
     # assert_nonnegative_diagonal(x_filt.Σ)
     return x_filt
 end

diff --git a/src/priors.jl b/src/priors.jl
@@ -16,20 +16,19 @@ function ibm(d::Integer, q::Integer, elType=typeof(1.0))
     end
     A = kron(I(d), A_breve)
     @assert istriu(A)
-    A = UpperTriangular(A)
+    # A = UpperTriangular(A)
 
     # Make Q
     Q_breve = zeros(elType, q+1, q+1)
     @fastmath _transdiff_ibm_element(row::Int, col::Int) =
         one(elType) / (2 * q + 1 - row - col)
     @simd ivdep for col in 0:q
-        @simd ivdep for row in col:q
+        @simd ivdep for row in 0:q
             val = _transdiff_ibm_element(row, col)
-            @inbounds Q_breve[1 + col, 1 + row] = val
             @inbounds Q_breve[1 + row, 1 + col] = val
         end
     end
-    QL_breve = cholesky(Q_breve).L
+    QL_breve = cholesky!(Q_breve).L
     QL = kron(I(d), QL_breve)
     Q = SRMatrix(QL)
 

diff --git a/src/projection.jl b/src/projection.jl
@@ -1,4 +1,14 @@
-function projection(d, q, elType=typeof(1.0))
-    Proj(deriv) = kron(diagm(0 => ones(elType, d)), [i==(deriv+1) ? 1 : 0 for i in 1:q+1]')
+function projection(d::Integer, q::Integer, elType=typeof(1.0))
+    # Proj(deriv) = kron(diagm(0 => ones(elType, d)), [i==(deriv+1) ? 1 : 0 for i in 1:q+1]')
+
+    # Slightly faster version of the above:
+    D = d*(q+1)
+    Proj(deriv) = begin
+        P = zeros(elType, d, D)
+        @simd ivdep for i in deriv*d + 1 : D+1 : d*D
+            @inbounds P[i] = 1
+        end
+        return P
+    end
     return Proj
 end
diff --git a/src/squarerootmatrix.jl b/src/squarerootmatrix.jl
@@ -44,9 +44,16 @@ X_A_Xt!(out::SquarerootMatrix, M::SquarerootMatrix, X::AbstractMatrix) = begin
     _matmul!(out.mat, out.squareroot, out.squareroot')
     return out
 end
+X_A_Xt!(out::SquarerootMatrix, M::SquarerootMatrix, D::Diagonal) = begin
+    # Basically just to optimize PI*Q*PI
+    out.squareroot .= D.diag .* M.squareroot
+    out.mat .= D.diag .* M.mat .* D.diag'
+    return out
+end
 
 
 Base.inv(M::SquarerootMatrix)= Base.inv(M.mat)
 LinearAlgebra.diag(M::SquarerootMatrix) = LinearAlgebra.diag(M.mat)
 LinearAlgebra.cholesky(M::SquarerootMatrix) = LinearAlgebra.cholesky(M.mat)
+LinearAlgebra.cholesky!(M::SquarerootMatrix) = LinearAlgebra.cholesky!(M.mat)
 LinearAlgebra.logdet(M::SquarerootMatrix) = LinearAlgebra.logdet(M.mat)
diff --git a/test/correctness.jl b/test/correctness.jl
@@ -21,7 +21,7 @@ for (prob, probname) in [
         true_sol = solve(remake(prob, u0=big.(prob.u0)), Tsit5(), abstol=1e-20, reltol=1e-20)
 
         for Alg in (EK0, EK1),
-            diffusion in [:fixed, :dynamic, :fixedMAP, :fixedMV, :dynamicMV],
+            diffusion in [:fixed, :dynamic, :fixedMV, :dynamicMV],
             init in [TaylorModeInit(), RungeKuttaInit()],
             q in [2, 3, 5]
 
@@ -51,7 +51,7 @@ for (prob, probname) in [
         true_dense_vals = true_sol.(t_eval)
 
         for Alg in (EK0, EK1),
-            diffusion in [:fixed, :dynamic, :fixedMAP, :fixedMV, :dynamicMV],
+            diffusion in [:fixed, :dynamic, :fixedMV, :dynamicMV],
             init in [TaylorModeInit(), RungeKuttaInit()],
             q in [2, 3, 5]
 

diff --git a/test/diffusions.jl b/test/diffusions.jl
@@ -33,10 +33,10 @@ import DiffEqProblemLibrary.ODEProblemLibrary: prob_ode_linear, prob_ode_2Dlinea
         @test sol.u[end] ≈ true_sol.(sol.t)[end]
     end
 
-    @testset "Time-Fixed Diffusion MAP" begin
-        sol = solve(prob, EK0(diffusionmodel=:fixedMAP, smooth=false),
-                    dense=false, adaptive=false, dt=1e-4)
-        @test sol.u[end] ≈ true_sol.(sol.t)[end]
-    end
+    # @testset "Time-Fixed Diffusion MAP" begin
+    #     sol = solve(prob, EK0(diffusionmodel=:fixedMAP, smooth=false),
+    #                 dense=false, adaptive=false, dt=1e-4)
+    #     @test sol.u[end] ≈ true_sol.(sol.t)[end]
+    # end
 
 end
diff --git a/test/filtering.jl b/test/filtering.jl
@@ -11,11 +11,11 @@ using LinearAlgebra
     # Setup
     d = 5
     m = rand(d)
-    L_p = LowerTriangular(rand(d,d))
+    L_p = Matrix(LowerTriangular(rand(d,d)))
     P = L_p*L_p'
 
     A = rand(d,d)
-    L_Q = LowerTriangular(rand(d,d))
+    L_Q = Matrix(LowerTriangular(rand(d,d)))
     Q = L_Q*L_Q'
 
     # PREDICT
@@ -53,7 +53,7 @@ end
     # Setup
     d = 5
     m_p = rand(d)
-    L_P_p = LowerTriangular(rand(d,d))
+    L_P_p = Matrix(LowerTriangular(rand(d,d)))
     P_p = L_P_p * L_P_p'
 
     # Measure
@@ -95,11 +95,11 @@ end
     # Setup
     d = 5
     m, m_s = rand(d), rand(d)
-    L_P, L_P_s = LowerTriangular(rand(d,d)), LowerTriangular(rand(d,d))
+    L_P, L_P_s = Matrix(LowerTriangular(rand(d,d))), Matrix(LowerTriangular(rand(d,d)))
     P, P_s = L_P*L_P', L_P_s*L_P_s'
 
     A = rand(d,d)
-    L_Q = LowerTriangular(rand(d,d))
+    L_Q = Matrix(LowerTriangular(rand(d,d)))
     Q = L_Q*L_Q'
     Q_SR = SRMatrix(L_Q)