From 8869f802790937208d553375f55fe0a4a400c73d Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 7 Sep 2023 01:17:41 -0400
Subject: [PATCH 01/19] Towards a cleaner and more maintainable internals of
 NonlinearSolve.jl

---
 .JuliaFormatter.toml  |    3 +-
 Project.toml          |    4 +-
 src/NonlinearSolve.jl |   85 ++--
 src/ad.jl             |   19 +-
 src/jacobian.jl       |  191 ++------
 src/levenberg.jl      |  361 +++++---------
 src/raphson.jl        |  185 +++-----
 src/trustRegion.jl    | 1056 +++++++++++++++++++----------------------
 src/utils.jl          |  139 +++---
 9 files changed, 836 insertions(+), 1207 deletions(-)

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 9c7935911..320e0c073 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,2 +1,3 @@
 style = "sciml"
-format_markdown = true
\ No newline at end of file
+format_markdown = true
+annotate_untyped_fields_with_any = false
diff --git a/Project.toml b/Project.toml
index ed0b27f95..db9ad0d35 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,12 @@
 name = "NonlinearSolve"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 authors = ["SciML"]
-version = "1.10.0"
+version = "1.11.0"
 
 [deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index cae730bc3..38a4b6142 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -1,38 +1,41 @@
 module NonlinearSolve
-if isdefined(Base, :Experimental) &&
-   isdefined(Base.Experimental, Symbol("@max_methods"))
+
+if isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@max_methods"))
     @eval Base.Experimental.@max_methods 1
 end
-using Reexport
-using UnPack: @unpack
-using FiniteDiff, ForwardDiff
-using ForwardDiff: Dual
-using LinearAlgebra
-using StaticArraysCore
-using RecursiveArrayTools
-import EnumX
-import ArrayInterface
-import LinearSolve
-using DiffEqBase
-using SparseDiffTools
-
-@reexport using SciMLBase
-using SciMLBase: NLStats
-@reexport using SimpleNonlinearSolve
-
-import SciMLBase: _unwrap_val
-
-abstract type AbstractNonlinearSolveAlgorithm <: SciMLBase.AbstractNonlinearAlgorithm end
-abstract type AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ} <:
-              AbstractNonlinearSolveAlgorithm end
-
-function SciMLBase.__solve(prob::NonlinearProblem,
-    alg::AbstractNonlinearSolveAlgorithm, args...;
-    kwargs...)
+
+using DiffEqBase, LinearAlgebra, LinearSolve, SparseDiffTools
+import ForwardDiff
+
+import ADTypes: AbstractFiniteDifferencesMode
+import ArrayInterface: undefmatrix
+import ConcreteStructs: @concrete
+import EnumX: @enumx
+import ForwardDiff: Dual
+import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
+import RecursiveArrayTools: AbstractVectorOfArray, recursivecopy!, recursivefill!
+import Reexport: @reexport
+import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
+import SparseDiffTools: __init_𝒥
+import StaticArraysCore: StaticArray, SVector
+import UnPack: @unpack
+
+@reexport using ADTypes, SciMLBase, SimpleNonlinearSolve
+
+const AbstractSparseADType = Union{ADTypes.AbstractSparseFiniteDifferences,
+    ADTypes.AbstractSparseForwardMode, ADTypes.AbstractSparseReverseMode}
+
+abstract type AbstractNonlinearSolveAlgorithm <: AbstractNonlinearAlgorithm end
+abstract type AbstractNewtonAlgorithm{CJ, AD} <: AbstractNonlinearSolveAlgorithm end
+
+function SciMLBase.__solve(prob::NonlinearProblem, alg::AbstractNonlinearSolveAlgorithm,
+    args...; kwargs...)
     cache = init(prob, alg, args...; kwargs...)
-    sol = solve!(cache)
+    return solve!(cache)
 end
 
+# FIXME: Scalar Case is Completely Broken
+
 include("utils.jl")
 include("raphson.jl")
 include("trustRegion.jl")
@@ -44,23 +47,23 @@ import PrecompileTools
 
 PrecompileTools.@compile_workload begin
     for T in (Float32, Float64)
-        prob = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
+        # prob = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
 
-        precompile_algs = if VERSION >= v"1.7"
-            (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
-        else
-            (NewtonRaphson(),)
-        end
+        #         precompile_algs = if VERSION ≥ v"1.7"
+        #             (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
+        #         else
+        #             (NewtonRaphson(),)
+        #         end
 
-        for alg in precompile_algs
-            solve(prob, alg, abstol = T(1e-2))
-        end
+        #         for alg in precompile_algs
+        #             solve(prob, alg, abstol = T(1e-2))
+        #         end
 
         prob = NonlinearProblem{true}((du, u, p) -> du[1] = u[1] * u[1] - p[1], T[0.1],
             T[2])
-        for alg in precompile_algs
-            solve(prob, alg, abstol = T(1e-2))
-        end
+        #         for alg in precompile_algs
+        #             solve(prob, alg, abstol = T(1e-2))
+        #         end
     end
 end
 
diff --git a/src/ad.jl b/src/ad.jl
index 0dad74c56..faa8c9f04 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -23,22 +23,17 @@ function scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     return sol, partials
 end
 
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, StaticArraysCore.SVector},
-        iip,
-        <:Dual{T, V, P}},
-    alg::AbstractNewtonAlgorithm,
-    args...; kwargs...) where {iip, T, V, P}
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
+        <:Dual{T, V, P}}, alg::AbstractNewtonAlgorithm, args...;
+    kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        retcode = sol.retcode)
+        sol.retcode)
 end
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, StaticArraysCore.SVector},
-        iip,
-        <:AbstractArray{<:Dual{T, V, P}}},
-    alg::AbstractNewtonAlgorithm,
-    args...;
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
+        <:AbstractArray{<:Dual{T, V, P}}}, alg::AbstractNewtonAlgorithm, args...;
     kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        retcode = sol.retcode)
+        sol.retcode)
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 8296069e0..dfa8b1212 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -1,120 +1,72 @@
-struct JacobianWrapper{fType, pType}
-    f::fType
-    p::pType
+@concrete struct JacobianWrapper
+    f
+    p
 end
 
 (uf::JacobianWrapper)(u) = uf.f(u, uf.p)
 (uf::JacobianWrapper)(res, u) = uf.f(res, u, uf.p)
 
-struct NonlinearSolveTag end
-
-function sparsity_colorvec(f, x)
-    sparsity = f.sparsity
-    colorvec = DiffEqBase.has_colorvec(f) ? f.colorvec :
-               (isnothing(sparsity) ? (1:length(x)) : matrix_colors(sparsity))
-    sparsity, colorvec
-end
-
-function jacobian_finitediff_forward!(J, f, x, jac_config, forwardcache, cache)
-    (FiniteDiff.finite_difference_jacobian!(J, f, x, jac_config, forwardcache);
-    maximum(jac_config.colorvec))
-end
-function jacobian_finitediff!(J, f, x, jac_config, cache)
-    (FiniteDiff.finite_difference_jacobian!(J, f, x, jac_config);
-    2 * maximum(jac_config.colorvec))
-end
+# function sparsity_colorvec(f, x)
+#     sparsity = f.sparsity
+#     colorvec = DiffEqBase.has_colorvec(f) ? f.colorvec :
+#                (isnothing(sparsity) ? (1:length(x)) : matrix_colors(sparsity))
+#     sparsity, colorvec
+# end
 
 # NoOp for Jacobian if it is not a Abstract Array -- For eg, JacVec Operator
-jacobian!(J, cache) = J
-function jacobian!(J::AbstractMatrix{<:Number}, cache)
-    f = cache.f
-    uf = cache.uf
-    x = cache.u
-    fx = cache.fu
-    jac_config = cache.jac_config
-    alg = cache.alg
-
-    if SciMLBase.has_jac(f)
-        f.jac(J, x, cache.p)
-    elseif alg_autodiff(alg)
-        forwarddiff_color_jacobian!(J, uf, x, jac_config)
-        #cache.destats.nf += 1
+jacobian!!(J, _) = J
+# `!!` notation is from BangBang.jl since J might be jacobian in case of oop `f.jac`
+# and we don't want wasteful `copyto!`
+function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
+    @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
+    iip = isinplace(cache)
+    if iip
+        has_jac(f) ? f.jac(J, u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, u)
     else
-        isforward = alg_difftype(alg) === Val{:forward}
-        if isforward
-            uf(fx, x)
-            #cache.destats.nf += 1
-            tmp = jacobian_finitediff_forward!(J, uf, x, jac_config, fx,
-                cache)
-        else # not forward difference
-            tmp = jacobian_finitediff!(J, uf, x, jac_config, cache)
-        end
-        #cache.destats.nf += tmp
+        return has_jac(f) ? f.jac(u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, u)
     end
-    nothing
+    return nothing
 end
 
-function build_jac_and_jac_config(alg, f::F1, uf::F2, du1, u, tmp, du2) where {F1, F2}
+# Build Jacobian Caches
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
+    ::Val{iip}) where {iip}
+    uf = JacobianWrapper(f, p)
+
     haslinsolve = hasfield(typeof(alg), :linsolve)
 
-    has_analytic_jac = SciMLBase.has_jac(f)
+    has_analytic_jac = has_jac(f)
     linsolve_needs_jac = (concrete_jac(alg) === nothing &&
                           (!haslinsolve || (haslinsolve && (alg.linsolve === nothing ||
-                             LinearSolve.needs_concrete_A(alg.linsolve)))))
-    alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
+                             needs_concrete_A(alg.linsolve)))))
+    alg_wants_jac = (concrete_jac(alg) === nothing && concrete_jac(alg))
 
+    fu = zero(u)  # TODO: Use Prototype
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
-        sparsity, colorvec = sparsity_colorvec(f, u)
-
-        if alg_autodiff(alg)
-            _chunksize = get_chunksize(alg) === Val(0) ? nothing : get_chunksize(alg) # SparseDiffEq uses different convection...
-
-            T = if standardtag(alg)
-                typeof(ForwardDiff.Tag(NonlinearSolveTag(), eltype(u)))
-            else
-                typeof(ForwardDiff.Tag(uf, eltype(u)))
-            end
-            jac_config = ForwardColorJacCache(uf, u, _chunksize; colorvec, sparsity,
-                tag = T)
-        else
-            if alg_difftype(alg) !== Val{:complex}
-                jac_config = FiniteDiff.JacobianCache(tmp, du1, du2, alg_difftype(alg);
-                    colorvec, sparsity)
-            else
-                jac_config = FiniteDiff.JacobianCache(Complex{eltype(tmp)}.(tmp),
-                    Complex{eltype(du1)}.(du1), nothing, alg_difftype(alg), eltype(u);
-                    colorvec, sparsity)
-            end
-        end
+        # TODO: We need an Upstream Mode to allow using known sparsity and colorvec
+        # TODO: We can use the jacobian prototype here
+        sd = typeof(alg.ad) <: AbstractSparseADType ? SymbolicsSparsityDetection() :
+             NoSparsityDetection()
+        jac_cache = iip ? sparse_jacobian_cache(alg.ad, sd, uf, fu, u) :
+                    sparse_jacobian_cache(alg.ad, sd, uf, u; fx=fu)
     else
-        jac_config = nothing
+        jac_cache = nothing
     end
 
     J = if !linsolve_needs_jac
         # We don't need to construct the Jacobian
-        JacVec(uf, u; autodiff = alg_autodiff(alg) ? AutoForwardDiff() : AutoFiniteDiff())
+        JacVec(uf, u; autodiff = alg.ad)
     else
-        if f.jac_prototype === nothing
-            ArrayInterface.undefmatrix(u)
+        if has_analytic_jac
+            iip ? undefmatrix(u) : nothing
         else
-            f.jac_prototype
+            f.jac_prototype === nothing ? __init_𝒥(jac_cache) : f.jac_prototype
         end
     end
 
-    return J, jac_config
-end
-
-# Build Jacobian Caches
-function jacobian_caches(alg::Union{NewtonRaphson, LevenbergMarquardt, TrustRegion}, f, u,
-    p, ::Val{true})
-    uf = JacobianWrapper(f, p)
-
-    du1 = zero(u)
-    du2 = zero(u)
-    tmp = zero(u)
-    J, jac_config = build_jac_and_jac_config(alg, f, uf, du1, u, tmp, du2)
-
+    # FIXME: Assumes same sized `u` and `fu` -- Incorrect Assumption for Levenberg
     linprob = LinearProblem(J, _vec(zero(u)); u0 = _vec(zero(u)))
+
     weight = similar(u)
     recursivefill!(weight, true)
 
@@ -122,64 +74,5 @@ function jacobian_caches(alg::Union{NewtonRaphson, LevenbergMarquardt, TrustRegi
             nothing)..., weight)
     linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
 
-    uf, linsolve, J, du1, jac_config
-end
-
-function get_chunksize(jac_config::ForwardDiff.JacobianConfig{
-    T,
-    V,
-    N,
-    D,
-}) where {T, V, N, D
-}
-    Val(N)
-end # don't degrade compile time information to runtime information
-
-function jacobian_finitediff(f, x, ::Type{diff_type}, dir, colorvec, sparsity,
-    jac_prototype) where {diff_type}
-    (FiniteDiff.finite_difference_derivative(f, x, diff_type, eltype(x), dir = dir), 2)
-end
-function jacobian_finitediff(f, x::AbstractArray, ::Type{diff_type}, dir, colorvec,
-    sparsity, jac_prototype) where {diff_type}
-    f_in = diff_type === Val{:forward} ? f(x) : similar(x)
-    ret_eltype = eltype(f_in)
-    J = FiniteDiff.finite_difference_jacobian(f, x, diff_type, ret_eltype, f_in,
-        dir = dir, colorvec = colorvec,
-        sparsity = sparsity,
-        jac_prototype = jac_prototype)
-    return J, _nfcount(maximum(colorvec), diff_type)
-end
-function jacobian(cache, f::F) where {F}
-    x = cache.u
-    alg = cache.alg
-    uf = cache.uf
-    local tmp
-
-    if DiffEqBase.has_jac(cache.f)
-        J = f.jac(cache.u, cache.p)
-    elseif alg_autodiff(alg)
-        J, tmp = jacobian_autodiff(uf, x, cache.f, alg)
-    else
-        jac_prototype = cache.f.jac_prototype
-        sparsity, colorvec = sparsity_colorvec(cache.f, x)
-        dir = true
-        J, tmp = jacobian_finitediff(uf, x, alg_difftype(alg), dir, colorvec, sparsity,
-            jac_prototype)
-    end
-    J
-end
-
-jacobian_autodiff(f, x, nonlinfun, alg) = (ForwardDiff.derivative(f, x), 1, alg)
-function jacobian_autodiff(f, x::AbstractArray, nonlinfun, alg)
-    jac_prototype = nonlinfun.jac_prototype
-    sparsity, colorvec = sparsity_colorvec(nonlinfun, x)
-    maxcolor = maximum(colorvec)
-    chunk_size = get_chunksize(alg) === Val(0) ? nothing : get_chunksize(alg)
-    num_of_chunks = chunk_size === nothing ?
-                    Int(ceil(maxcolor /
-                             SparseDiffTools.getsize(ForwardDiff.pickchunksize(maxcolor)))) :
-                    Int(ceil(maxcolor / _unwrap_val(chunk_size)))
-    (forwarddiff_color_jacobian(f, x, colorvec = colorvec, sparsity = sparsity,
-            jac_prototype = jac_prototype, chunksize = chunk_size),
-        num_of_chunks)
+    return uf, linsolve, J, fu, jac_cache
 end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index db8955f4a..721e08cd3 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -1,113 +1,82 @@
 """
-```julia
-LevenbergMarquardt(; chunk_size = Val{0}(),
-                    autodiff = Val{true}(),
-                    standardtag = Val{true}(),
-                    concrete_jac = nothing,
-                    diff_type = Val{:forward},
-                    linsolve = nothing, precs = DEFAULT_PRECS,
-                    damping_initial::Real = 1.0,
-                    damping_increase_factor::Real = 2.0,
-                    damping_decrease_factor::Real = 3.0,
-                    finite_diff_step_geodesic::Real = 0.1,
-                    α_geodesic::Real = 0.75,
-                    b_uphill::Real = 1.0,
-                    min_damping_D::AbstractFloat = 1e-8)
-```
+    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
+        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
 
 An advanced Levenberg-Marquardt implementation with the improvements suggested in the
 [paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
 algorithm for nonlinear least-squares minimization". Designed for large-scale and
 numerically-difficult nonlinear systems.
 
-
 ### Keyword Arguments
 
-- `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-  system. This allows for multiple derivative columns to be computed simultaneously,
-  improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-  default chunk size mechanism. For more details, see the documentation for
-  [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-- `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-  Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-  used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-  SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-  finite differencing.
-- `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-  differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-  then ForwardDiff's default function naming tag is used, which results in larger stack
-  traces.
-- `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-  then the Jacobian will not be constructed and instead direct Jacobian-vector products
-  `J*v` are computed using forward-mode automatic differentiation or finite differencing
-  tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-  for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-  the construction of the Jacobian.
-- `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-  `Val{:forward}` for forward finite differences. For more details on the choices, see the
-  [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
-- `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-  linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-  LinearSolve.jl default algorithm choice. For more information on available algorithm
-  choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-- `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-  preconditioners. For more information on specifying preconditioners for LinearSolve
-  algorithms, consult the
-  [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-- `damping_initial`: the starting value for the damping factor. The damping factor is
-  inversely proportional to the step size. The damping factor is adjusted during each
-  iteration. Defaults to `1.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `damping_increase_factor`: the factor by which the damping is increased if a step is
-  rejected. Defaults to `2.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-  accepted. Defaults to `3.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-  the geodesic acceleration. Defaults to `0.1` which means that the step size is
-  approximately 10% of the first-order step. For more details, see section 3 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-  geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-  that acceptable steps meet the condition
-  ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-  acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-  path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-  is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-  effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-  choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-  and reject all steps that increase the cost. Although this is a natural and safe choice,
-  it is often not the most efficient. Therefore downhill moves are always accepted, but
-  uphill moves are only conditionally accepted. To decide whether an uphill move will be
-  accepted at each iteration ``i``, we compute
-  ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-  between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-  step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-  specify, uphill moves are accepted if
-  ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-  iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-  allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-  will be accepted. Defaults to `1.0`. For more details, see section 4 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-  `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-  where `J` is the Jacobian. It is suggested by
-  [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-  `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+      ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+      `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
+  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
+    then the Jacobian will not be constructed and instead direct Jacobian-vector products
+    `J*v` are computed using forward-mode automatic differentiation or finite differencing
+    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
+    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
+    the construction of the Jacobian.
+  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
+    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
+    LinearSolve.jl default algorithm choice. For more information on available algorithm
+    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+    preconditioners. For more information on specifying preconditioners for LinearSolve
+    algorithms, consult the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `damping_initial`: the starting value for the damping factor. The damping factor is
+    inversely proportional to the step size. The damping factor is adjusted during each
+    iteration. Defaults to `1.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_increase_factor`: the factor by which the damping is increased if a step is
+    rejected. Defaults to `2.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+    accepted. Defaults to `3.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
+    the geodesic acceleration. Defaults to `0.1` which means that the step size is
+    approximately 10% of the first-order step. For more details, see section 3 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
+    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
+    that acceptable steps meet the condition
+    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
+    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
+    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
+    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+    and reject all steps that increase the cost. Although this is a natural and safe choice,
+    it is often not the most efficient. Therefore downhill moves are always accepted, but
+    uphill moves are only conditionally accepted. To decide whether an uphill move will be
+    accepted at each iteration ``i``, we compute
+    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+    specify, uphill moves are accepted if
+    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
+    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
+    will be accepted. Defaults to `1.0`. For more details, see section 4 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+    where `J` is the Jacobian. It is suggested by
+    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
+    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
 """
-struct LevenbergMarquardt{CS, AD, FDT, L, P, ST, CJ, T} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
     damping_initial::T
     damping_increase_factor::T
     damping_decrease_factor::T
@@ -117,54 +86,36 @@ struct LevenbergMarquardt{CS, AD, FDT, L, P, ST, CJ, T} <:
     min_damping_D::T
 end
 
-function LevenbergMarquardt(; chunk_size = Val{0}(),
-    autodiff = Val{true}(),
-    standardtag = Val{true}(),
-    concrete_jac = nothing,
-    diff_type = Val{:forward},
-    linsolve = nothing,
-    precs = DEFAULT_PRECS,
-    damping_initial::Real = 1.0,
-    damping_increase_factor::Real = 2.0,
-    damping_decrease_factor::Real = 3.0,
-    finite_diff_step_geodesic::Real = 0.1,
-    α_geodesic::Real = 0.75,
-    b_uphill::Real = 1.0,
-    min_damping_D::AbstractFloat = 1e-8)
-    LevenbergMarquardt{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac),
-        typeof(min_damping_D)}(linsolve, precs,
-        damping_initial,
-        damping_increase_factor,
-        damping_decrease_factor,
-        finite_diff_step_geodesic,
-        α_geodesic,
-        b_uphill,
-        min_damping_D)
+function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+    precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
+    damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
+    α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
+    adkwargs...)
+    ad = default_adargs_to_adtype(adkwargs...)
+    return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
+        damping_initial, damping_increase_factor, damping_decrease_factor,
+        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
 end
 
-mutable struct LevenbergMarquardtCache{iip, fType, algType, uType, duType, resType, pType,
-    INType, tolType, probType, ufType, L, jType, JC,
-    DᵀDType, λType, lossType,
-}
-    f::fType
-    alg::algType
+@concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
+    f
+    alg
     u::uType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
     J::jType
-    du_tmp::duType
-    jac_config::JC
+    jac_cache
     force_stop::Bool
     maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
-    DᵀD::DᵀDType
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    DᵀD
     JᵀJ::jType
     λ::λType
     λ_factor::λType
@@ -182,75 +133,25 @@ mutable struct LevenbergMarquardtCache{iip, fType, algType, uType, duType, resTy
     δ::uType
     loss_old::lossType
     make_new_J::Bool
-    fu_tmp::resType
+    fu_tmp
     mat_tmp::jType
     stats::NLStats
-
-    function LevenbergMarquardtCache{iip}(f::fType, alg::algType, u::uType, fu::resType,
-        p::pType, uf::ufType, linsolve::L, J::jType,
-        du_tmp::duType, jac_config::JC,
-        force_stop::Bool, maxiters::Int,
-        internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType, DᵀD::DᵀDType, JᵀJ::jType,
-        λ::λType, λ_factor::λType,
-        damping_increase_factor::λType,
-        damping_decrease_factor::λType, h::λType,
-        α_geodesic::λType, b_uphill::λType,
-        min_damping_D::λType, v::uType,
-        a::uType, tmp_vec::uType, v_old::uType,
-        norm_v_old::lossType, δ::uType,
-        loss_old::lossType, make_new_J::Bool,
-        fu_tmp::resType,
-        mat_tmp::jType,
-        stats::NLStats) where {
-        iip, fType, algType,
-        uType, duType, resType,
-        pType, INType, tolType,
-        probType, ufType, L,
-        jType, JC, DᵀDType,
-        λType, lossType,
-    }
-        new{iip, fType, algType, uType, duType, resType,
-            pType, INType, tolType, probType, ufType, L,
-            jType, JC, DᵀDType, λType, lossType}(f, alg, u, fu, p, uf, linsolve, J, du_tmp,
-            jac_config, force_stop, maxiters,
-            internalnorm, retcode, abstol, prob, DᵀD,
-            JᵀJ, λ, λ_factor,
-            damping_increase_factor,
-            damping_decrease_factor, h,
-            α_geodesic, b_uphill, min_damping_D,
-            v, a, tmp_vec, v_old,
-            norm_v_old, δ, loss_old, make_new_J,
-            fu_tmp, mat_tmp, stats)
-    end
 end
 
-function jacobian_caches(alg::LevenbergMarquardt, f, u, p, ::Val{false})
-    JacobianWrapper(f, p), nothing, ArrayInterface.undefmatrix(u), nothing, nothing
-end
+isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-6,
-    internalnorm = DEFAULT_NORM,
+    args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
     kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
-    f = prob.f
-    p = prob.p
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
     if iip
-        fu = zero(u)
-        f(fu, u, p)
+        fu1 = zero(u)  # TODO: Use Prototype
+        f(fu1, u, p)
     else
-        fu = f(u, p)
+        fu1 = f(u, p)
     end
-    uf, linsolve, J, du_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+    uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
 
     λ = convert(eltype(u), alg.damping_initial)
     λ_factor = convert(eltype(u), alg.damping_increase_factor)
@@ -269,7 +170,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarq
         DᵀD = Diagonal(d)
     end
 
-    loss = internalnorm(fu)
+    loss = internalnorm(fu1)
     JᵀJ = zero(J)
     v = zero(u)
     a = zero(u)
@@ -277,26 +178,25 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarq
     v_old = zero(u)
     δ = zero(u)
     make_new_J = true
-    fu_tmp = zero(fu)
+    fu_tmp = zero(fu1)
     mat_tmp = zero(J)
 
-    return LevenbergMarquardtCache{iip}(f, alg, u, fu, p, uf, linsolve, J, du_tmp,
-        jac_config, false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, DᵀD, JᵀJ,
-        λ, λ_factor, damping_increase_factor,
-        damping_decrease_factor, h,
-        α_geodesic, b_uphill, min_damping_D,
-        v, a, tmp_vec, v_old, loss, δ, loss, make_new_J,
-        fu_tmp, mat_tmp, NLStats(1, 0, 0, 0, 0))
+    return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
+        JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
+        b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
+        mat_tmp, NLStats(1, 0, 0, 0, 0))
 end
+
 function perform_step!(cache::LevenbergMarquardtCache{true})
-    @unpack fu, f, make_new_J = cache
-    if iszero(fu)
+    @unpack fu1, f, make_new_J = cache
+    if iszero(fu1)
         cache.force_stop = true
         return nothing
     end
+
     if make_new_J
-        jacobian!(cache.J, cache)
+        jacobian!!(cache.J, cache)
         mul!(cache.JᵀJ, cache.J', cache.J)
         cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
         cache.make_new_J = false
@@ -306,24 +206,24 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
 
     # Usual Levenberg-Marquardt step ("velocity").
     # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
-    mul!(cache.fu_tmp, J', fu)
+    mul!(cache.fu_tmp, J', fu1)
     @. cache.mat_tmp = JᵀJ + λ * DᵀD
     linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du_tmp), p = p, reltol = cache.abstol)
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. cache.v = -cache.du_tmp
+    @. cache.v = -cache.du
 
     # Geodesic acceleration (step_size = v + a / 2).
     @unpack v, α_geodesic, h = cache
     f(cache.fu_tmp, u .+ h .* v, p)
 
     # The following lines do: cache.a = -J \ cache.fu_tmp
-    mul!(cache.du_tmp, J, v)
-    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu) / h - cache.du_tmp)
+    mul!(cache.du, J, v)
+    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
     linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du_tmp), p = p, reltol = cache.abstol)
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. cache.a = -cache.du_tmp
+    @. cache.a = -cache.du
     cache.stats.nsolve += 2
     cache.stats.nfactors += 2
 
@@ -345,7 +245,7 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
                 cache.force_stop = true
                 return nothing
             end
-            cache.fu .= cache.fu_tmp
+            cache.fu1 .= cache.fu_tmp
             cache.v_old .= v
             cache.norm_v_old = norm_v
             cache.loss_old = loss
@@ -359,13 +259,14 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
 end
 
 function perform_step!(cache::LevenbergMarquardtCache{false})
-    @unpack fu, f, make_new_J = cache
-    if iszero(fu)
+    @unpack fu1, f, make_new_J = cache
+    if iszero(fu1)
         cache.force_stop = true
         return nothing
     end
+
     if make_new_J
-        cache.J = jacobian(cache, f)
+        cache.J = jacobian!!(cache.J, cache)
         cache.JᵀJ = cache.J' * cache.J
         if cache.JᵀJ isa Number
             cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
@@ -378,11 +279,11 @@ function perform_step!(cache::LevenbergMarquardtCache{false})
     @unpack u, p, λ, JᵀJ, DᵀD, J = cache
 
     # Usual Levenberg-Marquardt step ("velocity").
-    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu)
+    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
 
     @unpack v, h, α_geodesic = cache
     # Geodesic acceleration (step_size = v + a / 2).
-    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu) ./ h .- J * v))
+    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
 
@@ -404,7 +305,7 @@ function perform_step!(cache::LevenbergMarquardtCache{false})
                 cache.force_stop = true
                 return nothing
             end
-            cache.fu = fu_new
+            cache.fu1 = fu_new
             cache.v_old = v
             cache.norm_v_old = norm_v
             cache.loss_old = loss
@@ -429,6 +330,6 @@ function SciMLBase.solve!(cache::LevenbergMarquardtCache)
         cache.retcode = ReturnCode.Success
     end
 
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index 24e5799fd..d780d5077 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -1,9 +1,6 @@
 """
-```julia
-NewtonRaphson(; chunk_size = Val{0}(), autodiff = Val{true}(),
-              standardtag = Val{true}(), concrete_jac = nothing,
-              diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS)
-```
+    NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, adkwargs...)
 
 An advanced NewtonRaphson implementation with support for efficient handling of sparse
 matrices via colored automatic differentiation and preconditioned linear solvers. Designed
@@ -11,29 +8,15 @@ for large-scale and numerically-difficult nonlinear systems.
 
 ### Keyword Arguments
 
-  - `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-    system. This allows for multiple derivative columns to be computed simultaneously,
-    improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-    default chunk size mechanism. For more details, see the documentation for
-    [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-  - `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-    Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-    used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-    SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-    finite differencing.
-  - `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-    differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-    then ForwardDiff's default function naming tag is used, which results in larger stack
-    traces.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+    `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
     then the Jacobian will not be constructed and instead direct Jacobian-vector products
     `J*v` are computed using forward-mode automatic differentiation or finite differencing
     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
     the construction of the Jacobian.
-  - `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-    `Val{:forward}` for forward finite differences. For more details on the choices, see the
-    [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
     LinearSolve.jl default algorithm choice. For more information on available algorithm
@@ -42,114 +25,74 @@ for large-scale and numerically-difficult nonlinear systems.
     preconditioners. For more information on specifying preconditioners for LinearSolve
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
 """
-struct NewtonRaphson{CS, AD, FDT, L, P, ST, CJ} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct NewtonRaphson{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
 end
 
-function NewtonRaphson(; chunk_size = Val{0}(), autodiff = Val{true}(),
-    standardtag = Val{true}(), concrete_jac = nothing,
-    diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS)
-    NewtonRaphson{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac)}(linsolve,
-        precs)
+concrete_jac(::NewtonRaphson{CJ}) where {CJ} = CJ
+
+function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
+    precs = DEFAULT_PRECS, adkwargs...)
+    ad = default_adargs_to_adtype(adkwargs...)
+    return NewtonRaphson{_unwrap_val(concrete_jac)}(ad, linsolve, precs)
 end
 
-mutable struct NewtonRaphsonCache{iip, fType, algType, uType, duType, resType, pType,
-    INType, tolType,
-    probType, ufType, L, jType, JC}
-    f::fType
-    alg::algType
-    u::uType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
-    J::jType
-    du1::duType
-    jac_config::JC
-    force_stop::Bool
+@concrete mutable struct NewtonRaphsonCache{iip}
+    f
+    alg
+    u
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
+    J
+    jac_cache
+    force_stop
     maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
     stats::NLStats
-
-    function NewtonRaphsonCache{iip}(f::fType, alg::algType, u::uType, fu::resType,
-        p::pType, uf::ufType, linsolve::L, J::jType,
-        du1::duType,
-        jac_config::JC, force_stop::Bool, maxiters::Int,
-        internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType,
-        stats::NLStats) where {
-        iip, fType, algType, uType,
-        duType, resType, pType, INType,
-        tolType,
-        probType, ufType, L, jType, JC}
-        new{iip, fType, algType, uType, duType, resType, pType, INType, tolType,
-            probType, ufType, L, jType, JC}(f, alg, u, fu, p,
-            uf, linsolve, J, du1, jac_config,
-            force_stop, maxiters, internalnorm,
-            retcode, abstol, prob, stats)
-    end
 end
 
-function jacobian_caches(alg::NewtonRaphson, f, u, p, ::Val{false})
-    JacobianWrapper(f, p), nothing, nothing, nothing, nothing
-end
+isinplace(::NewtonRaphsonCache{iip}) where {iip} = iip
 
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-6,
-    internalnorm = DEFAULT_NORM,
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson, args...;
+    alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
     kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
-    f = prob.f
-    p = prob.p
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
     if iip
-        fu = zero(u)
-        f(fu, u, p)
+        fu1 = zero(u)  # TODO: Use Prototype
+        f(fu1, u, p)
     else
-        fu = f(u, p)
+        fu1 = f(u, p)
     end
-    uf, linsolve, J, du1, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+    uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
 
-    return NewtonRaphsonCache{iip}(f, alg, u, fu, p, uf, linsolve, J, du1, jac_config,
-        false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, NLStats(1, 0, 0, 0, 0))
+    return NewtonRaphsonCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
+        NLStats(1, 0, 0, 0, 0))
 end
 
 function perform_step!(cache::NewtonRaphsonCache{true})
-    @unpack u, fu, f, p, alg = cache
-    @unpack J, linsolve, du1 = cache
-    jacobian!(J, cache)
+    @unpack u, fu1, f, p, alg, J, linsolve, du = cache
+    jacobian!!(J, cache)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(fu), linu = _vec(du1),
-        p = p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
+        p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. u = u - du1
-    f(fu, u, p)
+    @. u = u - du
+    f(fu1, u, p)
 
-    if cache.internalnorm(cache.fu) < cache.abstol
-        cache.force_stop = true
-    end
+    cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -158,13 +101,17 @@ function perform_step!(cache::NewtonRaphsonCache{true})
 end
 
 function perform_step!(cache::NewtonRaphsonCache{false})
-    @unpack u, fu, f, p = cache
-    J = jacobian(cache, f)
-    cache.u = u - J \ fu
-    cache.fu = f(cache.u, p)
-    if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-        cache.force_stop = true
-    end
+    @unpack u, fu1, f, p, alg, linsolve, du = cache
+
+    cache.J = jacobian!!(cache.J, cache)
+    # u = u - J \ fu
+    linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1), linu = _vec(du),
+        p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    @. u = u - du
+    cache.fu1 = f(u, p)
+
+    cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -184,8 +131,8 @@ function SciMLBase.solve!(cache::NewtonRaphsonCache)
         cache.retcode = ReturnCode.Success
     end
 
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
 end
 
 function SciMLBase.reinit!(cache::NewtonRaphsonCache{iip}, u0 = cache.u; p = cache.p,
@@ -193,11 +140,11 @@ function SciMLBase.reinit!(cache::NewtonRaphsonCache{iip}, u0 = cache.u; p = cac
     cache.p = p
     if iip
         recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
+        cache.f(cache.fu1, cache.u, p)
     else
         # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
         cache.u = u0
-        cache.fu = cache.f(cache.u, p)
+        cache.fu1 = cache.f(cache.u, p)
     end
     cache.abstol = abstol
     cache.maxiters = maxiters
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 6e867699c..c43b86699 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -14,7 +14,7 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
 `TrustRegion(radius_update_scheme = your desired update scheme)`. For example,
 `sol = solve(prob, alg=TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))`.
 """
-EnumX.@enumx RadiusUpdateSchemes begin
+@enumx RadiusUpdateSchemes begin
     """
     `RadiusUpdateSchemes.Simple`
 
@@ -68,19 +68,12 @@ end
 
 """
 ```julia
-TrustRegion(; chunk_size = Val{0}(), autodiff = Val{true}(),
-            standardtag = Val{true}(), concrete_jac = nothing,
-            diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS,
-            radius_update_scheme = RadiusUpdateSchemes.Simple,
-            max_trust_radius::Real = 0 // 1,
-            initial_trust_radius::Real = 0 // 1,
-            step_threshold::Real = 1 // 10,
-            shrink_threshold::Real = 1 // 4,
-            expand_threshold::Real = 3 // 4,
-            shrink_factor::Real = 1 // 4,
-            expand_factor::Real = 2 // 1,
-            max_shrink_times::Int = 32)
-```
+    TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple,
+        max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
+        step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
+        expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
+        expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
 
 An advanced TrustRegion implementation with support for efficient handling of sparse
 matrices via colored automatic differentiation and preconditioned linear solvers. Designed
@@ -88,29 +81,15 @@ for large-scale and numerically-difficult nonlinear systems.
 
 ### Keyword Arguments
 
-  - `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-    system. This allows for multiple derivative columns to be computed simultaneously,
-    improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-    default chunk size mechanism. For more details, see the documentation for
-    [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-  - `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-    Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-    used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-    SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-    finite differencing.
-  - `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-    differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-    then ForwardDiff's default function naming tag is used, which results in larger stack
-    traces.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+    `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
     then the Jacobian will not be constructed and instead direct Jacobian-vector products
     `J*v` are computed using forward-mode automatic differentiation or finite differencing
     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
     the construction of the Jacobian.
-  - `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-    `Val{:forward}` for forward finite differences. For more details on the choices, see the
-    [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
     LinearSolve.jl default algorithm choice. For more information on available algorithm
@@ -148,18 +127,13 @@ for large-scale and numerically-difficult nonlinear systems.
     `expand_threshold < r` (with `r` defined in `shrink_threshold`). Defaults to `2.0`.
   - `max_shrink_times`: the maximum number of times to shrink the trust region radius in a
     row, `max_shrink_times` is exceeded, the algorithm returns. Defaults to `32`.
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
 """
-struct TrustRegion{CS, AD, FDT, L, P, ST, CJ, MTR} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct TrustRegion{CJ, AD, MTR} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
     radius_update_scheme::RadiusUpdateSchemes.T
-    max_trust_radius::MTR
+    max_trust_radius
     initial_trust_radius::MTR
     step_threshold::MTR
     shrink_threshold::MTR
@@ -169,535 +143,477 @@ struct TrustRegion{CS, AD, FDT, L, P, ST, CJ, MTR} <:
     max_shrink_times::Int
 end
 
-function TrustRegion(; chunk_size = Val{0}(),
-    autodiff = Val{true}(),
-    standardtag = Val{true}(), concrete_jac = nothing,
-    diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS,
+function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
     radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple, #defaults to conventional radius update
-    max_trust_radius::Real = 0 // 1,
-    initial_trust_radius::Real = 0 // 1,
-    step_threshold::Real = 1 // 10,
-    shrink_threshold::Real = 1 // 4,
-    expand_threshold::Real = 3 // 4,
-    shrink_factor::Real = 1 // 4,
-    expand_factor::Real = 2 // 1,
-    max_shrink_times::Int = 32)
-    TrustRegion{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac), typeof(max_trust_radius),
-    }(linsolve, precs, radius_update_scheme, max_trust_radius,
-        initial_trust_radius,
-        step_threshold,
-        shrink_threshold,
-        expand_threshold,
-        shrink_factor,
-        expand_factor,
-        max_shrink_times)
-end
-
-mutable struct TrustRegionCache{iip, fType, algType, uType, resType, pType,
-    INType, tolType, probType, ufType, L, jType, JC, floatType,
-    trustType, suType, su2Type, tmpType}
-    f::fType
-    alg::algType
-    u_prev::uType
-    u::uType
-    fu_prev::resType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
-    J::jType
-    jac_config::JC
-    force_stop::Bool
-    maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
-    radius_update_scheme::RadiusUpdateSchemes.T
-    trust_r::trustType
-    max_trust_r::trustType
-    step_threshold::suType
-    shrink_threshold::trustType
-    expand_threshold::trustType
-    shrink_factor::trustType
-    expand_factor::trustType
-    loss::floatType
-    loss_new::floatType
-    H::jType
-    g::resType
-    shrink_counter::Int
-    step_size::su2Type
-    u_tmp::tmpType
-    fu_new::resType
-    make_new_J::Bool
-    r::floatType
-    p1::floatType
-    p2::floatType
-    p3::floatType
-    p4::floatType
-    ϵ::floatType
-    stats::NLStats
-
-    function TrustRegionCache{iip}(f::fType, alg::algType, u_prev::uType, u::uType,
-        fu_prev::resType, fu::resType, p::pType,
-        uf::ufType, linsolve::L, J::jType, jac_config::JC,
-        force_stop::Bool, maxiters::Int, internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType,
-        radius_update_scheme::RadiusUpdateSchemes.T,
-        trust_r::trustType,
-        max_trust_r::trustType, step_threshold::suType,
-        shrink_threshold::trustType, expand_threshold::trustType,
-        shrink_factor::trustType, expand_factor::trustType,
-        loss::floatType, loss_new::floatType, H::jType,
-        g::resType, shrink_counter::Int, step_size::su2Type,
-        u_tmp::tmpType, fu_new::resType, make_new_J::Bool,
-        r::floatType, p1::floatType, p2::floatType,
-        p3::floatType, p4::floatType, ϵ::floatType,
-        stats::NLStats) where {iip, fType, algType, uType,
-        resType, pType, INType,
-        tolType, probType, ufType, L,
-        jType, JC, floatType, trustType,
-        suType, su2Type, tmpType}
-        new{iip, fType, algType, uType, resType, pType,
-            INType, tolType, probType, ufType, L, jType, JC, floatType,
-            trustType, suType, su2Type, tmpType}(f, alg, u_prev, u, fu_prev, fu, p, uf,
-            linsolve, J,
-            jac_config, force_stop,
-            maxiters, internalnorm, retcode,
-            abstol, prob, radius_update_scheme,
-            trust_r, max_trust_r,
-            step_threshold, shrink_threshold,
-            expand_threshold, shrink_factor,
-            expand_factor, loss,
-            loss_new, H, g, shrink_counter,
-            step_size, u_tmp, fu_new,
-            make_new_J, r, p1, p2, p3, p4, ϵ, stats)
-    end
-end
-
-function jacobian_caches(alg::TrustRegion, f, u, p, ::Val{false})
-    J = ArrayInterface.undefmatrix(u)
-    JacobianWrapper(f, p), nothing, J, zero(u), nothing
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-8,
-    internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
-    u_prev = zero(u)
-    f = prob.f
-    p = prob.p
-    if iip
-        fu = zero(u)
-        f(fu, u, p)
-    else
-        fu = f(u, p)
-    end
-    fu_prev = zero(fu)
-
-    loss = get_loss(fu)
-    uf, linsolve, J, u_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
-
-    radius_update_scheme = alg.radius_update_scheme
-    max_trust_radius = convert(eltype(u), alg.max_trust_radius)
-    initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
-    step_threshold = convert(eltype(u), alg.step_threshold)
-    shrink_threshold = convert(eltype(u), alg.shrink_threshold)
-    expand_threshold = convert(eltype(u), alg.expand_threshold)
-    shrink_factor = convert(eltype(u), alg.shrink_factor)
-    expand_factor = convert(eltype(u), alg.expand_factor)
-    # Set default trust region radius if not specified
-    if iszero(max_trust_radius)
-        max_trust_radius = convert(eltype(u), max(norm(fu), maximum(u) - minimum(u)))
-    end
-    if iszero(initial_trust_radius)
-        initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
-    end
-
-    loss_new = loss
-    H = ArrayInterface.undefmatrix(u)
-    g = zero(fu)
-    shrink_counter = 0
-    step_size = zero(u)
-    fu_new = zero(fu)
-    make_new_J = true
-    r = loss
-
-    # Parameters for the Schemes
-    p1 = convert(eltype(u), 0.0)
-    p2 = convert(eltype(u), 0.0)
-    p3 = convert(eltype(u), 0.0)
-    p4 = convert(eltype(u), 0.0)
-    ϵ = convert(eltype(u), 1.0e-8)
-    if radius_update_scheme === RadiusUpdateSchemes.Hei
-        step_threshold = convert(eltype(u), 0.0)
-        shrink_threshold = convert(eltype(u), 0.25)
-        expand_threshold = convert(eltype(u), 0.25)
-        p1 = convert(eltype(u), 5.0) # M
-        p2 = convert(eltype(u), 0.1) # β
-        p3 = convert(eltype(u), 0.15) # γ1
-        p4 = convert(eltype(u), 0.15) # γ2
-        initial_trust_radius = convert(eltype(u), 1.0)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-        step_threshold = convert(eltype(u), 0.0001)
-        shrink_threshold = convert(eltype(u), 0.25)
-        expand_threshold = convert(eltype(u), 0.25)
-        p1 = convert(eltype(u), 2.0) # μ
-        p2 = convert(eltype(u), 1 / 6) # c5
-        p3 = convert(eltype(u), 6.0) # c6
-        p4 = convert(eltype(u), 0.0)
-        if iip
-            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-        else
-            if isa(u, Number)
-                g = ForwardDiff.derivative(x -> f(x, p), u)
-            else
-                g = auto_jacvec(x -> f(x, p), u, fu)
-            end
-        end
-        initial_trust_radius = convert(eltype(u), p1 * norm(g))
-    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-        step_threshold = convert(eltype(u), 0.0001)
-        shrink_threshold = convert(eltype(u), 0.25)
-        expand_threshold = convert(eltype(u), 0.75)
-        p1 = convert(eltype(u), 0.1) # μ
-        p2 = convert(eltype(u), 1 / 4) # c5
-        p3 = convert(eltype(u), 12) # c6
-        p4 = convert(eltype(u), 1.0e18) # M
-        initial_trust_radius = convert(eltype(u), p1 * (norm(fu)^0.99))
-    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-        step_threshold = convert(eltype(u), 0.05)
-        shrink_threshold = convert(eltype(u), 0.05)
-        expand_threshold = convert(eltype(u), 0.9)
-        p1 = convert(eltype(u), 2.5)  #alpha_1
-        p2 = convert(eltype(u), 0.25) # alpha_2
-        p3 = convert(eltype(u), 0) # not required
-        p4 = convert(eltype(u), 0) # not required
-        initial_trust_radius = convert(eltype(u), 1.0)
-    end
-
-    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu, p, uf, linsolve, J,
-        jac_config,
-        false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, radius_update_scheme,
-        initial_trust_radius,
-        max_trust_radius, step_threshold, shrink_threshold,
-        expand_threshold, shrink_factor, expand_factor, loss,
-        loss_new, H, g, shrink_counter, step_size, u_tmp, fu_new,
-        make_new_J, r, p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0))
-end
-
-function perform_step!(cache::TrustRegionCache{true})
-    @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
-    if cache.make_new_J
-        jacobian!(J, cache)
-        mul!(cache.H, J, J)
-        mul!(cache.g, J, fu)
-        cache.stats.njacs += 1
-    end
-
-    linres = dolinsolve(alg.precs, linsolve, A = cache.H, b = _vec(cache.g),
-        linu = _vec(u_tmp),
-        p = p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    cache.u_tmp .= -1 .* u_tmp
-    dogleg!(cache)
-
-    # Compute the potentially new u
-    cache.u_tmp .= u .+ cache.step_size
-    f(cache.fu_new, cache.u_tmp, p)
-    trust_region_step!(cache)
-    cache.stats.nf += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
-
-function perform_step!(cache::TrustRegionCache{false})
-    @unpack make_new_J, fu, f, u, p = cache
-
-    if make_new_J
-        J = jacobian(cache, f)
-        cache.H = J * J
-        cache.g = J * fu
-        cache.stats.njacs += 1
-    end
-
-    @unpack g, H = cache
-    # Compute the Newton step.
-    cache.u_tmp = -H \ g
-    dogleg!(cache)
-
-    # Compute the potentially new u
-    cache.u_tmp = u .+ cache.step_size
-    cache.fu_new = f(cache.u_tmp, p)
-    trust_region_step!(cache)
-    cache.stats.nf += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
-
-function retrospective_step!(cache::TrustRegionCache{true})
-    @unpack J, fu_prev, fu, u_prev, u = cache
-    jacobian!(J, cache)
-    mul!(cache.H, J, J)
-    mul!(cache.g, J, fu)
-    cache.stats.njacs += 1
-    @unpack H, g, step_size = cache
-
-    return -(get_loss(fu_prev) - get_loss(fu)) /
-           (step_size' * g + step_size' * H * step_size / 2)
-end
-
-function retrospective_step!(cache::TrustRegionCache{false})
-    @unpack J, fu_prev, fu, u_prev, u, f = cache
-    J = jacobian(cache, f)
-    cache.H = J * J
-    cache.g = J * fu
-    cache.stats.njacs += 1
-    @unpack H, g, step_size = cache
-
-    return -(get_loss(fu_prev) - get_loss(fu)) /
-           (step_size' * g + step_size' * H * step_size / 2)
+    max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
+    step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
+    expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
+    expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
+    ad = default_adargs_to_adtype(adkwargs...)
+    return TrustRegion{_unwrap_val(concrete_jac)}(ad, linsolve, precs, radius_update_scheme,
+        max_trust_radius, initial_trust_radius, step_threshold, shrink_threshold,
+        expand_threshold, shrink_factor, expand_factor, max_shrink_times)
 end
 
-function trust_region_step!(cache::TrustRegionCache)
-    @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
-    cache.loss_new = get_loss(fu_new)
-
-    # Compute the ratio of the actual reduction to the predicted reduction.
-    cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
-    @unpack r = cache
-
-    if radius_update_scheme === RadiusUpdateSchemes.Simple
-        # Update the trust region radius.
-        if r < cache.shrink_threshold
-            cache.trust_r *= cache.shrink_factor
-            cache.shrink_counter += 1
-        else
-            cache.shrink_counter = 0
-        end
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-
-            # Update the trust region radius.
-            if r > cache.expand_threshold
-                cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
-            end
-
-            cache.make_new_J = true
-        else
-            # No need to make a new J, no step was taken, so we try again with a smaller trust_r
-            cache.make_new_J = false
-        end
-
-        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-            cache.force_stop = true
-        end
-
-    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-        else
-            cache.make_new_J = false
-        end
-        # Hei's radius update scheme
-        @unpack shrink_threshold, p1, p2, p3, p4 = cache
-        if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
-           cache.trust_r
-            cache.shrink_counter += 1
-        else
-            cache.shrink_counter = 0
-        end
-        cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
-                        cache.internalnorm(step_size)
-
-        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-           cache.internalnorm(g) < cache.ϵ
-            cache.force_stop = true
-        end
-
-    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-        if r < cache.shrink_threshold
-            cache.p1 = cache.p2 * cache.p1
-            cache.shrink_counter += 1
-        elseif r >= cache.expand_threshold &&
-               cache.internalnorm(step_size) > cache.trust_r / 2
-            cache.p1 = cache.p3 * cache.p1
-            cache.shrink_counter = 0
-        end
-
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-        else
-            cache.make_new_J = false
-        end
-
-        @unpack p1 = cache
-        cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
-        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-           cache.internalnorm(g) < cache.ϵ
-            cache.force_stop = true
-        end
-        #Fan's update scheme
-    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-        if r < cache.shrink_threshold
-            cache.p1 *= cache.p2
-            cache.shrink_counter += 1
-        elseif r > cache.expand_threshold
-            cache.p1 = min(cache.p1 * cache.p3, cache.p4)
-            cache.shrink_counter = 0
-        end
-
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-        else
-            cache.make_new_J = false
-        end
-
-        @unpack p1 = cache
-        cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
-        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-           cache.internalnorm(g) < cache.ϵ
-            cache.force_stop = true
-        end
-    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-            if retrospective_step!(cache) >= cache.expand_threshold
-                cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
-            end
-
-        else
-            cache.make_new_J = false
-            cache.trust_r *= cache.p2
-            cache.shrink_counter += 1
-        end
-        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-            cache.force_stop = true
-        end
-    end
-end
-
-function dogleg!(cache::TrustRegionCache)
-    @unpack u_tmp, trust_r = cache
-
-    # Test if the full step is within the trust region.
-    if norm(u_tmp) ≤ trust_r
-        cache.step_size = deepcopy(u_tmp)
-        return
-    end
-
-    # Calcualte Cauchy point, optimum along the steepest descent direction.
-    δsd = -cache.g
-    norm_δsd = norm(δsd)
-    if norm_δsd ≥ trust_r
-        cache.step_size = δsd .* trust_r / norm_δsd
-        return
-    end
-
-    # Find the intersection point on the boundary.
-    N_sd = u_tmp - δsd
-    dot_N_sd = dot(N_sd, N_sd)
-    dot_sd_N_sd = dot(δsd, N_sd)
-    dot_sd = dot(δsd, δsd)
-    fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
-    τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
-    cache.step_size = δsd + τ * N_sd
-end
-
-function take_step!(cache::TrustRegionCache{true})
-    cache.u_prev .= cache.u
-    cache.u .= cache.u_tmp
-    cache.fu_prev .= cache.fu
-    cache.fu .= cache.fu_new
-end
-
-function take_step!(cache::TrustRegionCache{false})
-    cache.u_prev = cache.u
-    cache.u = cache.u_tmp
-    cache.fu_prev = cache.fu
-    cache.fu = cache.fu_new
-end
-
-function jvp!(cache::TrustRegionCache{false})
-    @unpack f, u, fu, p = cache
-    if isa(u, Number)
-        return value_derivative(x -> f(x, p), u)
-    end
-    return auto_jacvec(x -> f(x, p), u, fu)
-end
-
-function jvp!(cache::TrustRegionCache{true})
-    @unpack g, f, u, fu, p = cache
-    if isa(u, Number)
-        return value_derivative(x -> f(x, p), u)
-    end
-    auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-    g
-end
-
-function SciMLBase.solve!(cache::TrustRegionCache)
-    while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
-              cache.shrink_counter < cache.alg.max_shrink_times
-        perform_step!(cache)
-        cache.stats.nsteps += 1
-    end
-
-    if cache.stats.nsteps == cache.maxiters
-        cache.retcode = ReturnCode.MaxIters
-    else
-        cache.retcode = ReturnCode.Success
-    end
-
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
-end
-
-function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
-    abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu = cache.f(cache.u, p)
-    end
-    cache.abstol = abstol
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    cache.make_new_J = true
-    cache.loss = get_loss(cache.fu)
-    cache.shrink_counter = 0
-    cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
-    if iszero(cache.trust_r)
-        cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
-    end
-    return cache
-end
+# @concrete mutable struct TrustRegionCache{iip}
+#     f
+#     alg
+#     u_prev::uType
+#     u::uType
+#     fu_prev::resType
+#     fu::resType
+#     p
+#     uf
+#     linsolve
+#     J::jType
+#     jac_cache
+#     force_stop::Bool
+#     maxiters::Int
+#     internalnorm
+#     retcode::ReturnCode.T
+#     abstol
+#     prob
+#     radius_update_scheme::RadiusUpdateSchemes.T
+#     trust_r::trustType
+#     max_trust_r::trustType
+#     step_threshold
+#     shrink_threshold::trustType
+#     expand_threshold::trustType
+#     shrink_factor::trustType
+#     expand_factor::trustType
+#     loss::floatType
+#     loss_new::floatType
+#     H::jType
+#     g::resType
+#     shrink_counter::Int
+#     step_size
+#     u_tmp
+#     fu_new::resType
+#     make_new_J::Bool
+#     r::floatType
+#     p1::floatType
+#     p2::floatType
+#     p3::floatType
+#     p4::floatType
+#     ϵ::floatType
+#     stats::NLStats
+# end
+
+# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
+#     args...;
+#     alias_u0 = false,
+#     maxiters = 1000,
+#     abstol = 1e-8,
+#     internalnorm = DEFAULT_NORM,
+#     kwargs...) where {uType, iip}
+#     if alias_u0
+#         u = prob.u0
+#     else
+#         u = deepcopy(prob.u0)
+#     end
+#     u_prev = zero(u)
+#     f = prob.f
+#     p = prob.p
+#     if iip
+#         fu = zero(u)
+#         f(fu, u, p)
+#     else
+#         fu = f(u, p)
+#     end
+#     fu_prev = zero(fu)
+
+#     loss = get_loss(fu)
+#     uf, linsolve, J, u_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+
+#     radius_update_scheme = alg.radius_update_scheme
+#     max_trust_radius = convert(eltype(u), alg.max_trust_radius)
+#     initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
+#     step_threshold = convert(eltype(u), alg.step_threshold)
+#     shrink_threshold = convert(eltype(u), alg.shrink_threshold)
+#     expand_threshold = convert(eltype(u), alg.expand_threshold)
+#     shrink_factor = convert(eltype(u), alg.shrink_factor)
+#     expand_factor = convert(eltype(u), alg.expand_factor)
+#     # Set default trust region radius if not specified
+#     if iszero(max_trust_radius)
+#         max_trust_radius = convert(eltype(u), max(norm(fu), maximum(u) - minimum(u)))
+#     end
+#     if iszero(initial_trust_radius)
+#         initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
+#     end
+
+#     loss_new = loss
+#     H = ArrayInterface.undefmatrix(u)
+#     g = zero(fu)
+#     shrink_counter = 0
+#     step_size = zero(u)
+#     fu_new = zero(fu)
+#     make_new_J = true
+#     r = loss
+
+#     # Parameters for the Schemes
+#     p1 = convert(eltype(u), 0.0)
+#     p2 = convert(eltype(u), 0.0)
+#     p3 = convert(eltype(u), 0.0)
+#     p4 = convert(eltype(u), 0.0)
+#     ϵ = convert(eltype(u), 1.0e-8)
+#     if radius_update_scheme === RadiusUpdateSchemes.Hei
+#         step_threshold = convert(eltype(u), 0.0)
+#         shrink_threshold = convert(eltype(u), 0.25)
+#         expand_threshold = convert(eltype(u), 0.25)
+#         p1 = convert(eltype(u), 5.0) # M
+#         p2 = convert(eltype(u), 0.1) # β
+#         p3 = convert(eltype(u), 0.15) # γ1
+#         p4 = convert(eltype(u), 0.15) # γ2
+#         initial_trust_radius = convert(eltype(u), 1.0)
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+#         step_threshold = convert(eltype(u), 0.0001)
+#         shrink_threshold = convert(eltype(u), 0.25)
+#         expand_threshold = convert(eltype(u), 0.25)
+#         p1 = convert(eltype(u), 2.0) # μ
+#         p2 = convert(eltype(u), 1 / 6) # c5
+#         p3 = convert(eltype(u), 6.0) # c6
+#         p4 = convert(eltype(u), 0.0)
+#         if iip
+#             auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
+#         else
+#             if isa(u, Number)
+#                 g = ForwardDiff.derivative(x -> f(x, p), u)
+#             else
+#                 g = auto_jacvec(x -> f(x, p), u, fu)
+#             end
+#         end
+#         initial_trust_radius = convert(eltype(u), p1 * norm(g))
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+#         step_threshold = convert(eltype(u), 0.0001)
+#         shrink_threshold = convert(eltype(u), 0.25)
+#         expand_threshold = convert(eltype(u), 0.75)
+#         p1 = convert(eltype(u), 0.1) # μ
+#         p2 = convert(eltype(u), 1 / 4) # c5
+#         p3 = convert(eltype(u), 12) # c6
+#         p4 = convert(eltype(u), 1.0e18) # M
+#         initial_trust_radius = convert(eltype(u), p1 * (norm(fu)^0.99))
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+#         step_threshold = convert(eltype(u), 0.05)
+#         shrink_threshold = convert(eltype(u), 0.05)
+#         expand_threshold = convert(eltype(u), 0.9)
+#         p1 = convert(eltype(u), 2.5)  #alpha_1
+#         p2 = convert(eltype(u), 0.25) # alpha_2
+#         p3 = convert(eltype(u), 0) # not required
+#         p4 = convert(eltype(u), 0) # not required
+#         initial_trust_radius = convert(eltype(u), 1.0)
+#     end
+
+#     return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu, p, uf, linsolve, J,
+#         jac_config,
+#         false, maxiters, internalnorm,
+#         ReturnCode.Default, abstol, prob, radius_update_scheme,
+#         initial_trust_radius,
+#         max_trust_radius, step_threshold, shrink_threshold,
+#         expand_threshold, shrink_factor, expand_factor, loss,
+#         loss_new, H, g, shrink_counter, step_size, u_tmp, fu_new,
+#         make_new_J, r, p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0))
+# end
+
+# function perform_step!(cache::TrustRegionCache{true})
+#     @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
+#     if cache.make_new_J
+#         jacobian!(J, cache)
+#         mul!(cache.H, J, J)
+#         mul!(cache.g, J, fu)
+#         cache.stats.njacs += 1
+#     end
+
+#     linres = dolinsolve(alg.precs, linsolve, A = cache.H, b = _vec(cache.g),
+#         linu = _vec(u_tmp),
+#         p = p, reltol = cache.abstol)
+#     cache.linsolve = linres.cache
+#     cache.u_tmp .= -1 .* u_tmp
+#     dogleg!(cache)
+
+#     # Compute the potentially new u
+#     cache.u_tmp .= u .+ cache.step_size
+#     f(cache.fu_new, cache.u_tmp, p)
+#     trust_region_step!(cache)
+#     cache.stats.nf += 1
+#     cache.stats.nsolve += 1
+#     cache.stats.nfactors += 1
+#     return nothing
+# end
+
+# function perform_step!(cache::TrustRegionCache{false})
+#     @unpack make_new_J, fu, f, u, p = cache
+
+#     if make_new_J
+#         J = jacobian(cache, f)
+#         cache.H = J * J
+#         cache.g = J * fu
+#         cache.stats.njacs += 1
+#     end
+
+#     @unpack g, H = cache
+#     # Compute the Newton step.
+#     cache.u_tmp = -H \ g
+#     dogleg!(cache)
+
+#     # Compute the potentially new u
+#     cache.u_tmp = u .+ cache.step_size
+#     cache.fu_new = f(cache.u_tmp, p)
+#     trust_region_step!(cache)
+#     cache.stats.nf += 1
+#     cache.stats.nsolve += 1
+#     cache.stats.nfactors += 1
+#     return nothing
+# end
+
+# function retrospective_step!(cache::TrustRegionCache{true})
+#     @unpack J, fu_prev, fu, u_prev, u = cache
+#     jacobian!(J, cache)
+#     mul!(cache.H, J, J)
+#     mul!(cache.g, J, fu)
+#     cache.stats.njacs += 1
+#     @unpack H, g, step_size = cache
+
+#     return -(get_loss(fu_prev) - get_loss(fu)) /
+#            (step_size' * g + step_size' * H * step_size / 2)
+# end
+
+# function retrospective_step!(cache::TrustRegionCache{false})
+#     @unpack J, fu_prev, fu, u_prev, u, f = cache
+#     J = jacobian(cache, f)
+#     cache.H = J * J
+#     cache.g = J * fu
+#     cache.stats.njacs += 1
+#     @unpack H, g, step_size = cache
+
+#     return -(get_loss(fu_prev) - get_loss(fu)) /
+#            (step_size' * g + step_size' * H * step_size / 2)
+# end
+
+# function trust_region_step!(cache::TrustRegionCache)
+#     @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
+#     cache.loss_new = get_loss(fu_new)
+
+#     # Compute the ratio of the actual reduction to the predicted reduction.
+#     cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
+#     @unpack r = cache
+
+#     if radius_update_scheme === RadiusUpdateSchemes.Simple
+#         # Update the trust region radius.
+#         if r < cache.shrink_threshold
+#             cache.trust_r *= cache.shrink_factor
+#             cache.shrink_counter += 1
+#         else
+#             cache.shrink_counter = 0
+#         end
+#         if r > cache.step_threshold
+#             take_step!(cache)
+#             cache.loss = cache.loss_new
+
+#             # Update the trust region radius.
+#             if r > cache.expand_threshold
+#                 cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
+#             end
+
+#             cache.make_new_J = true
+#         else
+#             # No need to make a new J, no step was taken, so we try again with a smaller trust_r
+#             cache.make_new_J = false
+#         end
+
+#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+#             cache.force_stop = true
+#         end
+
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Hei
+#         if r > cache.step_threshold
+#             take_step!(cache)
+#             cache.loss = cache.loss_new
+#             cache.make_new_J = true
+#         else
+#             cache.make_new_J = false
+#         end
+#         # Hei's radius update scheme
+#         @unpack shrink_threshold, p1, p2, p3, p4 = cache
+#         if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
+#            cache.trust_r
+#             cache.shrink_counter += 1
+#         else
+#             cache.shrink_counter = 0
+#         end
+#         cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
+#                         cache.internalnorm(step_size)
+
+#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+#            cache.internalnorm(g) < cache.ϵ
+#             cache.force_stop = true
+#         end
+
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+#         if r < cache.shrink_threshold
+#             cache.p1 = cache.p2 * cache.p1
+#             cache.shrink_counter += 1
+#         elseif r >= cache.expand_threshold &&
+#                cache.internalnorm(step_size) > cache.trust_r / 2
+#             cache.p1 = cache.p3 * cache.p1
+#             cache.shrink_counter = 0
+#         end
+
+#         if r > cache.step_threshold
+#             take_step!(cache)
+#             cache.loss = cache.loss_new
+#             cache.make_new_J = true
+#         else
+#             cache.make_new_J = false
+#         end
+
+#         @unpack p1 = cache
+#         cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
+#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+#            cache.internalnorm(g) < cache.ϵ
+#             cache.force_stop = true
+#         end
+#         #Fan's update scheme
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+#         if r < cache.shrink_threshold
+#             cache.p1 *= cache.p2
+#             cache.shrink_counter += 1
+#         elseif r > cache.expand_threshold
+#             cache.p1 = min(cache.p1 * cache.p3, cache.p4)
+#             cache.shrink_counter = 0
+#         end
+
+#         if r > cache.step_threshold
+#             take_step!(cache)
+#             cache.loss = cache.loss_new
+#             cache.make_new_J = true
+#         else
+#             cache.make_new_J = false
+#         end
+
+#         @unpack p1 = cache
+#         cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
+#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+#            cache.internalnorm(g) < cache.ϵ
+#             cache.force_stop = true
+#         end
+#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+#         if r > cache.step_threshold
+#             take_step!(cache)
+#             cache.loss = cache.loss_new
+#             cache.make_new_J = true
+#             if retrospective_step!(cache) >= cache.expand_threshold
+#                 cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
+#             end
+
+#         else
+#             cache.make_new_J = false
+#             cache.trust_r *= cache.p2
+#             cache.shrink_counter += 1
+#         end
+#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+#             cache.force_stop = true
+#         end
+#     end
+# end
+
+# function dogleg!(cache::TrustRegionCache)
+#     @unpack u_tmp, trust_r = cache
+
+#     # Test if the full step is within the trust region.
+#     if norm(u_tmp) ≤ trust_r
+#         cache.step_size = deepcopy(u_tmp)
+#         return
+#     end
+
+#     # Calcualte Cauchy point, optimum along the steepest descent direction.
+#     δsd = -cache.g
+#     norm_δsd = norm(δsd)
+#     if norm_δsd ≥ trust_r
+#         cache.step_size = δsd .* trust_r / norm_δsd
+#         return
+#     end
+
+#     # Find the intersection point on the boundary.
+#     N_sd = u_tmp - δsd
+#     dot_N_sd = dot(N_sd, N_sd)
+#     dot_sd_N_sd = dot(δsd, N_sd)
+#     dot_sd = dot(δsd, δsd)
+#     fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
+#     τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
+#     cache.step_size = δsd + τ * N_sd
+# end
+
+# function take_step!(cache::TrustRegionCache{true})
+#     cache.u_prev .= cache.u
+#     cache.u .= cache.u_tmp
+#     cache.fu_prev .= cache.fu
+#     cache.fu .= cache.fu_new
+# end
+
+# function take_step!(cache::TrustRegionCache{false})
+#     cache.u_prev = cache.u
+#     cache.u = cache.u_tmp
+#     cache.fu_prev = cache.fu
+#     cache.fu = cache.fu_new
+# end
+
+# function jvp!(cache::TrustRegionCache{false})
+#     @unpack f, u, fu, p = cache
+#     if isa(u, Number)
+#         return value_derivative(x -> f(x, p), u)
+#     end
+#     return auto_jacvec(x -> f(x, p), u, fu)
+# end
+
+# function jvp!(cache::TrustRegionCache{true})
+#     @unpack g, f, u, fu, p = cache
+#     if isa(u, Number)
+#         return value_derivative(x -> f(x, p), u)
+#     end
+#     auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
+#     g
+# end
+
+# function SciMLBase.solve!(cache::TrustRegionCache)
+#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
+#               cache.shrink_counter < cache.alg.max_shrink_times
+#         perform_step!(cache)
+#         cache.stats.nsteps += 1
+#     end
+
+#     if cache.stats.nsteps == cache.maxiters
+#         cache.retcode = ReturnCode.MaxIters
+#     else
+#         cache.retcode = ReturnCode.Success
+#     end
+
+#     SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
+#         retcode = cache.retcode, stats = cache.stats)
+# end
+
+# function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
+#     abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
+#     cache.p = p
+#     if iip
+#         recursivecopy!(cache.u, u0)
+#         cache.f(cache.fu, cache.u, p)
+#     else
+#         # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
+#         cache.u = u0
+#         cache.fu = cache.f(cache.u, p)
+#     end
+#     cache.abstol = abstol
+#     cache.maxiters = maxiters
+#     cache.stats.nf = 1
+#     cache.stats.nsteps = 1
+#     cache.force_stop = false
+#     cache.retcode = ReturnCode.Default
+#     cache.make_new_J = true
+#     cache.loss = get_loss(cache.fu)
+#     cache.shrink_counter = 0
+#     cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
+#     if iszero(cache.trust_r)
+#         cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
+#     end
+#     return cache
+# end
diff --git a/src/utils.jl b/src/utils.jl
index 9d72e230f..c50d52ad7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -2,88 +2,64 @@
 @inline UNITLESS_ABS2(x) = real(abs2(x))
 @inline DEFAULT_NORM(u::Union{AbstractFloat, Complex}) = @fastmath abs(u)
 @inline function DEFAULT_NORM(u::Array{T}) where {T <: Union{AbstractFloat, Complex}}
-    sqrt(real(sum(abs2, u)) / length(u))
+    return sqrt(real(sum(abs2, u)) / length(u))
 end
-@inline function DEFAULT_NORM(u::StaticArraysCore.StaticArray{
-    T,
-}) where {
-    T <: Union{
-        AbstractFloat,
-        Complex}}
-    sqrt(real(sum(abs2, u)) / length(u))
+@inline function DEFAULT_NORM(u::StaticArray{<:Union{AbstractFloat, Complex}})
+    return sqrt(real(sum(abs2, u)) / length(u))
 end
-@inline function DEFAULT_NORM(u::RecursiveArrayTools.AbstractVectorOfArray)
-    sum(sqrt(real(sum(UNITLESS_ABS2, _u)) / length(_u)) for _u in u.u)
+@inline function DEFAULT_NORM(u::AbstractVectorOfArray)
+    return sum(sqrt(real(sum(UNITLESS_ABS2, _u)) / length(_u)) for _u in u.u)
 end
 @inline DEFAULT_NORM(u::AbstractArray) = sqrt(real(sum(UNITLESS_ABS2, u)) / length(u))
 @inline DEFAULT_NORM(u) = norm(u)
 
-alg_autodiff(alg::AbstractNewtonAlgorithm{CS, AD}) where {CS, AD} = AD
+alg_autodiff(alg::AbstractNewtonAlgorithm{<:AbstractFiniteDifferencesMode}) = false
+alg_autodiff(alg::AbstractNewtonAlgorithm) = true
 alg_autodiff(alg) = false
 
 """
-value_derivative(f, x)
+    default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(),
+        standardtag = Val{true}(), diff_type = Val{:forward})
 
-Compute `f(x), d/dx f(x)` in the most efficient way.
+Construct the AD type from the arguments. This is mostly needed for compatibility with older
+code.
 """
-function value_derivative(f::F, x::R) where {F, R}
-    T = typeof(ForwardDiff.Tag(f, R))
-    out = f(ForwardDiff.Dual{T}(x, one(x)))
-    ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+function default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(),
+    standardtag = Val{true}(), diff_type = Val{:forward}())
+    ad = _unwrap_val(autodiff)
+    # Old API
+    if ad isa Bool
+        # FIXME: standardtag is not the Tag
+        ad && return AutoForwardDiff(; chunksize = _unwrap_val(chunk_size),
+            tag = _unwrap_val(standardtag))
+        return AutoFiniteDiff(; fdtype = diff_type)
+    end
+    return ad
 end
 
-# Todo: improve this dispatch
-function value_derivative(f::F, x::StaticArraysCore.SVector) where {F}
-    f(x), ForwardDiff.jacobian(f, x)
-end
+# """
+# value_derivative(f, x)
 
-value(x) = x
-value(x::Dual) = ForwardDiff.value(x)
-value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
-
-_vec(v) = vec(v)
-_vec(v::Number) = v
-_vec(v::AbstractVector) = v
-
-function alg_difftype(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    FDT
-end
+# Compute `f(x), d/dx f(x)` in the most efficient way.
+# """
+# function value_derivative(f::F, x::R) where {F, R}
+#     T = typeof(ForwardDiff.Tag(f, R))
+#     out = f(ForwardDiff.Dual{T}(x, one(x)))
+#     ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+# end
 
-function concrete_jac(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    CJ
-end
+# # Todo: improve this dispatch
+# function value_derivative(f::F, x::StaticArraysCore.SVector) where {F}
+#     f(x), ForwardDiff.jacobian(f, x)
+# end
 
-function get_chunksize(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    Val(CS)
-end
+@inline value(x) = x
+@inline value(x::Dual) = ForwardDiff.value(x)
+@inline value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
 
-function standardtag(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    ST
-end
+@inline _vec(v) = vec(v)
+@inline _vec(v::Number) = v
+@inline _vec(v::AbstractVector) = v
 
 DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 
@@ -94,10 +70,8 @@ function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing
     b !== nothing && (linsolve.b = b)
     linu !== nothing && (linsolve.u = linu)
 
-    Plprev = linsolve.Pl isa LinearSolve.ComposePreconditioner ? linsolve.Pl.outer :
-             linsolve.Pl
-    Prprev = linsolve.Pr isa LinearSolve.ComposePreconditioner ? linsolve.Pr.outer :
-             linsolve.Pr
+    Plprev = linsolve.Pl isa ComposePreconditioner ? linsolve.Pl.outer : linsolve.Pl
+    Prprev = linsolve.Pr isa ComposePreconditioner ? linsolve.Pr.outer : linsolve.Pr
 
     _Pl, _Pr = precs(linsolve.A, du, u, p, nothing, A !== nothing, Plprev, Prprev,
         cachedata)
@@ -110,29 +84,25 @@ function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing
         linsolve.Pr = Pr
     end
 
-    linres = if reltol === nothing
-        solve!(linsolve)
-    else
-        solve!(linsolve; reltol)
-    end
+    linres = reltol === nothing ? solve!(linsolve) : solve!(linsolve; reltol)
 
     return linres
 end
 
 function wrapprecs(_Pl, _Pr, weight)
     if _Pl !== nothing
-        Pl = LinearSolve.ComposePreconditioner(LinearSolve.InvPreconditioner(Diagonal(_vec(weight))),
-            _Pl)
+        Pl = ComposePreconditioner(InvPreconditioner(Diagonal(_vec(weight))), _Pl)
     else
-        Pl = LinearSolve.InvPreconditioner(Diagonal(_vec(weight)))
+        Pl = InvPreconditioner(Diagonal(_vec(weight)))
     end
 
     if _Pr !== nothing
-        Pr = LinearSolve.ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
+        Pr = ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
     else
         Pr = Diagonal(_vec(weight))
     end
-    Pl, Pr
+
+    return Pl, Pr
 end
 
 function _nfcount(N, ::Type{diff_type}) where {diff_type}
@@ -143,17 +113,18 @@ function _nfcount(N, ::Type{diff_type}) where {diff_type}
     else
         tmp = 2N
     end
-    tmp
+    return tmp
 end
 
-function get_loss(fu)
-    return norm(fu)^2 / 2
-end
+get_loss(fu) = norm(fu)^2 / 2
 
 function rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real} # R-function for adaptive trust region method
-    if (r >= c2)
+    if (r ≥ c2)
         return (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / π
     else
         return (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β))
     end
 end
+
+concrete_jac(_) = nothing
+concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ

From 4fe75e8a05ebc413e3ebcafd6b43adb38a880b86 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 7 Sep 2023 17:58:55 -0400
Subject: [PATCH 02/19] Incorporate upstream changes in NonlinearSolve.jl

---
 .gitignore            |   1 +
 Project.toml          |   2 +-
 src/NonlinearSolve.jl |   3 +-
 src/jacobian.jl       |  46 ++--
 src/levenberg.jl      | 616 +++++++++++++++++++++---------------------
 src/raphson.jl        |  10 +-
 6 files changed, 346 insertions(+), 332 deletions(-)

diff --git a/.gitignore b/.gitignore
index aa4ff57e3..2f8d95920 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ Manifest.toml
 docs/src/assets/Project.toml
 
 .vscode
+wip
diff --git a/Project.toml b/Project.toml
index db9ad0d35..b1724f423 100644
--- a/Project.toml
+++ b/Project.toml
@@ -33,7 +33,7 @@ LinearSolve = "2"
 PrecompileTools = "1"
 RecursiveArrayTools = "2"
 Reexport = "0.2, 1"
-SciMLBase = "1.92.4"
+SciMLBase = "1.97"
 SimpleNonlinearSolve = "0.1"
 SparseDiffTools = "1, 2"
 StaticArraysCore = "1.4"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 38a4b6142..b774b7953 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,7 +8,7 @@ using DiffEqBase, LinearAlgebra, LinearSolve, SparseDiffTools
 import ForwardDiff
 
 import ADTypes: AbstractFiniteDifferencesMode
-import ArrayInterface: undefmatrix
+import ArrayInterface: undefmatrix, matrix_colors
 import ConcreteStructs: @concrete
 import EnumX: @enumx
 import ForwardDiff: Dual
@@ -16,7 +16,6 @@ import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
 import RecursiveArrayTools: AbstractVectorOfArray, recursivecopy!, recursivefill!
 import Reexport: @reexport
 import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
-import SparseDiffTools: __init_𝒥
 import StaticArraysCore: StaticArray, SVector
 import UnPack: @unpack
 
diff --git a/src/jacobian.jl b/src/jacobian.jl
index dfa8b1212..2a96432d7 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -6,12 +6,27 @@ end
 (uf::JacobianWrapper)(u) = uf.f(u, uf.p)
 (uf::JacobianWrapper)(res, u) = uf.f(res, u, uf.p)
 
-# function sparsity_colorvec(f, x)
-#     sparsity = f.sparsity
-#     colorvec = DiffEqBase.has_colorvec(f) ? f.colorvec :
-#                (isnothing(sparsity) ? (1:length(x)) : matrix_colors(sparsity))
-#     sparsity, colorvec
-# end
+# FIXME: This is a deviation from older versions. Previously if sparsity and colorvec were
+#        provided we would use a sparse AD. Right now it requires an explicit specification
+sparsity_detection_alg(f, ad) = NoSparsityDetection()
+function sparsity_detection_alg(f, ad::AbstractSparseADType)
+    if f.sparsity === nothing
+        if f.jac_prototype === nothing
+            return SymbolicsSparsityDetection()
+        else
+            jac_prototype = f.jac_prototype
+        end
+    else
+        jac_prototype = f.sparsity
+    end
+
+    if SciMLBase.has_colorvec(f)
+        return PrecomputedJacobianColorvec(; jac_prototype, f.colorvec,
+            partition_by_rows = ad isa ADTypes.AbstractSparseReverseMode)
+    else
+        return JacPrototypeSparsityDetection(; jac_prototype)
+    end
+end
 
 # NoOp for Jacobian if it is not a Abstract Array -- For eg, JacVec Operator
 jacobian!!(J, _) = J
@@ -41,14 +56,13 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
                              needs_concrete_A(alg.linsolve)))))
     alg_wants_jac = (concrete_jac(alg) === nothing && concrete_jac(alg))
 
-    fu = zero(u)  # TODO: Use Prototype
+    # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
+    fu = f.resid_prototype === nothing ? (iip ? zero(u) : f(u, p)) :
+         deepcopy(f.resid_prototype)
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
-        # TODO: We need an Upstream Mode to allow using known sparsity and colorvec
-        # TODO: We can use the jacobian prototype here
-        sd = typeof(alg.ad) <: AbstractSparseADType ? SymbolicsSparsityDetection() :
-             NoSparsityDetection()
+        sd = sparsity_detection_alg(f, alg.ad)
         jac_cache = iip ? sparse_jacobian_cache(alg.ad, sd, uf, fu, u) :
-                    sparse_jacobian_cache(alg.ad, sd, uf, u; fx=fu)
+                    sparse_jacobian_cache(alg.ad, sd, uf, u; fx = fu)
     else
         jac_cache = nothing
     end
@@ -60,12 +74,12 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
         if has_analytic_jac
             iip ? undefmatrix(u) : nothing
         else
-            f.jac_prototype === nothing ? __init_𝒥(jac_cache) : f.jac_prototype
+            f.jac_prototype === nothing ? init_jacobian(jac_cache) : f.jac_prototype
         end
     end
 
-    # FIXME: Assumes same sized `u` and `fu` -- Incorrect Assumption for Levenberg
-    linprob = LinearProblem(J, _vec(zero(u)); u0 = _vec(zero(u)))
+    du = zero(u)
+    linprob = LinearProblem(J, _vec(fu); u0 = _vec(du))
 
     weight = similar(u)
     recursivefill!(weight, true)
@@ -74,5 +88,5 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
             nothing)..., weight)
     linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
 
-    return uf, linsolve, J, fu, jac_cache
+    return uf, linsolve, J, fu, jac_cache, du
 end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 721e08cd3..15956c7df 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -1,335 +1,335 @@
-"""
-    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
-        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
-        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
-        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
+# """
+#     LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+#         precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
+#         damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+#         finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
+#         b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
 
-An advanced Levenberg-Marquardt implementation with the improvements suggested in the
-[paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
-algorithm for nonlinear least-squares minimization". Designed for large-scale and
-numerically-difficult nonlinear systems.
+# An advanced Levenberg-Marquardt implementation with the improvements suggested in the
+# [paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
+# algorithm for nonlinear least-squares minimization". Designed for large-scale and
+# numerically-difficult nonlinear systems.
 
-### Keyword Arguments
+# ### Keyword Arguments
 
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-      ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-      `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `damping_initial`: the starting value for the damping factor. The damping factor is
-    inversely proportional to the step size. The damping factor is adjusted during each
-    iteration. Defaults to `1.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `damping_increase_factor`: the factor by which the damping is increased if a step is
-    rejected. Defaults to `2.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-    accepted. Defaults to `3.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-    the geodesic acceleration. Defaults to `0.1` which means that the step size is
-    approximately 10% of the first-order step. For more details, see section 3 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-    that acceptable steps meet the condition
-    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-    and reject all steps that increase the cost. Although this is a natural and safe choice,
-    it is often not the most efficient. Therefore downhill moves are always accepted, but
-    uphill moves are only conditionally accepted. To decide whether an uphill move will be
-    accepted at each iteration ``i``, we compute
-    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-    specify, uphill moves are accepted if
-    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-    will be accepted. Defaults to `1.0`. For more details, see section 4 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-    where `J` is the Jacobian. It is suggested by
-    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-"""
-@concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    damping_initial::T
-    damping_increase_factor::T
-    damping_decrease_factor::T
-    finite_diff_step_geodesic::T
-    α_geodesic::T
-    b_uphill::T
-    min_damping_D::T
-end
+#   - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+#       ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+#       `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
+#   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
+#     then the Jacobian will not be constructed and instead direct Jacobian-vector products
+#     `J*v` are computed using forward-mode automatic differentiation or finite differencing
+#     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
+#     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
+#     the construction of the Jacobian.
+#   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
+#     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
+#     LinearSolve.jl default algorithm choice. For more information on available algorithm
+#     choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+#   - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+#     preconditioners. For more information on specifying preconditioners for LinearSolve
+#     algorithms, consult the
+#     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+#   - `damping_initial`: the starting value for the damping factor. The damping factor is
+#     inversely proportional to the step size. The damping factor is adjusted during each
+#     iteration. Defaults to `1.0`. For more details, see section 2.1 of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `damping_increase_factor`: the factor by which the damping is increased if a step is
+#     rejected. Defaults to `2.0`. For more details, see section 2.1 of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+#     accepted. Defaults to `3.0`. For more details, see section 2.1 of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
+#     the geodesic acceleration. Defaults to `0.1` which means that the step size is
+#     approximately 10% of the first-order step. For more details, see section 3 of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
+#     geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
+#     that acceptable steps meet the condition
+#     ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+#     acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
+#     path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
+#     is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
+#     effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+#     choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+#     and reject all steps that increase the cost. Although this is a natural and safe choice,
+#     it is often not the most efficient. Therefore downhill moves are always accepted, but
+#     uphill moves are only conditionally accepted. To decide whether an uphill move will be
+#     accepted at each iteration ``i``, we compute
+#     ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+#     between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+#     step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+#     specify, uphill moves are accepted if
+#     ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+#     iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
+#     allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
+#     will be accepted. Defaults to `1.0`. For more details, see section 4 of
+#     [this paper](https://arxiv.org/abs/1201.5885).
+#   - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+#     `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+#     where `J` is the Jacobian. It is suggested by
+#     [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
+#     `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
+# """
+# @concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
+#     ad::AD
+#     linsolve
+#     precs
+#     damping_initial::T
+#     damping_increase_factor::T
+#     damping_decrease_factor::T
+#     finite_diff_step_geodesic::T
+#     α_geodesic::T
+#     b_uphill::T
+#     min_damping_D::T
+# end
 
-function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-    precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
-    damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
-    α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
-    adkwargs...)
-    ad = default_adargs_to_adtype(adkwargs...)
-    return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
-        damping_initial, damping_increase_factor, damping_decrease_factor,
-        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
-end
+# function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+#     precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
+#     damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
+#     α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
+#     adkwargs...)
+#     ad = default_adargs_to_adtype(adkwargs...)
+#     return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
+#         damping_initial, damping_increase_factor, damping_decrease_factor,
+#         finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
+# end
 
-@concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
-    f
-    alg
-    u::uType
-    fu1
-    fu2
-    du
-    p
-    uf
-    linsolve
-    J::jType
-    jac_cache
-    force_stop::Bool
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    prob
-    DᵀD
-    JᵀJ::jType
-    λ::λType
-    λ_factor::λType
-    damping_increase_factor::λType
-    damping_decrease_factor::λType
-    h::λType
-    α_geodesic::λType
-    b_uphill::λType
-    min_damping_D::λType
-    v::uType
-    a::uType
-    tmp_vec::uType
-    v_old::uType
-    norm_v_old::lossType
-    δ::uType
-    loss_old::lossType
-    make_new_J::Bool
-    fu_tmp
-    mat_tmp::jType
-    stats::NLStats
-end
+# @concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
+#     f
+#     alg
+#     u::uType
+#     fu1
+#     fu2
+#     du
+#     p
+#     uf
+#     linsolve
+#     J::jType
+#     jac_cache
+#     force_stop::Bool
+#     maxiters::Int
+#     internalnorm
+#     retcode::ReturnCode.T
+#     abstol
+#     prob
+#     DᵀD
+#     JᵀJ::jType
+#     λ::λType
+#     λ_factor::λType
+#     damping_increase_factor::λType
+#     damping_decrease_factor::λType
+#     h::λType
+#     α_geodesic::λType
+#     b_uphill::λType
+#     min_damping_D::λType
+#     v::uType
+#     a::uType
+#     tmp_vec::uType
+#     v_old::uType
+#     norm_v_old::lossType
+#     δ::uType
+#     loss_old::lossType
+#     make_new_J::Bool
+#     fu_tmp
+#     mat_tmp::jType
+#     stats::NLStats
+# end
 
-isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
+# isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
-    args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
-    @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
-    if iip
-        fu1 = zero(u)  # TODO: Use Prototype
-        f(fu1, u, p)
-    else
-        fu1 = f(u, p)
-    end
-    uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
+# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
+#     args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
+#     kwargs...) where {uType, iip}
+#     @unpack f, u0, p = prob
+#     u = alias_u0 ? u0 : deepcopy(u0)
+#     if iip
+#         fu1 = zero(u)  # TODO: Use Prototype
+#         f(fu1, u, p)
+#     else
+#         fu1 = f(u, p)
+#     end
+#     uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
 
-    λ = convert(eltype(u), alg.damping_initial)
-    λ_factor = convert(eltype(u), alg.damping_increase_factor)
-    damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
-    damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
-    h = convert(eltype(u), alg.finite_diff_step_geodesic)
-    α_geodesic = convert(eltype(u), alg.α_geodesic)
-    b_uphill = convert(eltype(u), alg.b_uphill)
-    min_damping_D = convert(eltype(u), alg.min_damping_D)
+#     λ = convert(eltype(u), alg.damping_initial)
+#     λ_factor = convert(eltype(u), alg.damping_increase_factor)
+#     damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
+#     damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
+#     h = convert(eltype(u), alg.finite_diff_step_geodesic)
+#     α_geodesic = convert(eltype(u), alg.α_geodesic)
+#     b_uphill = convert(eltype(u), alg.b_uphill)
+#     min_damping_D = convert(eltype(u), alg.min_damping_D)
 
-    if u isa Number
-        DᵀD = min_damping_D
-    else
-        d = similar(u)
-        d .= min_damping_D
-        DᵀD = Diagonal(d)
-    end
+#     if u isa Number
+#         DᵀD = min_damping_D
+#     else
+#         d = similar(u)
+#         d .= min_damping_D
+#         DᵀD = Diagonal(d)
+#     end
 
-    loss = internalnorm(fu1)
-    JᵀJ = zero(J)
-    v = zero(u)
-    a = zero(u)
-    tmp_vec = zero(u)
-    v_old = zero(u)
-    δ = zero(u)
-    make_new_J = true
-    fu_tmp = zero(fu1)
-    mat_tmp = zero(J)
+#     loss = internalnorm(fu1)
+#     JᵀJ = zero(J)
+#     v = zero(u)
+#     a = zero(u)
+#     tmp_vec = zero(u)
+#     v_old = zero(u)
+#     δ = zero(u)
+#     make_new_J = true
+#     fu_tmp = zero(fu1)
+#     mat_tmp = zero(J)
 
-    return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
-        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
-        JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
-        b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
-        mat_tmp, NLStats(1, 0, 0, 0, 0))
-end
+#     return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
+#         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
+#         JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
+#         b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
+#         mat_tmp, NLStats(1, 0, 0, 0, 0))
+# end
 
-function perform_step!(cache::LevenbergMarquardtCache{true})
-    @unpack fu1, f, make_new_J = cache
-    if iszero(fu1)
-        cache.force_stop = true
-        return nothing
-    end
+# function perform_step!(cache::LevenbergMarquardtCache{true})
+#     @unpack fu1, f, make_new_J = cache
+#     if iszero(fu1)
+#         cache.force_stop = true
+#         return nothing
+#     end
 
-    if make_new_J
-        jacobian!!(cache.J, cache)
-        mul!(cache.JᵀJ, cache.J', cache.J)
-        cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-        cache.make_new_J = false
-        cache.stats.njacs += 1
-    end
-    @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
+#     if make_new_J
+#         jacobian!!(cache.J, cache)
+#         mul!(cache.JᵀJ, cache.J', cache.J)
+#         cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+#         cache.make_new_J = false
+#         cache.stats.njacs += 1
+#     end
+#     @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
 
-    # Usual Levenberg-Marquardt step ("velocity").
-    # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
-    mul!(cache.fu_tmp, J', fu1)
-    @. cache.mat_tmp = JᵀJ + λ * DᵀD
-    linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du), p = p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    @. cache.v = -cache.du
+#     # Usual Levenberg-Marquardt step ("velocity").
+#     # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
+#     mul!(cache.fu_tmp, J', fu1)
+#     @. cache.mat_tmp = JᵀJ + λ * DᵀD
+#     linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
+#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
+#     cache.linsolve = linres.cache
+#     @. cache.v = -cache.du
 
-    # Geodesic acceleration (step_size = v + a / 2).
-    @unpack v, α_geodesic, h = cache
-    f(cache.fu_tmp, u .+ h .* v, p)
+#     # Geodesic acceleration (step_size = v + a / 2).
+#     @unpack v, α_geodesic, h = cache
+#     f(cache.fu_tmp, u .+ h .* v, p)
 
-    # The following lines do: cache.a = -J \ cache.fu_tmp
-    mul!(cache.du, J, v)
-    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
-    linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du), p = p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    @. cache.a = -cache.du
-    cache.stats.nsolve += 2
-    cache.stats.nfactors += 2
+#     # The following lines do: cache.a = -J \ cache.fu_tmp
+#     mul!(cache.du, J, v)
+#     @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
+#     linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
+#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
+#     cache.linsolve = linres.cache
+#     @. cache.a = -cache.du
+#     cache.stats.nsolve += 2
+#     cache.stats.nfactors += 2
 
-    # Require acceptable steps to satisfy the following condition.
-    norm_v = norm(v)
-    if (2 * norm(cache.a) / norm_v) < α_geodesic
-        @. cache.δ = v + cache.a / 2
-        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-        f(cache.fu_tmp, u .+ δ, p)
-        cache.stats.nf += 1
-        loss = cache.internalnorm(cache.fu_tmp)
+#     # Require acceptable steps to satisfy the following condition.
+#     norm_v = norm(v)
+#     if (2 * norm(cache.a) / norm_v) < α_geodesic
+#         @. cache.δ = v + cache.a / 2
+#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+#         f(cache.fu_tmp, u .+ δ, p)
+#         cache.stats.nf += 1
+#         loss = cache.internalnorm(cache.fu_tmp)
 
-        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        β = dot(v, v_old) / (norm_v * norm_v_old)
-        if (1 - β)^b_uphill * loss ≤ loss_old
-            # Accept step.
-            cache.u .+= δ
-            if loss < cache.abstol
-                cache.force_stop = true
-                return nothing
-            end
-            cache.fu1 .= cache.fu_tmp
-            cache.v_old .= v
-            cache.norm_v_old = norm_v
-            cache.loss_old = loss
-            cache.λ_factor = 1 / cache.damping_decrease_factor
-            cache.make_new_J = true
-        end
-    end
-    cache.λ *= cache.λ_factor
-    cache.λ_factor = cache.damping_increase_factor
-    return nothing
-end
+#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+#         β = dot(v, v_old) / (norm_v * norm_v_old)
+#         if (1 - β)^b_uphill * loss ≤ loss_old
+#             # Accept step.
+#             cache.u .+= δ
+#             if loss < cache.abstol
+#                 cache.force_stop = true
+#                 return nothing
+#             end
+#             cache.fu1 .= cache.fu_tmp
+#             cache.v_old .= v
+#             cache.norm_v_old = norm_v
+#             cache.loss_old = loss
+#             cache.λ_factor = 1 / cache.damping_decrease_factor
+#             cache.make_new_J = true
+#         end
+#     end
+#     cache.λ *= cache.λ_factor
+#     cache.λ_factor = cache.damping_increase_factor
+#     return nothing
+# end
 
-function perform_step!(cache::LevenbergMarquardtCache{false})
-    @unpack fu1, f, make_new_J = cache
-    if iszero(fu1)
-        cache.force_stop = true
-        return nothing
-    end
+# function perform_step!(cache::LevenbergMarquardtCache{false})
+#     @unpack fu1, f, make_new_J = cache
+#     if iszero(fu1)
+#         cache.force_stop = true
+#         return nothing
+#     end
 
-    if make_new_J
-        cache.J = jacobian!!(cache.J, cache)
-        cache.JᵀJ = cache.J' * cache.J
-        if cache.JᵀJ isa Number
-            cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
-        else
-            cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-        end
-        cache.make_new_J = false
-        cache.stats.njacs += 1
-    end
-    @unpack u, p, λ, JᵀJ, DᵀD, J = cache
+#     if make_new_J
+#         cache.J = jacobian!!(cache.J, cache)
+#         cache.JᵀJ = cache.J' * cache.J
+#         if cache.JᵀJ isa Number
+#             cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
+#         else
+#             cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+#         end
+#         cache.make_new_J = false
+#         cache.stats.njacs += 1
+#     end
+#     @unpack u, p, λ, JᵀJ, DᵀD, J = cache
 
-    # Usual Levenberg-Marquardt step ("velocity").
-    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
+#     # Usual Levenberg-Marquardt step ("velocity").
+#     cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
 
-    @unpack v, h, α_geodesic = cache
-    # Geodesic acceleration (step_size = v + a / 2).
-    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
+#     @unpack v, h, α_geodesic = cache
+#     # Geodesic acceleration (step_size = v + a / 2).
+#     cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
+#     cache.stats.nsolve += 1
+#     cache.stats.nfactors += 1
 
-    # Require acceptable steps to satisfy the following condition.
-    norm_v = norm(v)
-    if (2 * norm(cache.a) / norm_v) < α_geodesic
-        cache.δ = v .+ cache.a ./ 2
-        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-        fu_new = f(u .+ δ, p)
-        cache.stats.nf += 1
-        loss = cache.internalnorm(fu_new)
+#     # Require acceptable steps to satisfy the following condition.
+#     norm_v = norm(v)
+#     if (2 * norm(cache.a) / norm_v) < α_geodesic
+#         cache.δ = v .+ cache.a ./ 2
+#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+#         fu_new = f(u .+ δ, p)
+#         cache.stats.nf += 1
+#         loss = cache.internalnorm(fu_new)
 
-        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        β = dot(v, v_old) / (norm_v * norm_v_old)
-        if (1 - β)^b_uphill * loss ≤ loss_old
-            # Accept step.
-            cache.u += δ
-            if loss < cache.abstol
-                cache.force_stop = true
-                return nothing
-            end
-            cache.fu1 = fu_new
-            cache.v_old = v
-            cache.norm_v_old = norm_v
-            cache.loss_old = loss
-            cache.λ_factor = 1 / cache.damping_decrease_factor
-            cache.make_new_J = true
-        end
-    end
-    cache.λ *= cache.λ_factor
-    cache.λ_factor = cache.damping_increase_factor
-    return nothing
-end
+#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+#         β = dot(v, v_old) / (norm_v * norm_v_old)
+#         if (1 - β)^b_uphill * loss ≤ loss_old
+#             # Accept step.
+#             cache.u += δ
+#             if loss < cache.abstol
+#                 cache.force_stop = true
+#                 return nothing
+#             end
+#             cache.fu1 = fu_new
+#             cache.v_old = v
+#             cache.norm_v_old = norm_v
+#             cache.loss_old = loss
+#             cache.λ_factor = 1 / cache.damping_decrease_factor
+#             cache.make_new_J = true
+#         end
+#     end
+#     cache.λ *= cache.λ_factor
+#     cache.λ_factor = cache.damping_increase_factor
+#     return nothing
+# end
 
-function SciMLBase.solve!(cache::LevenbergMarquardtCache)
-    while !cache.force_stop && cache.stats.nsteps < cache.maxiters
-        perform_step!(cache)
-        cache.stats.nsteps += 1
-    end
+# function SciMLBase.solve!(cache::LevenbergMarquardtCache)
+#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters
+#         perform_step!(cache)
+#         cache.stats.nsteps += 1
+#     end
 
-    if cache.stats.nsteps == cache.maxiters
-        cache.retcode = ReturnCode.MaxIters
-    else
-        cache.retcode = ReturnCode.Success
-    end
+#     if cache.stats.nsteps == cache.maxiters
+#         cache.retcode = ReturnCode.MaxIters
+#     else
+#         cache.retcode = ReturnCode.Success
+#     end
 
-    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
-        cache.retcode, cache.stats)
-end
+#     return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+#         cache.retcode, cache.stats)
+# end
diff --git a/src/raphson.jl b/src/raphson.jl
index d780d5077..9f7c1fb87 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -36,7 +36,7 @@ concrete_jac(::NewtonRaphson{CJ}) where {CJ} = CJ
 
 function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
     precs = DEFAULT_PRECS, adkwargs...)
-    ad = default_adargs_to_adtype(adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
     return NewtonRaphson{_unwrap_val(concrete_jac)}(ad, linsolve, precs)
 end
 
@@ -69,14 +69,14 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson
     @unpack f, u0, p = prob
     u = alias_u0 ? u0 : deepcopy(u0)
     if iip
-        fu1 = zero(u)  # TODO: Use Prototype
+        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
         f(fu1, u, p)
     else
-        fu1 = f(u, p)
+        fu1 = f.resid_prototype === nothing ? f(u, p) : f.resid_prototype
     end
-    uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
 
-    return NewtonRaphsonCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
+    return NewtonRaphsonCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
         NLStats(1, 0, 0, 0, 0))
 end

From e87a82d213b0cf48dfccfc8bdff2b0d5531c56e6 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 8 Sep 2023 18:05:52 -0400
Subject: [PATCH 03/19] Patch broken solvers + better testing

---
 Project.toml          |    7 +-
 src/NonlinearSolve.jl |    9 +-
 src/jacobian.jl       |   31 +-
 src/levenberg.jl      |  616 +++++++++---------
 src/raphson.jl        |   18 +-
 src/trustRegion.jl    |  913 +++++++++++++--------------
 src/utils.jl          |   44 +-
 test/23_test_cases.jl |  510 ---------------
 test/basictests.jl    | 1388 +++++++++++++++++++----------------------
 test/runtests.jl      |   10 +-
 10 files changed, 1476 insertions(+), 2070 deletions(-)
 delete mode 100644 test/23_test_cases.jl

diff --git a/Project.toml b/Project.toml
index b1724f423..5033ab24a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,6 +27,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 ArrayInterface = "6.0.24, 7"
 DiffEqBase = "6"
 EnumX = "1"
+Enzyme = "0.11"
 FiniteDiff = "2"
 ForwardDiff = "0.10.3"
 LinearSolve = "2"
@@ -38,19 +39,23 @@ SimpleNonlinearSolve = "0.1"
 SparseDiffTools = "1, 2"
 StaticArraysCore = "1.4"
 UnPack = "1.0"
+Zygote = "0.6"
 julia = "1.6"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra"]
+test = ["Enzyme", "BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra", "Zygote", "SparseDiffTools"]
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index b774b7953..9fd4bb31d 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,15 +8,16 @@ using DiffEqBase, LinearAlgebra, LinearSolve, SparseDiffTools
 import ForwardDiff
 
 import ADTypes: AbstractFiniteDifferencesMode
-import ArrayInterface: undefmatrix, matrix_colors
+import ArrayInterface: undefmatrix, matrix_colors, parameterless_type, ismutable
 import ConcreteStructs: @concrete
 import EnumX: @enumx
 import ForwardDiff: Dual
 import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
-import RecursiveArrayTools: AbstractVectorOfArray, recursivecopy!, recursivefill!
+import RecursiveArrayTools: ArrayPartition,
+    AbstractVectorOfArray, recursivecopy!, recursivefill!
 import Reexport: @reexport
 import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
-import StaticArraysCore: StaticArray, SVector
+import StaticArraysCore: StaticArray, SVector, SArray, MArray
 import UnPack: @unpack
 
 @reexport using ADTypes, SciMLBase, SimpleNonlinearSolve
@@ -33,8 +34,6 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::AbstractNonlinearSolveAl
     return solve!(cache)
 end
 
-# FIXME: Scalar Case is Completely Broken
-
 include("utils.jl")
 include("raphson.jl")
 include("trustRegion.jl")
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 2a96432d7..9c7f6e721 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -36,12 +36,16 @@ function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
     @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
     iip = isinplace(cache)
     if iip
-        has_jac(f) ? f.jac(J, u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, u)
+        has_jac(f) ? f.jac(J, u, p) :
+        sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, _maybe_mutable(u, alg.ad))
     else
-        return has_jac(f) ? f.jac(u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, u)
+        return has_jac(f) ? f.jac(u, p) :
+               sparse_jacobian!(J, alg.ad, jac_cache, uf, _maybe_mutable(u, alg.ad))
     end
-    return nothing
+    return J
 end
+# Scalar case
+jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
 
 # Build Jacobian Caches
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
@@ -54,15 +58,16 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
     linsolve_needs_jac = (concrete_jac(alg) === nothing &&
                           (!haslinsolve || (haslinsolve && (alg.linsolve === nothing ||
                              needs_concrete_A(alg.linsolve)))))
-    alg_wants_jac = (concrete_jac(alg) === nothing && concrete_jac(alg))
+    alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
 
     # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
-    fu = f.resid_prototype === nothing ? (iip ? zero(u) : f(u, p)) :
-         deepcopy(f.resid_prototype)
+    fu = f.resid_prototype === nothing ? (iip ? _mutable_zero(u) : _mutable(f(u, p))) :
+         (iip ? deepcopy(f.resid_prototype) : f.resid_prototype)
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
         sd = sparsity_detection_alg(f, alg.ad)
-        jac_cache = iip ? sparse_jacobian_cache(alg.ad, sd, uf, fu, u) :
-                    sparse_jacobian_cache(alg.ad, sd, uf, u; fx = fu)
+        ad = alg.ad
+        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, _maybe_mutable(u, ad)) :
+                    sparse_jacobian_cache(ad, sd, uf, _maybe_mutable(u, ad); fx = fu)
     else
         jac_cache = nothing
     end
@@ -78,7 +83,7 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
         end
     end
 
-    du = zero(u)
+    du = _mutable_zero(u)
     linprob = LinearProblem(J, _vec(fu); u0 = _vec(du))
 
     weight = similar(u)
@@ -90,3 +95,11 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
 
     return uf, linsolve, J, fu, jac_cache, du
 end
+
+## Special Handling for Scalars
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u::Number, p,
+    ::Val{false})
+    # NOTE: Scalar `u` assumes scalar output from `f`
+    uf = JacobianWrapper(f, p)
+    return uf, nothing, u, nothing, nothing, u
+end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 15956c7df..6265eba3f 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -1,335 +1,335 @@
-# """
-#     LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-#         precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
-#         damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
-#         finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
-#         b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
+"""
+    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
+        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
 
-# An advanced Levenberg-Marquardt implementation with the improvements suggested in the
-# [paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
-# algorithm for nonlinear least-squares minimization". Designed for large-scale and
-# numerically-difficult nonlinear systems.
+An advanced Levenberg-Marquardt implementation with the improvements suggested in the
+[paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
+algorithm for nonlinear least-squares minimization". Designed for large-scale and
+numerically-difficult nonlinear systems.
 
-# ### Keyword Arguments
+### Keyword Arguments
 
-#   - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-#       ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-#       `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
-#   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-#     then the Jacobian will not be constructed and instead direct Jacobian-vector products
-#     `J*v` are computed using forward-mode automatic differentiation or finite differencing
-#     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-#     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-#     the construction of the Jacobian.
-#   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-#     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-#     LinearSolve.jl default algorithm choice. For more information on available algorithm
-#     choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-#   - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-#     preconditioners. For more information on specifying preconditioners for LinearSolve
-#     algorithms, consult the
-#     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-#   - `damping_initial`: the starting value for the damping factor. The damping factor is
-#     inversely proportional to the step size. The damping factor is adjusted during each
-#     iteration. Defaults to `1.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `damping_increase_factor`: the factor by which the damping is increased if a step is
-#     rejected. Defaults to `2.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-#     accepted. Defaults to `3.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-#     the geodesic acceleration. Defaults to `0.1` which means that the step size is
-#     approximately 10% of the first-order step. For more details, see section 3 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-#     geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-#     that acceptable steps meet the condition
-#     ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-#     acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-#     path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-#     is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-#     effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-#     choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-#     and reject all steps that increase the cost. Although this is a natural and safe choice,
-#     it is often not the most efficient. Therefore downhill moves are always accepted, but
-#     uphill moves are only conditionally accepted. To decide whether an uphill move will be
-#     accepted at each iteration ``i``, we compute
-#     ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-#     between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-#     step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-#     specify, uphill moves are accepted if
-#     ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-#     iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-#     allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-#     will be accepted. Defaults to `1.0`. For more details, see section 4 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-#     `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-#     where `J` is the Jacobian. It is suggested by
-#     [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-#     `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-# """
-# @concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
-#     ad::AD
-#     linsolve
-#     precs
-#     damping_initial::T
-#     damping_increase_factor::T
-#     damping_decrease_factor::T
-#     finite_diff_step_geodesic::T
-#     α_geodesic::T
-#     b_uphill::T
-#     min_damping_D::T
-# end
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+      ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+      `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
+  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
+    then the Jacobian will not be constructed and instead direct Jacobian-vector products
+    `J*v` are computed using forward-mode automatic differentiation or finite differencing
+    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
+    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
+    the construction of the Jacobian.
+  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
+    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
+    LinearSolve.jl default algorithm choice. For more information on available algorithm
+    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+    preconditioners. For more information on specifying preconditioners for LinearSolve
+    algorithms, consult the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `damping_initial`: the starting value for the damping factor. The damping factor is
+    inversely proportional to the step size. The damping factor is adjusted during each
+    iteration. Defaults to `1.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_increase_factor`: the factor by which the damping is increased if a step is
+    rejected. Defaults to `2.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+    accepted. Defaults to `3.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
+    the geodesic acceleration. Defaults to `0.1` which means that the step size is
+    approximately 10% of the first-order step. For more details, see section 3 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
+    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
+    that acceptable steps meet the condition
+    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
+    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
+    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
+    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+    and reject all steps that increase the cost. Although this is a natural and safe choice,
+    it is often not the most efficient. Therefore downhill moves are always accepted, but
+    uphill moves are only conditionally accepted. To decide whether an uphill move will be
+    accepted at each iteration ``i``, we compute
+    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+    specify, uphill moves are accepted if
+    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
+    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
+    will be accepted. Defaults to `1.0`. For more details, see section 4 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+    where `J` is the Jacobian. It is suggested by
+    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
+    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
+"""
+@concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
+    damping_initial::T
+    damping_increase_factor::T
+    damping_decrease_factor::T
+    finite_diff_step_geodesic::T
+    α_geodesic::T
+    b_uphill::T
+    min_damping_D::T
+end
 
-# function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-#     precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
-#     damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
-#     α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
-#     adkwargs...)
-#     ad = default_adargs_to_adtype(adkwargs...)
-#     return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
-#         damping_initial, damping_increase_factor, damping_decrease_factor,
-#         finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
-# end
+function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+    precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
+    damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
+    α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
+    adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
+    return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
+        damping_initial, damping_increase_factor, damping_decrease_factor,
+        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
+end
 
-# @concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
-#     f
-#     alg
-#     u::uType
-#     fu1
-#     fu2
-#     du
-#     p
-#     uf
-#     linsolve
-#     J::jType
-#     jac_cache
-#     force_stop::Bool
-#     maxiters::Int
-#     internalnorm
-#     retcode::ReturnCode.T
-#     abstol
-#     prob
-#     DᵀD
-#     JᵀJ::jType
-#     λ::λType
-#     λ_factor::λType
-#     damping_increase_factor::λType
-#     damping_decrease_factor::λType
-#     h::λType
-#     α_geodesic::λType
-#     b_uphill::λType
-#     min_damping_D::λType
-#     v::uType
-#     a::uType
-#     tmp_vec::uType
-#     v_old::uType
-#     norm_v_old::lossType
-#     δ::uType
-#     loss_old::lossType
-#     make_new_J::Bool
-#     fu_tmp
-#     mat_tmp::jType
-#     stats::NLStats
-# end
+@concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
+    f
+    alg
+    u::uType
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
+    J::jType
+    jac_cache
+    force_stop::Bool
+    maxiters::Int
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    DᵀD
+    JᵀJ::jType
+    λ::λType
+    λ_factor::λType
+    damping_increase_factor::λType
+    damping_decrease_factor::λType
+    h::λType
+    α_geodesic::λType
+    b_uphill::λType
+    min_damping_D::λType
+    v::uType
+    a::uType
+    tmp_vec::uType
+    v_old::uType
+    norm_v_old::lossType
+    δ::uType
+    loss_old::lossType
+    make_new_J::Bool
+    fu_tmp
+    mat_tmp::jType
+    stats::NLStats
+end
 
-# isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
+isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
-# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
-#     args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
-#     kwargs...) where {uType, iip}
-#     @unpack f, u0, p = prob
-#     u = alias_u0 ? u0 : deepcopy(u0)
-#     if iip
-#         fu1 = zero(u)  # TODO: Use Prototype
-#         f(fu1, u, p)
-#     else
-#         fu1 = f(u, p)
-#     end
-#     uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
+    args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
+    kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    if iip
+        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu1, u, p)
+    else
+        fu1 = f(u, p)
+    end
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
 
-#     λ = convert(eltype(u), alg.damping_initial)
-#     λ_factor = convert(eltype(u), alg.damping_increase_factor)
-#     damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
-#     damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
-#     h = convert(eltype(u), alg.finite_diff_step_geodesic)
-#     α_geodesic = convert(eltype(u), alg.α_geodesic)
-#     b_uphill = convert(eltype(u), alg.b_uphill)
-#     min_damping_D = convert(eltype(u), alg.min_damping_D)
+    λ = convert(eltype(u), alg.damping_initial)
+    λ_factor = convert(eltype(u), alg.damping_increase_factor)
+    damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
+    damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
+    h = convert(eltype(u), alg.finite_diff_step_geodesic)
+    α_geodesic = convert(eltype(u), alg.α_geodesic)
+    b_uphill = convert(eltype(u), alg.b_uphill)
+    min_damping_D = convert(eltype(u), alg.min_damping_D)
 
-#     if u isa Number
-#         DᵀD = min_damping_D
-#     else
-#         d = similar(u)
-#         d .= min_damping_D
-#         DᵀD = Diagonal(d)
-#     end
+    if u isa Number
+        DᵀD = min_damping_D
+    else
+        d = similar(u)
+        d .= min_damping_D
+        DᵀD = Diagonal(d)
+    end
 
-#     loss = internalnorm(fu1)
-#     JᵀJ = zero(J)
-#     v = zero(u)
-#     a = zero(u)
-#     tmp_vec = zero(u)
-#     v_old = zero(u)
-#     δ = zero(u)
-#     make_new_J = true
-#     fu_tmp = zero(fu1)
-#     mat_tmp = zero(J)
+    loss = internalnorm(fu1)
+    JᵀJ = zero(J)
+    v = zero(u)
+    a = zero(u)
+    tmp_vec = zero(u)
+    v_old = zero(u)
+    δ = zero(u)
+    make_new_J = true
+    fu_tmp = zero(fu1)
+    mat_tmp = zero(J)
 
-#     return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
-#         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
-#         JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
-#         b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
-#         mat_tmp, NLStats(1, 0, 0, 0, 0))
-# end
+    return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
+        JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
+        b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
+        mat_tmp, NLStats(1, 0, 0, 0, 0))
+end
 
-# function perform_step!(cache::LevenbergMarquardtCache{true})
-#     @unpack fu1, f, make_new_J = cache
-#     if iszero(fu1)
-#         cache.force_stop = true
-#         return nothing
-#     end
+function perform_step!(cache::LevenbergMarquardtCache{true})
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
+        cache.force_stop = true
+        return nothing
+    end
 
-#     if make_new_J
-#         jacobian!!(cache.J, cache)
-#         mul!(cache.JᵀJ, cache.J', cache.J)
-#         cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-#         cache.make_new_J = false
-#         cache.stats.njacs += 1
-#     end
-#     @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
+    if make_new_J
+        jacobian!!(cache.J, cache)
+        mul!(cache.JᵀJ, cache.J', cache.J)
+        cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+        cache.make_new_J = false
+        cache.stats.njacs += 1
+    end
+    @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
 
-#     # Usual Levenberg-Marquardt step ("velocity").
-#     # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
-#     mul!(cache.fu_tmp, J', fu1)
-#     @. cache.mat_tmp = JᵀJ + λ * DᵀD
-#     linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
-#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     @. cache.v = -cache.du
+    # Usual Levenberg-Marquardt step ("velocity").
+    # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
+    mul!(cache.fu_tmp, J', fu1)
+    @. cache.mat_tmp = JᵀJ + λ * DᵀD
+    linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    @. cache.v = -cache.du
 
-#     # Geodesic acceleration (step_size = v + a / 2).
-#     @unpack v, α_geodesic, h = cache
-#     f(cache.fu_tmp, u .+ h .* v, p)
+    # Geodesic acceleration (step_size = v + a / 2).
+    @unpack v, α_geodesic, h = cache
+    f(cache.fu_tmp, u .+ h .* v, p)
 
-#     # The following lines do: cache.a = -J \ cache.fu_tmp
-#     mul!(cache.du, J, v)
-#     @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
-#     linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
-#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     @. cache.a = -cache.du
-#     cache.stats.nsolve += 2
-#     cache.stats.nfactors += 2
+    # The following lines do: cache.a = -J \ cache.fu_tmp
+    mul!(cache.du, J, v)
+    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
+    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    @. cache.a = -cache.du
+    cache.stats.nsolve += 2
+    cache.stats.nfactors += 2
 
-#     # Require acceptable steps to satisfy the following condition.
-#     norm_v = norm(v)
-#     if (2 * norm(cache.a) / norm_v) < α_geodesic
-#         @. cache.δ = v + cache.a / 2
-#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-#         f(cache.fu_tmp, u .+ δ, p)
-#         cache.stats.nf += 1
-#         loss = cache.internalnorm(cache.fu_tmp)
+    # Require acceptable steps to satisfy the following condition.
+    norm_v = norm(v)
+    if (2 * norm(cache.a) / norm_v) < α_geodesic
+        @. cache.δ = v + cache.a / 2
+        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+        f(cache.fu_tmp, u .+ δ, p)
+        cache.stats.nf += 1
+        loss = cache.internalnorm(cache.fu_tmp)
 
-#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-#         β = dot(v, v_old) / (norm_v * norm_v_old)
-#         if (1 - β)^b_uphill * loss ≤ loss_old
-#             # Accept step.
-#             cache.u .+= δ
-#             if loss < cache.abstol
-#                 cache.force_stop = true
-#                 return nothing
-#             end
-#             cache.fu1 .= cache.fu_tmp
-#             cache.v_old .= v
-#             cache.norm_v_old = norm_v
-#             cache.loss_old = loss
-#             cache.λ_factor = 1 / cache.damping_decrease_factor
-#             cache.make_new_J = true
-#         end
-#     end
-#     cache.λ *= cache.λ_factor
-#     cache.λ_factor = cache.damping_increase_factor
-#     return nothing
-# end
+        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        β = dot(v, v_old) / (norm_v * norm_v_old)
+        if (1 - β)^b_uphill * loss ≤ loss_old
+            # Accept step.
+            cache.u .+= δ
+            if loss < cache.abstol
+                cache.force_stop = true
+                return nothing
+            end
+            cache.fu1 .= cache.fu_tmp
+            cache.v_old .= v
+            cache.norm_v_old = norm_v
+            cache.loss_old = loss
+            cache.λ_factor = 1 / cache.damping_decrease_factor
+            cache.make_new_J = true
+        end
+    end
+    cache.λ *= cache.λ_factor
+    cache.λ_factor = cache.damping_increase_factor
+    return nothing
+end
 
-# function perform_step!(cache::LevenbergMarquardtCache{false})
-#     @unpack fu1, f, make_new_J = cache
-#     if iszero(fu1)
-#         cache.force_stop = true
-#         return nothing
-#     end
+function perform_step!(cache::LevenbergMarquardtCache{false})
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
+        cache.force_stop = true
+        return nothing
+    end
 
-#     if make_new_J
-#         cache.J = jacobian!!(cache.J, cache)
-#         cache.JᵀJ = cache.J' * cache.J
-#         if cache.JᵀJ isa Number
-#             cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
-#         else
-#             cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-#         end
-#         cache.make_new_J = false
-#         cache.stats.njacs += 1
-#     end
-#     @unpack u, p, λ, JᵀJ, DᵀD, J = cache
+    if make_new_J
+        cache.J = jacobian!!(cache.J, cache)
+        cache.JᵀJ = cache.J' * cache.J
+        if cache.JᵀJ isa Number
+            cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
+        else
+            cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+        end
+        cache.make_new_J = false
+        cache.stats.njacs += 1
+    end
+    @unpack u, p, λ, JᵀJ, DᵀD, J = cache
 
-#     # Usual Levenberg-Marquardt step ("velocity").
-#     cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
+    # Usual Levenberg-Marquardt step ("velocity").
+    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
 
-#     @unpack v, h, α_geodesic = cache
-#     # Geodesic acceleration (step_size = v + a / 2).
-#     cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
+    @unpack v, h, α_geodesic = cache
+    # Geodesic acceleration (step_size = v + a / 2).
+    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
 
-#     # Require acceptable steps to satisfy the following condition.
-#     norm_v = norm(v)
-#     if (2 * norm(cache.a) / norm_v) < α_geodesic
-#         cache.δ = v .+ cache.a ./ 2
-#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-#         fu_new = f(u .+ δ, p)
-#         cache.stats.nf += 1
-#         loss = cache.internalnorm(fu_new)
+    # Require acceptable steps to satisfy the following condition.
+    norm_v = norm(v)
+    if (2 * norm(cache.a) / norm_v) < α_geodesic
+        cache.δ = v .+ cache.a ./ 2
+        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+        fu_new = f(u .+ δ, p)
+        cache.stats.nf += 1
+        loss = cache.internalnorm(fu_new)
 
-#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-#         β = dot(v, v_old) / (norm_v * norm_v_old)
-#         if (1 - β)^b_uphill * loss ≤ loss_old
-#             # Accept step.
-#             cache.u += δ
-#             if loss < cache.abstol
-#                 cache.force_stop = true
-#                 return nothing
-#             end
-#             cache.fu1 = fu_new
-#             cache.v_old = v
-#             cache.norm_v_old = norm_v
-#             cache.loss_old = loss
-#             cache.λ_factor = 1 / cache.damping_decrease_factor
-#             cache.make_new_J = true
-#         end
-#     end
-#     cache.λ *= cache.λ_factor
-#     cache.λ_factor = cache.damping_increase_factor
-#     return nothing
-# end
+        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        β = dot(v, v_old) / (norm_v * norm_v_old)
+        if (1 - β)^b_uphill * loss ≤ loss_old
+            # Accept step.
+            cache.u += δ
+            if loss < cache.abstol
+                cache.force_stop = true
+                return nothing
+            end
+            cache.fu1 = fu_new
+            cache.v_old = v
+            cache.norm_v_old = norm_v
+            cache.loss_old = loss
+            cache.λ_factor = 1 / cache.damping_decrease_factor
+            cache.make_new_J = true
+        end
+    end
+    cache.λ *= cache.λ_factor
+    cache.λ_factor = cache.damping_increase_factor
+    return nothing
+end
 
-# function SciMLBase.solve!(cache::LevenbergMarquardtCache)
-#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters
-#         perform_step!(cache)
-#         cache.stats.nsteps += 1
-#     end
+function SciMLBase.solve!(cache::LevenbergMarquardtCache)
+    while !cache.force_stop && cache.stats.nsteps < cache.maxiters
+        perform_step!(cache)
+        cache.stats.nsteps += 1
+    end
 
-#     if cache.stats.nsteps == cache.maxiters
-#         cache.retcode = ReturnCode.MaxIters
-#     else
-#         cache.retcode = ReturnCode.Success
-#     end
+    if cache.stats.nsteps == cache.maxiters
+        cache.retcode = ReturnCode.MaxIters
+    else
+        cache.retcode = ReturnCode.Success
+    end
 
-#     return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
-#         cache.retcode, cache.stats)
-# end
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
+end
diff --git a/src/raphson.jl b/src/raphson.jl
index 9f7c1fb87..33d12c4ba 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -72,7 +72,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson
         fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
         f(fu1, u, p)
     else
-        fu1 = f.resid_prototype === nothing ? f(u, p) : f.resid_prototype
+        fu1 = _mutable(f(u, p))
     end
     uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
 
@@ -101,15 +101,19 @@ function perform_step!(cache::NewtonRaphsonCache{true})
 end
 
 function perform_step!(cache::NewtonRaphsonCache{false})
-    @unpack u, fu1, f, p, alg, linsolve, du = cache
+    @unpack u, fu1, f, p, alg, linsolve = cache
 
     cache.J = jacobian!!(cache.J, cache)
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1), linu = _vec(du),
-        p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    @. u = u - du
-    cache.fu1 = f(u, p)
+    if linsolve === nothing
+        cache.du = fu1 / cache.J
+    else
+        linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
+            linu = _vec(cache.du), p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
+    end
+    cache.u = @. u - cache.du  # `u` might not support mutation
+    cache.fu1 = f(cache.u, p)
 
     cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index c43b86699..41ccb994e 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -8,7 +8,7 @@ scheme are provided below.
 
 ## Using `RadiusUpdateSchemes`
 
-`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl), 
+`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl),
 and hence inherits all properties of being an EnumX, including the type of each constituent enum
 states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
 `TrustRegion(radius_update_scheme = your desired update scheme)`. For example,
@@ -99,7 +99,7 @@ for large-scale and numerically-difficult nonlinear systems.
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
   - `radius_update_scheme`: the choice of radius update scheme to be used. Defaults to `RadiusUpdateSchemes.Simple`
-    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`, 
+    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`,
     `RadiusUpdateSchemes.Yuan`, `RadiusUpdateSchemes.Bastin`, `RadiusUpdateSchemes.Fan`. These schemes
     have the trust region radius converging to zero that is seen to improve convergence. For more details, see the
     [Yuan, Yx](https://link.springer.com/article/10.1007/s10107-015-0893-2#Sec4).
@@ -149,471 +149,454 @@ function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAU
     step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
     expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
     expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
-    ad = default_adargs_to_adtype(adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
     return TrustRegion{_unwrap_val(concrete_jac)}(ad, linsolve, precs, radius_update_scheme,
         max_trust_radius, initial_trust_radius, step_threshold, shrink_threshold,
         expand_threshold, shrink_factor, expand_factor, max_shrink_times)
 end
 
-# @concrete mutable struct TrustRegionCache{iip}
-#     f
-#     alg
-#     u_prev::uType
-#     u::uType
-#     fu_prev::resType
-#     fu::resType
-#     p
-#     uf
-#     linsolve
-#     J::jType
-#     jac_cache
-#     force_stop::Bool
-#     maxiters::Int
-#     internalnorm
-#     retcode::ReturnCode.T
-#     abstol
-#     prob
-#     radius_update_scheme::RadiusUpdateSchemes.T
-#     trust_r::trustType
-#     max_trust_r::trustType
-#     step_threshold
-#     shrink_threshold::trustType
-#     expand_threshold::trustType
-#     shrink_factor::trustType
-#     expand_factor::trustType
-#     loss::floatType
-#     loss_new::floatType
-#     H::jType
-#     g::resType
-#     shrink_counter::Int
-#     step_size
-#     u_tmp
-#     fu_new::resType
-#     make_new_J::Bool
-#     r::floatType
-#     p1::floatType
-#     p2::floatType
-#     p3::floatType
-#     p4::floatType
-#     ϵ::floatType
-#     stats::NLStats
-# end
-
-# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
-#     args...;
-#     alias_u0 = false,
-#     maxiters = 1000,
-#     abstol = 1e-8,
-#     internalnorm = DEFAULT_NORM,
-#     kwargs...) where {uType, iip}
-#     if alias_u0
-#         u = prob.u0
-#     else
-#         u = deepcopy(prob.u0)
-#     end
-#     u_prev = zero(u)
-#     f = prob.f
-#     p = prob.p
-#     if iip
-#         fu = zero(u)
-#         f(fu, u, p)
-#     else
-#         fu = f(u, p)
-#     end
-#     fu_prev = zero(fu)
-
-#     loss = get_loss(fu)
-#     uf, linsolve, J, u_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
-
-#     radius_update_scheme = alg.radius_update_scheme
-#     max_trust_radius = convert(eltype(u), alg.max_trust_radius)
-#     initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
-#     step_threshold = convert(eltype(u), alg.step_threshold)
-#     shrink_threshold = convert(eltype(u), alg.shrink_threshold)
-#     expand_threshold = convert(eltype(u), alg.expand_threshold)
-#     shrink_factor = convert(eltype(u), alg.shrink_factor)
-#     expand_factor = convert(eltype(u), alg.expand_factor)
-#     # Set default trust region radius if not specified
-#     if iszero(max_trust_radius)
-#         max_trust_radius = convert(eltype(u), max(norm(fu), maximum(u) - minimum(u)))
-#     end
-#     if iszero(initial_trust_radius)
-#         initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
-#     end
-
-#     loss_new = loss
-#     H = ArrayInterface.undefmatrix(u)
-#     g = zero(fu)
-#     shrink_counter = 0
-#     step_size = zero(u)
-#     fu_new = zero(fu)
-#     make_new_J = true
-#     r = loss
-
-#     # Parameters for the Schemes
-#     p1 = convert(eltype(u), 0.0)
-#     p2 = convert(eltype(u), 0.0)
-#     p3 = convert(eltype(u), 0.0)
-#     p4 = convert(eltype(u), 0.0)
-#     ϵ = convert(eltype(u), 1.0e-8)
-#     if radius_update_scheme === RadiusUpdateSchemes.Hei
-#         step_threshold = convert(eltype(u), 0.0)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.25)
-#         p1 = convert(eltype(u), 5.0) # M
-#         p2 = convert(eltype(u), 0.1) # β
-#         p3 = convert(eltype(u), 0.15) # γ1
-#         p4 = convert(eltype(u), 0.15) # γ2
-#         initial_trust_radius = convert(eltype(u), 1.0)
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-#         step_threshold = convert(eltype(u), 0.0001)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.25)
-#         p1 = convert(eltype(u), 2.0) # μ
-#         p2 = convert(eltype(u), 1 / 6) # c5
-#         p3 = convert(eltype(u), 6.0) # c6
-#         p4 = convert(eltype(u), 0.0)
-#         if iip
-#             auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-#         else
-#             if isa(u, Number)
-#                 g = ForwardDiff.derivative(x -> f(x, p), u)
-#             else
-#                 g = auto_jacvec(x -> f(x, p), u, fu)
-#             end
-#         end
-#         initial_trust_radius = convert(eltype(u), p1 * norm(g))
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-#         step_threshold = convert(eltype(u), 0.0001)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.75)
-#         p1 = convert(eltype(u), 0.1) # μ
-#         p2 = convert(eltype(u), 1 / 4) # c5
-#         p3 = convert(eltype(u), 12) # c6
-#         p4 = convert(eltype(u), 1.0e18) # M
-#         initial_trust_radius = convert(eltype(u), p1 * (norm(fu)^0.99))
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-#         step_threshold = convert(eltype(u), 0.05)
-#         shrink_threshold = convert(eltype(u), 0.05)
-#         expand_threshold = convert(eltype(u), 0.9)
-#         p1 = convert(eltype(u), 2.5)  #alpha_1
-#         p2 = convert(eltype(u), 0.25) # alpha_2
-#         p3 = convert(eltype(u), 0) # not required
-#         p4 = convert(eltype(u), 0) # not required
-#         initial_trust_radius = convert(eltype(u), 1.0)
-#     end
-
-#     return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu, p, uf, linsolve, J,
-#         jac_config,
-#         false, maxiters, internalnorm,
-#         ReturnCode.Default, abstol, prob, radius_update_scheme,
-#         initial_trust_radius,
-#         max_trust_radius, step_threshold, shrink_threshold,
-#         expand_threshold, shrink_factor, expand_factor, loss,
-#         loss_new, H, g, shrink_counter, step_size, u_tmp, fu_new,
-#         make_new_J, r, p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0))
-# end
-
-# function perform_step!(cache::TrustRegionCache{true})
-#     @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
-#     if cache.make_new_J
-#         jacobian!(J, cache)
-#         mul!(cache.H, J, J)
-#         mul!(cache.g, J, fu)
-#         cache.stats.njacs += 1
-#     end
-
-#     linres = dolinsolve(alg.precs, linsolve, A = cache.H, b = _vec(cache.g),
-#         linu = _vec(u_tmp),
-#         p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     cache.u_tmp .= -1 .* u_tmp
-#     dogleg!(cache)
-
-#     # Compute the potentially new u
-#     cache.u_tmp .= u .+ cache.step_size
-#     f(cache.fu_new, cache.u_tmp, p)
-#     trust_region_step!(cache)
-#     cache.stats.nf += 1
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
-#     return nothing
-# end
-
-# function perform_step!(cache::TrustRegionCache{false})
-#     @unpack make_new_J, fu, f, u, p = cache
-
-#     if make_new_J
-#         J = jacobian(cache, f)
-#         cache.H = J * J
-#         cache.g = J * fu
-#         cache.stats.njacs += 1
-#     end
-
-#     @unpack g, H = cache
-#     # Compute the Newton step.
-#     cache.u_tmp = -H \ g
-#     dogleg!(cache)
-
-#     # Compute the potentially new u
-#     cache.u_tmp = u .+ cache.step_size
-#     cache.fu_new = f(cache.u_tmp, p)
-#     trust_region_step!(cache)
-#     cache.stats.nf += 1
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
-#     return nothing
-# end
-
-# function retrospective_step!(cache::TrustRegionCache{true})
-#     @unpack J, fu_prev, fu, u_prev, u = cache
-#     jacobian!(J, cache)
-#     mul!(cache.H, J, J)
-#     mul!(cache.g, J, fu)
-#     cache.stats.njacs += 1
-#     @unpack H, g, step_size = cache
-
-#     return -(get_loss(fu_prev) - get_loss(fu)) /
-#            (step_size' * g + step_size' * H * step_size / 2)
-# end
-
-# function retrospective_step!(cache::TrustRegionCache{false})
-#     @unpack J, fu_prev, fu, u_prev, u, f = cache
-#     J = jacobian(cache, f)
-#     cache.H = J * J
-#     cache.g = J * fu
-#     cache.stats.njacs += 1
-#     @unpack H, g, step_size = cache
-
-#     return -(get_loss(fu_prev) - get_loss(fu)) /
-#            (step_size' * g + step_size' * H * step_size / 2)
-# end
-
-# function trust_region_step!(cache::TrustRegionCache)
-#     @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
-#     cache.loss_new = get_loss(fu_new)
-
-#     # Compute the ratio of the actual reduction to the predicted reduction.
-#     cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
-#     @unpack r = cache
-
-#     if radius_update_scheme === RadiusUpdateSchemes.Simple
-#         # Update the trust region radius.
-#         if r < cache.shrink_threshold
-#             cache.trust_r *= cache.shrink_factor
-#             cache.shrink_counter += 1
-#         else
-#             cache.shrink_counter = 0
-#         end
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-
-#             # Update the trust region radius.
-#             if r > cache.expand_threshold
-#                 cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
-#             end
-
-#             cache.make_new_J = true
-#         else
-#             # No need to make a new J, no step was taken, so we try again with a smaller trust_r
-#             cache.make_new_J = false
-#         end
-
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-#             cache.force_stop = true
-#         end
-
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-#         # Hei's radius update scheme
-#         @unpack shrink_threshold, p1, p2, p3, p4 = cache
-#         if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
-#            cache.trust_r
-#             cache.shrink_counter += 1
-#         else
-#             cache.shrink_counter = 0
-#         end
-#         cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
-#                         cache.internalnorm(step_size)
-
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-#         if r < cache.shrink_threshold
-#             cache.p1 = cache.p2 * cache.p1
-#             cache.shrink_counter += 1
-#         elseif r >= cache.expand_threshold &&
-#                cache.internalnorm(step_size) > cache.trust_r / 2
-#             cache.p1 = cache.p3 * cache.p1
-#             cache.shrink_counter = 0
-#         end
-
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-
-#         @unpack p1 = cache
-#         cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-#         #Fan's update scheme
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-#         if r < cache.shrink_threshold
-#             cache.p1 *= cache.p2
-#             cache.shrink_counter += 1
-#         elseif r > cache.expand_threshold
-#             cache.p1 = min(cache.p1 * cache.p3, cache.p4)
-#             cache.shrink_counter = 0
-#         end
-
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-
-#         @unpack p1 = cache
-#         cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#             if retrospective_step!(cache) >= cache.expand_threshold
-#                 cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
-#             end
-
-#         else
-#             cache.make_new_J = false
-#             cache.trust_r *= cache.p2
-#             cache.shrink_counter += 1
-#         end
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-#             cache.force_stop = true
-#         end
-#     end
-# end
-
-# function dogleg!(cache::TrustRegionCache)
-#     @unpack u_tmp, trust_r = cache
-
-#     # Test if the full step is within the trust region.
-#     if norm(u_tmp) ≤ trust_r
-#         cache.step_size = deepcopy(u_tmp)
-#         return
-#     end
-
-#     # Calcualte Cauchy point, optimum along the steepest descent direction.
-#     δsd = -cache.g
-#     norm_δsd = norm(δsd)
-#     if norm_δsd ≥ trust_r
-#         cache.step_size = δsd .* trust_r / norm_δsd
-#         return
-#     end
-
-#     # Find the intersection point on the boundary.
-#     N_sd = u_tmp - δsd
-#     dot_N_sd = dot(N_sd, N_sd)
-#     dot_sd_N_sd = dot(δsd, N_sd)
-#     dot_sd = dot(δsd, δsd)
-#     fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
-#     τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
-#     cache.step_size = δsd + τ * N_sd
-# end
-
-# function take_step!(cache::TrustRegionCache{true})
-#     cache.u_prev .= cache.u
-#     cache.u .= cache.u_tmp
-#     cache.fu_prev .= cache.fu
-#     cache.fu .= cache.fu_new
-# end
-
-# function take_step!(cache::TrustRegionCache{false})
-#     cache.u_prev = cache.u
-#     cache.u = cache.u_tmp
-#     cache.fu_prev = cache.fu
-#     cache.fu = cache.fu_new
-# end
-
-# function jvp!(cache::TrustRegionCache{false})
-#     @unpack f, u, fu, p = cache
-#     if isa(u, Number)
-#         return value_derivative(x -> f(x, p), u)
-#     end
-#     return auto_jacvec(x -> f(x, p), u, fu)
-# end
-
-# function jvp!(cache::TrustRegionCache{true})
-#     @unpack g, f, u, fu, p = cache
-#     if isa(u, Number)
-#         return value_derivative(x -> f(x, p), u)
-#     end
-#     auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-#     g
-# end
-
-# function SciMLBase.solve!(cache::TrustRegionCache)
-#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
-#               cache.shrink_counter < cache.alg.max_shrink_times
-#         perform_step!(cache)
-#         cache.stats.nsteps += 1
-#     end
-
-#     if cache.stats.nsteps == cache.maxiters
-#         cache.retcode = ReturnCode.MaxIters
-#     else
-#         cache.retcode = ReturnCode.Success
-#     end
-
-#     SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-#         retcode = cache.retcode, stats = cache.stats)
-# end
-
-# function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
-#     abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
-#     cache.p = p
-#     if iip
-#         recursivecopy!(cache.u, u0)
-#         cache.f(cache.fu, cache.u, p)
-#     else
-#         # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-#         cache.u = u0
-#         cache.fu = cache.f(cache.u, p)
-#     end
-#     cache.abstol = abstol
-#     cache.maxiters = maxiters
-#     cache.stats.nf = 1
-#     cache.stats.nsteps = 1
-#     cache.force_stop = false
-#     cache.retcode = ReturnCode.Default
-#     cache.make_new_J = true
-#     cache.loss = get_loss(cache.fu)
-#     cache.shrink_counter = 0
-#     cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
-#     if iszero(cache.trust_r)
-#         cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
-#     end
-#     return cache
-# end
+@concrete mutable struct TrustRegionCache{iip, trustType, floatType}
+    f
+    alg
+    u_prev
+    u
+    fu_prev
+    fu
+    fu2
+    p
+    uf
+    linsolve
+    J
+    jac_cache
+    force_stop::Bool
+    maxiters::Int
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    radius_update_scheme::RadiusUpdateSchemes.T
+    trust_r::trustType
+    max_trust_r::trustType
+    step_threshold
+    shrink_threshold::trustType
+    expand_threshold::trustType
+    shrink_factor::trustType
+    expand_factor::trustType
+    loss::floatType
+    loss_new::floatType
+    H
+    g
+    shrink_counter::Int
+    step_size
+    u_tmp
+    fu_new
+    make_new_J::Bool
+    r::floatType
+    p1::floatType
+    p2::floatType
+    p3::floatType
+    p4::floatType
+    ϵ::floatType
+    stats::NLStats
+end
+
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion, args...;
+    alias_u0 = false, maxiters = 1000, abstol = 1e-8, internalnorm = DEFAULT_NORM,
+    kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    u_prev = zero(u)
+    if iip
+        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu1, u, p)
+    else
+        fu1 = f(u, p)
+    end
+    fu_prev = zero(fu1)
+
+    loss = get_loss(fu1)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
+
+    radius_update_scheme = alg.radius_update_scheme
+    max_trust_radius = convert(eltype(u), alg.max_trust_radius)
+    initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
+    step_threshold = convert(eltype(u), alg.step_threshold)
+    shrink_threshold = convert(eltype(u), alg.shrink_threshold)
+    expand_threshold = convert(eltype(u), alg.expand_threshold)
+    shrink_factor = convert(eltype(u), alg.shrink_factor)
+    expand_factor = convert(eltype(u), alg.expand_factor)
+    # Set default trust region radius if not specified
+    if iszero(max_trust_radius)
+        max_trust_radius = convert(eltype(u), max(norm(fu1), maximum(u) - minimum(u)))
+    end
+    if iszero(initial_trust_radius)
+        initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
+    end
+
+    loss_new = loss
+    H = zero(J)
+    g = _mutable_zero(fu1)
+    shrink_counter = 0
+    step_size = zero(u)
+    fu_new = zero(fu1)
+    make_new_J = true
+    r = loss
+
+    # Parameters for the Schemes
+    p1 = convert(eltype(u), 0.0)
+    p2 = convert(eltype(u), 0.0)
+    p3 = convert(eltype(u), 0.0)
+    p4 = convert(eltype(u), 0.0)
+    ϵ = convert(eltype(u), 1.0e-8)
+    if radius_update_scheme === RadiusUpdateSchemes.Hei
+        step_threshold = convert(eltype(u), 0.0)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.25)
+        p1 = convert(eltype(u), 5.0) # M
+        p2 = convert(eltype(u), 0.1) # β
+        p3 = convert(eltype(u), 0.15) # γ1
+        p4 = convert(eltype(u), 0.15) # γ2
+        initial_trust_radius = convert(eltype(u), 1.0)
+    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+        step_threshold = convert(eltype(u), 0.0001)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.25)
+        p1 = convert(eltype(u), 2.0) # μ
+        p2 = convert(eltype(u), 1 / 6) # c5
+        p3 = convert(eltype(u), 6.0) # c6
+        p4 = convert(eltype(u), 0.0)
+        if iip
+            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu1)
+        else
+            if isa(u, Number)
+                g = ForwardDiff.derivative(x -> f(x, p), u)
+            else
+                g = auto_jacvec(x -> f(x, p), u, fu1)
+            end
+        end
+        initial_trust_radius = convert(eltype(u), p1 * norm(g))
+    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+        step_threshold = convert(eltype(u), 0.0001)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.75)
+        p1 = convert(eltype(u), 0.1) # μ
+        p2 = convert(eltype(u), 1 / 4) # c5
+        p3 = convert(eltype(u), 12) # c6
+        p4 = convert(eltype(u), 1.0e18) # M
+        initial_trust_radius = convert(eltype(u), p1 * (norm(fu1)^0.99))
+    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+        step_threshold = convert(eltype(u), 0.05)
+        shrink_threshold = convert(eltype(u), 0.05)
+        expand_threshold = convert(eltype(u), 0.9)
+        p1 = convert(eltype(u), 2.5)  #alpha_1
+        p2 = convert(eltype(u), 0.25) # alpha_2
+        p3 = convert(eltype(u), 0) # not required
+        p4 = convert(eltype(u), 0) # not required
+        initial_trust_radius = convert(eltype(u), 1.0)
+    end
+
+    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu1, fu2, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
+        radius_update_scheme, initial_trust_radius, max_trust_radius, step_threshold,
+        shrink_threshold, expand_threshold, shrink_factor, expand_factor, loss, loss_new,
+        H, g, shrink_counter, step_size, du, fu_new, make_new_J, r, p1, p2, p3, p4, ϵ,
+        NLStats(1, 0, 0, 0, 0))
+end
+
+isinplace(::TrustRegionCache{iip}) where {iip} = iip
+
+function perform_step!(cache::TrustRegionCache{true})
+    @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
+    if cache.make_new_J
+        jacobian!!(J, cache)
+        mul!(cache.H, J, J)
+        mul!(cache.g, J, fu)
+        cache.stats.njacs += 1
+    end
+
+    linres = dolinsolve(alg.precs, linsolve; A = cache.H, b = _vec(cache.g),
+        linu = _vec(u_tmp), p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    cache.u_tmp .= -1 .* u_tmp
+    dogleg!(cache)
+
+    # Compute the potentially new u
+    cache.u_tmp .= u .+ cache.step_size
+    f(cache.fu_new, cache.u_tmp, p)
+    trust_region_step!(cache)
+    cache.stats.nf += 1
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
+    return nothing
+end
+
+function perform_step!(cache::TrustRegionCache{false})
+    @unpack make_new_J, fu, f, u, p = cache
+
+    if make_new_J
+        J = jacobian!!(cache.J, cache)
+        cache.H = J * J
+        cache.g = J * fu
+        cache.stats.njacs += 1
+    end
+
+    @unpack g, H = cache
+    # Compute the Newton step.
+    cache.u_tmp = -H \ g
+    dogleg!(cache)
+
+    # Compute the potentially new u
+    cache.u_tmp = u .+ cache.step_size
+    cache.fu_new = f(cache.u_tmp, p)
+    trust_region_step!(cache)
+    cache.stats.nf += 1
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
+    return nothing
+end
+
+function retrospective_step!(cache::TrustRegionCache)
+    @unpack J, fu_prev, fu, u_prev, u = cache
+    J = jacobian!!(deepcopy(J), cache)
+    if J isa Number
+        cache.H = J * J
+        cache.g = J * fu
+    else
+        mul!(cache.H, J, J)
+        mul!(cache.g, J, fu)
+    end
+    cache.stats.njacs += 1
+    @unpack H, g, step_size = cache
+
+    return -(get_loss(fu_prev) - get_loss(fu)) /
+           (step_size' * g + step_size' * H * step_size / 2)
+end
+
+function trust_region_step!(cache::TrustRegionCache)
+    @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
+    cache.loss_new = get_loss(fu_new)
+
+    # Compute the ratio of the actual reduction to the predicted reduction.
+    cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
+    @unpack r = cache
+
+    if radius_update_scheme === RadiusUpdateSchemes.Simple
+        # Update the trust region radius.
+        if r < cache.shrink_threshold
+            cache.trust_r *= cache.shrink_factor
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+        end
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+
+            # Update the trust region radius.
+            if r > cache.expand_threshold
+                cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
+            end
+
+            cache.make_new_J = true
+        else
+            # No need to make a new J, no step was taken, so we try again with a smaller trust_r
+            cache.make_new_J = false
+        end
+
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+            cache.force_stop = true
+        end
+
+    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+        # Hei's radius update scheme
+        @unpack shrink_threshold, p1, p2, p3, p4 = cache
+        if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
+           cache.trust_r
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+        end
+        cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
+                        cache.internalnorm(step_size)
+
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+
+    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+        if r < cache.shrink_threshold
+            cache.p1 = cache.p2 * cache.p1
+            cache.shrink_counter += 1
+        elseif r >= cache.expand_threshold &&
+               cache.internalnorm(step_size) > cache.trust_r / 2
+            cache.p1 = cache.p3 * cache.p1
+            cache.shrink_counter = 0
+        end
+
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+
+        @unpack p1 = cache
+        cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+        #Fan's update scheme
+    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+        if r < cache.shrink_threshold
+            cache.p1 *= cache.p2
+            cache.shrink_counter += 1
+        elseif r > cache.expand_threshold
+            cache.p1 = min(cache.p1 * cache.p3, cache.p4)
+            cache.shrink_counter = 0
+        end
+
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+
+        @unpack p1 = cache
+        cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+            if retrospective_step!(cache) >= cache.expand_threshold
+                cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
+            end
+
+        else
+            cache.make_new_J = false
+            cache.trust_r *= cache.p2
+            cache.shrink_counter += 1
+        end
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+            cache.force_stop = true
+        end
+    end
+end
+
+function dogleg!(cache::TrustRegionCache)
+    @unpack u_tmp, trust_r = cache
+
+    # Test if the full step is within the trust region.
+    if norm(u_tmp) ≤ trust_r
+        cache.step_size = deepcopy(u_tmp)
+        return
+    end
+
+    # Calcualte Cauchy point, optimum along the steepest descent direction.
+    δsd = -cache.g
+    norm_δsd = norm(δsd)
+    if norm_δsd ≥ trust_r
+        cache.step_size = δsd .* trust_r / norm_δsd
+        return
+    end
+
+    # Find the intersection point on the boundary.
+    N_sd = u_tmp - δsd
+    dot_N_sd = dot(N_sd, N_sd)
+    dot_sd_N_sd = dot(δsd, N_sd)
+    dot_sd = dot(δsd, δsd)
+    fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
+    τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
+    cache.step_size = δsd + τ * N_sd
+end
+
+function take_step!(cache::TrustRegionCache{true})
+    cache.u_prev .= cache.u
+    cache.u .= cache.u_tmp
+    cache.fu_prev .= cache.fu
+    cache.fu .= cache.fu_new
+end
+
+function take_step!(cache::TrustRegionCache{false})
+    cache.u_prev = cache.u
+    cache.u = cache.u_tmp
+    cache.fu_prev = cache.fu
+    cache.fu = cache.fu_new
+end
+
+function jvp!(cache::TrustRegionCache{false})
+    @unpack f, u, fu, uf = cache
+    if isa(u, Number)
+        return value_derivative(uf, u)
+    end
+    return auto_jacvec(uf, u, fu)
+end
+
+function jvp!(cache::TrustRegionCache{true})
+    @unpack g, f, u, fu, uf = cache
+    if isa(u, Number)
+        return value_derivative(uf, u)
+    end
+    auto_jacvec!(g, uf, u, fu)
+    return g
+end
+
+function SciMLBase.solve!(cache::TrustRegionCache)
+    while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
+              cache.shrink_counter < cache.alg.max_shrink_times
+        perform_step!(cache)
+        cache.stats.nsteps += 1
+    end
+
+    if cache.stats.nsteps == cache.maxiters
+        cache.retcode = ReturnCode.MaxIters
+    else
+        cache.retcode = ReturnCode.Success
+    end
+
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu; cache.retcode,
+        cache.stats)
+end
+
+function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
+    abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
+    cache.p = p
+    if iip
+        recursivecopy!(cache.u, u0)
+        cache.f(cache.fu, cache.u, p)
+    else
+        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
+        cache.u = u0
+        cache.fu = cache.f(cache.u, p)
+    end
+    cache.abstol = abstol
+    cache.maxiters = maxiters
+    cache.stats.nf = 1
+    cache.stats.nsteps = 1
+    cache.force_stop = false
+    cache.retcode = ReturnCode.Default
+    cache.make_new_J = true
+    cache.loss = get_loss(cache.fu)
+    cache.shrink_counter = 0
+    cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
+    if iszero(cache.trust_r)
+        cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
+    end
+    return cache
+end
diff --git a/src/utils.jl b/src/utils.jl
index c50d52ad7..3df540632 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -37,21 +37,21 @@ function default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(
     return ad
 end
 
-# """
-# value_derivative(f, x)
-
-# Compute `f(x), d/dx f(x)` in the most efficient way.
-# """
-# function value_derivative(f::F, x::R) where {F, R}
-#     T = typeof(ForwardDiff.Tag(f, R))
-#     out = f(ForwardDiff.Dual{T}(x, one(x)))
-#     ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
-# end
-
-# # Todo: improve this dispatch
-# function value_derivative(f::F, x::StaticArraysCore.SVector) where {F}
-#     f(x), ForwardDiff.jacobian(f, x)
-# end
+"""
+value_derivative(f, x)
+
+Compute `f(x), d/dx f(x)` in the most efficient way.
+"""
+function value_derivative(f::F, x::R) where {F, R}
+    T = typeof(ForwardDiff.Tag(f, R))
+    out = f(ForwardDiff.Dual{T}(x, one(x)))
+    ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+end
+
+# Todo: improve this dispatch
+function value_derivative(f::F, x::SVector) where {F}
+    f(x), ForwardDiff.jacobian(f, x)
+end
 
 @inline value(x) = x
 @inline value(x::Dual) = ForwardDiff.value(x)
@@ -128,3 +128,17 @@ end
 
 concrete_jac(_) = nothing
 concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ
+
+# Circumventing https://github.com/SciML/RecursiveArrayTools.jl/issues/277
+_iszero(x) = iszero(x)
+_iszero(x::ArrayPartition) = all(_iszero, x.x)
+
+_mutable_zero(x) = zero(x)
+_mutable_zero(x::SArray) = MArray(x)
+
+_mutable(x) = x
+_mutable(x::SArray) = MArray(x)
+_maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
+# The shadow allocated for Enzyme needs to be mutable
+_maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
+_maybe_mutable(x, _) = x
diff --git a/test/23_test_cases.jl b/test/23_test_cases.jl
deleted file mode 100644
index 3cb0eb310..000000000
--- a/test/23_test_cases.jl
+++ /dev/null
@@ -1,510 +0,0 @@
-using NonlinearSolve, NLsolve, LinearAlgebra
-
-# Implementation of the 23 test problems in
-# [test_nonlin](https://people.sc.fsu.edu/~jburkardt/m_src/test_nonlin/test_nonlin.html)
-
-# ------------------------------------- Problem 1 ------------------------------------------
-function p1_f!(out, x, p = nothing)
-    n = length(x)
-    out[1] = 1.0 - x[1]
-    out[2:n] .= 10.0 .* (x[2:n] .- x[1:(n - 1)] .* x[1:(n - 1)])
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n)
-x_start[1] = -1.2
-p1_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Generalized Rosenbrock function")
-
-# ------------------------------------- Problem 2 ------------------------------------------
-function p2_f!(out, x, p = nothing)
-    out[1] = x[1] + 10.0 * x[2]
-    out[2] = sqrt(5.0) * (x[3] - x[4])
-    out[3] = (x[2] - 2.0 * x[3])^2
-    out[4] = sqrt(10.0) * (x[1] - x[4]) * (x[1] - x[4])
-    nothing
-end
-
-n = 4
-x_sol = zeros(n)
-x_start = [3.0, -1.0, 0.0, 1.0]
-p2_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell singular function")
-
-# ------------------------------------- Problem 3 ------------------------------------------
-function p3_f!(out, x, p = nothing)
-    out[1] = 10000.0 * x[1] * x[2] - 1.0
-    out[2] = exp(-x[1]) + exp(-x[2]) - 1.0001
-    nothing
-end
-
-n = 2
-x_sol = [1.098159e-05, 9.106146]
-x_start = [0.0, 1.0]
-p3_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell badly scaled function")
-
-# ------------------------------------- Problem 4 ------------------------------------------
-function p4_f!(out, x, p = nothing)
-    temp1 = x[2] - x[1] * x[1]
-    temp2 = x[4] - x[3] * x[3]
-
-    out[1] = -200.0 * x[1] * temp1 - (1.0 - x[1])
-    out[2] = 200.0 * temp1 + 20.2 * (x[2] - 1.0) + 19.8 * (x[4] - 1.0)
-    out[3] = -180.0 * x[3] * temp2 - (1.0 - x[3])
-    out[4] = 180.0 * temp2 + 20.2 * (x[4] - 1.0) + 19.8 * (x[2] - 1.0)
-    nothing
-end
-
-n = 4
-x_sol = ones(n)
-x_start = [-3.0, -1.0, -3.0, -1.0]
-p4_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Wood function")
-
-# ------------------------------------- Problem 5 ------------------------------------------
-function p5_f!(out, x, p = nothing)
-    if 0.0 < x[1]
-        temp = atan(x[2] / x[1]) / (2.0 * pi)
-    elseif x[1] < 0.0
-        temp = atan(x[2] / x[1]) / (2.0 * pi) + 0.5
-    else
-        temp = 0.25 * sign(x[2])
-    end
-
-    out[1] = 10.0 * (x[3] - 10.0 * temp)
-    out[2] = 10.0 * (sqrt(x[1] * x[1] + x[2] * x[2]) - 1.0)
-    out[3] = x[3]
-    nothing
-end
-
-n = 3
-x_sol = [1.0, 0.0, 0.0]
-x_start = [-1.0, 0.0, 0.0]
-p5_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Helical valley function")
-
-# ------------------------------------- Problem 6 ------------------------------------------
-function p6_f!(out, x, p = nothing)
-    n = length(x)
-    for i in 1:29
-        ti = i / 29.0
-        sum1 = 0.0
-        temp = 1.0
-        for j in 2:n
-            sum1 = sum1 + j * temp * x[j]
-            temp = ti * temp
-        end
-
-        sum2 = 0.0
-        temp = 1.0
-        for j in 1:n
-            sum2 = sum2 + temp * x[j]
-            temp = ti * temp
-        end
-        temp = 1.0 / ti
-
-        for k in 1:n
-            out[k] = out[k] + temp * (sum1 - sum2 * sum2 - 1.0) * (k - 2.0 * ti * sum2)
-            temp = ti * temp
-        end
-    end
-
-    out[1] = out[1] + 3.0 * x[1] - 2.0 * x[1] * x[1] + 2.0 * x[1]^3
-    out[2] = out[2] + x[2] - x[2]^2 - 1.0
-    nothing
-end
-
-n = 2
-x_sol = []
-x_start = zeros(n)
-p6_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Watson function")
-
-# ------------------------------------- Problem 7 ------------------------------------------
-function p7_f!(out, x, p = nothing)
-    n = length(x)
-    out .= 0.0
-    for j in 1:n
-        t1 = 1.0
-        t2 = x[j]
-        for i in 1:n
-            out[i] += t2
-            t3 = 2.0 * x[j] * t2 - t1
-            t1 = t2
-            t2 = t3
-        end
-    end
-    out ./= n
-
-    for i in 1:n
-        ip1 = i
-        if ip1 % 2 == 0
-            out[i] = out[i] + 1.0 / (ip1 * ip1 - 1)
-        end
-    end
-    nothing
-end
-
-n = 2
-x_sol = [0.2113248654051871, 0.7886751345948129]
-x_sol .= 2.0 .* x_sol .- 1.0
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (2 * i - n) / (n + 1)
-end
-p7_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chebyquad function")
-
-# ------------------------------------- Problem 8 ------------------------------------------
-function p8_f!(out, x, p = nothing)
-    n = length(x)
-    out[1:(n - 1)] .= x[1:(n - 1)] .+ sum(x) .- (n + 1)
-    out[n] = prod(x) - 1.0
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n) ./ 2
-p8_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Brown almost linear function")
-
-# ------------------------------------- Problem 9 ------------------------------------------
-function p9_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        out[k] = 2.0 * x[k] + 0.5 * h^2 * (x[k] + k * h + 1.0)^3
-        if 1 < k
-            out[k] = out[k] - x[k - 1]
-        end
-        if k < n
-            out[k] = out[k] - x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p9_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete boundary value function")
-
-# ------------------------------------- Problem 10 -----------------------------------------
-function p10_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        tk = k / (n + 1)
-        sum1 = 0.0
-        for j in 1:k
-            tj = j * h
-            sum1 = sum1 + tj * (x[j] + tj + 1.0)^3
-        end
-        sum2 = 0.0
-        for j in k:n
-            tj = j * h
-            sum2 = sum2 + (1.0 - tj) * (x[j] + tj + 1.0)^3
-        end
-
-        out[k] = x[k] + h * ((1.0 - tk) * sum1 + tk * sum2) / 2.0
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p10_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete integral equation function")
-
-# ------------------------------------- Problem 11 -----------------------------------------
-function p11_f!(out, x, p = nothing)
-    n = length(x)
-    c_sum = sum(cos.(x))
-    for k in 1:n
-        out[k] = n - c_sum + k * (1.0 - cos(x[k])) - sin(x[k])
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) / n
-p11_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Trigonometric function")
-
-# ------------------------------------- Problem 12 -----------------------------------------
-function p12_f!(out, x, p = nothing)
-    n = length(x)
-    sum1 = 0.0
-    for j in 1:n
-        sum1 += j * (x[j] - 1.0)
-    end
-    for k in 1:n
-        out[k] = x[k] - 1.0 + k * sum1 * (1.0 + 2.0 * sum1 * sum1)
-    end
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = 1.0 - i / n
-end
-p12_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Variably dimensioned function")
-
-# ------------------------------------- Problem 13 -----------------------------------------
-function p13_f!(out, x, p = nothing)
-    n = length(x)
-    for k in 1:n
-        out[k] = (3.0 - 2.0 * x[k]) * x[k] + 1.0
-        if 1 < k
-            out[k] -= x[k - 1]
-        end
-        if k < n
-            out[k] -= 2.0 * x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p13_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden tridiagonal function")
-
-# ------------------------------------- Problem 14 -----------------------------------------
-function p14_f!(out, x, p = nothing)
-    n = length(x)
-    ml = 5
-    mu = 1
-    for k in 1:n
-        k1 = max(1, k - ml)
-        k2 = min(n, k + mu)
-
-        temp = 0.0
-        for j in k1:k2
-            if j != k
-                temp += x[j] * (1.0 + x[j])
-            end
-        end
-        out[k] = x[k] * (2.0 + 5.0 * x[k] * x[k]) + 1.0 - temp
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p14_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden banded function")
-
-# ------------------------------------- Problem 15 -----------------------------------------
-function p15_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[3]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[4]) - 1.0
-    out[3] = (x[3] * x[1] + x[4] * x[3]) - 0.0
-    out[4] = (x[3] * x[2] + x[4] * x[4]) - 0.0001
-    nothing
-end
-
-n = 4
-x_sol = [0.01, 50.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 1.0]
-p15_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 2 by 2 matrix square root problem")
-
-# ------------------------------------- Problem 16 -----------------------------------------
-function p16_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[4] + x[3] * x[7]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[5] + x[3] * x[8]) - 1.0
-    out[3] = x[1] * x[3] + x[2] * x[6] + x[3] * x[9]
-    out[4] = x[4] * x[1] + x[5] * x[4] + x[6] * x[7]
-    out[5] = (x[4] * x[2] + x[5] * x[5] + x[6] * x[8]) - 0.0001
-    out[6] = x[4] * x[3] + x[5] * x[6] + x[6] * x[9]
-    out[7] = x[7] * x[1] + x[8] * x[4] + x[9] * x[7]
-    out[8] = x[7] * x[2] + x[8] * x[5] + x[9] * x[8]
-    out[9] = (x[7] * x[3] + x[8] * x[6] + x[9] * x[9]) - 0.0001
-    nothing
-end
-
-n = 9
-x_sol = [0.01, 50.0, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
-p16_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 3 by 3 matrix square root problem")
-
-# ------------------------------------- Problem 17 -----------------------------------------
-function p17_f!(out, x, p = nothing)
-    out[1] = x[1] + x[2] - 3.0
-    out[2] = x[1]^2 + x[2]^2 - 9.0
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 3.0]
-x_start = [1.0, 5.0]
-p17_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Dennis and Schnabel 2 by 2 example")
-
-# ------------------------------------- Problem 18 -----------------------------------------
-function p18_f!(out, x, p = nothing)
-    if x[1] != 0.0
-        out[1] = x[2]^2 * (1.0 - exp(-x[1] * x[1])) / x[1]
-    else
-        out[1] = 0.0
-    end
-    if x[2] != 0.0
-        out[2] = x[1] * (1.0 - exp(-x[2] * x[2])) / x[2]
-    else
-        out[2] = 0.0
-    end
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [2.0, 2.0]
-p18_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 18")
-
-# ------------------------------------- Problem 19 -----------------------------------------
-function p19_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1]^2 + x[2]^2)
-    out[2] = x[2] * (x[1]^2 + x[2]^2)
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [3.0, 3.0]
-p19_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 19")
-
-# ------------------------------------- Problem 20 -----------------------------------------
-function p20_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1] - 5.0)^2
-    nothing
-end
-
-n = 1
-x_sol = [5.0] # OR [0.0]...
-x_start = [1.0]
-p20_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Scalar problem f(x) = x(x - 5)^2")
-
-# ------------------------------------- Problem 21 -----------------------------------------
-function p21_f!(out, x, p = nothing)
-    out[1] = x[1] - x[2]^3 + 5.0 * x[2]^2 - 2.0 * x[2] - 13.0
-    out[2] = x[1] + x[2]^3 + x[2]^2 - 14.0 * x[2] - 29.0
-    nothing
-end
-
-n = 2
-x_sol = [5.0, 4.0]
-x_start = [0.5, -2.0]
-p21_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Freudenstein-Roth function")
-
-# ------------------------------------- Problem 22 -----------------------------------------
-function p22_f!(out, x, p = nothing)
-    out[1] = x[1] * x[1] - x[2] + 1.0
-    out[2] = x[1] - cos(0.5 * pi * x[2])
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 1.0]
-x_start = [1.0, 0.0]
-p22_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Boggs function")
-
-# ------------------------------------- Problem 23 -----------------------------------------
-function p23_f!(out, x, p = nothing)
-    c = 0.9
-    out[1:n] = x[1:n]
-    μ = zeros(n)
-    for i in 1:n
-        μ[i] = (2 * i) / (2 * n)
-    end
-    for i in 1:n
-        s = 0.0
-        for j in 1:n
-            s = s + (μ[i] * x[j]) / (μ[i] + μ[j])
-        end
-        term = 1.0 - c * s / (2 * n)
-        out[i] -= 1.0 / term
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-p23_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chandrasekhar function")
-
-# ----------------------------------- Solve problems ---------------------------------------
-problems = (p1_f!, p2_f!, p3_f!, p4_f!, p5_f!, p6_f!, p7_f!, p8_f!, p9_f!, p10_f!, p11_f!,
-    p12_f!, p13_f!, p14_f!, p15_f!, p16_f!, p17_f!, p18_f!, p19_f!, p20_f!, p21_f!,
-    p22_f!, p23_f!)
-dicts = (p1_dict, p2_dict, p3_dict, p4_dict, p5_dict, p6_dict, p7_dict, p8_dict, p9_dict,
-    p10_dict, p11_dict, p12_dict, p13_dict, p14_dict, p15_dict, p16_dict, p17_dict,
-    p18_dict, p19_dict, p20_dict, p21_dict, p22_dict, p23_dict)
-algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
-names = ("NewtonRaphson", "TrustRegion", "LevenbergMarquardt")
-
-for (problem, dict) in zip(problems, dicts)
-    for (alg, name) in zip(algs, names)
-        local x = dict["start"]
-        local nlprob = NonlinearProblem(problem, x)
-        local out = similar(x)
-        try
-            problem(out,
-                solve(nlprob, alg, abstol = 1e-15, reltol = 1e-15).u, nothing)
-            dict["error_" * name] = ""
-        catch
-            # println("Error in $name")
-            dict["error_" * name] = "(Singular error)"
-        end
-        dict["out_" * name] = out
-    end
-    local x = dict["start"]
-    local nlprob = NonlinearProblem(problem, x)
-    sol = nlsolve(problem, x, xtol = 1e-15, ftol = 1e-15)
-    dict["norm_nlsolve"] = sol.residual_norm
-end
-
-# ----------------------------------- Print results ----------------------------------------
-i_str = i_str = rpad("nr", 3, " ")
-title_str = rpad("Problem", 50, " ")
-n_str = rpad("n", 5, " ")
-norm_str = rpad(names[1], 20, " ") * rpad(names[2], 20, " ") * rpad(names[3], 20, " ") *
-           rpad("nlsolve", 20, " ")
-println("$i_str $title_str $n_str $norm_str")
-
-for (i, dict) in enumerate(dicts)
-    local i_str = rpad(string(i), 3, " ")
-    local title_str = rpad(dict["title"], 50, " ")
-    local n_str = rpad(string(dict["n"]), 5, " ")
-    local norm_str = ""
-    for (alg, name) in zip(algs, names)
-        norm_str *= rpad(string(trunc(norm(dict["out_" * name]); sigdigits = 5)), 20, " ")
-    end
-    norm_str *= rpad(string(round(dict["norm_nlsolve"]; sigdigits = 5)), 20, " ")
-    println("$i_str $title_str $n_str $norm_str")
-end
diff --git a/test/basictests.jl b/test/basictests.jl
index 05a0152fa..ee42db9f3 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -1,774 +1,672 @@
-using NonlinearSolve
-using StaticArrays
-using BenchmarkTools
-using LinearSolve
-using Random
-using LinearAlgebra
-using Test
+using BenchmarkTools, LinearSolve, NonlinearSolve, StaticArrays, Random, LinearAlgebra,
+    Test, ForwardDiff, Zygote, Enzyme, SparseDiffTools
 
-# --- NewtonRaphson tests ---
-
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, NewtonRaphson(), abstol = 1e-9))
-end
-
-function ff(u, p)
-    u .* u .- 2
-end
-const cu0 = @SVector[1.0, 1.0]
-function sf(u, p)
-    u * u - 2
-end
-const csu0 = 1.0
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-# @test (@ballocated benchmark_immutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_mutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_scalar(sf, csu0)) < 400
-
-function benchmark_inplace(f, u0, linsolve, precs)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
-    sol = solve!(solver)
-end
+_nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-precs = [
-    NonlinearSolve.DEFAULT_PRECS,
-    (args...) -> (Diagonal(rand!(similar(u0))), nothing)
-]
+# --- NewtonRaphson tests ---
 
-for prec in precs, linsolve in (nothing, KrylovJL_GMRES())
-    sol = benchmark_inplace(ffiip, u0, linsolve, prec)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
+@testset "NewtonRaphson" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0)
+        prob = NonlinearProblem{false}(f, u0, p)
+        cache = init(prob, NewtonRaphson(), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) <= 64
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs)
+        prob = NonlinearProblem{true}(f, u0, p)
+        cache = init(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-# AD Tests
-using ForwardDiff
+    quadratic_f(u, p) = u .* u .- p
+    quadratic_f!(du, u, p) = (du .= u .* u .- p)
 
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
+        sol = benchmark_nlsolve_oop(quadratic_f, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-9)
-    return sol.u[end]
-end
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), NewtonRaphson(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-for p in 1.0:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    precs = [NonlinearSolve.DEFAULT_PRECS, :Random]
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
+            1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
+        if prec === :Random
+            prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
+        end
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-# NewtonRaphson
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-10)
-    return sol.u
-end
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+            NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
 
-@test ForwardDiff.derivative(g, 1.0) ≈ 0.5
+    # Immutable
+    @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+            res_true = sqrt(p)
+            all(res.u .≈ res_true)
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+            @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+            res_true = sqrt(p)
+            res.u ≈ res_true
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u, p) ≈
+              1 / (2 * sqrt(p))
+    end
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, NewtonRaphson())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
+    quadratic_f2(u, p) = @. p[1] * u * u - p[2]
+    t = (p) -> [sqrt(p[2] / p[1])]
+    p = [0.9, 50.0]
+    @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+    @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
+          ForwardDiff.jacobian(t, p)
+
+    # Iterator interface
+    function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
+        probN = NonlinearProblem{iip}(f, iip ? [0.5] : 0.5, p_range[begin])
+        cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
+        sols = zeros(length(p_range))
+        for (i, p) in enumerate(p_range)
+            reinit!(cache, iip ? [cache.u[1]] : cache.u; p = p)
+            sol = solve!(cache)
+            sols[i] = iip ? sol.u[1] : sol.u
+        end
+        return sols
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+    p = range(0.01, 2, length = 200)
+    @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
+    @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
+
+    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
+    @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
+        AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(),
+        AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, NewtonRaphson()).u[end] ≈ sqrt(2.0)
-@test solve(probN, NewtonRaphson(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson(; autodiff = false)).u ≈ sol
 end
 
 # --- TrustRegion tests ---
-
-function benchmark_immutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9))
-end
-
-function ff(u, p = nothing)
-    u .* u .- 2
-end
-
-function sf(u, p = nothing)
-    u * u - 2
-end
-
-u0 = [1.0, 1.0]
-radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
-    RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_immutable(ff, cu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_mutable(ff, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_scalar(sf, csu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test abs(sol.u * sol.u - 2) < 1e-9
-end
-
-function benchmark_inplace(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, TrustRegion(; radius_update_scheme), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p = nothing)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_inplace(ffiip, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
-
-for radius_update_scheme in radius_update_schemes
-    probN = NonlinearProblem{true}(ffiip, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    @test (@ballocated solve!(solver)) < 200
-end
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
-    end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+@testset "TrustRegion" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; radius_update_scheme)
+        prob = NonlinearProblem{false}(f, u0, p)
+        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
+        return solve!(cache)
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
-@test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
-end
-
-# Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# Test kwars in `TrustRegion`
-max_trust_radius = [10.0, 100.0, 1000.0]
-initial_trust_radius = [10.0, 1.0, 0.1]
-step_threshold = [0.0, 0.01, 0.25]
-shrink_threshold = [0.25, 0.3, 0.5]
-expand_threshold = [0.5, 0.8, 0.9]
-shrink_factor = [0.1, 0.3, 0.5]
-expand_factor = [1.5, 2.0, 3.0]
-max_shrink_times = [10, 20, 30]
-
-list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
-    shrink_threshold, expand_threshold, shrink_factor,
-    expand_factor, max_shrink_times)
-for options in list_of_options
-    local probN, sol, alg
-    alg = TrustRegion(max_trust_radius = options[1],
-        initial_trust_radius = options[2],
-        step_threshold = options[3],
-        shrink_threshold = options[4],
-        expand_threshold = options[5],
-        shrink_factor = options[6],
-        expand_factor = options[7],
-        max_shrink_times = options[8])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
-end
-
-# Testing consistency of iip vs oop iterations
-
-maxiterations = [2, 3, 4, 5]
-u0 = [1.0, 1.0]
-function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
-    prob_iip = NonlinearProblem{true}(fip, u0)
-    solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_iip = solve!(solver)
-
-    prob_oop = NonlinearProblem{false}(f, u0)
-    solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_oop = solve!(solver)
-
-    return sol_iip.u[end], sol_oop.u[end]
-end
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
-    @test iip == oop
-end
-
-# --- LevenbergMarquardt tests ---
-
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
-end
-
-function ff(u, p)
-    u .* u .- 2
-end
-
-function sf(u, p)
-    u * u - 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-function benchmark_inplace(f, u0)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_inplace(ffiip, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) < 120
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
-    return sol.u[end]
-end
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; radius_update_scheme)
+        prob = NonlinearProblem{true}(f, u0, p)
+        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    quadratic_f(u, p) = u .* u .- p
+    quadratic_f!(du, u, p) = (du .= u .* u .- p)
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
+        RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
+    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0), radius_update_scheme in radius_update_schemes
+        sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    @testset "[IIP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0],), radius_update_scheme in radius_update_schemes
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, LevenbergMarquardt())
-    return [sol.u]
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
 end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
 
-@test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
-@test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
 
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
-end
-
-# Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# # Test kwars in `LevenbergMarquardt`
-damping_initial = [0.5, 2.0, 5.0]
-damping_increase_factor = [1.5, 3.0, 10.0]
-damping_decrease_factor = [2, 5, 10]
-finite_diff_step_geodesic = [0.02, 0.2, 0.3]
-α_geodesic = [0.6, 0.8, 0.9]
-b_uphill = [0, 1, 2]
-min_damping_D = [1e-12, 1e-9, 1e-4]
-
-list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
-    finite_diff_step_geodesic, α_geodesic, b_uphill,
-    min_damping_D)
-for options in list_of_options
-    local probN, sol, alg
-    alg = LevenbergMarquardt(damping_initial = options[1],
-        damping_increase_factor = options[2],
-        damping_decrease_factor = options[3],
-        finite_diff_step_geodesic = options[4],
-        α_geodesic = options[5],
-        b_uphill = options[6],
-        min_damping_D = options[7])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
-end
+# # Immutable
+# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# # Scalar
+# f, u0 = (u, p) -> u * u - p, 1.0
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# f = (u, p) -> p[1] * u * u - p[2]
+# t = (p) -> [sqrt(p[2] / p[1])]
+# p = [0.9, 50.0]
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion())
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# # Iterator interface
+# f = (u, p) -> u * u - p
+# g = function (p_range)
+#     probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
+#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+#     sols = zeros(length(p_range))
+#     for (i, p) in enumerate(p_range)
+#         reinit!(cache, cache.u; p = p)
+#         sol = solve!(cache)
+#         sols[i] = sol.u
+#     end
+#     return sols
+# end
+# p = range(0.01, 2, length = 200)
+# @test g(p) ≈ sqrt.(p)
+
+# f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
+# g = function (p_range)
+#     probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
+#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+#     sols = zeros(length(p_range))
+#     for (i, p) in enumerate(p_range)
+#         reinit!(cache, [cache.u[1]]; p = p)
+#         sol = solve!(cache)
+#         sols[i] = sol.u[1]
+#     end
+#     return sols
+# end
+# p = range(0.01, 2, length = 200)
+# @test g(p) ≈ sqrt.(p)
+
+# # Error Checks
+# f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
+# probN = NonlinearProblem(f, u0)
+
+# @test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
+# @test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# for u0 in [1.0, [1, 1.0]]
+#     local f, probN, sol
+#     f = (u, p) -> u .* u .- 2.0
+#     probN = NonlinearProblem(f, u0)
+#     sol = sqrt(2) * u0
+
+#     @test solve(probN, TrustRegion()).u ≈ sol
+#     @test solve(probN, TrustRegion()).u ≈ sol
+#     @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
+# end
+
+# # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
+# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+# global g, f
+# f = (u, p) -> 0.010000000000000002 .+
+#               10.000000000000002 ./ (1 .+
+#                (0.21640425613334457 .+
+#                 216.40425613334457 ./ (1 .+
+#                  (0.21640425613334457 .+
+#                   216.40425613334457 ./
+#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+#               0.0011552453009332421u .- p
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# # Test kwars in `TrustRegion`
+# max_trust_radius = [10.0, 100.0, 1000.0]
+# initial_trust_radius = [10.0, 1.0, 0.1]
+# step_threshold = [0.0, 0.01, 0.25]
+# shrink_threshold = [0.25, 0.3, 0.5]
+# expand_threshold = [0.5, 0.8, 0.9]
+# shrink_factor = [0.1, 0.3, 0.5]
+# expand_factor = [1.5, 2.0, 3.0]
+# max_shrink_times = [10, 20, 30]
+
+# list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
+#     shrink_threshold, expand_threshold, shrink_factor,
+#     expand_factor, max_shrink_times)
+# for options in list_of_options
+#     local probN, sol, alg
+#     alg = TrustRegion(max_trust_radius = options[1],
+#         initial_trust_radius = options[2],
+#         step_threshold = options[3],
+#         shrink_threshold = options[4],
+#         expand_threshold = options[5],
+#         shrink_factor = options[6],
+#         expand_factor = options[7],
+#         max_shrink_times = options[8])
+
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, alg, abstol = 1e-10)
+#     @test all(abs.(f(u, p)) .< 1e-10)
+# end
+
+# # Testing consistency of iip vs oop iterations
+
+# maxiterations = [2, 3, 4, 5]
+# u0 = [1.0, 1.0]
+# function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
+#     prob_iip = NonlinearProblem{true}(fip, u0)
+#     solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
+#         abstol = 1e-9, maxiters = maxiters)
+#     sol_iip = solve!(solver)
+
+#     prob_oop = NonlinearProblem{false}(f, u0)
+#     solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
+#         abstol = 1e-9, maxiters = maxiters)
+#     sol_oop = solve!(solver)
+
+#     return sol_iip.u[end], sol_oop.u[end]
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
+#     @test iip == oop
+# end
+
+# # --- LevenbergMarquardt tests ---
+
+# function benchmark_immutable(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function benchmark_mutable(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function benchmark_scalar(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
+# end
+
+# function ff(u, p)
+#     u .* u .- 2
+# end
+
+# function sf(u, p)
+#     u * u - 2
+# end
+# u0 = [1.0, 1.0]
+
+# sol = benchmark_immutable(ff, cu0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+# sol = benchmark_mutable(ff, u0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+# sol = benchmark_scalar(sf, csu0)
+# @test SciMLBase.successful_retcode(sol)
+# @test abs(sol.u * sol.u - 2) < 1e-9
+
+# function benchmark_inplace(f, u0)
+#     probN = NonlinearProblem{true}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function ffiip(du, u, p)
+#     du .= u .* u .- 2
+# end
+# u0 = [1.0, 1.0]
+
+# sol = benchmark_inplace(ffiip, u0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+# u0 = [1.0, 1.0]
+# probN = NonlinearProblem{true}(ffiip, u0)
+# solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+# @test (@ballocated solve!(solver)) < 120
+
+# # AD Tests
+# using ForwardDiff
+
+# # Immutable
+# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# # Scalar
+# f, u0 = (u, p) -> u * u - p, 1.0
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# f = (u, p) -> p[1] * u * u - p[2]
+# t = (p) -> [sqrt(p[2] / p[1])]
+# p = [0.9, 50.0]
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, LevenbergMarquardt())
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# # Error Checks
+# f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
+# probN = NonlinearProblem(f, u0)
+
+# @test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
+# @test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
+
+# for u0 in [1.0, [1, 1.0]]
+#     local f, probN, sol
+#     f = (u, p) -> u .* u .- 2.0
+#     probN = NonlinearProblem(f, u0)
+#     sol = sqrt(2) * u0
+
+#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
+#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
+#     @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
+# end
+
+# # Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
+# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+# global g, f
+# f = (u, p) -> 0.010000000000000002 .+
+#               10.000000000000002 ./ (1 .+
+#                (0.21640425613334457 .+
+#                 216.40425613334457 ./ (1 .+
+#                  (0.21640425613334457 .+
+#                   216.40425613334457 ./
+#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+#               0.0011552453009332421u .- p
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# # # Test kwars in `LevenbergMarquardt`
+# damping_initial = [0.5, 2.0, 5.0]
+# damping_increase_factor = [1.5, 3.0, 10.0]
+# damping_decrease_factor = [2, 5, 10]
+# finite_diff_step_geodesic = [0.02, 0.2, 0.3]
+# α_geodesic = [0.6, 0.8, 0.9]
+# b_uphill = [0, 1, 2]
+# min_damping_D = [1e-12, 1e-9, 1e-4]
+
+# list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
+#     finite_diff_step_geodesic, α_geodesic, b_uphill,
+#     min_damping_D)
+# for options in list_of_options
+#     local probN, sol, alg
+#     alg = LevenbergMarquardt(damping_initial = options[1],
+#         damping_increase_factor = options[2],
+#         damping_decrease_factor = options[3],
+#         finite_diff_step_geodesic = options[4],
+#         α_geodesic = options[5],
+#         b_uphill = options[6],
+#         min_damping_D = options[7])
+
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, alg, abstol = 1e-10)
+#     @test all(abs.(f(u, p)) .< 1e-10)
+# end
diff --git a/test/runtests.jl b/test/runtests.jl
index a84fc3cb1..f8cf35db3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,11 +14,11 @@ end
 @time begin
     if GROUP == "All" || GROUP == "Core"
         @time @safetestset "Basic Tests + Some AD" include("basictests.jl")
-        @time @safetestset "Sparsity Tests" include("sparse.jl")
+        # @time @safetestset "Sparsity Tests" include("sparse.jl")
     end
 
-    if GROUP == "GPU"
-        activate_downstream_env()
-        @time @safetestset "GPU Tests" include("gpu.jl")
-    end
+    # if GROUP == "GPU"
+    #     activate_downstream_env()
+    #     @time @safetestset "GPU Tests" include("gpu.jl")
+    # end
 end

From 5963ec9b7d7764f817746e93c4bb4f0ba527de37 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 11:57:42 -0400
Subject: [PATCH 04/19] Finish TrustRegion tests

---
 test/basictests.jl | 506 +++++++++++++--------------------------------
 1 file changed, 139 insertions(+), 367 deletions(-)

diff --git a/test/basictests.jl b/test/basictests.jl
index ee42db9f3..b7dc05ae2 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -8,14 +8,12 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 @testset "NewtonRaphson" begin
     function benchmark_nlsolve_oop(f, u0, p = 2.0)
         prob = NonlinearProblem{false}(f, u0, p)
-        cache = init(prob, NewtonRaphson(), abstol = 1e-9)
-        return solve!(cache)
+        return solve(prob, NewtonRaphson(), abstol = 1e-9)
     end
 
     function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs)
         prob = NonlinearProblem{true}(f, u0, p)
-        cache = init(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
-        return solve!(cache)
+        return solve(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
     end
 
     quadratic_f(u, p) = u .* u .- p
@@ -47,14 +45,15 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    # Immutable
+    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
+    #        accidentally passed
     @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
         @test begin
             res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
             res_true = sqrt(p)
             all(res.u .≈ res_true)
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
             @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
@@ -93,25 +92,25 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
     probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
     @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
-        AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(),
-        AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+            AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(),
+            AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
 end
 
 # --- TrustRegion tests ---
+
 @testset "TrustRegion" begin
-    function benchmark_nlsolve_oop(f, u0, p = 2.0; radius_update_scheme)
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; radius_update_scheme, kwargs...)
         prob = NonlinearProblem{false}(f, u0, p)
-        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
-        return solve!(cache)
+        return solve(prob, TrustRegion(; radius_update_scheme); abstol = 1e-9, kwargs...)
     end
 
-    function benchmark_nlsolve_iip(f, u0, p = 2.0; radius_update_scheme)
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; radius_update_scheme, kwargs...)
         prob = NonlinearProblem{true}(f, u0, p)
-        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
-        return solve!(cache)
+        return solve(prob, TrustRegion(; radius_update_scheme); abstol = 1e-9, kwargs...)
     end
 
     quadratic_f(u, p) = u .* u .- p
@@ -120,7 +119,8 @@ end
     radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
         RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
 
-    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0), radius_update_scheme in radius_update_schemes
+    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([
+                1.0, 1.0], @SVector[1.0, 1.0], 1.0), radius_update_scheme in radius_update_schemes
         sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme)
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
@@ -130,7 +130,8 @@ end
         @test (@ballocated solve!($cache)) < 200
     end
 
-    @testset "[IIP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0],), radius_update_scheme in radius_update_schemes
+    @testset "[IIP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([
+            1.0, 1.0],), radius_update_scheme in radius_update_schemes
         sol = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme)
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
@@ -139,371 +140,142 @@ end
             TrustRegion(; radius_update_scheme); abstol = 1e-9)
         @test (@ballocated solve!($cache)) ≤ 64
     end
-end
-
-
-# # Immutable
-# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, TrustRegion(), abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-#         abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-#         abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-#         abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-#         abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# # Scalar
-# f, u0 = (u, p) -> u * u - p, 1.0
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# f = (u, p) -> p[1] * u * u - p[2]
-# t = (p) -> [sqrt(p[2] / p[1])]
-# p = [0.9, 50.0]
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, TrustRegion())
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# # Iterator interface
-# f = (u, p) -> u * u - p
-# g = function (p_range)
-#     probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-#     sols = zeros(length(p_range))
-#     for (i, p) in enumerate(p_range)
-#         reinit!(cache, cache.u; p = p)
-#         sol = solve!(cache)
-#         sols[i] = sol.u
-#     end
-#     return sols
-# end
-# p = range(0.01, 2, length = 200)
-# @test g(p) ≈ sqrt.(p)
-
-# f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-# g = function (p_range)
-#     probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-#     sols = zeros(length(p_range))
-#     for (i, p) in enumerate(p_range)
-#         reinit!(cache, [cache.u[1]]; p = p)
-#         sol = solve!(cache)
-#         sols[i] = sol.u[1]
-#     end
-#     return sols
-# end
-# p = range(0.01, 2, length = 200)
-# @test g(p) ≈ sqrt.(p)
-
-# # Error Checks
-# f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
-# probN = NonlinearProblem(f, u0)
-
-# @test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
-# @test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
 
-# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
-#       sqrt(2.0)
-# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
-#       sqrt(2.0)
+    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
+    #        accidentally passed
+    @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme) p: $(p)" for radius_update_scheme in radius_update_schemes,
+        p in 1.0:0.1:100.0
 
-# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
-#       sqrt(2.0)
-# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
-#       sqrt(2.0)
-
-# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
-#       sqrt(2.0)
-# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
-#       sqrt(2.0)
-
-# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
-#       sqrt(2.0)
-# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
-#       sqrt(2.0)
-
-# for u0 in [1.0, [1, 1.0]]
-#     local f, probN, sol
-#     f = (u, p) -> u .* u .- 2.0
-#     probN = NonlinearProblem(f, u0)
-#     sol = sqrt(2) * u0
-
-#     @test solve(probN, TrustRegion()).u ≈ sol
-#     @test solve(probN, TrustRegion()).u ≈ sol
-#     @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
-# end
-
-# # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
-# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-# global g, f
-# f = (u, p) -> 0.010000000000000002 .+
-#               10.000000000000002 ./ (1 .+
-#                (0.21640425613334457 .+
-#                 216.40425613334457 ./ (1 .+
-#                  (0.21640425613334457 .+
-#                   216.40425613334457 ./
-#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-#               0.0011552453009332421u .- p
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
-#     return sol.u
-# end
-# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-# u = g(p)
-# f(u, p)
-# @test all(abs.(f(u, p)) .< 1e-10)
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
+                radius_update_scheme)
+            res_true = sqrt(p)
+            all(res.u .≈ res_true)
+        end
+        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+            @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
+    end
 
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-# u = g(p)
-# f(u, p)
-# @test all(abs.(f(u, p)) .< 1e-10)
+    @testset "[OOP] [Scalar AD] radius_update_scheme: $(radius_update_scheme)  p: $(p)" for radius_update_scheme in radius_update_schemes,
+        p in 1.0:0.1:100.0
 
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-#         abstol = 1e-10)
-#     return sol.u
-# end
-# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-# u = g(p)
-# f(u, p)
-# @test all(abs.(f(u, p)) .< 1e-10)
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0), p;
+                radius_update_scheme)
+            res_true = sqrt(p)
+            res.u ≈ res_true
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0),
+                p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
+    end
 
-# # Test kwars in `TrustRegion`
-# max_trust_radius = [10.0, 100.0, 1000.0]
-# initial_trust_radius = [10.0, 1.0, 0.1]
-# step_threshold = [0.0, 0.01, 0.25]
-# shrink_threshold = [0.25, 0.3, 0.5]
-# expand_threshold = [0.5, 0.8, 0.9]
-# shrink_factor = [0.1, 0.3, 0.5]
-# expand_factor = [1.5, 2.0, 3.0]
-# max_shrink_times = [10, 20, 30]
-
-# list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
-#     shrink_threshold, expand_threshold, shrink_factor,
-#     expand_factor, max_shrink_times)
-# for options in list_of_options
-#     local probN, sol, alg
-#     alg = TrustRegion(max_trust_radius = options[1],
-#         initial_trust_radius = options[2],
-#         step_threshold = options[3],
-#         shrink_threshold = options[4],
-#         expand_threshold = options[5],
-#         shrink_factor = options[6],
-#         expand_factor = options[7],
-#         max_shrink_times = options[8])
+    quadratic_f2(u, p) = @. p[1] * u * u - p[2]
+    t = (p) -> [sqrt(p[2] / p[1])]
+    p = [0.9, 50.0]
+    @testset "[OOP] [Jacobian] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p; radius_update_scheme).u ≈
+              sqrt(p[2] / p[1])
+        @test ForwardDiff.jacobian(p -> [
+                benchmark_nlsolve_oop(quadratic_f2, 0.5, p;
+                    radius_update_scheme).u,
+            ], p) ≈ ForwardDiff.jacobian(t, p)
+    end
 
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, alg, abstol = 1e-10)
-#     @test all(abs.(f(u, p)) .< 1e-10)
-# end
+    # Iterator interface
+    function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
+        probN = NonlinearProblem{iip}(f, iip ? [0.5] : 0.5, p_range[begin])
+        cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+        sols = zeros(length(p_range))
+        for (i, p) in enumerate(p_range)
+            reinit!(cache, iip ? [cache.u[1]] : cache.u; p = p)
+            sol = solve!(cache)
+            sols[i] = iip ? sol.u[1] : sol.u
+        end
+        return sols
+    end
+    p = range(0.01, 2, length = 200)
+    @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
+    @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
-# # Testing consistency of iip vs oop iterations
+    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
+    @testset "ADType: $(autodiff) u0: $(u0) radius_update_scheme: $(radius_update_scheme)" for autodiff in (false,
+            true, AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(), AutoSparseEnzyme()),
+        u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0]),
+        radius_update_scheme in radius_update_schemes
 
-# maxiterations = [2, 3, 4, 5]
-# u0 = [1.0, 1.0]
-# function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
-#     prob_iip = NonlinearProblem{true}(fip, u0)
-#     solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
-#         abstol = 1e-9, maxiters = maxiters)
-#     sol_iip = solve!(solver)
-
-#     prob_oop = NonlinearProblem{false}(f, u0)
-#     solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
-#         abstol = 1e-9, maxiters = maxiters)
-#     sol_oop = solve!(solver)
-
-#     return sol_iip.u[end], sol_oop.u[end]
-# end
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
+    end
 
-# for maxiters in maxiterations
-#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
-#     @test iip == oop
-# end
+    # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
+    function newton_fails(u, p)
+        return 0.010000000000000002 .+
+               10.000000000000002 ./ (1 .+
+                (0.21640425613334457 .+
+                 216.40425613334457 ./ (1 .+
+                  (0.21640425613334457 .+
+                   216.40425613334457 ./
+                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+               0.0011552453009332421u .- p
+    end
 
-# for maxiters in maxiterations
-#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
-#     @test iip == oop
-# end
+    @testset "Newton Raphson Fails: radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in [
+        RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
+        u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+        p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        sol = benchmark_nlsolve_oop(newton_fails, u0, p; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(newton_fails(sol.u, p)) .< 1e-9)
+    end
 
-# for maxiters in maxiterations
-#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
-#     @test iip == oop
-# end
+    # Test kwargs in `TrustRegion`
+    @testset "Keyword Arguments" begin
+        max_trust_radius = [10.0, 100.0, 1000.0]
+        initial_trust_radius = [10.0, 1.0, 0.1]
+        step_threshold = [0.0, 0.01, 0.25]
+        shrink_threshold = [0.25, 0.3, 0.5]
+        expand_threshold = [0.5, 0.8, 0.9]
+        shrink_factor = [0.1, 0.3, 0.5]
+        expand_factor = [1.5, 2.0, 3.0]
+        max_shrink_times = [10, 20, 30]
+
+        list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
+            shrink_threshold, expand_threshold, shrink_factor,
+            expand_factor, max_shrink_times)
+        for options in list_of_options
+            local probN, sol, alg
+            alg = TrustRegion(max_trust_radius = options[1],
+                initial_trust_radius = options[2], step_threshold = options[3],
+                shrink_threshold = options[4], expand_threshold = options[5],
+                shrink_factor = options[6], expand_factor = options[7],
+                max_shrink_times = options[8])
+
+            probN = NonlinearProblem{false}(f, u0, p)
+            sol = solve(probN, alg, abstol = 1e-10)
+            @test all(abs.(f(u, p)) .< 1e-10)
+        end
+    end
 
-# for maxiters in maxiterations
-#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
-#     @test iip == oop
-# end
+    # Testing consistency of iip vs oop iterations
+    @testset "OOP / IIP Consistency" begin
+        maxiterations = [2, 3, 4, 5]
+        u0 = [1.0, 1.0]
+        @testset "radius_update_scheme: $(radius_update_scheme) maxiters: $(maxiters)" for radius_update_scheme in radius_update_schemes,
+            maxiters in maxiterations
+
+            sol_iip = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme,
+                maxiters)
+            sol_oop = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme,
+                maxiters)
+            @test sol_iip.u ≈ sol_iip.u
+        end
+    end
+end
 
-# for maxiters in maxiterations
-#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
-#     @test iip == oop
-# end
+# --- LevenbergMarquardt tests ---
 
-# # --- LevenbergMarquardt tests ---
+@testset "LevenbergMarquardt" begin end
 
 # function benchmark_immutable(f, u0)
 #     probN = NonlinearProblem{false}(f, u0)

From eb3a6ffdd2eedee09fb0605f95afcbd92c638fec Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 13:55:38 -0400
Subject: [PATCH 05/19] Finalize tests

---
 src/ad.jl                |   1 +
 test/basictests.jl       | 311 +++++++++++++++------------------------
 test/convergencetests.jl |  40 -----
 test/runtests.jl         |  10 +-
 test/sparse.jl           |  14 +-
 5 files changed, 138 insertions(+), 238 deletions(-)
 delete mode 100644 test/convergencetests.jl

diff --git a/src/ad.jl b/src/ad.jl
index faa8c9f04..15e5af285 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -30,6 +30,7 @@ function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
     return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
         sol.retcode)
 end
+
 function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
         <:AbstractArray{<:Dual{T, V, P}}}, alg::AbstractNewtonAlgorithm, args...;
     kwargs...) where {iip, T, V, P}
diff --git a/test/basictests.jl b/test/basictests.jl
index b7dc05ae2..3af807479 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -3,6 +3,21 @@ using BenchmarkTools, LinearSolve, NonlinearSolve, StaticArrays, Random, LinearA
 
 _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
+quadratic_f(u, p) = u .* u .- p
+quadratic_f!(du, u, p) = (du .= u .* u .- p)
+quadratic_f2(u, p) = @. p[1] * u * u - p[2]
+
+function newton_fails(u, p)
+    return 0.010000000000000002 .+
+           10.000000000000002 ./ (1 .+
+            (0.21640425613334457 .+
+             216.40425613334457 ./ (1 .+
+              (0.21640425613334457 .+
+               216.40425613334457 ./
+               (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+           0.0011552453009332421u .- p
+end
+
 # --- NewtonRaphson tests ---
 
 @testset "NewtonRaphson" begin
@@ -16,9 +31,6 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
         return solve(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
     end
 
-    quadratic_f(u, p) = u .* u .- p
-    quadratic_f!(du, u, p) = (du .= u .* u .- p)
-
     @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
         sol = benchmark_nlsolve_oop(quadratic_f, u0)
         @test SciMLBase.successful_retcode(sol)
@@ -40,7 +52,7 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
             NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
         @test (@ballocated solve!($cache)) ≤ 64
     end
@@ -67,7 +79,6 @@ _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
               1 / (2 * sqrt(p))
     end
 
-    quadratic_f2(u, p) = @. p[1] * u * u - p[2]
     t = (p) -> [sqrt(p[2] / p[1])]
     p = [0.9, 50.0]
     @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
@@ -113,9 +124,6 @@ end
         return solve(prob, TrustRegion(; radius_update_scheme); abstol = 1e-9, kwargs...)
     end
 
-    quadratic_f(u, p) = u .* u .- p
-    quadratic_f!(du, u, p) = (du .= u .* u .- p)
-
     radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
         RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
 
@@ -169,7 +177,6 @@ end
                 p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
     end
 
-    quadratic_f2(u, p) = @. p[1] * u * u - p[2]
     t = (p) -> [sqrt(p[2] / p[1])]
     p = [0.9, 50.0]
     @testset "[OOP] [Jacobian] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
@@ -209,17 +216,6 @@ end
     end
 
     # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
-    function newton_fails(u, p)
-        return 0.010000000000000002 .+
-               10.000000000000002 ./ (1 .+
-                (0.21640425613334457 .+
-                 216.40425613334457 ./ (1 .+
-                  (0.21640425613334457 .+
-                   216.40425613334457 ./
-                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-               0.0011552453009332421u .- p
-    end
-
     @testset "Newton Raphson Fails: radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in [
         RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
         u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
@@ -251,9 +247,9 @@ end
                 shrink_factor = options[6], expand_factor = options[7],
                 max_shrink_times = options[8])
 
-            probN = NonlinearProblem{false}(f, u0, p)
+            probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
             sol = solve(probN, alg, abstol = 1e-10)
-            @test all(abs.(f(u, p)) .< 1e-10)
+            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-10)
         end
     end
 
@@ -275,170 +271,107 @@ end
 
 # --- LevenbergMarquardt tests ---
 
-@testset "LevenbergMarquardt" begin end
-
-# function benchmark_immutable(f, u0)
-#     probN = NonlinearProblem{false}(f, u0)
-#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-#     sol = solve!(solver)
-# end
-
-# function benchmark_mutable(f, u0)
-#     probN = NonlinearProblem{false}(f, u0)
-#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-#     sol = solve!(solver)
-# end
-
-# function benchmark_scalar(f, u0)
-#     probN = NonlinearProblem{false}(f, u0)
-#     sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
-# end
-
-# function ff(u, p)
-#     u .* u .- 2
-# end
-
-# function sf(u, p)
-#     u * u - 2
-# end
-# u0 = [1.0, 1.0]
-
-# sol = benchmark_immutable(ff, cu0)
-# @test SciMLBase.successful_retcode(sol)
-# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-# sol = benchmark_mutable(ff, u0)
-# @test SciMLBase.successful_retcode(sol)
-# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-# sol = benchmark_scalar(sf, csu0)
-# @test SciMLBase.successful_retcode(sol)
-# @test abs(sol.u * sol.u - 2) < 1e-9
-
-# function benchmark_inplace(f, u0)
-#     probN = NonlinearProblem{true}(f, u0)
-#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-#     sol = solve!(solver)
-# end
-
-# function ffiip(du, u, p)
-#     du .= u .* u .- 2
-# end
-# u0 = [1.0, 1.0]
-
-# sol = benchmark_inplace(ffiip, u0)
-# @test SciMLBase.successful_retcode(sol)
-# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-
-# u0 = [1.0, 1.0]
-# probN = NonlinearProblem{true}(ffiip, u0)
-# solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-# @test (@ballocated solve!(solver)) < 120
-
-# # AD Tests
-# using ForwardDiff
-
-# # Immutable
-# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, csu0, p)
-#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
-#     return sol.u[end]
-# end
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# # Scalar
-# f, u0 = (u, p) -> u * u - p, 1.0
-
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-#     return sol.u
-# end
-
-# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-# for p in 1.1:0.1:100.0
-#     @test g(p) ≈ sqrt(p)
-#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-# end
-
-# f = (u, p) -> p[1] * u * u - p[2]
-# t = (p) -> [sqrt(p[2] / p[1])]
-# p = [0.9, 50.0]
-# gnewton = function (p)
-#     probN = NonlinearProblem{false}(f, 0.5, p)
-#     sol = solve(probN, LevenbergMarquardt())
-#     return [sol.u]
-# end
-# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# # Error Checks
-# f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-# probN = NonlinearProblem(f, u0)
-
-# @test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
-# @test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-# for u0 in [1.0, [1, 1.0]]
-#     local f, probN, sol
-#     f = (u, p) -> u .* u .- 2.0
-#     probN = NonlinearProblem(f, u0)
-#     sol = sqrt(2) * u0
-
-#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
-#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
-#     @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
-# end
-
-# # Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
-# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-# global g, f
-# f = (u, p) -> 0.010000000000000002 .+
-#               10.000000000000002 ./ (1 .+
-#                (0.21640425613334457 .+
-#                 216.40425613334457 ./ (1 .+
-#                  (0.21640425613334457 .+
-#                   216.40425613334457 ./
-#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-#               0.0011552453009332421u .- p
-# g = function (p)
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-#     return sol.u
-# end
-# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-# u = g(p)
-# f(u, p)
-# @test all(abs.(f(u, p)) .< 1e-10)
-
-# # # Test kwars in `LevenbergMarquardt`
-# damping_initial = [0.5, 2.0, 5.0]
-# damping_increase_factor = [1.5, 3.0, 10.0]
-# damping_decrease_factor = [2, 5, 10]
-# finite_diff_step_geodesic = [0.02, 0.2, 0.3]
-# α_geodesic = [0.6, 0.8, 0.9]
-# b_uphill = [0, 1, 2]
-# min_damping_D = [1e-12, 1e-9, 1e-4]
-
-# list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
-#     finite_diff_step_geodesic, α_geodesic, b_uphill,
-#     min_damping_D)
-# for options in list_of_options
-#     local probN, sol, alg
-#     alg = LevenbergMarquardt(damping_initial = options[1],
-#         damping_increase_factor = options[2],
-#         damping_decrease_factor = options[3],
-#         finite_diff_step_geodesic = options[4],
-#         α_geodesic = options[5],
-#         b_uphill = options[6],
-#         min_damping_D = options[7])
-
-#     probN = NonlinearProblem{false}(f, u0, p)
-#     sol = solve(probN, alg, abstol = 1e-10)
-#     @test all(abs.(f(u, p)) .< 1e-10)
-# end
+@testset "LevenbergMarquardt" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0)
+        prob = NonlinearProblem{false}(f, u0, p)
+        return solve(prob, LevenbergMarquardt(), abstol = 1e-9)
+    end
+
+    function benchmark_nlsolve_iip(f, u0, p = 2.0)
+        prob = NonlinearProblem{true}(f, u0, p)
+        return solve(prob, LevenbergMarquardt(), abstol = 1e-9)
+    end
+
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
+        sol = benchmark_nlsolve_oop(quadratic_f, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), LevenbergMarquardt(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
+
+    @testset "[IIP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0],)
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0), LevenbergMarquardt(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
+
+    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
+    #        accidentally passed
+    @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+            res_true = sqrt(p)
+            all(res.u .≈ res_true)
+        end
+        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+            @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
+    end
+
+    @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+            res_true = sqrt(p)
+            res.u ≈ res_true
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u, p) ≈
+              1 / (2 * sqrt(p))
+    end
+
+    t = (p) -> [sqrt(p[2] / p[1])]
+    p = [0.9, 50.0]
+    @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+    @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
+          ForwardDiff.jacobian(t, p)
+
+    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
+    @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
+            AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(),
+            AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, LevenbergMarquardt(; autodiff)).u .≈ sqrt(2.0))
+    end
+
+    # Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
+    @testset "Newton Raphson Fails" begin
+        u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+        p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        sol = benchmark_nlsolve_oop(newton_fails, u0, p)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(newton_fails(sol.u, p)) .< 1e-9)
+    end
+
+    # Test kwargs in `LevenbergMarquardt`
+    @testset "Keyword Arguments" begin
+        damping_initial = [0.5, 2.0, 5.0]
+        damping_increase_factor = [1.5, 3.0, 10.0]
+        damping_decrease_factor = Float64[2, 5, 10]
+        finite_diff_step_geodesic = [0.02, 0.2, 0.3]
+        α_geodesic = [0.6, 0.8, 0.9]
+        b_uphill = Float64[0, 1, 2]
+        min_damping_D = [1e-12, 1e-9, 1e-4]
+
+        list_of_options = zip(damping_initial, damping_increase_factor,
+            damping_decrease_factor, finite_diff_step_geodesic, α_geodesic, b_uphill,
+            min_damping_D)
+        for options in list_of_options
+            local probN, sol, alg
+            alg = LevenbergMarquardt(damping_initial = options[1],
+                    damping_increase_factor = options[2],
+                    damping_decrease_factor = options[3],
+                    finite_diff_step_geodesic = options[4], α_geodesic = options[5],
+                    b_uphill = options[6], min_damping_D = options[7])
+
+            probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
+            sol = solve(probN, alg, abstol = 1e-10)
+            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-10)
+        end
+    end
+end
diff --git a/test/convergencetests.jl b/test/convergencetests.jl
deleted file mode 100644
index 751948522..000000000
--- a/test/convergencetests.jl
+++ /dev/null
@@ -1,40 +0,0 @@
-using NonlinearSolve
-using StaticArrays
-using BenchmarkTools
-using Test
-
-using SciMLNLSolve
-
-###-----Trust Region tests-----###
-
-# some simple functions #
-function f_oop(u, p)
-    u .* u .- p
-end
-
-function f_iip(du, u, p)
-    du .= u .* u .- p
-end
-
-function f_scalar(u, p)
-    u * u - p
-end
-
-u0 = [1.0, 1.0]
-csu0 = 1.0
-p = [2.0, 2.0]
-radius_update_scheme = RadiusUpdateSchemes.Simple
-tol = 1e-9
-
-function convergence_test_oop(f, u0, p, radius_update_scheme)
-    prob = NonlinearProblem{false}(f, oftype(p, u0), p)
-    cache = init(prob,
-        TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(cache)
-    return cache.internalnorm(cache.u_prev - cache.u), cache.iter, sol.retcode
-end
-
-residual, iterations, return_code = convergence_test_oop(f_oop, u0, p, radius_update_scheme)
-@test return_code === ReturnCode.Success
-@test residual ≈ tol
diff --git a/test/runtests.jl b/test/runtests.jl
index f8cf35db3..a84fc3cb1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,11 +14,11 @@ end
 @time begin
     if GROUP == "All" || GROUP == "Core"
         @time @safetestset "Basic Tests + Some AD" include("basictests.jl")
-        # @time @safetestset "Sparsity Tests" include("sparse.jl")
+        @time @safetestset "Sparsity Tests" include("sparse.jl")
     end
 
-    # if GROUP == "GPU"
-    #     activate_downstream_env()
-    #     @time @safetestset "GPU Tests" include("gpu.jl")
-    # end
+    if GROUP == "GPU"
+        activate_downstream_env()
+        @time @safetestset "GPU Tests" include("gpu.jl")
+    end
 end
diff --git a/test/sparse.jl b/test/sparse.jl
index 1f4d07161..256ca7530 100644
--- a/test/sparse.jl
+++ b/test/sparse.jl
@@ -2,8 +2,10 @@ using NonlinearSolve, LinearAlgebra, SparseArrays, Symbolics
 
 const N = 32
 const xyd_brusselator = range(0, stop = 1, length = N)
+
 brusselator_f(x, y) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * 5.0
 limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
+
 function brusselator_2d_loop(du, u, p)
     A, B, alpha, dx = p
     alpha = alpha / dx^2
@@ -21,6 +23,7 @@ function brusselator_2d_loop(du, u, p)
                       A * u[i, j, 1] - u[i, j, 1]^2 * u[i, j, 2]
     end
 end
+
 p = (3.4, 1.0, 10.0, step(xyd_brusselator))
 
 function init_brusselator_2d(xyd)
@@ -32,8 +35,9 @@ function init_brusselator_2d(xyd)
         u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
         u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
     end
-    u
+    return u
 end
+
 u0 = init_brusselator_2d(xyd_brusselator)
 prob_brusselator_2d = NonlinearProblem(brusselator_2d_loop, u0, p)
 sol = solve(prob_brusselator_2d, NewtonRaphson())
@@ -47,12 +51,14 @@ fill!(jac_prototype, 0)
 
 ff = NonlinearFunction(brusselator_2d_loop; jac_prototype)
 prob_brusselator_2d = NonlinearProblem(ff, u0, p)
+
+# for autodiff in [false, ]
 sol = solve(prob_brusselator_2d, NewtonRaphson())
 @test norm(sol.resid) < 1e-8
 @test !all(iszero, jac_prototype)
 
-sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = false))
+sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff()))
 @test norm(sol.resid) < 1e-6
 
-cache = init(prob_brusselator_2d, NewtonRaphson())
-@test maximum(cache.jac_config.colorvec) == 12
+cache = init(prob_brusselator_2d, NewtonRaphson(; autodiff = AutoSparseForwardDiff()));
+@test maximum(cache.jac_cache.coloring.colorvec) == 12

From 78beabebd046a0c24a1872734485669498b793c9 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 14:02:59 -0400
Subject: [PATCH 06/19] Formatting

---
 src/NonlinearSolve.jl | 24 ++++++++++++------------
 test/basictests.jl    |  8 ++++----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 9fd4bb31d..2f851faa3 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -45,23 +45,23 @@ import PrecompileTools
 
 PrecompileTools.@compile_workload begin
     for T in (Float32, Float64)
-        # prob = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
+        prob = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
 
-        #         precompile_algs = if VERSION ≥ v"1.7"
-        #             (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
-        #         else
-        #             (NewtonRaphson(),)
-        #         end
+        precompile_algs = if VERSION ≥ v"1.7"
+            (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
+        else
+            (NewtonRaphson(),)
+        end
 
-        #         for alg in precompile_algs
-        #             solve(prob, alg, abstol = T(1e-2))
-        #         end
+        for alg in precompile_algs
+            solve(prob, alg, abstol = T(1e-2))
+        end
 
         prob = NonlinearProblem{true}((du, u, p) -> du[1] = u[1] * u[1] - p[1], T[0.1],
             T[2])
-        #         for alg in precompile_algs
-        #             solve(prob, alg, abstol = T(1e-2))
-        #         end
+        for alg in precompile_algs
+            solve(prob, alg, abstol = T(1e-2))
+        end
     end
 end
 
diff --git a/test/basictests.jl b/test/basictests.jl
index 3af807479..7b9de6b50 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -364,10 +364,10 @@ end
         for options in list_of_options
             local probN, sol, alg
             alg = LevenbergMarquardt(damping_initial = options[1],
-                    damping_increase_factor = options[2],
-                    damping_decrease_factor = options[3],
-                    finite_diff_step_geodesic = options[4], α_geodesic = options[5],
-                    b_uphill = options[6], min_damping_D = options[7])
+                damping_increase_factor = options[2],
+                damping_decrease_factor = options[3],
+                finite_diff_step_geodesic = options[4], α_geodesic = options[5],
+                b_uphill = options[6], min_damping_D = options[7])
 
             probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
             sol = solve(probN, alg, abstol = 1e-10)

From dd66ce102432f84665eb4ccfa4337c9e187b7b44 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 14:36:31 -0400
Subject: [PATCH 07/19] Make it a breaking change: autodiff args have different
 semantics

---
 Project.toml    | 2 +-
 src/jacobian.jl | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5033ab24a..6bb2fe223 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NonlinearSolve"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 authors = ["SciML"]
-version = "1.11.0"
+version = "2.0.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 9c7f6e721..dc64d0e08 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -6,8 +6,6 @@ end
 (uf::JacobianWrapper)(u) = uf.f(u, uf.p)
 (uf::JacobianWrapper)(res, u) = uf.f(res, u, uf.p)
 
-# FIXME: This is a deviation from older versions. Previously if sparsity and colorvec were
-#        provided we would use a sparse AD. Right now it requires an explicit specification
 sparsity_detection_alg(f, ad) = NoSparsityDetection()
 function sparsity_detection_alg(f, ad::AbstractSparseADType)
     if f.sparsity === nothing

From 47135d427cda71f543071bf8289b5e0e1771ee25 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 14:43:43 -0400
Subject: [PATCH 08/19] Update docs compat

---
 docs/Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index f6889de90..df765bb1d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -14,7 +14,7 @@ Sundials = "c3572dad-4567-51f8-b174-8c6c989267f4"
 BenchmarkTools = "1"
 Documenter = "0.27"
 LinearSolve = "2"
-NonlinearSolve = "1"
+NonlinearSolve = "1, 2"
 NonlinearSolveMINPACK = "0.1"
 SciMLNLSolve = "0.1"
 SimpleNonlinearSolve = "0.1.5"

From 5e7bafca11c0fa7a33b5c47957a1f2ea386b31c7 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 18:21:59 -0400
Subject: [PATCH 09/19] Fix forward AD

---
 src/ad.jl          | 57 +++++++++++++++++++++++++++++++---------------
 test/basictests.jl | 30 ++++++++----------------
 2 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/src/ad.jl b/src/ad.jl
index 15e5af285..0af33742c 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -1,40 +1,61 @@
 function scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     f = prob.f
     p = value(prob.p)
-
     u0 = value(prob.u0)
     newprob = NonlinearProblem(f, u0, p; prob.kwargs...)
 
     sol = solve(newprob, alg, args...; kwargs...)
 
     uu = sol.u
-    if p isa Number
-        f_p = ForwardDiff.derivative(Base.Fix1(f, uu), p)
-    else
-        f_p = ForwardDiff.gradient(Base.Fix1(f, uu), p)
-    end
+    f_p = scalar_nlsolve_∂f_∂p(f, uu, p)
+    f_x = scalar_nlsolve_∂f_∂u(f, uu, p)
+
+    z_arr = -inv(f_x) * f_p
 
-    f_x = ForwardDiff.derivative(Base.Fix2(f, p), uu)
     pp = prob.p
-    sumfun = let f_x′ = -f_x
-        ((fp, p),) -> (fp / f_x′) * ForwardDiff.partials(p)
+    sumfun = ((z, p),) -> [zᵢ * ForwardDiff.partials(p) for zᵢ in z]
+    if uu isa Number
+        partials = sum(sumfun, zip(z_arr, pp))
+    else
+        partials = sum(sumfun, zip(eachcol(z_arr), pp))
     end
-    partials = sum(sumfun, zip(f_p, pp))
+
     return sol, partials
 end
 
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
-        <:Dual{T, V, P}}, alg::AbstractNewtonAlgorithm, args...;
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector, <:AbstractArray},
+        iip, <:Dual{T, V, P}}, alg::AbstractNewtonAlgorithm, args...;
     kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
-    return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        sol.retcode)
+    dual_soln = scalar_nlsolve_dual_soln(sol.u, partials, prob.p)
+    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode)
 end
 
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector}, iip,
-        <:AbstractArray{<:Dual{T, V, P}}}, alg::AbstractNewtonAlgorithm, args...;
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector, <:AbstractArray},
+        iip, <:AbstractArray{<:Dual{T, V, P}}}, alg::AbstractNewtonAlgorithm, args...;
     kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
-    return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        sol.retcode)
+    dual_soln = scalar_nlsolve_dual_soln(sol.u, partials, prob.p)
+    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode)
+end
+
+function scalar_nlsolve_∂f_∂p(f, u, p)
+    ff = p isa Number ? ForwardDiff.derivative :
+         (u isa Number ? ForwardDiff.gradient : ForwardDiff.jacobian)
+    return ff(Base.Fix1(f, u), p)
+end
+
+function scalar_nlsolve_∂f_∂u(f, u, p)
+    ff = u isa Number ? ForwardDiff.derivative : ForwardDiff.jacobian
+    return ff(Base.Fix2(f, p), u)
+end
+
+function scalar_nlsolve_dual_soln(u::Number, partials,
+    ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
+    return Dual{T, V, P}(u, partials[1])
+end
+
+function scalar_nlsolve_dual_soln(u::AbstractArray, partials,
+    ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
+    return map(((uᵢ, pᵢ),) -> Dual{T, V, P}(uᵢ, pᵢ), zip(u, partials))
 end
diff --git a/test/basictests.jl b/test/basictests.jl
index 7b9de6b50..763ecf2d6 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -57,15 +57,13 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
-    #        accidentally passed
     @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
         @test begin
             res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
             res_true = sqrt(p)
             all(res.u .≈ res_true)
         end
-        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
             @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
@@ -101,11 +99,9 @@ end
     @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
     @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
-    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
-    @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (false, true,
             AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
-            AutoSparseZygote(),
-            AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
@@ -149,8 +145,6 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
-    #        accidentally passed
     @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme) p: $(p)" for radius_update_scheme in radius_update_schemes,
         p in 1.0:0.1:100.0
 
@@ -160,7 +154,7 @@ end
             res_true = sqrt(p)
             all(res.u .≈ res_true)
         end
-        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
             @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
@@ -204,11 +198,9 @@ end
     @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
     @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
-    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
-    @testset "ADType: $(autodiff) u0: $(u0) radius_update_scheme: $(radius_update_scheme)" for autodiff in (false,
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0)) radius_update_scheme: $(radius_update_scheme)" for autodiff in (false,
             true, AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
-            AutoSparseZygote(), AutoSparseEnzyme()),
-        u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0]),
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0]),
         radius_update_scheme in radius_update_schemes
 
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
@@ -302,15 +294,13 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    # FIXME: Even the previous tests were broken, but due to a typo in the tests they
-    #        accidentally passed
     @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
         @test begin
             res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
             res_true = sqrt(p)
             all(res.u .≈ res_true)
         end
-        @test_broken ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
             @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
@@ -330,11 +320,9 @@ end
     @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
           ForwardDiff.jacobian(t, p)
 
-    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
-    @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (false, true,
             AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
-            AutoSparseZygote(),
-            AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, LevenbergMarquardt(; autodiff)).u .≈ sqrt(2.0))
     end

From a9fc4b8599f3baaf9db52d64bd02b2f376bcc688 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 11 Sep 2023 18:33:31 -0400
Subject: [PATCH 10/19] Non allocating for scalars

---
 src/ad.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ad.jl b/src/ad.jl
index 0af33742c..9b6567328 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -13,7 +13,7 @@ function scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     z_arr = -inv(f_x) * f_p
 
     pp = prob.p
-    sumfun = ((z, p),) -> [zᵢ * ForwardDiff.partials(p) for zᵢ in z]
+    sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
     if uu isa Number
         partials = sum(sumfun, zip(z_arr, pp))
     else
@@ -52,7 +52,7 @@ end
 
 function scalar_nlsolve_dual_soln(u::Number, partials,
     ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
-    return Dual{T, V, P}(u, partials[1])
+    return Dual{T, V, P}(u, partials)
 end
 
 function scalar_nlsolve_dual_soln(u::AbstractArray, partials,

From 85f7449293c2df1291fe1005c7d5599cfd46a3b8 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 12 Sep 2023 09:25:29 -0400
Subject: [PATCH 11/19] Non allocating for static vectors

---
 src/ad.jl          |   2 +
 test/basictests.jl | 100 ++++++++++++++++++++++++++-------------------
 2 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/src/ad.jl b/src/ad.jl
index 9b6567328..05fd8bfa9 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -16,6 +16,8 @@ function scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
     if uu isa Number
         partials = sum(sumfun, zip(z_arr, pp))
+    elseif p isa Number
+        partials = sumfun((z_arr, pp))
     else
         partials = sum(sumfun, zip(eachcol(z_arr), pp))
     end
diff --git a/test/basictests.jl b/test/basictests.jl
index 763ecf2d6..d06543efd 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -57,14 +57,16 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
-            res_true = sqrt(p)
-            all(res.u .≈ res_true)
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                res_true = sqrt(p)
+                all(res.u .≈ res_true)
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-            @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
     @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
@@ -77,11 +79,14 @@ end
               1 / (2 * sqrt(p))
     end
 
-    t = (p) -> [sqrt(p[2] / p[1])]
-    p = [0.9, 50.0]
-    @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
-    @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
-          ForwardDiff.jacobian(t, p)
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+        @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u],
+            p) ≈
+              ForwardDiff.jacobian(t, p)
+    end
 
     # Iterator interface
     function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
@@ -145,17 +150,19 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme) p: $(p)" for radius_update_scheme in radius_update_schemes,
-        p in 1.0:0.1:100.0
-
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
-                radius_update_scheme)
-            res_true = sqrt(p)
-            all(res.u .≈ res_true)
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme) p: $(p)" for radius_update_scheme in radius_update_schemes,
+            p in 1.0:0.1:100.0
+
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
+                    radius_update_scheme)
+                res_true = sqrt(p)
+                all(res.u .≈ res_true)
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-            @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
     @testset "[OOP] [Scalar AD] radius_update_scheme: $(radius_update_scheme)  p: $(p)" for radius_update_scheme in radius_update_schemes,
@@ -171,15 +178,17 @@ end
                 p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
     end
 
-    t = (p) -> [sqrt(p[2] / p[1])]
-    p = [0.9, 50.0]
-    @testset "[OOP] [Jacobian] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
-        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p; radius_update_scheme).u ≈
-              sqrt(p[2] / p[1])
-        @test ForwardDiff.jacobian(p -> [
-                benchmark_nlsolve_oop(quadratic_f2, 0.5, p;
-                    radius_update_scheme).u,
-            ], p) ≈ ForwardDiff.jacobian(t, p)
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @testset "[OOP] [Jacobian] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+            @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p; radius_update_scheme).u ≈
+                  sqrt(p[2] / p[1])
+            @test ForwardDiff.jacobian(p -> [
+                    benchmark_nlsolve_oop(quadratic_f2, 0.5, p;
+                        radius_update_scheme).u,
+                ], p) ≈ ForwardDiff.jacobian(t, p)
+        end
     end
 
     # Iterator interface
@@ -294,14 +303,16 @@ end
         @test (@ballocated solve!($cache)) ≤ 64
     end
 
-    @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
-            res_true = sqrt(p)
-            all(res.u .≈ res_true)
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                res_true = sqrt(p)
+                all(res.u .≈ res_true)
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-            @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
     end
 
     @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
@@ -314,11 +325,14 @@ end
               1 / (2 * sqrt(p))
     end
 
-    t = (p) -> [sqrt(p[2] / p[1])]
-    p = [0.9, 50.0]
-    @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
-    @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
-          ForwardDiff.jacobian(t, p)
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+        @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u],
+            p) ≈
+              ForwardDiff.jacobian(t, p)
+    end
 
     @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (false, true,
             AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),

From f7e29aac0f70119b4f97d73189509856f4428c9c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 12 Sep 2023 11:54:08 -0400
Subject: [PATCH 12/19] Ignore SVector for 1.6

---
 test/basictests.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/basictests.jl b/test/basictests.jl
index d06543efd..11e64307d 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -31,7 +31,8 @@ end
         return solve(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
     end
 
-    @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
+    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
         sol = benchmark_nlsolve_oop(quadratic_f, u0)
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
@@ -127,9 +128,11 @@ end
 
     radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
         RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
+    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
+
+    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in u0s,
+        radius_update_scheme in radius_update_schemes
 
-    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([
-                1.0, 1.0], @SVector[1.0, 1.0], 1.0), radius_update_scheme in radius_update_schemes
         sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme)
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
@@ -283,7 +286,8 @@ end
         return solve(prob, LevenbergMarquardt(), abstol = 1e-9)
     end
 
-    @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
+    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
         sol = benchmark_nlsolve_oop(quadratic_f, u0)
         @test SciMLBase.successful_retcode(sol)
         @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)

From 6307028494733bb930a30aca1a37a28dd7074331 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 12 Sep 2023 13:55:14 -0400
Subject: [PATCH 13/19] Bump compat entries

---
 Project.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 6bb2fe223..ab1f6a500 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,7 +24,9 @@ StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
+ADTypes = "0.2"
 ArrayInterface = "6.0.24, 7"
+ConcreteStructs = "0.2"
 DiffEqBase = "6"
 EnumX = "1"
 Enzyme = "0.11"
@@ -36,7 +38,7 @@ RecursiveArrayTools = "2"
 Reexport = "0.2, 1"
 SciMLBase = "1.97"
 SimpleNonlinearSolve = "0.1"
-SparseDiffTools = "1, 2"
+SparseDiffTools = "2.6"
 StaticArraysCore = "1.4"
 UnPack = "1.0"
 Zygote = "0.6"

From 5b46c2dfd30ae47195fd1224175e5253c0a5b08d Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 13 Sep 2023 10:39:32 -0400
Subject: [PATCH 14/19] Fix jac prototype

---
 src/jacobian.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jacobian.jl b/src/jacobian.jl
index dc64d0e08..157562752 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -75,7 +75,7 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
         JacVec(uf, u; autodiff = alg.ad)
     else
         if has_analytic_jac
-            iip ? undefmatrix(u) : nothing
+            f.jac_prototype === nothing ? undefmatrix(u) : f.jac_prototype
         else
             f.jac_prototype === nothing ? init_jacobian(jac_cache) : f.jac_prototype
         end

From de8086c7ce4086e3b3ac135ea88c3e247b3d37f2 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 15 Sep 2023 17:02:54 -0400
Subject: [PATCH 15/19] Fix JacVec for not inplace problems

---
 src/jacobian.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/jacobian.jl b/src/jacobian.jl
index 157562752..aea7b4270 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -1,10 +1,13 @@
-@concrete struct JacobianWrapper
+@concrete struct JacobianWrapper{iip}
     f
     p
 end
 
-(uf::JacobianWrapper)(u) = uf.f(u, uf.p)
-(uf::JacobianWrapper)(res, u) = uf.f(res, u, uf.p)
+# Previous Implementation did not hold onto `iip`, but this causes problems in packages
+# where we check for the presence of function signatures to check which dispatch to call
+(uf::JacobianWrapper{false})(u) = uf.f(u, uf.p)
+(uf::JacobianWrapper{false})(res, u) = (vec(res) .= vec(uf.f(u, uf.p)))
+(uf::JacobianWrapper{true})(res, u) = uf.f(res, u, uf.p)
 
 sparsity_detection_alg(f, ad) = NoSparsityDetection()
 function sparsity_detection_alg(f, ad::AbstractSparseADType)
@@ -48,7 +51,7 @@ jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
 # Build Jacobian Caches
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
     ::Val{iip}) where {iip}
-    uf = JacobianWrapper(f, p)
+    uf = JacobianWrapper{iip}(f, p)
 
     haslinsolve = hasfield(typeof(alg), :linsolve)
 
@@ -98,6 +101,6 @@ end
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u::Number, p,
     ::Val{false})
     # NOTE: Scalar `u` assumes scalar output from `f`
-    uf = JacobianWrapper(f, p)
+    uf = JacobianWrapper{false}(f, p)
     return uf, nothing, u, nothing, nothing, u
 end

From 7e26d18c78173f887c7bda5c2a0b1bc20112701d Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 20 Sep 2023 14:26:23 -0400
Subject: [PATCH 16/19] Add support for line search in Newton Raphson

---
 Project.toml          |   2 +
 src/NonlinearSolve.jl |   5 +-
 src/jacobian.jl       |  11 ++--
 src/levenberg.jl      |  12 ++--
 src/linesearch.jl     | 146 ++++++++++++++++++++++++++++++++++++++++++
 src/raphson.jl        |  35 ++++++----
 src/trustRegion.jl    |  11 +---
 src/utils.jl          |  23 +++++++
 test/basictests.jl    |  56 +++++++++-------
 9 files changed, 241 insertions(+), 60 deletions(-)
 create mode 100644 src/linesearch.jl

diff --git a/Project.toml b/Project.toml
index ab1f6a500..f5d4ddcef 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
@@ -33,6 +34,7 @@ Enzyme = "0.11"
 FiniteDiff = "2"
 ForwardDiff = "0.10.3"
 LinearSolve = "2"
+LineSearches = "7"
 PrecompileTools = "1"
 RecursiveArrayTools = "2"
 Reexport = "0.2, 1"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 2f851faa3..615f96c03 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -20,7 +20,7 @@ import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isi
 import StaticArraysCore: StaticArray, SVector, SArray, MArray
 import UnPack: @unpack
 
-@reexport using ADTypes, SciMLBase, SimpleNonlinearSolve
+@reexport using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
 
 const AbstractSparseADType = Union{ADTypes.AbstractSparseFiniteDifferences,
     ADTypes.AbstractSparseForwardMode, ADTypes.AbstractSparseReverseMode}
@@ -35,6 +35,7 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::AbstractNonlinearSolveAl
 end
 
 include("utils.jl")
+include("linesearch.jl")
 include("raphson.jl")
 include("trustRegion.jl")
 include("levenberg.jl")
@@ -69,4 +70,6 @@ export RadiusUpdateSchemes
 
 export NewtonRaphson, TrustRegion, LevenbergMarquardt
 
+export LineSearch
+
 end # module
diff --git a/src/jacobian.jl b/src/jacobian.jl
index aea7b4270..83d26fee6 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -9,7 +9,7 @@ end
 (uf::JacobianWrapper{false})(res, u) = (vec(res) .= vec(uf.f(u, uf.p)))
 (uf::JacobianWrapper{true})(res, u) = uf.f(res, u, uf.p)
 
-sparsity_detection_alg(f, ad) = NoSparsityDetection()
+sparsity_detection_alg(_, _) = NoSparsityDetection()
 function sparsity_detection_alg(f, ad::AbstractSparseADType)
     if f.sparsity === nothing
         if f.jac_prototype === nothing
@@ -49,8 +49,8 @@ end
 jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
 
 # Build Jacobian Caches
-function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
-    ::Val{iip}) where {iip}
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p, ::Val{iip};
+    linsolve_kwargs=(;)) where {iip}
     uf = JacobianWrapper{iip}(f, p)
 
     haslinsolve = hasfield(typeof(alg), :linsolve)
@@ -92,14 +92,15 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
 
     Pl, Pr = wrapprecs(alg.precs(J, nothing, u, p, nothing, nothing, nothing, nothing,
             nothing)..., weight)
-    linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
+    linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr,
+        linsolve_kwargs...)
 
     return uf, linsolve, J, fu, jac_cache, du
 end
 
 ## Special Handling for Scalars
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u::Number, p,
-    ::Val{false})
+    ::Val{false}; kwargs...)
     # NOTE: Scalar `u` assumes scalar output from `f`
     uf = JacobianWrapper{false}(f, p)
     return uf, nothing, u, nothing, nothing, u
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 6265eba3f..17f61475f 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -142,16 +142,12 @@ isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
     args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
     @unpack f, u0, p = prob
     u = alias_u0 ? u0 : deepcopy(u0)
-    if iip
-        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
-        f(fu1, u, p)
-    else
-        fu1 = f(u, p)
-    end
-    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
+    fu1 = evaluate_f(prob, u)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+        linsolve_kwargs)
 
     λ = convert(eltype(u), alg.damping_initial)
     λ_factor = convert(eltype(u), alg.damping_increase_factor)
diff --git a/src/linesearch.jl b/src/linesearch.jl
new file mode 100644
index 000000000..3890f8230
--- /dev/null
+++ b/src/linesearch.jl
@@ -0,0 +1,146 @@
+"""
+    LineSearch(method = Static(), autodiff = AutoFiniteDiff(), alpha = true)
+
+Wrapper over algorithms from
+[LineSeaches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl/). Allows automatic
+construction of the objective functions for the line search algorithms utilizing automatic
+differentiation for fast Vector Jacobian Products.
+
+### Arguments
+
+  - `method`: the line search algorithm to use. Defaults to `Static()`, which means that the
+    step size is fixed to the value of `alpha`.
+  - `autodiff`: the automatic differentiation backend to use for the line search. Defaults to
+    `AutoFiniteDiff()`, which means that finite differencing is used to compute the VJP.
+    `AutoZygote()` will be faster in most cases, but it requires `Zygote.jl` to be manually
+    installed and loaded
+  - `alpha`: the initial step size to use. Defaults to `true` (which is equivalent to `1`).
+"""
+@concrete struct LineSearch
+    method
+    autodiff
+    α
+end
+
+function LineSearch(; method = Static(), autodiff = AutoFiniteDiff(), alpha = true)
+    return LineSearch(method, autodiff, alpha)
+end
+
+@concrete mutable struct LineSearchCache
+    f
+    ϕ
+    dϕ
+    ϕdϕ
+    α
+    ls
+end
+
+function LineSearchCache(ls::LineSearch, f, u::Number, p, _, ::Val{false})
+    eval_f(u, du, α) = eval_f(u - α * du)
+    eval_f(u) = f(u, p)
+
+    ls.method isa Static && return LineSearchCache(eval_f, nothing, nothing, nothing,
+        convert(typeof(u), ls.α), ls)
+
+    g(u, fu) = last(value_derivative(Base.Fix2(f, p), u)) * fu
+
+    function ϕ(u, du)
+        function ϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            return dot(_fu, _fu) / 2
+        end
+        return ϕ_internal
+    end
+
+    function dϕ(u, du)
+        function dϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g(u_, _fu)
+            return dot(g₀, -du)
+        end
+        return dϕ_internal
+    end
+
+    function ϕdϕ(u, du)
+        function ϕdϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g(u_, _fu)
+            return dot(_fu, _fu) / 2, dot(g₀, -du)
+        end
+        return ϕdϕ_internal
+    end
+
+    return LineSearchCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
+end
+
+function LineSearchCache(ls::LineSearch, f, u, p, fu1, IIP::Val{iip}) where {iip}
+    fu = iip ? fu1 : nothing
+    u_ = _mutable_zero(u)
+
+    function eval_f(u, du, α)
+        @. u_ = u - α * du
+        return eval_f(u_)
+    end
+    eval_f(u) = evaluate_f(f, u, p, IIP; fu)
+
+    ls.method isa Static && return LineSearchCache(eval_f, nothing, nothing, nothing,
+        convert(eltype(u), ls.α), ls)
+
+    g₀ = _mutable_zero(u)
+
+    function g!(u, fu)
+        op = VecJac((args...) -> f(args..., p), u)
+        if iip
+            mul!(g₀, op, fu)
+            return g₀
+        else
+            return op * fu
+        end
+    end
+
+    function ϕ(u, du)
+        function ϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            return dot(_fu, _fu) / 2
+        end
+        return ϕ_internal
+    end
+
+    function dϕ(u, du)
+        function dϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g!(u_, _fu)
+            return dot(g₀, -du)
+        end
+        return dϕ_internal
+    end
+
+    function ϕdϕ(u, du)
+        function ϕdϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g!(u_, _fu)
+            return dot(_fu, _fu) / 2, dot(g₀, -du)
+        end
+        return ϕdϕ_internal
+    end
+
+    return LineSearchCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
+end
+
+function perform_linesearch!(cache::LineSearchCache, u, du)
+    cache.ls.method isa Static && return (cache.α, cache.f(u, du, cache.α))
+
+    ϕ = cache.ϕ(u, du)
+    dϕ = cache.dϕ(u, du)
+    ϕdϕ = cache.ϕdϕ(u, du)
+
+    ϕ₀, dϕ₀ = ϕdϕ(zero(eltype(u)))
+
+    return cache.ls.method(ϕ, cache.dϕ(u, du), cache.ϕdϕ(u, du), cache.α, ϕ₀, dϕ₀)
+end
diff --git a/src/raphson.jl b/src/raphson.jl
index 33d12c4ba..d01881dc4 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -25,19 +25,24 @@ for large-scale and numerically-difficult nonlinear systems.
     preconditioners. For more information on specifying preconditioners for LinearSolve
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
+    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
+    used here directly, and they will be converted to the correct `LineSearch`.
 """
 @concrete struct NewtonRaphson{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
     ad::AD
     linsolve
     precs
+    linesearch
 end
 
 concrete_jac(::NewtonRaphson{CJ}) where {CJ} = CJ
 
 function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
-    precs = DEFAULT_PRECS, adkwargs...)
+    linesearch = LineSearch(), precs = DEFAULT_PRECS, adkwargs...)
     ad = default_adargs_to_adtype(; adkwargs...)
-    return NewtonRaphson{_unwrap_val(concrete_jac)}(ad, linsolve, precs)
+    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method=linesearch)
+    return NewtonRaphson{_unwrap_val(concrete_jac)}(ad, linsolve, precs, linesearch)
 end
 
 @concrete mutable struct NewtonRaphsonCache{iip}
@@ -59,26 +64,23 @@ end
     abstol
     prob
     stats::NLStats
+    lscache
 end
 
 isinplace(::NewtonRaphsonCache{iip}) where {iip} = iip
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson, args...;
     alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
     @unpack f, u0, p = prob
     u = alias_u0 ? u0 : deepcopy(u0)
-    if iip
-        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
-        f(fu1, u, p)
-    else
-        fu1 = _mutable(f(u, p))
-    end
-    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
+    fu1 = evaluate_f(prob, u)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+        linsolve_kwargs)
 
     return NewtonRaphsonCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
-        NLStats(1, 0, 0, 0, 0))
+        NLStats(1, 0, 0, 0, 0), LineSearchCache(alg.linesearch, f, u, p, fu1, Val(iip)))
 end
 
 function perform_step!(cache::NewtonRaphsonCache{true})
@@ -89,8 +91,10 @@ function perform_step!(cache::NewtonRaphsonCache{true})
     linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
         p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. u = u - du
-    f(fu1, u, p)
+
+    # Line Search
+    α, _ = perform_linesearch!(cache.lscache, u, du)
+    @. u = u - α * du
 
     cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
@@ -112,7 +116,10 @@ function perform_step!(cache::NewtonRaphsonCache{false})
             linu = _vec(cache.du), p, reltol = cache.abstol)
         cache.linsolve = linres.cache
     end
-    cache.u = @. u - cache.du  # `u` might not support mutation
+
+    # Line Search
+    α, _fu = perform_linesearch!(cache.lscache, u, cache.du)
+    cache.u = @. u - α * cache.du  # `u` might not support mutation
     cache.fu1 = f(cache.u, p)
 
     cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 41ccb994e..e0892a4da 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -202,20 +202,15 @@ end
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion, args...;
     alias_u0 = false, maxiters = 1000, abstol = 1e-8, internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
     @unpack f, u0, p = prob
     u = alias_u0 ? u0 : deepcopy(u0)
     u_prev = zero(u)
-    if iip
-        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
-        f(fu1, u, p)
-    else
-        fu1 = f(u, p)
-    end
+    fu1 = evaluate_f(prob, u)
     fu_prev = zero(fu1)
 
     loss = get_loss(fu1)
-    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip); linsolve_kwargs)
 
     radius_update_scheme = alg.radius_update_scheme
     max_trust_radius = convert(eltype(u), alg.max_trust_radius)
diff --git a/src/utils.jl b/src/utils.jl
index 3df540632..7498d5afa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -142,3 +142,26 @@ _maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
 # The shadow allocated for Enzyme needs to be mutable
 _maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
 _maybe_mutable(x, _) = x
+
+# Helper function to get value of `f(u, p)`
+function evaluate_f(prob::NonlinearProblem{uType, iip}, u) where {uType, iip}
+    @unpack f, u0, p = prob
+    if iip
+        fu = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu, u, p)
+    else
+        fu = _mutable(f(u, p))
+    end
+    return fu
+end
+
+evaluate_f(cache, u; fu = nothing) = evaluate_f(cache.f, u, cache.p, Val(cache.iip); fu)
+
+function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
+    if iip
+        f(fu, u, p)
+        return fu
+    else
+        return f(u, p)
+    end
+end
diff --git a/test/basictests.jl b/test/basictests.jl
index 11e64307d..c31be05fa 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -21,41 +21,49 @@ end
 # --- NewtonRaphson tests ---
 
 @testset "NewtonRaphson" begin
-    function benchmark_nlsolve_oop(f, u0, p = 2.0)
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; linesearch = LineSearch())
         prob = NonlinearProblem{false}(f, u0, p)
-        return solve(prob, NewtonRaphson(), abstol = 1e-9)
+        return solve(prob, NewtonRaphson(; linesearch), abstol = 1e-9)
     end
 
-    function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs)
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs,
+        linesearch = LineSearch())
         prob = NonlinearProblem{true}(f, u0, p)
-        return solve(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
+        return solve(prob, NewtonRaphson(; linsolve, precs, linesearch), abstol = 1e-9)
     end
 
-    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
-    @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
-        sol = benchmark_nlsolve_oop(quadratic_f, u0)
-        @test SciMLBase.successful_retcode(sol)
-        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+    @testset "LineSearch: $(_nameof(lsmethod)) LineSearch AD: $(_nameof(ad))" for lsmethod in (Static(),
+            StrongWolfe(), BackTracking(), HagerZhang(), MoreThuente()),
+        ad in (AutoFiniteDiff(), AutoZygote())
 
-        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), NewtonRaphson(),
-            abstol = 1e-9)
-        @test (@ballocated solve!($cache)) < 200
-    end
+        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
 
-    precs = [NonlinearSolve.DEFAULT_PRECS, :Random]
+        @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
+            sol = benchmark_nlsolve_oop(quadratic_f, u0; linesearch)
+            @test SciMLBase.successful_retcode(sol)
+            @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-    @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
-            1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
-        if prec === :Random
-            prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
+            cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), NewtonRaphson(),
+                abstol = 1e-9)
+            @test (@ballocated solve!($cache)) < 200
         end
-        sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec)
-        @test SciMLBase.successful_retcode(sol)
-        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
-            NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
-        @test (@ballocated solve!($cache)) ≤ 64
+        precs = [NonlinearSolve.DEFAULT_PRECS, :Random]
+
+        @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
+                1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
+            if prec === :Random
+                prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
+            end
+            sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec, linesearch)
+            @test SciMLBase.successful_retcode(sol)
+            @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+            cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
+                NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
+            @test (@ballocated solve!($cache)) ≤ 64
+        end
     end
 
     if VERSION ≥ v"1.9"

From 83c0723d801ac1e65140891948b2173b52beaf87 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 20 Sep 2023 15:41:25 -0400
Subject: [PATCH 17/19] auto switch to finitediff for inplace problems

---
 src/linesearch.jl  |  16 +++++--
 src/raphson.jl     |   5 ++-
 test/basictests.jl | 107 +++++++++++++++++++++++++--------------------
 3 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/src/linesearch.jl b/src/linesearch.jl
index 3890f8230..30861e14b 100644
--- a/src/linesearch.jl
+++ b/src/linesearch.jl
@@ -91,8 +91,15 @@ function LineSearchCache(ls::LineSearch, f, u, p, fu1, IIP::Val{iip}) where {iip
 
     g₀ = _mutable_zero(u)
 
+    autodiff = if iip && (ls.autodiff isa AutoZygote || ls.autodiff isa AutoSparseZygote)
+        @warn "Attempting to use Zygote.jl for linesearch on an in-place problem. Falling back to finite differencing."
+        AutoFiniteDiff()
+    else
+        ls.autodiff
+    end
+
     function g!(u, fu)
-        op = VecJac((args...) -> f(args..., p), u)
+        op = VecJac((args...) -> f(args..., p), u; autodiff)
         if iip
             mul!(g₀, op, fu)
             return g₀
@@ -134,7 +141,7 @@ function LineSearchCache(ls::LineSearch, f, u, p, fu1, IIP::Val{iip}) where {iip
 end
 
 function perform_linesearch!(cache::LineSearchCache, u, du)
-    cache.ls.method isa Static && return (cache.α, cache.f(u, du, cache.α))
+    cache.ls.method isa Static && return cache.α
 
     ϕ = cache.ϕ(u, du)
     dϕ = cache.dϕ(u, du)
@@ -142,5 +149,8 @@ function perform_linesearch!(cache::LineSearchCache, u, du)
 
     ϕ₀, dϕ₀ = ϕdϕ(zero(eltype(u)))
 
-    return cache.ls.method(ϕ, cache.dϕ(u, du), cache.ϕdϕ(u, du), cache.α, ϕ₀, dϕ₀)
+    # This case is sometimes possible for large optimization problems
+    dϕ₀ ≥ 0 && return cache.α
+
+    return first(cache.ls.method(ϕ, cache.dϕ(u, du), cache.ϕdϕ(u, du), cache.α, ϕ₀, dϕ₀))
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index d01881dc4..8297f92fe 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -93,8 +93,9 @@ function perform_step!(cache::NewtonRaphsonCache{true})
     cache.linsolve = linres.cache
 
     # Line Search
-    α, _ = perform_linesearch!(cache.lscache, u, du)
+    α = perform_linesearch!(cache.lscache, u, du)
     @. u = u - α * du
+    f(cache.fu1, u, p)
 
     cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
@@ -118,7 +119,7 @@ function perform_step!(cache::NewtonRaphsonCache{false})
     end
 
     # Line Search
-    α, _fu = perform_linesearch!(cache.lscache, u, cache.du)
+    α = perform_linesearch!(cache.lscache, u, cache.du)
     cache.u = @. u - α * cache.du  # `u` might not support mutation
     cache.fu1 = f(cache.u, p)
 
diff --git a/test/basictests.jl b/test/basictests.jl
index c31be05fa..54e63e93d 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -53,10 +53,12 @@ end
 
         @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
                 1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
+            ad isa AutoZygote && continue
             if prec === :Random
                 prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
             end
-            sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec, linesearch)
+            sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec,
+                linesearch)
             @test SciMLBase.successful_retcode(sol)
             @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
@@ -67,25 +69,30 @@ end
     end
 
     if VERSION ≥ v"1.9"
-        @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
-            @test begin
-                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
-                res_true = sqrt(p)
-                all(res.u .≈ res_true)
+        @testset "[OOP] [Immutable AD]" begin
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
             end
-            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-                @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
         end
     end
 
-    @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
-            res_true = sqrt(p)
-            res.u ≈ res_true
+    @testset "[OOP] [Scalar AD]" begin
+        for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+                res_true = sqrt(p)
+                res.u ≈ res_true
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u,
+                p) ≈
+                  1 / (2 * sqrt(p))
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u, p) ≈
-              1 / (2 * sqrt(p))
     end
 
     if VERSION ≥ v"1.9"
@@ -162,33 +169,34 @@ end
     end
 
     if VERSION ≥ v"1.9"
-        @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme) p: $(p)" for radius_update_scheme in radius_update_schemes,
-            p in 1.0:0.1:100.0
+        @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
+                        radius_update_scheme)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
+            end
+        end
+    end
 
+    @testset "[OOP] [Scalar AD] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+        for p in 1.0:0.1:100.0
             @test begin
-                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
+                res = benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0), p;
                     radius_update_scheme)
                 res_true = sqrt(p)
-                all(res.u .≈ res_true)
+                res.u ≈ res_true
             end
             @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-                @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
+                    oftype(p, 1.0),
+                    p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
         end
     end
 
-    @testset "[OOP] [Scalar AD] radius_update_scheme: $(radius_update_scheme)  p: $(p)" for radius_update_scheme in radius_update_schemes,
-        p in 1.0:0.1:100.0
-
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0), p;
-                radius_update_scheme)
-            res_true = sqrt(p)
-            res.u ≈ res_true
-        end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0),
-                p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
-    end
-
     if VERSION ≥ v"1.9"
         t = (p) -> [sqrt(p[2] / p[1])]
         p = [0.9, 50.0]
@@ -316,25 +324,30 @@ end
     end
 
     if VERSION ≥ v"1.9"
-        @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
-            @test begin
-                res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
-                res_true = sqrt(p)
-                all(res.u .≈ res_true)
+        @testset "[OOP] [Immutable AD]" begin
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
             end
-            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
-                @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
         end
     end
 
-    @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
-        @test begin
-            res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
-            res_true = sqrt(p)
-            res.u ≈ res_true
+    @testset "[OOP] [Scalar AD]" begin
+        for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+                res_true = sqrt(p)
+                res.u ≈ res_true
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u,
+                p) ≈
+                  1 / (2 * sqrt(p))
         end
-        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u, p) ≈
-              1 / (2 * sqrt(p))
     end
 
     if VERSION ≥ v"1.9"

From 0b3eaa1587121d120f9a0158858daafab608ab35 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 20 Sep 2023 18:22:30 -0400
Subject: [PATCH 18/19] Drop 1.6 and require DiffEqBase 6.130

---
 .github/workflows/CI.yml | 1 -
 Project.toml             | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index cf1105bad..33fa3e6e2 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -15,7 +15,6 @@ jobs:
           - Core
         version:
           - '1'
-          - '1.6'
     steps:
       - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@v1
diff --git a/Project.toml b/Project.toml
index f5d4ddcef..4d474db44 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 ADTypes = "0.2"
 ArrayInterface = "6.0.24, 7"
 ConcreteStructs = "0.2"
-DiffEqBase = "6"
+DiffEqBase = "6.130"
 EnumX = "1"
 Enzyme = "0.11"
 FiniteDiff = "2"
@@ -44,7 +44,7 @@ SparseDiffTools = "2.6"
 StaticArraysCore = "1.4"
 UnPack = "1.0"
 Zygote = "0.6"
-julia = "1.6"
+julia = "1.9"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"

From 4cd2d979e4d2da5972f69f654e112a4b0e062e6a Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 20 Sep 2023 18:25:16 -0400
Subject: [PATCH 19/19] Remove 1.6 downstream

---
 .github/workflows/Downstream.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
index 0d2a213b4..ffa38dd95 100644
--- a/.github/workflows/Downstream.yml
+++ b/.github/workflows/Downstream.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: [1,1.6]
+        julia-version: [1]
         os: [ubuntu-latest]
         package:
           - {user: SciML, repo: ModelingToolkit.jl, group: All}