From 0cb65fcdcf638eda2c648eb923831109a32ecffb Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 12 Sep 2023 18:18:59 -0400
Subject: [PATCH 01/28] Fast path for Two Point BVPs

---
 src/BoundaryValueDiffEq.jl |   1 +
 src/nlprob.jl              | 136 +++++++++++++++++++++++++++++++++++++
 test/orbital.jl            |   5 +-
 3 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 157e6f69..b6d6b300 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -11,6 +11,7 @@ import DiffEqBase: solve
 import ForwardDiff: pickchunksize
 import RecursiveArrayTools: ArrayPartition, DiffEqArray
 import SciMLBase: AbstractDiffEqInterpolation
+import RecursiveArrayTools: ArrayPartition
 import SparseDiffTools: AbstractSparseADType
 import TruncatedStacktraces: @truncate_stacktrace
 import UnPack: @unpack
diff --git a/src/nlprob.jl b/src/nlprob.jl
index 27487dc9..1ed8b5d0 100644
--- a/src/nlprob.jl
+++ b/src/nlprob.jl
@@ -251,3 +251,139 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
 
     return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
 end
+
+# Two Point Specialization
+function construct_sparse_banded_jac_prototype(y::ArrayPartition, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    l_top = M * length(y.x[1].x[1])
+    l_bot = M * length(y.x[1].x[2])
+
+    Is = Vector{Int}(undef, l + l_top + l_bot)
+    Js = Vector{Int}(undef, l + l_top + l_bot)
+    idx = 1
+
+    for i in 1:length(y.x[1].x[1]), j in 1:M
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i + length(y.x[1].x[1])
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:length(y.x[1].x[2]), j in 1:M
+        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
+        Js[idx] = j + M * (N - 1)
+        idx += 1
+    end
+
+    y_ = similar(y, length(Is))
+    return sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
+        y_, M * N, M * N)
+end
+
+function generate_split_jac(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+    _) where {iip}
+    @unpack nlsolve, jac_alg = cache.alg
+    N = length(cache.mesh)
+
+    resid_bc = cache.prob.f.bcresid_prototype === nothing ? similar(y, cache.M) :
+               cache.prob.f.bcresid_prototype
+    resid_collocation = similar(y, cache.M * (N - 1))
+
+    sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
+            NoSparsityDetection()
+
+    if iip
+        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, resid_bc, y)
+    else
+        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, y;
+            fx = resid_bc)
+    end
+
+    sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
+        Jₛ = construct_sparse_banded_jac_prototype(y, cache.M, N)
+        JacPrototypeSparsityDetection(; jac_prototype = Jₛ)
+    else
+        NoSparsityDetection()
+    end
+
+    if iip
+        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
+            sd_collocation, loss_collocation, resid_collocation, y)
+    else
+        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
+            sd_collocation, loss_collocation, y; fx = resid_collocation)
+    end
+
+    jac_prototype = vcat(init_jacobian(cache_bc), init_jacobian(cache_collocation))
+
+    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
+    #       mismatch for ForwardDiff
+    jac = if iip
+        function jac_internal!(J, x, p)
+            sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
+                loss_bc, resid_bc, x)
+            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+                cache_collocation, loss_collocation, resid_collocation, x)
+            return J
+        end
+    else
+        J_ = jac_prototype
+        function jac_internal(x, p)
+            sparse_jacobian!(@view(J_[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
+                loss_bc, x)
+            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+                cache_collocation, loss_collocation, x)
+            return J_
+        end
+    end
+
+    return jac, jac_prototype
+end
+
+function generate_split_jac(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+    ::TwoPointBVProblem) where {iip}
+    @unpack nlsolve, jac_alg = cache.alg
+    N = length(cache.mesh)
+
+    if !iip && cache.prob.f.bcresid_prototype === nothing
+        y_ = recursive_unflatten!(cache.y, y)
+        resid_ = cache.bc((y_[1], y_[end]), cache.p)
+        resid = ArrayPartition(ArrayPartition(resid_), similar(y, cache.M * (N - 1)))
+    else
+        resid = ArrayPartition(cache.prob.f.bcresid_prototype,
+            similar(y, cache.M * (N - 1)))
+    end
+
+    Jₛ = construct_sparse_banded_jac_prototype(resid, cache.M, N)
+    sd = JacPrototypeSparsityDetection(; jac_prototype = Jₛ)
+
+    if iip
+        cache = sparse_jacobian_cache(jac_alg.bc_diffmode, sd, loss, resid, y)
+    else
+        cache = sparse_jacobian_cache(jac_alg.bc_diffmode, sd, loss, y; fx = resid)
+    end
+
+    jac_prototype = init_jacobian(cache)
+
+    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
+    #       mismatch for ForwardDiff
+    jac = if iip
+        function jac_internal!(J, x, p)
+            sparse_jacobian!(J, jac_alg.bc_diffmode, cache, loss, resid, x)
+            return J
+        end
+    else
+        J_ = jac_prototype
+        function jac_internal(x, p)
+            sparse_jacobian!(J_, jac_alg.bc_diffmode, cache, loss, x)
+            return J_
+        end
+    end
+
+    return jac, jac_prototype
+end
diff --git a/test/orbital.jl b/test/orbital.jl
index cd3ce853..2a6dcb64 100644
--- a/test/orbital.jl
+++ b/test/orbital.jl
@@ -3,6 +3,8 @@ using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
 
 @info "Testing Lambert's Problem"
 
+@info "Testing Lambert's Problem"
+
 y0 = [
     -4.7763169762853989E+06,
     -3.8386398704441520E+05,
@@ -74,8 +76,7 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     AutoSparseForwardDiff(), AutoFiniteDiff(; fdtype = Val(:forward)),
     AutoSparseFiniteDiff())
     nlsolve = NewtonRaphson(; autodiff)
-    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13,
-        reltol = 1e-13)
+    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13, reltol = 1e-13)
     cur_bc!(resid_f, sol, nothing, sol.t)
     @test norm(resid_f, Inf) < TestTol
 end

From 1dba481ab6939603df51b9e279baf2330207758d Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 13 Sep 2023 15:23:23 -0400
Subject: [PATCH 02/28] Allow non vector inputs for Single Shooting

---
 src/solve/single_shooting.jl | 35 +++++++++++++++++++----------------
 test/non_vector_inputs.jl    |  7 +++++++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 52e32501..acf21650 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,28 +1,31 @@
-# TODO: Differentiate between nlsolve kwargs and odesolve kwargs
 # TODO: Support Non-Vector Inputs
-function SciMLBase.__solve(prob::BVProblem, alg::Shooting; kwargs...)
-    iip = isinplace(prob)
-    bc = prob.f.bc
-    u0 = deepcopy(prob.u0)
+function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
+    nlsolve_kwargs = (;), kwargs...)
+    iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(prob.u0), size(prob.u0)
     loss_fn = if iip
-        function loss!(resid, u0, p)
-            tmp_prob = ODEProblem{iip}(prob.f, u0, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; kwargs...)
-            eval_bc_residual!(resid, prob.problem_type, bc, internal_sol, p)
+        resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
+                     size(prob.f.bcresid_prototype)
+        function loss!(resid, u0_, p)
+            u0_internal = reshape(u0_, u0_size)
+            tmp_prob = ODEProblem{iip}(prob.f, u0_internal, prob.tspan, p)
+            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., kwargs...)
+            eval_bc_residual!(reshape(resid, resid_size), prob.problem_type, bc,
+                internal_sol, p)
             return nothing
         end
     else
-        function loss(u0, p)
-            tmp_prob = ODEProblem(prob.f, u0, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; kwargs...)
-            return eval_bc_residual(prob.problem_type, bc, internal_sol, p)
+        function loss(u0_, p)
+            u0_internal = reshape(u0_, u0_size)
+            tmp_prob = ODEProblem(prob.f, u0_internal, prob.tspan, p)
+            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., kwargs...)
+            return vec(eval_bc_residual(prob.problem_type, bc, internal_sol, p))
         end
     end
     opt = solve(NonlinearProblem(NonlinearFunction{iip}(loss_fn; prob.f.jac_prototype,
-                resid_prototype = prob.f.bcresid_prototype), u0, prob.p), alg.nlsolve;
+                resid_prototype = prob.f.bcresid_prototype), vec(u0), prob.p), alg.nlsolve;
         kwargs...)
-    sol_prob = ODEProblem{iip}(prob.f, opt.u, prob.tspan, prob.p)
-    sol = solve(sol_prob, alg.ode_alg; kwargs...)
+    sol_prob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
+    sol = solve(sol_prob, alg.ode_alg; nlsolve_kwargs..., kwargs...)
     return DiffEqBase.solution_new_retcode(sol,
         sol.retcode == opt.retcode ? ReturnCode.Success : ReturnCode.Failure)
 end
diff --git a/test/non_vector_inputs.jl b/test/non_vector_inputs.jl
index 52f6c81c..99aa0fb2 100644
--- a/test/non_vector_inputs.jl
+++ b/test/non_vector_inputs.jl
@@ -50,4 +50,11 @@ probs = [
             @test norm(diff(first.(sol.u)) .+ 0.2, Inf) + abs(sol[1][1] - 5) < 0.01
         end
     end
+
+    @testset "Single Shooting" begin
+        for prob in probs
+            @time sol = solve(prob, Shooting(Tsit5()))
+            @test norm(boundary(sol, prob.p, nothing)) < 0.01
+        end
+    end
 end

From a6f41e7496e3ef6d2ae5ddb9241ab178a90ac35a Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 13 Sep 2023 17:07:31 -0400
Subject: [PATCH 03/28] Working version of Multiple Shooting :tada:

---
 src/BoundaryValueDiffEq.jl     |   3 +-
 src/algorithms.jl              |  27 +++++-
 src/solve/multiple_shooting.jl | 171 +++++++++++++++++++++++++++++++++
 test/shooting_tests.jl         |  62 +++++++-----
 4 files changed, 237 insertions(+), 26 deletions(-)
 create mode 100644 src/solve/multiple_shooting.jl

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index b6d6b300..2b7b73a2 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -25,6 +25,7 @@ include("cache.jl")
 include("collocation.jl")
 include("nlprob.jl")
 include("solve/single_shooting.jl")
+include("solve/multiple_shooting.jl")
 include("solve/mirk.jl")
 include("adaptivity.jl")
 include("interpolation.jl")
@@ -35,7 +36,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::BoundaryValueDiffEqAlgorithm, a
     return solve!(cache)
 end
 
-export Shooting
+export Shooting, MultipleShooting
 export MIRK2, MIRK3, MIRK4, MIRK5, MIRK6
 export MIRKJacobianComputationAlgorithm
 # From ODEInterface.jl
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 5aff9f08..afc0f6c3 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -7,7 +7,7 @@ abstract type BoundaryValueDiffEqAlgorithm <: SciMLBase.AbstractBVPAlgorithm end
 abstract type AbstractMIRK <: BoundaryValueDiffEqAlgorithm end
 
 """
-    Shooting(ode_alg; nlsolve = BoundaryValueDiffEq.DEFAULT_NLSOLVE_SHOOTING)
+    Shooting(ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING)
 
 Single shooting method, reduces BVP to an initial value problem and solves the IVP.
 """
@@ -18,6 +18,31 @@ end
 
 Shooting(ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING) = Shooting(ode_alg, nlsolve)
 
+"""
+    MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING,
+        grid_coarsening = true)
+
+Multiple Shooting method, reduces BVP to an initial value problem and solves the IVP.
+Significantly more stable than Single Shooting.
+"""
+@concrete struct MultipleShooting
+    ode_alg
+    nlsolve
+    nshoots::Int
+    grid_coarsening
+end
+
+function MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING,
+    grid_coarsening = true)
+    @assert grid_coarsening isa Bool || grid_coarsening isa Function || grid_coarsening isa AbstractVector{<:Integer} || grid_coarsening isa NTuple{N, <:Integer} where {N}
+    grid_coarsening isa Tuple && (grid_coarsening = Vector(grid_coarsening...))
+    if grid_coarsening isa AbstractVector
+        sort!(grid_coarsening; rev=true)
+        @assert all(grid_coarsening .> 0) && 1 ∉ grid_coarsening
+    end
+    return MultipleShooting(ode_alg, nlsolve, nshoots, grid_coarsening)
+end
+
 for order in (2, 3, 4, 5, 6)
     alg = Symbol("MIRK$(order)")
 
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
new file mode 100644
index 00000000..01b465a8
--- /dev/null
+++ b/src/solve/multiple_shooting.jl
@@ -0,0 +1,171 @@
+# TODO: incorporate `initial_guess` similar to MIRK methods
+function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
+    nlsolve_kwargs = (;), kwargs...)
+    @unpack f, bc, tspan = prob
+    bcresid_prototype = prob.f.bcresid_prototype === nothing ? similar(prob.u0) :
+                        prob.f.bcresid_prototype
+    N, u0_size, nshoots, iip = length(prob.u0), size(prob.u0), alg.nshoots, isinplace(prob)
+
+    @views function loss!(resid::ArrayPartition, us, p, cur_nshoots, nodes)
+        ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
+        us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
+
+        resid_bc, resid_nodes = resid.x[1], resid.x[2]
+
+        for i in 1:cur_nshoots
+            local odeprob = ODEProblem{iip}(f,
+                reshape(us[((i - 1) * N + 1):(i * N)], u0_size), (nodes[i], nodes[i + 1]),
+                p)
+            sol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
+                save_end = true, save_everystep = false)
+
+            ts_[i] = sol.t
+            us_[i] = sol.u
+
+            resid_nodes[((i - 1) * N + 1):(i * N)] .= vec(us[(i * N + 1):((i + 1) * N)]) .-
+                                                      vec(sol.u[end])
+        end
+
+        _ts = foldl(vcat, ts_)
+        _us = foldl(vcat, us_)
+
+        # Boundary conditions
+        # Builds an ODESolution object to keep the framework for bc(,,) consistent
+        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+        total_solution = SciMLBase.build_solution(odeprob, nothing, _ts, _us)
+
+        if iip
+            eval_bc_residual!(resid_bc, prob.problem_type, bc, total_solution, p)
+        else
+            resid_bc .= eval_bc_residual(prob.problem_type, bc, total_solution, p)
+        end
+
+        return resid
+    end
+
+    # This gets all the nshoots except the final SingleShooting case
+    all_nshoots = get_all_nshoots(alg)
+    u_at_nodes, nodes = nothing, nothing
+
+    for (i, cur_nshoot) in enumerate(all_nshoots)
+        if i == 1
+            nodes, u_at_nodes = multiple_shooting_initialize(prob, alg; odesolve_kwargs,
+                kwargs...)
+        else
+            nodes, u_at_nodes = multiple_shooting_initialize(u_at_nodes, prob, alg, nodes,
+                cur_nshoot, all_nshoots[i - 1]; odesolve_kwargs, kwargs...)
+        end
+
+        resid_prototype = ArrayPartition(bcresid_prototype,
+            similar(u_at_nodes, cur_nshoot * N))
+        loss_function! = NonlinearFunction{true}((args...) -> loss!(args...,
+                cur_nshoot, nodes); resid_prototype)
+        nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
+        sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., kwargs...)
+        u_at_nodes = sol_nlsolve.u
+    end
+
+    single_shooting_prob = remake(prob; u0 = reshape(u_at_nodes[1:N], u0_size))
+    return SciMLBase.__solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve);
+        odesolve_kwargs, nlsolve_kwargs, kwargs...)
+end
+
+function multiple_shooting_initialize(prob, alg::MultipleShooting; odesolve_kwargs = (;),
+    kwargs...)
+    @unpack f, bc, u0, tspan, p = prob
+    @unpack ode_alg, nshoots = alg
+
+    N = length(u0)
+    nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
+
+    # Ensures type stability in case the parameters are dual numbers
+    if !(typeof(p) <: SciMLBase.NullParameters)
+        if !isconcretetype(eltype(p))
+            @warn "Type inference will fail if eltype(p) is not a concrete type"
+        end
+        u_at_nodes = similar(u0, promote_type(eltype(u0), eltype(p)), (nshoots + 1) * N)
+    else
+        u_at_nodes = similar(u0, (nshoots + 1) * N)
+    end
+
+    # Assumes no initial guess for now
+    start_prob = ODEProblem{isinplace(prob)}(f, u0, tspan, p)
+    sol = solve(start_prob, ode_alg; odesolve_kwargs..., kwargs..., saveat = nodes)
+
+    if SciMLBase.successful_retcode(sol)
+        u_at_nodes[1:N] .= sol.u[1]
+        for i in 2:(nshoots + 1)
+            u_at_nodes[(N + (i - 2) * N) .+ (1:N)] .= sol.u[i]
+        end
+    else
+        @warn "Initialization using odesolve failed. Initializing using 0s. It is \
+               recommended to provide an `initial_guess` function in this case."
+        fill!(u_at_nodes, 0)
+    end
+
+    return nodes, u_at_nodes
+end
+
+@views @inline function multiple_shooting_initialize(u_at_nodes_prev, prob, alg,
+    prev_nodes, nshoots, old_nshoots; odesolve_kwargs = (;), kwargs...)
+    @unpack f, bc, u0, tspan, p = prob
+    nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
+    N = length(u0)
+
+    u_at_nodes = similar(u_at_nodes_prev, N + nshoots * N)
+    u_at_nodes[1:N] .= u_at_nodes_prev[1:N]
+    u_at_nodes[(end - N + 1):end] .= u_at_nodes_prev[(end - N + 1):end]
+
+    skipsize = old_nshoots / nshoots
+    for i in 2:nshoots
+        pos = skipsize * (i - 1) + 1
+        idxs = (N + (i - 2) * N) .+ (1:N)
+        if isinteger(pos)
+            # If the current node is also a node of the finer grid
+            ind = trunc(Int, pos)
+            idxs_prev = (N + (ind - 2) * N .+ (1:N))
+            u_at_nodes[idxs] .= u_at_nodes_prev[idxs_prev]
+        else
+            # If the current node is not a node of the finer grid simulate from closest
+            # previous node and take result from simulation
+            fpos = floor(Int, pos)
+            r = pos - fpos
+
+            t0 = prev_nodes[fpos]
+            tf = prev_nodes[fpos + 1]
+            tstop = t0 + r * (tf - t0)
+
+            idxs_prev = (N + (fpos - 2) * N .+ (1:N))
+            ustart = u_at_nodes_prev[idxs_prev]
+
+            odeprob = ODEProblem(f, ustart, (t0, tstop), p)
+            odesol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
+                saveat = (), save_end = true)
+
+            u_at_nodes[idxs] .= odesol.u[end]
+        end
+    end
+
+    return nodes, u_at_nodes
+end
+
+@inline function get_all_nshoots(alg::MultipleShooting)
+    @unpack nshoots, grid_coarsening = alg
+    if grid_coarsening isa Bool
+        !grid_coarsening && return [nshoots]
+        update_fn = Base.Fix2(÷, 2)
+    elseif grid_coarsening isa Function
+        update_fn = grid_coarsening
+    else
+        grid_coarsening[1] == nshoots && return grid_coarsening
+        return vcat(nshoots, grid_coarsening)
+    end
+    nshoots_vec = Int[nshoots]
+    next = update_fn(nshoots)
+    while next > 1
+        push!(nshoots_vec, next)
+        next = update_fn(last(nshoots_vec))
+    end
+    @assert !(1 in nshoots_vec)
+    return nshoots_vec
+end
diff --git a/test/shooting_tests.jl b/test/shooting_tests.jl
index 4a4443ad..c7935535 100644
--- a/test/shooting_tests.jl
+++ b/test/shooting_tests.jl
@@ -2,6 +2,8 @@ using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
 
 @info "Shooting method"
 
+SOLVERS = [Shooting(Tsit5()), MultipleShooting(10, Tsit5())]
+
 @info "Multi-Point BVProblem" # Not really but using that API
 
 tspan = (0.0, 100.0)
@@ -23,11 +25,13 @@ end
 
 bvp1 = BVProblem(f1!, bc1!, u0, tspan)
 @test SciMLBase.isinplace(bvp1)
-resid_f = Array{Float64}(undef, 2)
-sol = solve(bvp1, Shooting(Tsit5()); abstol = 1e-13, reltol = 1e-13)
-@test SciMLBase.successful_retcode(sol)
-bc1!(resid_f, sol, nothing, sol.t)
-@test norm(resid_f) < 1e-12
+for solver in SOLVERS
+    resid_f = Array{Float64}(undef, 2)
+    sol = solve(bvp1, solver; abstol = 1e-13, reltol = 1e-13)
+    @test SciMLBase.successful_retcode(sol)
+    bc1!(resid_f, sol, nothing, sol.t)
+    @test norm(resid_f) < 1e-12
+end
 
 # Out of Place
 f1(u, p, t) = [u[2], -u[1]]
@@ -42,10 +46,12 @@ end
 
 bvp2 = BVProblem(f1, bc1, u0, tspan)
 @test !SciMLBase.isinplace(bvp2)
-sol = solve(bvp2, Shooting(Tsit5()); abstol = 1e-13, reltol = 1e-13)
-@test SciMLBase.successful_retcode(sol)
-resid_f = bc1(sol, nothing, sol.t)
-@test norm(resid_f) < 1e-12
+for solver in SOLVERS
+    sol = solve(bvp2, solver; abstol = 1e-13, reltol = 1e-13)
+    @test SciMLBase.successful_retcode(sol)
+    resid_f = bc1(sol, nothing, sol.t)
+    @test norm(resid_f) < 1e-12
+end
 
 @info "Two Point BVProblem" # Not really but using that API
 
@@ -63,12 +69,15 @@ end
 bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
     bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))
 @test SciMLBase.isinplace(bvp3)
-sol = solve(bvp3, Shooting(Tsit5()); abstol = 1e-13, reltol = 1e-13)
-@test SciMLBase.successful_retcode(sol)
-resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
-bc2a!(resid_f[1], sol(tspan[1]), nothing)
-bc2b!(resid_f[2], sol(tspan[2]), nothing)
-@test norm(reduce(vcat, resid_f)) < 1e-11
+for solver in SOLVERS
+    sol = solve(bvp3, solver)
+    @test SciMLBase.successful_retcode(sol; abstol = 1e-13, reltol = 1e-13)
+    @test SciMLBase.successful_retcode(sol)
+    resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
+    bc2a!(resid_f[1], sol(tspan[1]), nothing)
+    bc2b!(resid_f[2], sol(tspan[2]), nothing)
+    @test norm(reduce(vcat, resid_f)) < 1e-11
+end
 
 # Out of Place
 bc2a(ua, p) = [ua[1]]
@@ -76,17 +85,22 @@ bc2b(ub, p) = [ub[1] - 1]
 
 bvp4 = TwoPointBVProblem(f1, (bc2a, bc2b), u0, tspan)
 @test !SciMLBase.isinplace(bvp4)
-sol = solve(bvp4, Shooting(Tsit5()); abstol = 1e-13, reltol = 1e-13)
-@test SciMLBase.successful_retcode(sol)
-resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
-@test norm(resid_f) < 1e-12
+for solver in SOLVERS
+    sol = solve(bvp4, solver; abstol = 1e-13, reltol = 1e-13)
+    @test SciMLBase.successful_retcode(sol)
+    resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
+    @test norm(resid_f) < 1e-12
+end
 
 #Test for complex values
 u0 = [0.0, 1.0] .+ 1im
 bvp = BVProblem(f1!, bc1!, u0, tspan)
 resid_f = Array{ComplexF64}(undef, 2)
-sol = solve(bvp, Shooting(Tsit5(); nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff()));
-    abstol = 1e-13, reltol = 1e-13)
-resid_f = Array{ComplexF64}(undef, 2)
-bc1!(resid_f, sol, nothing, sol.t)
-@test norm(resid_f) < 1e-12
+
+nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff())
+for solver in [Shooting(Tsit5(); nlsolve), MultipleShooting(10, Tsit5(); nlsolve)]
+    sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
+    @test SciMLBase.successful_retcode(sol)
+    bc1!(resid_f, sol, nothing, sol.t)
+    @test norm(resid_f) < 1e-12
+end

From 3cba674b0331568131f637a9335784c47e0ebb69 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 22 Sep 2023 20:04:06 -0400
Subject: [PATCH 04/28] Tests

---
 src/algorithms.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/algorithms.jl b/src/algorithms.jl
index afc0f6c3..e4701537 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -34,10 +34,12 @@ end
 
 function MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING,
     grid_coarsening = true)
-    @assert grid_coarsening isa Bool || grid_coarsening isa Function || grid_coarsening isa AbstractVector{<:Integer} || grid_coarsening isa NTuple{N, <:Integer} where {N}
+    @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
+            grid_coarsening isa AbstractVector{<:Integer} ||
+            grid_coarsening isa NTuple{N, <:Integer} where {N}
     grid_coarsening isa Tuple && (grid_coarsening = Vector(grid_coarsening...))
     if grid_coarsening isa AbstractVector
-        sort!(grid_coarsening; rev=true)
+        sort!(grid_coarsening; rev = true)
         @assert all(grid_coarsening .> 0) && 1 ∉ grid_coarsening
     end
     return MultipleShooting(ode_alg, nlsolve, nshoots, grid_coarsening)

From 9ebd818bad1d4bb81112c557085a09c826e560af Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 22 Sep 2023 20:23:54 -0400
Subject: [PATCH 05/28] orbital multiple shooting

---
 test/orbital.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test/orbital.jl b/test/orbital.jl
index 2a6dcb64..3529d04d 100644
--- a/test/orbital.jl
+++ b/test/orbital.jl
@@ -79,6 +79,12 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13, reltol = 1e-13)
     cur_bc!(resid_f, sol, nothing, sol.t)
     @test norm(resid_f, Inf) < TestTol
+
+    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve); abstol = 1e-6,
+        reltol = 1e-6)
+    @test SciMLBase.successful_retcode(sol)
+    cur_bc!(resid_f, sol, nothing, sol.t)
+    @test norm(resid_f, Inf) < 1e-6
 end
 
 ### Using the TwoPoint BVP Structure
@@ -93,4 +99,11 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
     cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
     @test norm(reduce(vcat, resid_f_2p), Inf) < TestTol
+
+    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve); abstol = 1e-6,
+        reltol = 1e-6)
+    @test SciMLBase.successful_retcode(sol)
+    cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
+    cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
+    @test norm(reduce(vcat, resid_f_2p), Inf) < TestTol
 end

From b11a35a2388960c264e8bb7d97757a3ae07ebe56 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 22 Sep 2023 21:32:47 -0400
Subject: [PATCH 06/28] Add type stability tests

---
 src/solve/single_shooting.jl               |  5 +--
 test/{ => mirk}/ensemble.jl                |  0
 test/{ => mirk}/mirk_convergence_tests.jl  |  0
 test/{ => mirk}/vectorofvector_initials.jl |  0
 test/{ => misc}/non_vector_inputs.jl       |  0
 test/{ => misc}/odeinterface_ex7.jl        |  0
 test/misc/type_stability.jl                | 51 ++++++++++++++++++++++
 test/runtests.jl                           | 26 ++++++-----
 test/{ => shooting}/orbital.jl             |  0
 test/{ => shooting}/shooting_tests.jl      |  0
 10 files changed, 67 insertions(+), 15 deletions(-)
 rename test/{ => mirk}/ensemble.jl (100%)
 rename test/{ => mirk}/mirk_convergence_tests.jl (100%)
 rename test/{ => mirk}/vectorofvector_initials.jl (100%)
 rename test/{ => misc}/non_vector_inputs.jl (100%)
 rename test/{ => misc}/odeinterface_ex7.jl (100%)
 create mode 100644 test/misc/type_stability.jl
 rename test/{ => shooting}/orbital.jl (100%)
 rename test/{ => shooting}/shooting_tests.jl (100%)

diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index acf21650..036ce14a 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,10 +1,9 @@
-# TODO: Support Non-Vector Inputs
 function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), kwargs...)
     iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(prob.u0), size(prob.u0)
+    resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
+                 size(prob.f.bcresid_prototype)
     loss_fn = if iip
-        resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
-                     size(prob.f.bcresid_prototype)
         function loss!(resid, u0_, p)
             u0_internal = reshape(u0_, u0_size)
             tmp_prob = ODEProblem{iip}(prob.f, u0_internal, prob.tspan, p)
diff --git a/test/ensemble.jl b/test/mirk/ensemble.jl
similarity index 100%
rename from test/ensemble.jl
rename to test/mirk/ensemble.jl
diff --git a/test/mirk_convergence_tests.jl b/test/mirk/mirk_convergence_tests.jl
similarity index 100%
rename from test/mirk_convergence_tests.jl
rename to test/mirk/mirk_convergence_tests.jl
diff --git a/test/vectorofvector_initials.jl b/test/mirk/vectorofvector_initials.jl
similarity index 100%
rename from test/vectorofvector_initials.jl
rename to test/mirk/vectorofvector_initials.jl
diff --git a/test/non_vector_inputs.jl b/test/misc/non_vector_inputs.jl
similarity index 100%
rename from test/non_vector_inputs.jl
rename to test/misc/non_vector_inputs.jl
diff --git a/test/odeinterface_ex7.jl b/test/misc/odeinterface_ex7.jl
similarity index 100%
rename from test/odeinterface_ex7.jl
rename to test/misc/odeinterface_ex7.jl
diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
new file mode 100644
index 00000000..a035f4af
--- /dev/null
+++ b/test/misc/type_stability.jl
@@ -0,0 +1,51 @@
+using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
+
+f(u, p, t) = [p[1] * u[1] - p[2] * u[1] * u[2], p[3] * u[1] * u[2] - p[4] * u[2]]
+function f!(du, u, p, t)
+    du[1] = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = p[3] * u[1] * u[2] - p[4] * u[2]
+end
+
+bc(sol, p, t) = [sol[1][1] - 1, sol[end][2] - 2]
+function bc!(res, sol, p, t)
+    res[1] = sol[1][1] - 1
+    res[2] = sol[end][2] - 2
+end
+twobc((ua, ub), p) = ([ua[1] - 1], [ub[2] - 2])
+function twobc!((resa, resb), (ua, ub), p)
+    resa[1] = ua[1] - 1
+    resb[1] = ub[2] - 2
+end
+
+u0 = Float64[0, 0]
+tspan = (0.0, 1.0)
+p = [1.0, 1.0, 1.0, 1.0]
+bcresid_prototype = (zeros(1), zeros(1))
+
+# Multi-Point BVP
+mpbvp_iip = BVProblem(f!, bc!, u0, tspan, p)
+mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
+
+@inferred solve(mpbvp_iip, Shooting(Tsit5()))
+@inferred solve(mpbvp_oop, Shooting(Tsit5()))
+@inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
+@inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
+
+for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
+    @inferred solve(mpbvp_iip, solver; dt = 0.2)
+    @inferred solve(mpbvp_oop, solver; dt = 0.2)
+end
+
+# Two-Point BVP
+tpbvp_iip = TwoPointBVProblem(f!, twobc!, u0, tspan, p; bcresid_prototype)
+tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
+
+@inferred solve(tpbvp_iip, Shooting(Tsit5()))
+@inferred solve(tpbvp_oop, Shooting(Tsit5()))
+@inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
+@inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))
+
+for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
+    @inferred solve(tpbvp_iip, solver; dt = 0.2)
+    @inferred solve(tpbvp_oop, solver; dt = 0.2)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index af8b3bbd..0d404a56 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,34 +3,36 @@ using Test, SafeTestsets
 @testset "Boundary Value Problem Tests" begin
     @time @testset "Shooting Method Tests" begin
         @time @safetestset "Shooting Tests" begin
-            include("shooting_tests.jl")
+            include("shooting/shooting_tests.jl")
         end
         @time @safetestset "Orbital" begin
-            include("orbital.jl")
+            include("shooting/orbital.jl")
         end
     end
 
     @time @testset "Collocation Method (MIRK) Tests" begin
         @time @safetestset "Ensemble" begin
-            include("ensemble.jl")
+            include("mirk/ensemble.jl")
         end
         @time @safetestset "MIRK Convergence Tests" begin
-            include("mirk_convergence_tests.jl")
+            include("mirk/mirk_convergence_tests.jl")
         end
         @time @safetestset "Vector of Vector" begin
-            include("vectorofvector_initials.jl")
+            include("mirk/vectorofvector_initials.jl")
         end
     end
 
-    @time @testset "ODE Interface Solvers" begin
-        @time @safetestset "ODE Interface Tests" begin
-            include("odeinterface_ex7.jl")
+    @time @testset "Miscelleneous" begin
+        @time @safetestset "Non Vector Inputs" begin
+            include("misc/non_vector_inputs.jl")
         end
-    end
 
-    @time @testset "Non Vector Inputs Tests" begin
-        @time @safetestset "Non Vector Inputs" begin
-            include("non_vector_inputs.jl")
+        @time @safetestset "Type Stability" begin
+            include("misc/type_stability.jl")
+        end
+
+        @time @safetestset "ODE Interface Tests" begin
+            include("misc/odeinterface_ex7.jl")
         end
     end
 end
diff --git a/test/orbital.jl b/test/shooting/orbital.jl
similarity index 100%
rename from test/orbital.jl
rename to test/shooting/orbital.jl
diff --git a/test/shooting_tests.jl b/test/shooting/shooting_tests.jl
similarity index 100%
rename from test/shooting_tests.jl
rename to test/shooting/shooting_tests.jl

From 7f81251a6640818cdfde717db499fae6b87838ec Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 22 Sep 2023 22:46:39 -0400
Subject: [PATCH 07/28] Port shooting tests from NeuralBVP

---
 test/shooting/shooting_tests.jl | 326 ++++++++++++++++++++++++--------
 1 file changed, 242 insertions(+), 84 deletions(-)

diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index c7935535..f70ad9f3 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -1,106 +1,264 @@
 using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
 
-@info "Shooting method"
+@testset "Basic Shooting Tests" begin
+    SOLVERS = [Shooting(Tsit5()), MultipleShooting(10, Tsit5())]
 
-SOLVERS = [Shooting(Tsit5()), MultipleShooting(10, Tsit5())]
+    tspan = (0.0, 100.0)
+    u0 = [0.0, 1.0]
 
-@info "Multi-Point BVProblem" # Not really but using that API
+    # Inplace
+    function f1!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
 
-tspan = (0.0, 100.0)
-u0 = [0.0, 1.0]
+    function bc1!(resid, sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        resid[1] = sol(t₀)[1]
+        resid[2] = sol(t₁)[1] - 1
+        return nothing
+    end
 
-# Inplace
-function f1!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = -u[1]
-    return nothing
-end
+    bvp1 = BVProblem(f1!, bc1!, u0, tspan)
+    @test SciMLBase.isinplace(bvp1)
+    for solver in SOLVERS
+        resid_f = Array{Float64}(undef, 2)
+        sol = solve(bvp1, solver; abstol = 1e-13, reltol = 1e-13)
+        @test SciMLBase.successful_retcode(sol)
+        bc1!(resid_f, sol, nothing, sol.t)
+        @test norm(resid_f) < 1e-12
+    end
 
-function bc1!(resid, sol, p, t)
-    t₀, t₁ = first(t), last(t)
-    resid[1] = sol(t₀)[1]
-    resid[2] = sol(t₁)[1] - 1
-    return nothing
-end
+    # Out of Place
+    f1(u, p, t) = [u[2], -u[1]]
 
-bvp1 = BVProblem(f1!, bc1!, u0, tspan)
-@test SciMLBase.isinplace(bvp1)
-for solver in SOLVERS
-    resid_f = Array{Float64}(undef, 2)
-    sol = solve(bvp1, solver; abstol = 1e-13, reltol = 1e-13)
-    @test SciMLBase.successful_retcode(sol)
-    bc1!(resid_f, sol, nothing, sol.t)
-    @test norm(resid_f) < 1e-12
-end
+    function bc1(sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        return [sol(t₀)[1], sol(t₁)[1] - 1]
+    end
 
-# Out of Place
-f1(u, p, t) = [u[2], -u[1]]
+    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1!, bc1, u0, tspan)
+    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1, bc1!, u0, tspan)
 
-function bc1(sol, p, t)
-    t₀, t₁ = first(t), last(t)
-    return [sol(t₀)[1], sol(t₁)[1] - 1]
-end
+    bvp2 = BVProblem(f1, bc1, u0, tspan)
+    @test !SciMLBase.isinplace(bvp2)
+    for solver in SOLVERS
+        sol = solve(bvp2, solver; abstol = 1e-13, reltol = 1e-13)
+        @test SciMLBase.successful_retcode(sol)
+        resid_f = bc1(sol, nothing, sol.t)
+        @test norm(resid_f) < 1e-12
+    end
 
-@test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1!, bc1, u0, tspan)
-@test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1, bc1!, u0, tspan)
+    # Inplace
+    bc2_a!(resid, ua, p) = (resid[1] = ua[1])
+    bc2_b!(resid, ub, p) = (resid[1] = ub[1] - 1)
 
-bvp2 = BVProblem(f1, bc1, u0, tspan)
-@test !SciMLBase.isinplace(bvp2)
-for solver in SOLVERS
-    sol = solve(bvp2, solver; abstol = 1e-13, reltol = 1e-13)
-    @test SciMLBase.successful_retcode(sol)
-    resid_f = bc1(sol, nothing, sol.t)
-    @test norm(resid_f) < 1e-12
-end
+    bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
+        bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))
+    @test SciMLBase.isinplace(bvp3)
+    for solver in SOLVERS
+        sol = solve(bvp3, solver; abstol = 1e-13, reltol = 1e-13)
+        @test SciMLBase.successful_retcode(sol)
+        resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
+        bc2a!(resid_f[1], sol(tspan[1]), nothing)
+        bc2b!(resid_f[2], sol(tspan[2]), nothing)
+        @test norm(reduce(vcat, resid_f)) < 1e-11
+    end
 
-@info "Two Point BVProblem" # Not really but using that API
+    # Out of Place
+    bc2a(ua, p) = [ua[1]]
+    bc2b(ub, p) = [ub[1] - 1]
 
-# Inplace
-function bc2a!(resid, ua, p)
-    resid[1] = ua[1]
-    return nothing
+    bvp4 = TwoPointBVProblem(f1, (bc2a, bc2b), u0, tspan)
+    @test !SciMLBase.isinplace(bvp4)
+    for solver in SOLVERS
+        sol = solve(bvp4, solver; abstol = 1e-13, reltol = 1e-13)
+        @test SciMLBase.successful_retcode(sol)
+        resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
+        @test norm(resid_f) < 1e-12
+    end
 end
 
-function bc2b!(resid, ub, p)
-    resid[1] = ub[1] - 1
-    return nothing
-end
+@testset "Shooting with Complex Values" begin
+    # Test for complex values
+    u0 = [0.0, 1.0] .+ 1im
+    bvp = BVProblem(f1!, bc1!, u0, tspan)
+    resid_f = Array{ComplexF64}(undef, 2)
 
-bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
-    bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))
-@test SciMLBase.isinplace(bvp3)
-for solver in SOLVERS
-    sol = solve(bvp3, solver)
-    @test SciMLBase.successful_retcode(sol; abstol = 1e-13, reltol = 1e-13)
-    @test SciMLBase.successful_retcode(sol)
-    resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
-    bc2a!(resid_f[1], sol(tspan[1]), nothing)
-    bc2b!(resid_f[2], sol(tspan[2]), nothing)
-    @test norm(reduce(vcat, resid_f)) < 1e-11
+    nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff())
+    for solver in [Shooting(Tsit5(); nlsolve), MultipleShooting(10, Tsit5(); nlsolve)]
+        sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
+        @test SciMLBase.successful_retcode(sol)
+        bc1!(resid_f, sol, nothing, sol.t)
+        @test norm(resid_f) < 1e-12
+    end
 end
 
-# Out of Place
-bc2a(ua, p) = [ua[1]]
-bc2b(ub, p) = [ub[1] - 1]
-
-bvp4 = TwoPointBVProblem(f1, (bc2a, bc2b), u0, tspan)
-@test !SciMLBase.isinplace(bvp4)
-for solver in SOLVERS
-    sol = solve(bvp4, solver; abstol = 1e-13, reltol = 1e-13)
-    @test SciMLBase.successful_retcode(sol)
-    resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
-    @test norm(resid_f) < 1e-12
+@testset "Flow In a Channel" begin
+    function flow_in_a_channel!(du, u, p, t)
+        R, P = p
+        A, f′′, f′, f, h′, h, θ′, θ = u
+        du[1] = 0
+        du[2] = R * (f′^2 - f * f′′) - R * A
+        du[3] = f′′
+        du[4] = f′
+        du[5] = -R * f * h′ - 1
+        du[6] = h′
+        du[7] = -P * f * θ′
+        du[8] = θ′
+    end
+
+    function bc_flow!(resid, sol, p, tspan)
+        t₁, t₂ = extrema(tspan)
+        solₜ₁ = sol(t₁)
+        solₜ₂ = sol(t₂)
+        resid[1] = solₜ₁[4]
+        resid[2] = solₜ₁[3]
+        resid[3] = solₜ₂[4] - 1
+        resid[4] = solₜ₂[3]
+        resid[5] = solₜ₁[6]
+        resid[6] = solₜ₂[6]
+        resid[7] = solₜ₁[8]
+        resid[8] = solₜ₂[8] - 1
+    end
+
+    tspan = (0.0, 1.0)
+    p = [10.0, 7.0]
+    u0 = zeros(8)
+
+    flow_bvp = BVProblem{true}(flow_in_a_channel!, bc_flow!, u0, tspan, p)
+
+    sol_shooting = solve(flow_bvp,
+        Shooting(AutoTsit5(Rosenbrock23()), NewtonRaphson());
+        maxiters = 100)
+    @test SciMLBase.successful_retcode(sol_shooting)
+
+    resid = zeros(8)
+    bc_flow!(resid, sol_shooting, p, sol_shooting.t)
+    @test norm(resid, Inf) < 1e-6
+
+    sol_msshooting = solve(flow_bvp,
+        MultipleShooting(10, AutoTsit5(Rosenbrock23()); nlsolve = NewtonRaphson());
+        maxiters = 100)
+    @test SciMLBase.successful_retcode(sol_msshooting)
+
+    resid = zeros(8)
+    bc_flow!(resid, sol_msshooting, p, sol_msshooting.t)
+    @test norm(resid, Inf) < 1e-6
 end
 
-#Test for complex values
-u0 = [0.0, 1.0] .+ 1im
-bvp = BVProblem(f1!, bc1!, u0, tspan)
-resid_f = Array{ComplexF64}(undef, 2)
-
-nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff())
-for solver in [Shooting(Tsit5(); nlsolve), MultipleShooting(10, Tsit5(); nlsolve)]
-    sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
-    @test SciMLBase.successful_retcode(sol)
-    bc1!(resid_f, sol, nothing, sol.t)
-    @test norm(resid_f) < 1e-12
+@testset "Ray Tracing BVP" begin
+    # Example 1.7 from 
+    # "Numerical Solution to Boundary Value Problems for Ordinary Differential equations",
+    # 'Ascher, Mattheij, Russell'
+
+    # Earthquake happens at known position (x0, y0, z0)
+    # Earthquake is detected by seismograph at (xi, yi, zi)
+
+    # Find the path taken by the first ray that reached seismograph.
+    # i.e. given some velocity field finds the quickest path from
+    # (x0,y0,z0) to (xi, yi, zi)
+
+    # du = [dx, dy, dz, dξ, dη, dζ, dT, dS]
+    # du = [x, y, z, ξ, η, ζ, T, S]
+    # p = [ν(x,y,z), μ_x(x,y,z), μ_y(x,y,z), μ_z(x,y,z)]
+    @inline v(x, y, z, p) = 1 / (4 + cos(p[1] * x) + sin(p[2] * y) - cos(p[3] * z))
+    @inline ux(x, y, z, p) = -p[1] * sin(p[1] * x)
+    @inline uy(x, y, z, p) = p[2] * cos(p[2] * y)
+    @inline uz(x, y, z, p) = p[3] * sin(p[3] * z)
+
+    function ray_tracing(u, p, t)
+        du = similar(u)
+        ray_tracing!(du, u, p, t)
+        return du
+    end
+
+    function ray_tracing!(du, u, p, t)
+        x, y, z, ξ, η, ζ, T, S = u
+
+        nu = v(x, y, z, p) # Velocity of a sound wave, function of space;
+        μx = ux(x, y, z, p) # ∂(slowness)/∂x, function of space
+        μy = uy(x, y, z, p) # ∂(slowness)/∂y, function of space
+        μz = uz(x, y, z, p) # ∂(slowness)/∂z, function of space
+
+        du[1] = S * nu * ξ
+        du[2] = S * nu * η
+        du[3] = S * nu * ζ
+
+        du[4] = S * μx
+        du[5] = S * μy
+        du[6] = S * μz
+
+        du[7] = S / nu
+        du[8] = 0
+
+        return nothing
+    end
+
+    function ray_tracing_bc(sol, p, t)
+        res = similar(first(sol))
+        ray_tracing_bc!(res, sol, p, t)
+        return res
+    end
+
+    function ray_tracing_bc!(res, sol, p, t)
+        ua = sol(0.0)
+        ub = sol(1.0)
+        nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
+
+        res[1] = ua[1] - x0
+        res[2] = ua[2] - y0
+        res[3] = ua[3] - z0
+        res[4] = ua[7]      # T(0) = 0
+        res[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
+        res[6] = ub[1] - xi
+        res[7] = ub[2] - yi
+        res[8] = ub[3] - zi
+        return nothing
+    end
+
+    a = 0
+    b = 1
+    c = 2
+    x0 = 0
+    y0 = 0
+    z0 = 0
+    xi = 4
+    yi = 3
+    zi = 2.0
+    p = [a, b, c, x0, y0, z0, xi, yi, zi]
+
+    dx = xi - x0
+    dy = yi - y0
+    dz = zi - z0
+
+    u0 = zeros(8)
+    u0[1:3] .= 0 # position
+    u0[4] = dx / v(x0, y0, z0, p)
+    u0[5] = dy / v(x0, y0, z0, p)
+    u0[6] = dz / v(x0, y0, z0, p)
+    u0[8] = 1
+
+    tspan = (0.0, 1.0)
+
+    prob_oop = BVProblem{false}(ray_tracing, ray_tracing_bc, u0, tspan, p)
+    alg = MultipleShooting(16, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
+        grid_coarsening = Base.Fix2(div, 3))
+
+    sol = solve(prob_oop, alg; reltol = 1e-6, abstol = 1e-6)
+    @test SciMLBase.successful_retcode(sol.retcode)
+    resid = zeros(8)
+    ray_tracing_bc!(resid, sol, p, sol.t)
+    @test norm(resid, Inf) < 1e-6
+
+    prob_iip = BVProblem{true}(ray_tracing!, ray_tracing_bc!, u0, tspan, p)
+    alg = MultipleShooting(16, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
+        grid_coarsening = Base.Fix2(div, 3))
+
+    sol = solve(prob_iip, alg; reltol = 1e-6, abstol = 1e-6)
+    @test SciMLBase.successful_retcode(sol.retcode)
+    resid = zeros(8)
+    ray_tracing_bc!(resid, sol, p, sol.t)
+    @test norm(resid, Inf) < 1e-6
 end

From 131026fcff38c100ca0d0eb0a0c06615a933e27e Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 28 Sep 2023 14:22:22 -0400
Subject: [PATCH 08/28] Incorrect merge

---
 src/nlprob.jl | 136 --------------------------------------------------
 1 file changed, 136 deletions(-)

diff --git a/src/nlprob.jl b/src/nlprob.jl
index 1ed8b5d0..27487dc9 100644
--- a/src/nlprob.jl
+++ b/src/nlprob.jl
@@ -251,139 +251,3 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
 
     return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
 end
-
-# Two Point Specialization
-function construct_sparse_banded_jac_prototype(y::ArrayPartition, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    l_top = M * length(y.x[1].x[1])
-    l_bot = M * length(y.x[1].x[2])
-
-    Is = Vector{Int}(undef, l + l_top + l_bot)
-    Js = Vector{Int}(undef, l + l_top + l_bot)
-    idx = 1
-
-    for i in 1:length(y.x[1].x[1]), j in 1:M
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i + length(y.x[1].x[1])
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:length(y.x[1].x[2]), j in 1:M
-        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
-        Js[idx] = j + M * (N - 1)
-        idx += 1
-    end
-
-    y_ = similar(y, length(Is))
-    return sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-        y_, M * N, M * N)
-end
-
-function generate_split_jac(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
-    _) where {iip}
-    @unpack nlsolve, jac_alg = cache.alg
-    N = length(cache.mesh)
-
-    resid_bc = cache.prob.f.bcresid_prototype === nothing ? similar(y, cache.M) :
-               cache.prob.f.bcresid_prototype
-    resid_collocation = similar(y, cache.M * (N - 1))
-
-    sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
-            NoSparsityDetection()
-
-    if iip
-        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, resid_bc, y)
-    else
-        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, y;
-            fx = resid_bc)
-    end
-
-    sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
-        Jₛ = construct_sparse_banded_jac_prototype(y, cache.M, N)
-        JacPrototypeSparsityDetection(; jac_prototype = Jₛ)
-    else
-        NoSparsityDetection()
-    end
-
-    if iip
-        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
-            sd_collocation, loss_collocation, resid_collocation, y)
-    else
-        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
-            sd_collocation, loss_collocation, y; fx = resid_collocation)
-    end
-
-    jac_prototype = vcat(init_jacobian(cache_bc), init_jacobian(cache_collocation))
-
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
-    jac = if iip
-        function jac_internal!(J, x, p)
-            sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
-                loss_bc, resid_bc, x)
-            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
-                cache_collocation, loss_collocation, resid_collocation, x)
-            return J
-        end
-    else
-        J_ = jac_prototype
-        function jac_internal(x, p)
-            sparse_jacobian!(@view(J_[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
-                loss_bc, x)
-            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
-                cache_collocation, loss_collocation, x)
-            return J_
-        end
-    end
-
-    return jac, jac_prototype
-end
-
-function generate_split_jac(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
-    ::TwoPointBVProblem) where {iip}
-    @unpack nlsolve, jac_alg = cache.alg
-    N = length(cache.mesh)
-
-    if !iip && cache.prob.f.bcresid_prototype === nothing
-        y_ = recursive_unflatten!(cache.y, y)
-        resid_ = cache.bc((y_[1], y_[end]), cache.p)
-        resid = ArrayPartition(ArrayPartition(resid_), similar(y, cache.M * (N - 1)))
-    else
-        resid = ArrayPartition(cache.prob.f.bcresid_prototype,
-            similar(y, cache.M * (N - 1)))
-    end
-
-    Jₛ = construct_sparse_banded_jac_prototype(resid, cache.M, N)
-    sd = JacPrototypeSparsityDetection(; jac_prototype = Jₛ)
-
-    if iip
-        cache = sparse_jacobian_cache(jac_alg.bc_diffmode, sd, loss, resid, y)
-    else
-        cache = sparse_jacobian_cache(jac_alg.bc_diffmode, sd, loss, y; fx = resid)
-    end
-
-    jac_prototype = init_jacobian(cache)
-
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
-    jac = if iip
-        function jac_internal!(J, x, p)
-            sparse_jacobian!(J, jac_alg.bc_diffmode, cache, loss, resid, x)
-            return J
-        end
-    else
-        J_ = jac_prototype
-        function jac_internal(x, p)
-            sparse_jacobian!(J_, jac_alg.bc_diffmode, cache, loss, x)
-            return J_
-        end
-    end
-
-    return jac, jac_prototype
-end

From 5ed2e111802a2a444606d74c182b686cd1483245 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 28 Sep 2023 14:47:47 -0400
Subject: [PATCH 09/28] fix type stability??

---
 src/solve/single_shooting.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 036ce14a..55d6e3ec 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -22,9 +22,9 @@ function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;)
     end
     opt = solve(NonlinearProblem(NonlinearFunction{iip}(loss_fn; prob.f.jac_prototype,
                 resid_prototype = prob.f.bcresid_prototype), vec(u0), prob.p), alg.nlsolve;
-        kwargs...)
-    sol_prob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
-    sol = solve(sol_prob, alg.ode_alg; nlsolve_kwargs..., kwargs...)
+        nlsolve_kwargs..., kwargs...)
+    newprob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
+    sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., kwargs...)
     return DiffEqBase.solution_new_retcode(sol,
         sol.retcode == opt.retcode ? ReturnCode.Success : ReturnCode.Failure)
 end

From a349db2f132fedabd358f08a58f532a6f3e2d890 Mon Sep 17 00:00:00 2001
From: Avik Pal <avik.pal.2017@gmail.com>
Date: Sun, 1 Oct 2023 11:11:11 -0400
Subject: [PATCH 10/28] Update test/shooting/shooting_tests.jl

Co-authored-by: Qingyu Qu <52615090+ErikQQY@users.noreply.github.com>
---
 test/shooting/shooting_tests.jl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index f70ad9f3..1a141925 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -82,6 +82,18 @@ end
 
 @testset "Shooting with Complex Values" begin
     # Test for complex values
+        function f1!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
+
+    function bc1!(resid, sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        resid[1] = sol(t₀)[1]
+        resid[2] = sol(t₁)[1] - 1
+        return nothing
+    end
     u0 = [0.0, 1.0] .+ 1im
     bvp = BVProblem(f1!, bc1!, u0, tspan)
     resid_f = Array{ComplexF64}(undef, 2)

From 6cffaec5c2f6ac1d56ce5b58387f251db7cf9342 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Sun, 1 Oct 2023 21:21:35 -0400
Subject: [PATCH 11/28] Fix formatting

---
 test/misc/type_stability.jl     | 56 ++++++++++++++++++++-------------
 test/shooting/orbital.jl        |  3 +-
 test/shooting/shooting_tests.jl |  6 ++--
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
index a035f4af..7e6cafa8 100644
--- a/test/misc/type_stability.jl
+++ b/test/misc/type_stability.jl
@@ -23,29 +23,41 @@ p = [1.0, 1.0, 1.0, 1.0]
 bcresid_prototype = (zeros(1), zeros(1))
 
 # Multi-Point BVP
-mpbvp_iip = BVProblem(f!, bc!, u0, tspan, p)
-mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
-
-@inferred solve(mpbvp_iip, Shooting(Tsit5()))
-@inferred solve(mpbvp_oop, Shooting(Tsit5()))
-@inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
-@inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
-
-for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
-    @inferred solve(mpbvp_iip, solver; dt = 0.2)
-    @inferred solve(mpbvp_oop, solver; dt = 0.2)
+@testset "Multi-Point BVP" begin
+    mpbvp_iip = BVProblem(f!, bc!, u0, tspan, p)
+    mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
+
+    @testset "Shooting Methods" begin
+        @inferred solve(mpbvp_iip, Shooting(Tsit5()))
+        @inferred solve(mpbvp_oop, Shooting(Tsit5()))
+        @inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
+        @inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
+    end
+
+    @testset "MIRK Methods" begin
+        for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
+            @inferred solve(mpbvp_iip, solver; dt = 0.2)
+            @inferred solve(mpbvp_oop, solver; dt = 0.2)
+        end
+    end
 end
 
 # Two-Point BVP
-tpbvp_iip = TwoPointBVProblem(f!, twobc!, u0, tspan, p; bcresid_prototype)
-tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
-
-@inferred solve(tpbvp_iip, Shooting(Tsit5()))
-@inferred solve(tpbvp_oop, Shooting(Tsit5()))
-@inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
-@inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))
-
-for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
-    @inferred solve(tpbvp_iip, solver; dt = 0.2)
-    @inferred solve(tpbvp_oop, solver; dt = 0.2)
+@testset "Two-Point BVP" begin
+    tpbvp_iip = TwoPointBVProblem(f!, twobc!, u0, tspan, p; bcresid_prototype)
+    tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
+
+    @testset "Shooting Methods" begin
+        @inferred solve(tpbvp_iip, Shooting(Tsit5()))
+        @inferred solve(tpbvp_oop, Shooting(Tsit5()))
+        @inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
+        @inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))
+    end
+
+    @testset "MIRK Methods" begin
+        for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
+            @inferred solve(tpbvp_iip, solver; dt = 0.2)
+            @inferred solve(tpbvp_oop, solver; dt = 0.2)
+        end
+    end
 end
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
index 3529d04d..e5513d56 100644
--- a/test/shooting/orbital.jl
+++ b/test/shooting/orbital.jl
@@ -76,7 +76,8 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     AutoSparseForwardDiff(), AutoFiniteDiff(; fdtype = Val(:forward)),
     AutoSparseFiniteDiff())
     nlsolve = NewtonRaphson(; autodiff)
-    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13, reltol = 1e-13)
+    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true,
+        abstol = 1e-13, reltol = 1e-13)
     cur_bc!(resid_f, sol, nothing, sol.t)
     @test norm(resid_f, Inf) < TestTol
 
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index 1a141925..8dd71368 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -82,7 +82,7 @@ end
 
 @testset "Shooting with Complex Values" begin
     # Test for complex values
-        function f1!(du, u, p, t)
+    function f1!(du, u, p, t)
         du[1] = u[2]
         du[2] = -u[1]
         return nothing
@@ -94,6 +94,8 @@ end
         resid[2] = sol(t₁)[1] - 1
         return nothing
     end
+
+    tspan = (0.0, 100.0)
     u0 = [0.0, 1.0] .+ 1im
     bvp = BVProblem(f1!, bc1!, u0, tspan)
     resid_f = Array{ComplexF64}(undef, 2)
@@ -161,7 +163,7 @@ end
 end
 
 @testset "Ray Tracing BVP" begin
-    # Example 1.7 from 
+    # Example 1.7 from
     # "Numerical Solution to Boundary Value Problems for Ordinary Differential equations",
     # 'Ascher, Mattheij, Russell'
 

From de7077dbfacf523b9f6c78a3e26302a19a50f72f Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Sun, 1 Oct 2023 22:02:00 -0400
Subject: [PATCH 12/28] Mark Single Shooting test as borken

---
 src/solve/single_shooting.jl | 2 +-
 test/misc/type_stability.jl  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 55d6e3ec..464c2b72 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -25,6 +25,6 @@ function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;)
         nlsolve_kwargs..., kwargs...)
     newprob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
     sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., kwargs...)
-    return DiffEqBase.solution_new_retcode(sol,
+    return SciMLBase.solution_new_retcode(sol,
         sol.retcode == opt.retcode ? ReturnCode.Success : ReturnCode.Failure)
 end
diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
index 7e6cafa8..72eb5974 100644
--- a/test/misc/type_stability.jl
+++ b/test/misc/type_stability.jl
@@ -28,7 +28,8 @@ bcresid_prototype = (zeros(1), zeros(1))
     mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
 
     @testset "Shooting Methods" begin
-        @inferred solve(mpbvp_iip, Shooting(Tsit5()))
+        @test_broken SciMLBase.successful_retcode(@inferred solve(mpbvp_iip,
+            Shooting(Tsit5())))
         @inferred solve(mpbvp_oop, Shooting(Tsit5()))
         @inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
         @inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
@@ -48,7 +49,8 @@ end
     tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
 
     @testset "Shooting Methods" begin
-        @inferred solve(tpbvp_iip, Shooting(Tsit5()))
+        @test_broken SciMLBase.successful_retcode(@inferred solve(tpbvp_iip,
+            Shooting(Tsit5())))
         @inferred solve(tpbvp_oop, Shooting(Tsit5()))
         @inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
         @inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))

From 2d3de5ad5c978109670a2335508c7991cf49dee1 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 2 Oct 2023 10:13:08 -0400
Subject: [PATCH 13/28] Fix alg

---
 src/solve/multiple_shooting.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 01b465a8..0831780d 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -15,7 +15,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         for i in 1:cur_nshoots
             local odeprob = ODEProblem{iip}(f,
                 reshape(us[((i - 1) * N + 1):(i * N)], u0_size), (nodes[i], nodes[i + 1]),
-                p)
+                prob.p)
             sol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
                 save_end = true, save_everystep = false)
 
@@ -32,7 +32,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         # Boundary conditions
         # Builds an ODESolution object to keep the framework for bc(,,) consistent
         odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
-        total_solution = SciMLBase.build_solution(odeprob, nothing, _ts, _us)
+        total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
 
         if iip
             eval_bc_residual!(resid_bc, prob.problem_type, bc, total_solution, p)
@@ -139,8 +139,8 @@ end
             ustart = u_at_nodes_prev[idxs_prev]
 
             odeprob = ODEProblem(f, ustart, (t0, tstop), p)
-            odesol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
-                saveat = (), save_end = true)
+            odesol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs..., saveat = (),
+                save_end = true)
 
             u_at_nodes[idxs] .= odesol.u[end]
         end

From 0579f7b2e81a7ece2122ebbf5ff6d4ca73417ce5 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 2 Oct 2023 16:17:58 -0400
Subject: [PATCH 14/28] Use ensemble problem to speed up multiple shooting

---
 src/BoundaryValueDiffEq.jl     |   3 +-
 src/solve/mirk.jl              | 177 +++++++++++++++++++++++++++++++++
 src/solve/multiple_shooting.jl |  38 ++++---
 src/solve/single_shooting.jl   |   7 +-
 src/utils.jl                   |  76 ++++++++++++++
 5 files changed, 283 insertions(+), 18 deletions(-)

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 2b7b73a2..1e56bdfb 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -23,10 +23,11 @@ include("alg_utils.jl")
 include("mirk_tableaus.jl")
 include("cache.jl")
 include("collocation.jl")
-include("nlprob.jl")
+
 include("solve/single_shooting.jl")
 include("solve/multiple_shooting.jl")
 include("solve/mirk.jl")
+
 include("adaptivity.jl")
 include("interpolation.jl")
 
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 7a2bb97a..341c4f7b 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -183,3 +183,180 @@ function SciMLBase.solve!(cache::MIRKCache)
     return DiffEqBase.build_solution(prob, alg, cache.mesh,
         u; interp = MIRKInterpolation(cache.mesh, u, cache), retcode = info)
 end
+
+# Constructing the Nonlinear Problem
+function construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {iip}
+    loss_bc = if iip
+        function loss_bc_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            eval_bc_residual!(resid, cache.problem_type, cache.bc, y_, p, cache.mesh)
+            return resid
+        end
+    else
+        function loss_bc_internal(u::AbstractVector, p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            return eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
+        end
+    end
+
+    loss_collocation = if iip
+        function loss_collocation_internal!(resid::AbstractVector, u::AbstractVector,
+            p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            resids = [get_tmp(r, u) for r in cache.residual[2:end]]
+            Φ!(resids, cache, y_, u, p)
+            recursive_flatten!(resid, resids)
+            return resid
+        end
+    else
+        function loss_collocation_internal(u::AbstractVector, p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            resids = Φ(cache, y_, u, p)
+            xxx = mapreduce(vec, vcat, resids)
+            return xxx
+        end
+    end
+
+    loss = if !(cache.problem_type isa TwoPointBVProblem)
+        if iip
+            function loss_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
+                y_ = recursive_unflatten!(cache.y, u)
+                resids = [get_tmp(r, u) for r in cache.residual]
+                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
+                    cache.mesh)
+                Φ!(resids[2:end], cache, y_, u, p)
+                recursive_flatten!(resid, resids)
+                return resid
+            end
+        else
+            function loss_internal(u::AbstractVector, p = cache.p)
+                y_ = recursive_unflatten!(cache.y, u)
+                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
+                resid_co = Φ(cache, y_, u, p)
+                return vcat(resid_bc, mapreduce(vec, vcat, resid_co))
+            end
+        end
+    else
+        # Reordering for 2 point BVP
+        if iip
+            function loss_internal_2point!(resid::AbstractVector, u::AbstractVector,
+                p = cache.p)
+                y_ = recursive_unflatten!(cache.y, u)
+                resids = [get_tmp(r, u) for r in cache.residual]
+                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
+                    cache.mesh)
+                Φ!(resids[2:end], cache, y_, u, p)
+                recursive_flatten_twopoint!(resid, resids)
+                return resid
+            end
+        else
+            function loss_internal_2point(u::AbstractVector, p = cache.p)
+                y_ = recursive_unflatten!(cache.y, u)
+                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
+                resid_co = Φ(cache, y_, u, p)
+                return vcat(resid_bc.x[1], mapreduce(vec, vcat, resid_co), resid_bc.x[2])
+            end
+        end
+    end
+
+    return generate_nlprob(cache, y, loss_bc, loss_collocation, loss, cache.problem_type)
+end
+
+function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+    _) where {iip}
+    @unpack nlsolve, jac_alg = cache.alg
+    N = length(cache.mesh)
+
+    resid_bc = cache.prob.f.bcresid_prototype === nothing ? similar(y, cache.M) :
+               cache.prob.f.bcresid_prototype
+    resid_collocation = similar(y, cache.M * (N - 1))
+
+    sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
+            NoSparsityDetection()
+
+    cache_bc = __sparse_jacobian_cache(Val(iip), jac_alg.bc_diffmode, sd_bc, loss_bc,
+        resid_bc, y)
+
+    sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
+        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(y, cache.M, N)
+        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
+            col_colorvec = cvec)
+    else
+        NoSparsityDetection()
+    end
+
+    cache_collocation = __sparse_jacobian_cache(Val(iip), jac_alg.collocation_diffmode,
+        sd_collocation, loss_collocation, resid_collocation, y)
+
+    jac_prototype = vcat(init_jacobian(cache_bc),
+        jac_alg.collocation_diffmode isa AbstractSparseADType ? Jₛ :
+        init_jacobian(cache_collocation))
+
+    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
+    #       mismatch for ForwardDiff
+    jac = if iip
+        function jac_internal!(J, x, p)
+            sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
+                loss_bc, resid_bc, x)
+            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+                cache_collocation, loss_collocation, resid_collocation, x)
+            return J
+        end
+    else
+        J_ = jac_prototype
+        function jac_internal(x, p)
+            sparse_jacobian!(@view(J_[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
+                loss_bc, x)
+            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+                cache_collocation, loss_collocation, x)
+            return J_
+        end
+    end
+
+    return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
+end
+
+function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+    ::TwoPointBVProblem) where {iip}
+    @unpack nlsolve, jac_alg = cache.alg
+    N = length(cache.mesh)
+
+    if !iip && cache.prob.f.bcresid_prototype === nothing
+        y_ = recursive_unflatten!(cache.y, y)
+        resid_ = cache.bc((y_[1], y_[end]), cache.p)
+        resid = ArrayPartition(ArrayPartition(resid_), similar(y, cache.M * (N - 1)))
+    else
+        resid = ArrayPartition(cache.prob.f.bcresid_prototype,
+            similar(y, cache.M * (N - 1)))
+    end
+
+    sd = if jac_alg.diffmode isa AbstractSparseADType
+        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(resid, cache.M, N)
+        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
+            col_colorvec = cvec)
+    else
+        NoSparsityDetection()
+    end
+
+    diffcache = __sparse_jacobian_cache(Val(iip), jac_alg.diffmode, sd, loss, resid, y)
+
+    jac_prototype = jac_alg.diffmode isa AbstractSparseADType ? Jₛ :
+                    init_jacobian(diffcache)
+
+    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
+    #       mismatch for ForwardDiff
+    jac = if iip
+        function jac_internal!(J, x, p)
+            sparse_jacobian!(J, jac_alg.diffmode, diffcache, loss, resid, x)
+            return J
+        end
+    else
+        J_ = jac_prototype
+        function jac_internal(x, p)
+            sparse_jacobian!(J_, jac_alg.diffmode, diffcache, loss, x)
+            return J_
+        end
+    end
+
+    return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
+end
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 0831780d..82c7ce9d 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,6 +1,7 @@
 # TODO: incorporate `initial_guess` similar to MIRK methods
+# FIXME: We can't specify `ensemblealg` from outside
 function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
-    nlsolve_kwargs = (;), kwargs...)
+    nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), kwargs...)
     @unpack f, bc, tspan = prob
     bcresid_prototype = prob.f.bcresid_prototype === nothing ? similar(prob.u0) :
                         prob.f.bcresid_prototype
@@ -12,26 +13,33 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
 
         resid_bc, resid_nodes = resid.x[1], resid.x[2]
 
-        for i in 1:cur_nshoots
-            local odeprob = ODEProblem{iip}(f,
-                reshape(us[((i - 1) * N + 1):(i * N)], u0_size), (nodes[i], nodes[i + 1]),
-                prob.p)
-            sol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
-                save_end = true, save_everystep = false)
-
-            ts_[i] = sol.t
-            us_[i] = sol.u
+        function prob_func(probᵢ, i, repeat)
+            return remake(probᵢ; u0 = reshape(us[((i - 1) * N + 1):(i * N)], u0_size),
+                tspan = (nodes[i], nodes[i + 1]))
+        end
 
-            resid_nodes[((i - 1) * N + 1):(i * N)] .= vec(us[(i * N + 1):((i + 1) * N)]) .-
-                                                      vec(sol.u[end])
+        function reduction(u, data, I)
+            for i in I
+                u.us[i] = data[i].u
+                u.ts[i] = data[i].t
+                u.resid[((i - 1) * N + 1):(i * N)] .= vec(us[(i * N + 1):((i + 1) * N)]) .-
+                                                      vec(data[i].u[end])
+            end
+            return (u, false)
         end
 
-        _ts = foldl(vcat, ts_)
-        _us = foldl(vcat, us_)
+        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+
+        ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction, safetycopy = false,
+            u_init = (; us = us_, ts = ts_, resid = resid_nodes))
+        ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg; odesolve_kwargs...,
+            kwargs..., trajectories = cur_nshoots)
+
+        _us = reduce(vcat, ensemble_sol.u.us)
+        _ts = reduce(vcat, ensemble_sol.u.ts)
 
         # Boundary conditions
         # Builds an ODESolution object to keep the framework for bc(,,) consistent
-        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
         total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
 
         if iip
diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 464c2b72..786ed3b5 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -25,6 +25,9 @@ function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;)
         nlsolve_kwargs..., kwargs...)
     newprob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
     sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., kwargs...)
-    return SciMLBase.solution_new_retcode(sol,
-        sol.retcode == opt.retcode ? ReturnCode.Success : ReturnCode.Failure)
+
+    if !SciMLBase.successful_retcode(opt)
+        return SciMLBase.solution_new_retcode(sol, ReturnCode.Failure)
+    end
+    return sol
 end
diff --git a/src/utils.jl b/src/utils.jl
index 3082566d..a7a3cbad 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -90,3 +90,79 @@ eval_bc_residual!(resid, _, bc!, sol, p, t) = bc!(resid, sol, p, t)
     bcb!(resid.x[2], ub, p)
     return resid
 end
+
+# Generating Banded Matrix
+function construct_sparse_banded_jac_prototype(y, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    Is = Vector{Int}(undef, l)
+    Js = Vector{Int}(undef, l)
+    idx = 1
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+    col_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, M * (N - 1))
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    y_ = similar(y, length(Is))
+    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
+            y_, M * (N - 1), M * N), col_colorvec, row_colorvec)
+end
+
+# Two Point Specialization
+function construct_sparse_banded_jac_prototype(y::ArrayPartition, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    l_top = M * length(y.x[1].x[1])
+    l_bot = M * length(y.x[1].x[2])
+
+    Is = Vector{Int}(undef, l + l_top + l_bot)
+    Js = Vector{Int}(undef, l + l_top + l_bot)
+    idx = 1
+
+    for i in 1:length(y.x[1].x[1]), j in 1:M
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i + length(y.x[1].x[1])
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:length(y.x[1].x[2]), j in 1:M
+        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
+        Js[idx] = j + M * (N - 1)
+        idx += 1
+    end
+
+    col_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    y_ = similar(y, length(Is))
+    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
+            y_, M * N, M * N), col_colorvec, row_colorvec)
+end
+
+# Helpers for IIP/OOP functions
+function __sparse_jacobian_cache(::Val{iip}, ad, sd, fn, fx, y) where {iip}
+    if iip
+        sparse_jacobian_cache(ad, sd, fn, fx, y)
+    else
+        sparse_jacobian_cache(ad, sd, fn, y; fx)
+    end
+end

From 430f356ebec6a76274b3840124ee9f610a46dcc5 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 3 Oct 2023 14:57:24 -0400
Subject: [PATCH 15/28] Patch interpolation

---
 src/BoundaryValueDiffEq.jl |  2 +-
 src/adaptivity.jl          | 25 ++++----------
 src/interpolation.jl       | 68 ++++++++++++--------------------------
 3 files changed, 29 insertions(+), 66 deletions(-)

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 1e56bdfb..f4fa5475 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -1,7 +1,7 @@
 module BoundaryValueDiffEq
 
 using Adapt, LinearAlgebra, PreallocationTools, Reexport, Setfield, SparseArrays, SciMLBase,
-    RecursiveArrayTools
+    RecursiveArrayTools, ForwardDiff
 @reexport using ADTypes, DiffEqBase, NonlinearSolve, SparseDiffTools, SciMLBase
 
 import ADTypes: AbstractADType
diff --git a/src/adaptivity.jl b/src/adaptivity.jl
index ab6e3eb4..d76df301 100644
--- a/src/adaptivity.jl
+++ b/src/adaptivity.jl
@@ -18,9 +18,7 @@ end
 Find the interval that `t` belongs to in `mesh`. Assumes that `mesh` is sorted.
 """
 function interval(mesh, t)
-    t == first(mesh) && return 1
-    t == last(mesh) && return length(mesh) - 1
-    return searchsortedfirst(mesh, t) - 1
+    return clamp(searchsortedfirst(mesh, t) - 1, 1, length(mesh) - 1)
 end
 
 """
@@ -237,11 +235,8 @@ function sum_stages!(z, cache::MIRKCache, w, i::Int, dt = cache.mesh_dt[i])
 
     z .= zero(z)
     __maybe_matmul!(z, k_discrete[i].du[:, 1:stage], w[1:stage])
-    __maybe_matmul!(z,
-        k_interp[i][:, 1:(s_star - stage)],
-        w[(stage + 1):s_star],
-        true,
-        true)
+    __maybe_matmul!(z, k_interp[i][:, 1:(s_star - stage)],
+        w[(stage + 1):s_star], true, true)
     z .= z .* dt .+ cache.y₀[i]
 
     return z
@@ -253,18 +248,12 @@ end
 
     z .= zero(z)
     __maybe_matmul!(z, k_discrete[i].du[:, 1:stage], w[1:stage])
-    __maybe_matmul!(z,
-        k_interp[i][:, 1:(s_star - stage)],
-        w[(stage + 1):s_star],
-        true,
-        true)
+    __maybe_matmul!(z, k_interp[i][:, 1:(s_star - stage)],
+        w[(stage + 1):s_star], true, true)
     z′ .= zero(z′)
     __maybe_matmul!(z′, k_discrete[i].du[:, 1:stage], w′[1:stage])
-    __maybe_matmul!(z′,
-        k_interp[i][:, 1:(s_star - stage)],
-        w′[(stage + 1):s_star],
-        true,
-        true)
+    __maybe_matmul!(z′, k_interp[i][:, 1:(s_star - stage)],
+        w′[(stage + 1):s_star], true, true)
     z .= z .* dt[1] .+ cache.y₀[i]
 
     return z, z′
diff --git a/src/interpolation.jl b/src/interpolation.jl
index 9b4600b8..41a2c612 100644
--- a/src/interpolation.jl
+++ b/src/interpolation.jl
@@ -4,6 +4,10 @@ struct MIRKInterpolation{T1, T2} <: AbstractDiffEqInterpolation
     cache
 end
 
+function DiffEqBase.interp_summary(interp::MIRKInterpolation)
+    return "MIRK Order $(interp.cache.order) Interpolation"
+end
+
 function (id::MIRKInterpolation)(tvals, idxs, deriv, p, continuity::Symbol = :left)
     interpolation(tvals, id, idxs, deriv, p, continuity)
 end
@@ -12,14 +16,11 @@ function (id::MIRKInterpolation)(val, tvals, idxs, deriv, p, continuity::Symbol
     interpolation!(val, tvals, id, idxs, deriv, p, continuity)
 end
 
-@inline function interpolation(tvals,
-    id::I,
-    idxs,
-    deriv::D,
-    p,
+# FIXME: Fix the interpolation outside the tspan
+
+@inline function interpolation(tvals, id::I, idxs, deriv::D, p,
     continuity::Symbol = :left) where {I, D}
-    t = id.t
-    u = id.u
+    @unpack t, u, cache = id
     cache = id.cache
     tdir = sign(t[end] - t[1])
     idx = sortperm(tvals, rev = tdir < 0)
@@ -33,56 +34,29 @@ end
     end
 
     for j in idx
-        tval = tvals[j]
-        i = interval(t, tval)
-        dt = t[i + 1] - t[i]
-        θ = (tval - t[i]) / dt
-        weights, _ = interp_weights(θ, cache.alg)
-        z = zeros(cache.M)
-        sum_stages!(z, cache, weights, i)
-        vals[j] = copy(z)
+        z = similar(cache.fᵢ₂_cache)
+        interp_eval!(z, id.cache, tvals[j], id.cache.mesh, id.cache.mesh_dt)
+        vals[j] = z
     end
-    DiffEqArray(vals, tvals)
+    return DiffEqArray(vals, tvals)
 end
 
-@inline function interpolation!(vals,
-    tvals,
-    id::I,
-    idxs,
-    deriv::D,
-    p,
+@inline function interpolation!(vals, tvals, id::I, idxs, deriv::D, p,
     continuity::Symbol = :left) where {I, D}
-    t = id.t
-    cache = id.cache
+    @unpack t, cache = id
     tdir = sign(t[end] - t[1])
     idx = sortperm(tvals, rev = tdir < 0)
 
     for j in idx
-        tval = tvals[j]
-        i = interval(t, tval)
-        dt = t[i] - t[i - 1]
-        θ = (tval - t[i]) / dt
-        weights, _ = interp_weights(θ, cache.alg)
-        z = zeros(cache.M)
-        sum_stages!(z, cache, weights, i)
-        vals[j] = copy(z)
+        z = similar(cache.fᵢ₂_cache)
+        interp_eval!(z, id.cache, tvals[j], id.cache.mesh, id.cache.mesh_dt)
+        vals[j] = z
     end
 end
 
-@inline function interpolation(tval::Number,
-    id::I,
-    idxs,
-    deriv::D,
-    p,
+@inline function interpolation(tval::Number, id::I, idxs, deriv::D, p,
     continuity::Symbol = :left) where {I, D}
-    t = id.t
-    cache = id.cache
-    i = interval(t, tval)
-    dt = t[i] - t[i - 1]
-    θ = (tval - t[i]) / dt
-    weights, _ = interp_weights(θ, cache.alg)
-    z = zeros(cache.M)
-    sum_stages!(z, cache, weights, i)
-    val = copy(z)
-    val
+    z = similar(id.cache.fᵢ₂_cache)
+    interp_eval!(z, id.cache, tval, id.cache.mesh, id.cache.mesh_dt)
+    return z
 end

From e345b305fcd0699cc1bc3a1e1c127670bff7575e Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 5 Oct 2023 11:03:25 -0400
Subject: [PATCH 16/28] Don't use custom jacobian

---
 src/solve/multiple_shooting.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 82c7ce9d..e286e611 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,5 +1,4 @@
 # TODO: incorporate `initial_guess` similar to MIRK methods
-# FIXME: We can't specify `ensemblealg` from outside
 function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), kwargs...)
     @unpack f, bc, tspan = prob
@@ -33,7 +32,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction, safetycopy = false,
             u_init = (; us = us_, ts = ts_, resid = resid_nodes))
         ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg; odesolve_kwargs...,
-            kwargs..., trajectories = cur_nshoots)
+            kwargs..., save_end = true, save_everystep = false, trajectories = cur_nshoots)
 
         _us = reduce(vcat, ensemble_sol.u.us)
         _ts = reduce(vcat, ensemble_sol.u.ts)
@@ -66,8 +65,8 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
 
         resid_prototype = ArrayPartition(bcresid_prototype,
             similar(u_at_nodes, cur_nshoot * N))
-        loss_function! = NonlinearFunction{true}((args...) -> loss!(args...,
-                cur_nshoot, nodes); resid_prototype)
+        loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
+            nodes); resid_prototype)
         nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
         sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., kwargs...)
         u_at_nodes = sol_nlsolve.u

From 4de1d457e9ee7929aa0d889df8c4e9b0d4c582d7 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 5 Oct 2023 17:36:33 -0400
Subject: [PATCH 17/28] Use sparse Jacobian for Multiple Shooting

---
 src/BoundaryValueDiffEq.jl     |   2 +-
 src/solve/mirk.jl              |  74 +++++++++++++++-
 src/solve/multiple_shooting.jl | 156 ++++++++++++++++++++++++++++-----
 src/solve/single_shooting.jl   |  20 +++--
 src/utils.jl                   |  67 --------------
 5 files changed, 220 insertions(+), 99 deletions(-)

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index f4fa5475..b6daf8dc 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -5,7 +5,7 @@ using Adapt, LinearAlgebra, PreallocationTools, Reexport, Setfield, SparseArrays
 @reexport using ADTypes, DiffEqBase, NonlinearSolve, SparseDiffTools, SciMLBase
 
 import ADTypes: AbstractADType
-import ArrayInterface: matrix_colors, parameterless_type
+import ArrayInterface: matrix_colors, parameterless_type, undefmatrix
 import ConcreteStructs: @concrete
 import DiffEqBase: solve
 import ForwardDiff: pickchunksize
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 341c4f7b..f7788d80 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -212,8 +212,7 @@ function construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {ii
         function loss_collocation_internal(u::AbstractVector, p = cache.p)
             y_ = recursive_unflatten!(cache.y, u)
             resids = Φ(cache, y_, u, p)
-            xxx = mapreduce(vec, vcat, resids)
-            return xxx
+            return mapreduce(vec, vcat, resids)
         end
     end
 
@@ -278,7 +277,7 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
         resid_bc, y)
 
     sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(y, cache.M, N)
+        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(cache, y, cache.M, N)
         PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
             col_colorvec = cvec)
     else
@@ -331,7 +330,7 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     end
 
     sd = if jac_alg.diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(resid, cache.M, N)
+        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(cache, resid, cache.M, N)
         PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
             col_colorvec = cvec)
     else
@@ -360,3 +359,70 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
 
     return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
 end
+
+# Generating Banded Matrix
+function construct_sparse_banded_jac_prototype(::MIRKCache, y, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    Is = Vector{Int}(undef, l)
+    Js = Vector{Int}(undef, l)
+    idx = 1
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+    col_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, M * (N - 1))
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    y_ = similar(y, length(Is))
+    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
+            y_, M * (N - 1), M * N), col_colorvec, row_colorvec)
+end
+
+# Two Point Specialization
+function construct_sparse_banded_jac_prototype(::MIRKCache, y::ArrayPartition, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    l_top = M * length(y.x[1].x[1])
+    l_bot = M * length(y.x[1].x[2])
+
+    Is = Vector{Int}(undef, l + l_top + l_bot)
+    Js = Vector{Int}(undef, l + l_top + l_bot)
+    idx = 1
+
+    for i in 1:length(y.x[1].x[1]), j in 1:M
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i + length(y.x[1].x[1])
+        Js[idx] = j
+        idx += 1
+    end
+
+    for i in 1:length(y.x[1].x[2]), j in 1:M
+        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
+        Js[idx] = j + M * (N - 1)
+        idx += 1
+    end
+
+    col_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, M * N)
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    y_ = similar(y, length(Is))
+    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
+            y_, M * N, M * N), col_colorvec, row_colorvec)
+end
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index e286e611..0ab9f87c 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,10 +1,18 @@
 # TODO: incorporate `initial_guess` similar to MIRK methods
 function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
-    nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), kwargs...)
+    nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
     @unpack f, bc, tspan = prob
-    bcresid_prototype = prob.f.bcresid_prototype === nothing ? similar(prob.u0) :
+    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
+    _u0 = has_initial_guess ? first(prob.u0) : prob.u0
+    N, u0_size, nshoots, iip = length(_u0), size(_u0), alg.nshoots, isinplace(prob)
+    bcresid_prototype = prob.f.bcresid_prototype === nothing ? similar(_u0) :
                         prob.f.bcresid_prototype
-    N, u0_size, nshoots, iip = length(prob.u0), size(prob.u0), alg.nshoots, isinplace(prob)
+
+    if has_initial_guess && length(prob.u0) != nshoots + 1
+        nshoots = length(prob.u0) - 1
+        verbose &&
+            @warn "Initial guess length != `nshoots + 1`! Adapting to `nshoots = $(nshoots)`"
+    end
 
     @views function loss!(resid::ArrayPartition, us, p, cur_nshoots, nodes)
         ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
@@ -32,7 +40,8 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction, safetycopy = false,
             u_init = (; us = us_, ts = ts_, resid = resid_nodes))
         ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg; odesolve_kwargs...,
-            kwargs..., save_end = true, save_everystep = false, trajectories = cur_nshoots)
+            verbose, kwargs..., save_end = true, save_everystep = false,
+            trajectories = cur_nshoots)
 
         _us = reduce(vcat, ensemble_sol.u.us)
         _ts = reduce(vcat, ensemble_sol.u.ts)
@@ -50,44 +59,123 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         return resid
     end
 
+    @views function jac!(J::AbstractMatrix, us, p, cur_nshoots, nodes, resid_bc)
+        J_bc = J[1:N, :]
+        J_c = J[(N + 1):end, :]
+
+        # Threads.@threads :static
+        # FIXME: Threading here leads to segfaults
+        for i in 1:cur_nshoots
+            uᵢ = us[((i - 1) * N + 1):(i * N)]
+            idx = ((i - 1) * N + 1):(i * N)
+            probᵢ = ODEProblem{iip}(f, reshape(uᵢ, u0_size), (nodes[i], nodes[i + 1]), p)
+            function solve_func(u₀)
+                sJ = solve(probᵢ, alg.ode_alg; u0 = u₀, odesolve_kwargs...,
+                    kwargs..., save_end = true, save_everystep = false, saveat = ())
+                return -last(sJ)
+            end
+            # @show sum(J_c[idx, idx]), sum(J_c[idx, idx .+ N])
+            ForwardDiff.jacobian!(J_c[idx, idx], solve_func, uᵢ)
+            J_c′ = J_c[idx, idx .+ N]
+            J_c′[diagind(J_c′)] .= 1
+            # @show sum(J_c[idx, idx]), sum(J_c[idx, idx .+ N])
+        end
+
+        function evaluate_boundary_condition(us)
+            ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
+            us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
+
+            function prob_func(probᵢ, i, repeat)
+                return remake(probᵢ; u0 = reshape(us[((i - 1) * N + 1):(i * N)], u0_size),
+                    tspan = (nodes[i], nodes[i + 1]))
+            end
+
+            function reduction(u, data, I)
+                for i in I
+                    u.us[i] = data[i].u
+                    u.ts[i] = data[i].t
+                end
+                return (u, false)
+            end
+
+            odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+
+            ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction,
+                safetycopy = false, u_init = (; us = us_, ts = ts_))
+            ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg;
+                odesolve_kwargs..., kwargs..., save_end = true, save_everystep = false,
+                trajectories = cur_nshoots)
+
+            _us = reduce(vcat, ensemble_sol.u.us)
+            _ts = reduce(vcat, ensemble_sol.u.ts)
+
+            # Boundary conditions
+            # Builds an ODESolution object to keep the framework for bc(,,) consistent
+            total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
+
+            if iip
+                _resid_bc = get_tmp(resid_bc, us)
+                eval_bc_residual!(_resid_bc, prob.problem_type, bc, total_solution, p)
+                return _resid_bc
+            else
+                return eval_bc_residual(prob.problem_type, bc, total_solution, p)
+            end
+        end
+
+        ForwardDiff.jacobian!(J_bc, evaluate_boundary_condition, us)
+
+        return nothing
+    end
+
     # This gets all the nshoots except the final SingleShooting case
-    all_nshoots = get_all_nshoots(alg)
+    all_nshoots = get_all_nshoots(alg.grid_coarsening, nshoots)
     u_at_nodes, nodes = nothing, nothing
 
     for (i, cur_nshoot) in enumerate(all_nshoots)
         if i == 1
-            nodes, u_at_nodes = multiple_shooting_initialize(prob, alg; odesolve_kwargs,
-                kwargs...)
+            nodes, u_at_nodes = multiple_shooting_initialize(prob, alg, has_initial_guess,
+                nshoots; odesolve_kwargs, verbose, kwargs...)
         else
             nodes, u_at_nodes = multiple_shooting_initialize(u_at_nodes, prob, alg, nodes,
-                cur_nshoot, all_nshoots[i - 1]; odesolve_kwargs, kwargs...)
+                cur_nshoot, all_nshoots[i - 1], has_initial_guess; odesolve_kwargs, verbose,
+                kwargs...)
         end
 
         resid_prototype = ArrayPartition(bcresid_prototype,
             similar(u_at_nodes, cur_nshoot * N))
+        residbc_prototype = DiffCache(bcresid_prototype, pickchunksize(cur_nshoot * N))
+        jac_prototype = __generate_sparse_jacobian_prototype(alg, _u0, bcresid_prototype, N,
+            cur_nshoot)
+
         loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
-            nodes); resid_prototype)
+                nodes); resid_prototype, jac = (args...) -> jac!(args..., cur_nshoot, nodes, residbc_prototype), jac_prototype)
         nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
-        sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., kwargs...)
+        sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
         u_at_nodes = sol_nlsolve.u
     end
 
     single_shooting_prob = remake(prob; u0 = reshape(u_at_nodes[1:N], u0_size))
     return SciMLBase.__solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve);
-        odesolve_kwargs, nlsolve_kwargs, kwargs...)
+        odesolve_kwargs, nlsolve_kwargs, verbose, kwargs...)
 end
 
-function multiple_shooting_initialize(prob, alg::MultipleShooting; odesolve_kwargs = (;),
-    kwargs...)
+function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
+    nshoots; odesolve_kwargs = (;), verbose = true, kwargs...)
     @unpack f, bc, u0, tspan, p = prob
-    @unpack ode_alg, nshoots = alg
+    @unpack ode_alg = alg
 
-    N = length(u0)
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
+    N = has_initial_guess ? length(first(u0)) : length(u0)
+
+    if has_initial_guess
+        u_at_nodes = similar(first(u0), (nshoots + 1) * N)
+        recursive_flatten!(u_at_nodes, u0)
+        return nodes, u_at_nodes
+    end
 
     # Ensures type stability in case the parameters are dual numbers
     if !(typeof(p) <: SciMLBase.NullParameters)
-        if !isconcretetype(eltype(p))
+        if !isconcretetype(eltype(p)) && verbose
             @warn "Type inference will fail if eltype(p) is not a concrete type"
         end
         u_at_nodes = similar(u0, promote_type(eltype(u0), eltype(p)), (nshoots + 1) * N)
@@ -97,7 +185,7 @@ function multiple_shooting_initialize(prob, alg::MultipleShooting; odesolve_kwar
 
     # Assumes no initial guess for now
     start_prob = ODEProblem{isinplace(prob)}(f, u0, tspan, p)
-    sol = solve(start_prob, ode_alg; odesolve_kwargs..., kwargs..., saveat = nodes)
+    sol = solve(start_prob, ode_alg; odesolve_kwargs..., verbose, kwargs..., saveat = nodes)
 
     if SciMLBase.successful_retcode(sol)
         u_at_nodes[1:N] .= sol.u[1]
@@ -114,10 +202,10 @@ function multiple_shooting_initialize(prob, alg::MultipleShooting; odesolve_kwar
 end
 
 @views @inline function multiple_shooting_initialize(u_at_nodes_prev, prob, alg,
-    prev_nodes, nshoots, old_nshoots; odesolve_kwargs = (;), kwargs...)
+    prev_nodes, nshoots, old_nshoots, has_initial_guess; odesolve_kwargs = (;), kwargs...)
     @unpack f, bc, u0, tspan, p = prob
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
-    N = length(u0)
+    N = has_initial_guess ? length(first(u0)) : length(u0)
 
     u_at_nodes = similar(u_at_nodes_prev, N + nshoots * N)
     u_at_nodes[1:N] .= u_at_nodes_prev[1:N]
@@ -156,8 +244,7 @@ end
     return nodes, u_at_nodes
 end
 
-@inline function get_all_nshoots(alg::MultipleShooting)
-    @unpack nshoots, grid_coarsening = alg
+@inline function get_all_nshoots(grid_coarsening, nshoots)
     if grid_coarsening isa Bool
         !grid_coarsening && return [nshoots]
         update_fn = Base.Fix2(÷, 2)
@@ -176,3 +263,30 @@ end
     @assert !(1 in nshoots_vec)
     return nshoots_vec
 end
+
+function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, bcresid_prototype,
+    N::Int, nshoots::Int)
+    # Assume dense BC
+    J_bc = similar(bcresid_prototype, length(bcresid_prototype), N * (nshoots + 1))
+
+    # Sparse for Stitching solution together
+    Is = Vector{UInt32}(undef, (N^2 + N) * nshoots)
+    Js = Vector{UInt32}(undef, (N^2 + N) * nshoots)
+
+    idx = 1
+    for i in 1:nshoots
+        for (i₁, i₂) in Iterators.product(1:N, 1:N)
+            Is[idx] = i₁ + ((i - 1) * N)
+            Js[idx] = i₂ + ((i - 1) * N)
+            idx += 1
+        end
+        Is[idx:(idx + N - 1)] .= (1:N) .+ ((i - 1) * N)
+        Js[idx:(idx + N - 1)] .= (1:N) .+ (i * N)
+        idx += N
+    end
+
+    J_c = sparse(adapt(parameterless_type(u0), Is), adapt(parameterless_type(u0), Js),
+        similar(u0, length(Is)))
+
+    return vcat(J_bc, J_c)
+end
diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 786ed3b5..2de87648 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,13 +1,20 @@
 function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
-    nlsolve_kwargs = (;), kwargs...)
-    iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(prob.u0), size(prob.u0)
+    nlsolve_kwargs = (;), verbose = true, kwargs...)
+    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
+    has_initial_guess && verbose &&
+        @warn "Initial guess provided, but will be ignored for Shooting!"
+    u0 = has_initial_guess ? first(prob.u0) : prob.u0
+
+    iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(u0), size(u0)
     resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
                  size(prob.f.bcresid_prototype)
+
     loss_fn = if iip
         function loss!(resid, u0_, p)
             u0_internal = reshape(u0_, u0_size)
             tmp_prob = ODEProblem{iip}(prob.f, u0_internal, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., kwargs...)
+            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., verbose,
+                kwargs...)
             eval_bc_residual!(reshape(resid, resid_size), prob.problem_type, bc,
                 internal_sol, p)
             return nothing
@@ -16,15 +23,16 @@ function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;)
         function loss(u0_, p)
             u0_internal = reshape(u0_, u0_size)
             tmp_prob = ODEProblem(prob.f, u0_internal, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., kwargs...)
+            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., verbose,
+                kwargs...)
             return vec(eval_bc_residual(prob.problem_type, bc, internal_sol, p))
         end
     end
     opt = solve(NonlinearProblem(NonlinearFunction{iip}(loss_fn; prob.f.jac_prototype,
                 resid_prototype = prob.f.bcresid_prototype), vec(u0), prob.p), alg.nlsolve;
-        nlsolve_kwargs..., kwargs...)
+        nlsolve_kwargs..., verbose, kwargs...)
     newprob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
-    sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., kwargs...)
+    sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., verbose, kwargs...)
 
     if !SciMLBase.successful_retcode(opt)
         return SciMLBase.solution_new_retcode(sol, ReturnCode.Failure)
diff --git a/src/utils.jl b/src/utils.jl
index a7a3cbad..636b100c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -91,73 +91,6 @@ eval_bc_residual!(resid, _, bc!, sol, p, t) = bc!(resid, sol, p, t)
     return resid
 end
 
-# Generating Banded Matrix
-function construct_sparse_banded_jac_prototype(y, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    Is = Vector{Int}(undef, l)
-    Js = Vector{Int}(undef, l)
-    idx = 1
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * (N - 1))
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * (N - 1), M * N), col_colorvec, row_colorvec)
-end
-
-# Two Point Specialization
-function construct_sparse_banded_jac_prototype(y::ArrayPartition, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    l_top = M * length(y.x[1].x[1])
-    l_bot = M * length(y.x[1].x[2])
-
-    Is = Vector{Int}(undef, l + l_top + l_bot)
-    Js = Vector{Int}(undef, l + l_top + l_bot)
-    idx = 1
-
-    for i in 1:length(y.x[1].x[1]), j in 1:M
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i + length(y.x[1].x[1])
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:length(y.x[1].x[2]), j in 1:M
-        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
-        Js[idx] = j + M * (N - 1)
-        idx += 1
-    end
-
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * N, M * N), col_colorvec, row_colorvec)
-end
-
 # Helpers for IIP/OOP functions
 function __sparse_jacobian_cache(::Val{iip}, ad, sd, fn, fx, y) where {iip}
     if iip

From a4694e8caa57ae9b187b261ec6c248b8c7e7cfca Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 5 Oct 2023 18:13:05 -0400
Subject: [PATCH 18/28] xSetup colorvec construction

---
 src/solve/multiple_shooting.jl  | 32 ++++++++++++++++++++++----------
 test/misc/type_stability.jl     |  6 ++----
 test/shooting/orbital.jl        |  2 --
 test/shooting/shooting_tests.jl |  5 ++++-
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 0ab9f87c..a2660234 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,7 +1,8 @@
 # TODO: incorporate `initial_guess` similar to MIRK methods
 function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
-    @unpack f, bc, tspan = prob
+    @unpack f, tspan = prob
+    bc = prob.f.bc
     has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
     _u0 = has_initial_guess ? first(prob.u0) : prob.u0
     N, u0_size, nshoots, iip = length(_u0), size(_u0), alg.nshoots, isinplace(prob)
@@ -143,12 +144,18 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
 
         resid_prototype = ArrayPartition(bcresid_prototype,
             similar(u_at_nodes, cur_nshoot * N))
-        residbc_prototype = DiffCache(bcresid_prototype, pickchunksize(cur_nshoot * N))
-        jac_prototype = __generate_sparse_jacobian_prototype(alg, _u0, bcresid_prototype, N,
+        residbc_prototype = DiffCache(bcresid_prototype,
+            pickchunksize((cur_nshoot + 1) * N))
+
+        J_bc = similar(bcresid_prototype, length(bcresid_prototype), N * (cur_nshoot + 1))
+        J_c, col_colorvec, row_colorvec = __generate_sparse_jacobian_prototype(alg, _u0, N,
             cur_nshoot)
+        jac_prototype = vcat(J_bc, J_c)
 
         loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
-                nodes); resid_prototype, jac = (args...) -> jac!(args..., cur_nshoot, nodes, residbc_prototype), jac_prototype)
+                nodes); resid_prototype,
+            jac = (args...) -> jac!(args..., cur_nshoot, nodes, residbc_prototype),
+            jac_prototype)
         nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
         sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
         u_at_nodes = sol_nlsolve.u
@@ -264,11 +271,7 @@ end
     return nshoots_vec
 end
 
-function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, bcresid_prototype,
-    N::Int, nshoots::Int)
-    # Assume dense BC
-    J_bc = similar(bcresid_prototype, length(bcresid_prototype), N * (nshoots + 1))
-
+function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, N::Int, nshoots::Int)
     # Sparse for Stitching solution together
     Is = Vector{UInt32}(undef, (N^2 + N) * nshoots)
     Js = Vector{UInt32}(undef, (N^2 + N) * nshoots)
@@ -288,5 +291,14 @@ function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, bcresid_pr
     J_c = sparse(adapt(parameterless_type(u0), Is), adapt(parameterless_type(u0), Js),
         similar(u0, length(Is)))
 
-    return vcat(J_bc, J_c)
+    col_colorvec = Vector{Int}(undef, N * (nshoots + 1))
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, 2 * N)
+    end
+    row_colorvec = Vector{Int}(undef, N * nshoots)
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, 2 * N)
+    end
+
+    return J_c, col_colorvec, row_colorvec
 end
diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
index 72eb5974..7e6cafa8 100644
--- a/test/misc/type_stability.jl
+++ b/test/misc/type_stability.jl
@@ -28,8 +28,7 @@ bcresid_prototype = (zeros(1), zeros(1))
     mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
 
     @testset "Shooting Methods" begin
-        @test_broken SciMLBase.successful_retcode(@inferred solve(mpbvp_iip,
-            Shooting(Tsit5())))
+        @inferred solve(mpbvp_iip, Shooting(Tsit5()))
         @inferred solve(mpbvp_oop, Shooting(Tsit5()))
         @inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
         @inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
@@ -49,8 +48,7 @@ end
     tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
 
     @testset "Shooting Methods" begin
-        @test_broken SciMLBase.successful_retcode(@inferred solve(tpbvp_iip,
-            Shooting(Tsit5())))
+        @inferred solve(tpbvp_iip, Shooting(Tsit5()))
         @inferred solve(tpbvp_oop, Shooting(Tsit5()))
         @inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
         @inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
index e5513d56..f5b5d518 100644
--- a/test/shooting/orbital.jl
+++ b/test/shooting/orbital.jl
@@ -3,8 +3,6 @@ using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
 
 @info "Testing Lambert's Problem"
 
-@info "Testing Lambert's Problem"
-
 y0 = [
     -4.7763169762853989E+06,
     -3.8386398704441520E+05,
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index 8dd71368..c998fa09 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -101,7 +101,10 @@ end
     resid_f = Array{ComplexF64}(undef, 2)
 
     nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff())
-    for solver in [Shooting(Tsit5(); nlsolve), MultipleShooting(10, Tsit5(); nlsolve)]
+    for solver in [Shooting(Tsit5(); nlsolve)]
+        # FIXME: Need to reenable MS. Currently it always uses ForwardDiff which is a
+        # regression and needs fixing
+        # , MultipleShooting(10, Tsit5(); nlsolve)]
         sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
         @test SciMLBase.successful_retcode(sol)
         bc1!(resid_f, sol, nothing, sol.t)

From 5edb3fa4b8321b2ee1daeb092c2706c489dc7fa3 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 9 Oct 2023 11:18:00 -0400
Subject: [PATCH 19/28] Fix tests

---
 src/nlprob.jl                       | 253 ----------------------------
 src/solve/mirk.jl                   |   4 +-
 src/solve/multiple_shooting.jl      |   4 +-
 test/mirk/mirk_convergence_tests.jl |   2 -
 test/shooting/orbital.jl            |   2 -
 test/shooting/shooting_tests.jl     |   4 +-
 6 files changed, 6 insertions(+), 263 deletions(-)
 delete mode 100644 src/nlprob.jl

diff --git a/src/nlprob.jl b/src/nlprob.jl
deleted file mode 100644
index 27487dc9..00000000
--- a/src/nlprob.jl
+++ /dev/null
@@ -1,253 +0,0 @@
-function construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {iip}
-    loss_bc = if iip
-        function loss_bc_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
-            y_ = recursive_unflatten!(cache.y, u)
-            eval_bc_residual!(resid, cache.problem_type, cache.bc, y_, p, cache.mesh)
-            return resid
-        end
-    else
-        function loss_bc_internal(u::AbstractVector, p = cache.p)
-            y_ = recursive_unflatten!(cache.y, u)
-            return eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
-        end
-    end
-
-    loss_collocation = if iip
-        function loss_collocation_internal!(resid::AbstractVector, u::AbstractVector,
-            p = cache.p)
-            y_ = recursive_unflatten!(cache.y, u)
-            resids = [get_tmp(r, u) for r in cache.residual[2:end]]
-            Φ!(resids, cache, y_, u, p)
-            recursive_flatten!(resid, resids)
-            return resid
-        end
-    else
-        function loss_collocation_internal(u::AbstractVector, p = cache.p)
-            y_ = recursive_unflatten!(cache.y, u)
-            resids = Φ(cache, y_, u, p)
-            return mapreduce(vec, vcat, resids)
-        end
-    end
-
-    loss = if !(cache.problem_type isa TwoPointBVProblem)
-        if iip
-            function loss_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resids = [get_tmp(r, u) for r in cache.residual]
-                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
-                    cache.mesh)
-                Φ!(resids[2:end], cache, y_, u, p)
-                recursive_flatten!(resid, resids)
-                return resid
-            end
-        else
-            function loss_internal(u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
-                resid_co = Φ(cache, y_, u, p)
-                return vcat(resid_bc, mapreduce(vec, vcat, resid_co))
-            end
-        end
-    else
-        # Reordering for 2 point BVP
-        if iip
-            function loss_internal_2point!(resid::AbstractVector, u::AbstractVector,
-                p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resids = [get_tmp(r, u) for r in cache.residual]
-                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
-                    cache.mesh)
-                Φ!(resids[2:end], cache, y_, u, p)
-                recursive_flatten_twopoint!(resid, resids)
-                return resid
-            end
-        else
-            function loss_internal_2point(u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
-                resid_co = Φ(cache, y_, u, p)
-                return vcat(resid_bc.x[1], mapreduce(vec, vcat, resid_co), resid_bc.x[2])
-            end
-        end
-    end
-
-    return generate_nlprob(cache, y, loss_bc, loss_collocation, loss, cache.problem_type)
-end
-
-function construct_sparse_banded_jac_prototype(y, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    Is = Vector{Int}(undef, l)
-    Js = Vector{Int}(undef, l)
-    idx = 1
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * (N - 1))
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * (N - 1), M * N), col_colorvec, row_colorvec)
-end
-
-# Two Point Specialization
-function construct_sparse_banded_jac_prototype(y::ArrayPartition, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    l_top = M * length(y.x[1].x[1])
-    l_bot = M * length(y.x[1].x[2])
-
-    Is = Vector{Int}(undef, l + l_top + l_bot)
-    Js = Vector{Int}(undef, l + l_top + l_bot)
-    idx = 1
-
-    for i in 1:length(y.x[1].x[1]), j in 1:M
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i + length(y.x[1].x[1])
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:length(y.x[1].x[2]), j in 1:M
-        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
-        Js[idx] = j + M * (N - 1)
-        idx += 1
-    end
-
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * N, M * N), col_colorvec, row_colorvec)
-end
-
-function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
-    _) where {iip}
-    @unpack nlsolve, jac_alg = cache.alg
-    N = length(cache.mesh)
-
-    resid_bc = cache.prob.f.bcresid_prototype === nothing ? similar(y, cache.M) :
-               cache.prob.f.bcresid_prototype
-    resid_collocation = similar(y, cache.M * (N - 1))
-
-    sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
-            NoSparsityDetection()
-
-    if iip
-        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, resid_bc, y)
-    else
-        cache_bc = sparse_jacobian_cache(jac_alg.bc_diffmode, sd_bc, loss_bc, y;
-            fx = resid_bc)
-    end
-
-    sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(y, cache.M, N)
-        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
-            col_colorvec = cvec)
-    else
-        NoSparsityDetection()
-    end
-
-    if iip
-        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
-            sd_collocation, loss_collocation, resid_collocation, y)
-    else
-        cache_collocation = sparse_jacobian_cache(jac_alg.collocation_diffmode,
-            sd_collocation, loss_collocation, y; fx = resid_collocation)
-    end
-
-    jac_prototype = vcat(init_jacobian(cache_bc),
-        jac_alg.collocation_diffmode isa AbstractSparseADType ? Jₛ :
-        init_jacobian(cache_collocation))
-
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
-    jac = if iip
-        function jac_internal!(J, x, p)
-            sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
-                loss_bc, resid_bc, x)
-            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
-                cache_collocation, loss_collocation, resid_collocation, x)
-            return J
-        end
-    else
-        J_ = jac_prototype
-        function jac_internal(x, p)
-            sparse_jacobian!(@view(J_[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
-                loss_bc, x)
-            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
-                cache_collocation, loss_collocation, x)
-            return J_
-        end
-    end
-
-    return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
-end
-
-function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
-    ::TwoPointBVProblem) where {iip}
-    @unpack nlsolve, jac_alg = cache.alg
-    N = length(cache.mesh)
-
-    if !iip && cache.prob.f.bcresid_prototype === nothing
-        y_ = recursive_unflatten!(cache.y, y)
-        resid_ = cache.bc[1](y_[1], cache.p), cache.bc[2](y_[end], cache.p)
-        resid = ArrayPartition(ArrayPartition(resid_), similar(y, cache.M * (N - 1)))
-    else
-        resid = ArrayPartition(cache.prob.f.bcresid_prototype,
-            similar(y, cache.M * (N - 1)))
-    end
-
-    sd = if jac_alg.diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(resid, cache.M, N)
-        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
-            col_colorvec = cvec)
-    else
-        NoSparsityDetection()
-    end
-
-    if iip
-        diffcache = sparse_jacobian_cache(jac_alg.diffmode, sd, loss, resid, y)
-    else
-        diffcache = sparse_jacobian_cache(jac_alg.diffmode, sd, loss, y; fx = resid)
-    end
-
-    jac_prototype = jac_alg.diffmode isa AbstractSparseADType ? Jₛ :
-                    init_jacobian(diffcache)
-
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
-    jac = if iip
-        function jac_internal!(J, x, p)
-            sparse_jacobian!(J, jac_alg.diffmode, diffcache, loss, resid, x)
-            return J
-        end
-    else
-        J_ = jac_prototype
-        function jac_internal(x, p)
-            sparse_jacobian!(J_, jac_alg.diffmode, diffcache, loss, x)
-            return J_
-        end
-    end
-
-    return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
-end
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index f7788d80..2c90e3bb 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -322,8 +322,8 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
 
     if !iip && cache.prob.f.bcresid_prototype === nothing
         y_ = recursive_unflatten!(cache.y, y)
-        resid_ = cache.bc((y_[1], y_[end]), cache.p)
-        resid = ArrayPartition(ArrayPartition(resid_), similar(y, cache.M * (N - 1)))
+        resid_ = ArrayPartition(cache.bc[1](y_[1], cache.p), cache.bc[2](y_[end], cache.p))
+        resid = ArrayPartition(resid_, similar(y, cache.M * (N - 1)))
     else
         resid = ArrayPartition(cache.prob.f.bcresid_prototype,
             similar(y, cache.M * (N - 1)))
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index a2660234..9eca8aec 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -168,7 +168,7 @@ end
 
 function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
     nshoots; odesolve_kwargs = (;), verbose = true, kwargs...)
-    @unpack f, bc, u0, tspan, p = prob
+    @unpack f, u0, tspan, p = prob
     @unpack ode_alg = alg
 
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
@@ -210,7 +210,7 @@ end
 
 @views @inline function multiple_shooting_initialize(u_at_nodes_prev, prob, alg,
     prev_nodes, nshoots, old_nshoots, has_initial_guess; odesolve_kwargs = (;), kwargs...)
-    @unpack f, bc, u0, tspan, p = prob
+    @unpack f, u0, tspan, p = prob
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
     N = has_initial_guess ? length(first(u0)) : length(u0)
 
diff --git a/test/mirk/mirk_convergence_tests.jl b/test/mirk/mirk_convergence_tests.jl
index 363c85a9..54707c75 100644
--- a/test/mirk/mirk_convergence_tests.jl
+++ b/test/mirk/mirk_convergence_tests.jl
@@ -75,8 +75,6 @@ testTol = 0.2
 affineTol = 1e-2
 dts = 1 .// 2 .^ (3:-1:1)
 
-@info "Collocation method (MIRK)"
-
 @testset "Affineness" begin
     @testset "Problem: $i" for i in (1, 2, 5, 6)
         prob = probArr[i]
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
index f5b5d518..ec40ea6a 100644
--- a/test/shooting/orbital.jl
+++ b/test/shooting/orbital.jl
@@ -1,8 +1,6 @@
 # Lambert's Problem
 using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
 
-@info "Testing Lambert's Problem"
-
 y0 = [
     -4.7763169762853989E+06,
     -3.8386398704441520E+05,
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index c998fa09..8a8dbc4a 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -51,8 +51,8 @@ using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
     end
 
     # Inplace
-    bc2_a!(resid, ua, p) = (resid[1] = ua[1])
-    bc2_b!(resid, ub, p) = (resid[1] = ub[1] - 1)
+    bc2a!(resid, ua, p) = (resid[1] = ua[1])
+    bc2b!(resid, ub, p) = (resid[1] = ub[1] - 1)
 
     bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
         bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))

From df3909c301ef4417fa77f999e749833ad18d358c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 9 Oct 2023 13:52:39 -0400
Subject: [PATCH 20/28] Update the TwoPointBVP code

---
 src/solve/multiple_shooting.jl |  1 -
 test/misc/type_stability.jl    | 13 ++++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 9eca8aec..4267db67 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,4 +1,3 @@
-# TODO: incorporate `initial_guess` similar to MIRK methods
 function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
     @unpack f, tspan = prob
diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
index 7e6cafa8..ef20e632 100644
--- a/test/misc/type_stability.jl
+++ b/test/misc/type_stability.jl
@@ -11,11 +11,10 @@ function bc!(res, sol, p, t)
     res[1] = sol[1][1] - 1
     res[2] = sol[end][2] - 2
 end
-twobc((ua, ub), p) = ([ua[1] - 1], [ub[2] - 2])
-function twobc!((resa, resb), (ua, ub), p)
-    resa[1] = ua[1] - 1
-    resb[1] = ub[2] - 2
-end
+twobc_a(ua, p) = [ua[1] - 1]
+twobc_b(ub, p) = [ub[2] - 2]
+twobc_a!(resa, ua, p) = (resa[1] = ua[1] - 1)
+twobc_b!(resb, ub, p) = (resb[1] = ub[2] - 2)
 
 u0 = Float64[0, 0]
 tspan = (0.0, 1.0)
@@ -44,8 +43,8 @@ end
 
 # Two-Point BVP
 @testset "Two-Point BVP" begin
-    tpbvp_iip = TwoPointBVProblem(f!, twobc!, u0, tspan, p; bcresid_prototype)
-    tpbvp_oop = TwoPointBVProblem(f, twobc, u0, tspan, p)
+    tpbvp_iip = TwoPointBVProblem(f!, (twobc_a!, twobc_b!), u0, tspan, p; bcresid_prototype)
+    tpbvp_oop = TwoPointBVProblem(f, (twobc_a, twobc_b), u0, tspan, p)
 
     @testset "Shooting Methods" begin
         @inferred solve(tpbvp_iip, Shooting(Tsit5()))

From 97d0f5db13f8a5bdcf429931f5e4f86ee07572be Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 9 Oct 2023 18:32:44 -0400
Subject: [PATCH 21/28] Initial Working version with colorring

---
 src/solve/multiple_shooting.jl | 127 +++++++++++++++------------------
 1 file changed, 59 insertions(+), 68 deletions(-)

diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 4267db67..f4efbf09 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -14,12 +14,11 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
             @warn "Initial guess length != `nshoots + 1`! Adapting to `nshoots = $(nshoots)`"
     end
 
-    @views function loss!(resid::ArrayPartition, us, p, cur_nshoots, nodes)
+    # We will use colored AD for this parts!
+    @views function solve_internal_odes!(resid_nodes, us, p, cur_nshoots, nodes)
         ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
         us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
 
-        resid_bc, resid_nodes = resid.x[1], resid.x[2]
-
         function prob_func(probᵢ, i, repeat)
             return remake(probᵢ; u0 = reshape(us[((i - 1) * N + 1):(i * N)], u0_size),
                 tspan = (nodes[i], nodes[i + 1]))
@@ -43,11 +42,23 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
             verbose, kwargs..., save_end = true, save_everystep = false,
             trajectories = cur_nshoots)
 
-        _us = reduce(vcat, ensemble_sol.u.us)
-        _ts = reduce(vcat, ensemble_sol.u.ts)
+        return reduce(vcat, ensemble_sol.u.us), reduce(vcat, ensemble_sol.u.ts)
+    end
+
+    @views function compute_bc_residual!(resid_bc, us, p, cur_nshoots, nodes,
+        resid_nodes::Union{Nothing, MaybeDiffCache} = nothing)
+        if resid_nodes === nothing
+            _resid_nodes = similar(us, cur_nshoots * N)  # This might be Dual based on `us`
+        else
+            _resid_nodes = get_tmp(resid_nodes, us)
+        end
+
+        # NOTE: We need to recompute this to correctly propagate the dual numbers / gradients
+        _us, _ts = solve_internal_odes!(_resid_nodes, us, p, cur_nshoots, nodes)
 
         # Boundary conditions
         # Builds an ODESolution object to keep the framework for bc(,,) consistent
+        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
         total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
 
         if iip
@@ -56,73 +67,39 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
             resid_bc .= eval_bc_residual(prob.problem_type, bc, total_solution, p)
         end
 
-        return resid
+        return resid_bc
     end
 
-    @views function jac!(J::AbstractMatrix, us, p, cur_nshoots, nodes, resid_bc)
-        J_bc = J[1:N, :]
-        J_c = J[(N + 1):end, :]
-
-        # Threads.@threads :static
-        # FIXME: Threading here leads to segfaults
-        for i in 1:cur_nshoots
-            uᵢ = us[((i - 1) * N + 1):(i * N)]
-            idx = ((i - 1) * N + 1):(i * N)
-            probᵢ = ODEProblem{iip}(f, reshape(uᵢ, u0_size), (nodes[i], nodes[i + 1]), p)
-            function solve_func(u₀)
-                sJ = solve(probᵢ, alg.ode_alg; u0 = u₀, odesolve_kwargs...,
-                    kwargs..., save_end = true, save_everystep = false, saveat = ())
-                return -last(sJ)
-            end
-            # @show sum(J_c[idx, idx]), sum(J_c[idx, idx .+ N])
-            ForwardDiff.jacobian!(J_c[idx, idx], solve_func, uᵢ)
-            J_c′ = J_c[idx, idx .+ N]
-            J_c′[diagind(J_c′)] .= 1
-            # @show sum(J_c[idx, idx]), sum(J_c[idx, idx .+ N])
-        end
-
-        function evaluate_boundary_condition(us)
-            ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
-            us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
-
-            function prob_func(probᵢ, i, repeat)
-                return remake(probᵢ; u0 = reshape(us[((i - 1) * N + 1):(i * N)], u0_size),
-                    tspan = (nodes[i], nodes[i + 1]))
-            end
+    @views function loss!(resid::ArrayPartition, us, p, cur_nshoots, nodes)
+        resid_bc, resid_nodes = resid.x[1], resid.x[2]
 
-            function reduction(u, data, I)
-                for i in I
-                    u.us[i] = data[i].u
-                    u.ts[i] = data[i].t
-                end
-                return (u, false)
-            end
+        _us, _ts = solve_internal_odes!(resid_nodes, us, p, cur_nshoots, nodes)
 
-            odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+        # Boundary conditions
+        # Builds an ODESolution object to keep the framework for bc(,,) consistent
+        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+        total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
 
-            ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction,
-                safetycopy = false, u_init = (; us = us_, ts = ts_))
-            ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg;
-                odesolve_kwargs..., kwargs..., save_end = true, save_everystep = false,
-                trajectories = cur_nshoots)
+        if iip
+            eval_bc_residual!(resid_bc, prob.problem_type, bc, total_solution, p)
+        else
+            resid_bc .= eval_bc_residual(prob.problem_type, bc, total_solution, p)
+        end
 
-            _us = reduce(vcat, ensemble_sol.u.us)
-            _ts = reduce(vcat, ensemble_sol.u.ts)
+        return resid
+    end
 
-            # Boundary conditions
-            # Builds an ODESolution object to keep the framework for bc(,,) consistent
-            total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
+    @views function jac!(J::AbstractMatrix, us, p, resid_bc, resid_nodes::MaybeDiffCache,
+        ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
+        J_bc = J[1:N, :]
+        J_c = J[(N + 1):end, :]
 
-            if iip
-                _resid_bc = get_tmp(resid_bc, us)
-                eval_bc_residual!(_resid_bc, prob.problem_type, bc, total_solution, p)
-                return _resid_bc
-            else
-                return eval_bc_residual(prob.problem_type, bc, total_solution, p)
-            end
-        end
+        # FIXME: External control
+        sparse_jacobian!(J_c, AutoSparseForwardDiff(), ode_jac_cache, ode_fn,
+            resid_nodes.du, us)
 
-        ForwardDiff.jacobian!(J_bc, evaluate_boundary_condition, us)
+        # For BC
+        sparse_jacobian!(J_bc, AutoForwardDiff(), bc_jac_cache, bc_fn, resid_bc, us)
 
         return nothing
     end
@@ -145,16 +122,30 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
             similar(u_at_nodes, cur_nshoot * N))
         residbc_prototype = DiffCache(bcresid_prototype,
             pickchunksize((cur_nshoot + 1) * N))
+        resid_nodes = maybe_allocate_diffcache(resid_prototype.x[2],
+            pickchunksize((cur_nshoot + 1) * N),
+            AutoForwardDiff())
 
-        J_bc = similar(bcresid_prototype, length(bcresid_prototype), N * (cur_nshoot + 1))
         J_c, col_colorvec, row_colorvec = __generate_sparse_jacobian_prototype(alg, _u0, N,
             cur_nshoot)
-        jac_prototype = vcat(J_bc, J_c)
+
+        ode_fn = (du, u) -> solve_internal_odes!(du, u, prob.p, cur_nshoot, nodes)
+        ode_jac_cache = sparse_jacobian_cache(AutoSparseForwardDiff(),
+            PrecomputedJacobianColorvec(; jac_prototype = J_c, col_colorvec, row_colorvec),
+            ode_fn, copy(resid_prototype.x[2]), u_at_nodes)
+
+        bc_fn = (du, u) -> compute_bc_residual!(du, u, prob.p, cur_nshoot,
+            nodes, resid_nodes)
+        bc_jac_cache = sparse_jacobian_cache(AutoForwardDiff(),
+            NoSparsityDetection(), bc_fn, copy(resid_prototype.x[1]), u_at_nodes)
+
+        jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
+
+        jac_fn = (J, us, p) -> jac!(J, us, p, resid_prototype.x[1], resid_nodes,
+            ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
 
         loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
-                nodes); resid_prototype,
-            jac = (args...) -> jac!(args..., cur_nshoot, nodes, residbc_prototype),
-            jac_prototype)
+                nodes); resid_prototype, jac = jac_fn, jac_prototype)
         nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
         sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
         u_at_nodes = sol_nlsolve.u

From 8eedf1fff500d332eef00a7b7387e22e1901ec85 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 9 Oct 2023 22:28:45 -0400
Subject: [PATCH 22/28] Generalize the Jacobian Computation for
 MultipleShooting

---
 src/BoundaryValueDiffEq.jl      |  2 +-
 src/algorithms.jl               | 39 ++++++++++++----------------
 src/solve/mirk.jl               | 12 ++++-----
 src/solve/multiple_shooting.jl  | 39 ++++++++++++++--------------
 src/types.jl                    | 46 ++++++++++++++++-----------------
 test/shooting/orbital.jl        |  6 +++--
 test/shooting/shooting_tests.jl | 10 +++----
 7 files changed, 75 insertions(+), 79 deletions(-)

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index b6daf8dc..f63fe51a 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -39,7 +39,7 @@ end
 
 export Shooting, MultipleShooting
 export MIRK2, MIRK3, MIRK4, MIRK5, MIRK6
-export MIRKJacobianComputationAlgorithm
+export MIRKJacobianComputationAlgorithm, BVPJacobianAlgorithm
 # From ODEInterface.jl
 export BVPM2, BVPSOL
 
diff --git a/src/algorithms.jl b/src/algorithms.jl
index e4701537..faafae4e 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -1,13 +1,9 @@
-const DEFAULT_NLSOLVE_SHOOTING = NewtonRaphson()
-const DEFAULT_NLSOLVE_MIRK = NewtonRaphson()
-const DEFAULT_JACOBIAN_ALGORITHM_MIRK = MIRKJacobianComputationAlgorithm()
-
 # Algorithms
 abstract type BoundaryValueDiffEqAlgorithm <: SciMLBase.AbstractBVPAlgorithm end
 abstract type AbstractMIRK <: BoundaryValueDiffEqAlgorithm end
 
 """
-    Shooting(ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING)
+    Shooting(ode_alg; nlsolve = NewtonRaphson())
 
 Single shooting method, reduces BVP to an initial value problem and solves the IVP.
 """
@@ -16,24 +12,25 @@ struct Shooting{O, N} <: BoundaryValueDiffEqAlgorithm
     nlsolve::N
 end
 
-Shooting(ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING) = Shooting(ode_alg, nlsolve)
+Shooting(ode_alg; nlsolve = NewtonRaphson()) = Shooting(ode_alg, nlsolve)
 
 """
-    MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING,
+    MultipleShooting(nshoots::Int, ode_alg; nlsolve = NewtonRaphson(),
         grid_coarsening = true)
 
 Multiple Shooting method, reduces BVP to an initial value problem and solves the IVP.
 Significantly more stable than Single Shooting.
 """
-@concrete struct MultipleShooting
+@concrete struct MultipleShooting{J <: BVPJacobianAlgorithm}
     ode_alg
     nlsolve
+    jac_alg::J
     nshoots::Int
     grid_coarsening
 end
 
-function MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOTING,
-    grid_coarsening = true)
+function MultipleShooting(nshoots::Int, ode_alg; nlsolve = NewtonRaphson(),
+    grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
     @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
             grid_coarsening isa AbstractVector{<:Integer} ||
             grid_coarsening isa NTuple{N, <:Integer} where {N}
@@ -42,7 +39,7 @@ function MultipleShooting(nshoots::Int, ode_alg; nlsolve = DEFAULT_NLSOLVE_SHOOT
         sort!(grid_coarsening; rev = true)
         @assert all(grid_coarsening .> 0) && 1 ∉ grid_coarsening
     end
-    return MultipleShooting(ode_alg, nlsolve, nshoots, grid_coarsening)
+    return MultipleShooting(ode_alg, nlsolve, jac_alg, nshoots, grid_coarsening)
 end
 
 for order in (2, 3, 4, 5, 6)
@@ -50,29 +47,27 @@ for order in (2, 3, 4, 5, 6)
 
     @eval begin
         """
-            $($alg)(; nlsolve = BoundaryValueDiffEq.DEFAULT_NLSOLVE_MIRK,
-                jac_alg = BoundaryValueDiffEq.DEFAULT_JACOBIAN_ALGORITHM_MIRK)
+            $($alg)(; nlsolve = NewtonRaphson(), jac_alg = BVPJacobianAlgorithm())
 
         $($order)th order Monotonic Implicit Runge Kutta method, with Newton Raphson nonlinear solver as default.
 
         ## References
 
         @article{Enright1996RungeKuttaSW,
-        title={Runge-Kutta Software with Defect Control for Boundary Value ODEs},
-        author={Wayne H. Enright and Paul H. Muir},
-        journal={SIAM J. Sci. Comput.},
-        year={1996},
-        volume={17},
-        pages={479-497}
+            title={Runge-Kutta Software with Defect Control for Boundary Value ODEs},
+            author={Wayne H. Enright and Paul H. Muir},
+            journal={SIAM J. Sci. Comput.},
+            year={1996},
+            volume={17},
+            pages={479-497}
         }
         """
-        struct $(alg){N, J <: MIRKJacobianComputationAlgorithm} <: AbstractMIRK
+        struct $(alg){N, J <: BVPJacobianAlgorithm} <: AbstractMIRK
             nlsolve::N
             jac_alg::J
         end
 
-        function $(alg)(; nlsolve = DEFAULT_NLSOLVE_MIRK,
-            jac_alg = DEFAULT_JACOBIAN_ALGORITHM_MIRK)
+        function $(alg)(; nlsolve = NewtonRaphson(), jac_alg = BVPJacobianAlgorithm())
             return $(alg)(nlsolve, jac_alg)
         end
     end
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 2c90e3bb..6137d4d3 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -276,7 +276,7 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     cache_bc = __sparse_jacobian_cache(Val(iip), jac_alg.bc_diffmode, sd_bc, loss_bc,
         resid_bc, y)
 
-    sd_collocation = if jac_alg.collocation_diffmode isa AbstractSparseADType
+    sd_collocation = if jac_alg.nonbc_diffmode isa AbstractSparseADType
         Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(cache, y, cache.M, N)
         PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
             col_colorvec = cvec)
@@ -284,12 +284,10 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
         NoSparsityDetection()
     end
 
-    cache_collocation = __sparse_jacobian_cache(Val(iip), jac_alg.collocation_diffmode,
+    cache_collocation = __sparse_jacobian_cache(Val(iip), jac_alg.nonbc_diffmode,
         sd_collocation, loss_collocation, resid_collocation, y)
 
-    jac_prototype = vcat(init_jacobian(cache_bc),
-        jac_alg.collocation_diffmode isa AbstractSparseADType ? Jₛ :
-        init_jacobian(cache_collocation))
+    jac_prototype = vcat(init_jacobian(cache_bc), init_jacobian(cache_collocation))
 
     # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
     #       mismatch for ForwardDiff
@@ -297,7 +295,7 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
         function jac_internal!(J, x, p)
             sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
                 loss_bc, resid_bc, x)
-            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+            sparse_jacobian!(@view(J[(cache.M + 1):end, :]), jac_alg.nonbc_diffmode,
                 cache_collocation, loss_collocation, resid_collocation, x)
             return J
         end
@@ -306,7 +304,7 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
         function jac_internal(x, p)
             sparse_jacobian!(@view(J_[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
                 loss_bc, x)
-            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.collocation_diffmode,
+            sparse_jacobian!(@view(J_[(cache.M + 1):end, :]), jac_alg.nonbc_diffmode,
                 cache_collocation, loss_collocation, x)
             return J_
         end
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index f4efbf09..c70e0fae 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -14,7 +14,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
             @warn "Initial guess length != `nshoots + 1`! Adapting to `nshoots = $(nshoots)`"
     end
 
-    # We will use colored AD for this parts!
+    # We will use colored AD for this part!
     @views function solve_internal_odes!(resid_nodes, us, p, cur_nshoots, nodes)
         ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
         us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
@@ -94,12 +94,11 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         J_bc = J[1:N, :]
         J_c = J[(N + 1):end, :]
 
-        # FIXME: External control
-        sparse_jacobian!(J_c, AutoSparseForwardDiff(), ode_jac_cache, ode_fn,
+        sparse_jacobian!(J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn,
             resid_nodes.du, us)
 
         # For BC
-        sparse_jacobian!(J_bc, AutoForwardDiff(), bc_jac_cache, bc_fn, resid_bc, us)
+        sparse_jacobian!(J_bc, alg.jac_alg.bc_diffmode, bc_jac_cache, bc_fn, resid_bc, us)
 
         return nothing
     end
@@ -120,28 +119,30 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
 
         resid_prototype = ArrayPartition(bcresid_prototype,
             similar(u_at_nodes, cur_nshoot * N))
-        residbc_prototype = DiffCache(bcresid_prototype,
-            pickchunksize((cur_nshoot + 1) * N))
         resid_nodes = maybe_allocate_diffcache(resid_prototype.x[2],
-            pickchunksize((cur_nshoot + 1) * N),
-            AutoForwardDiff())
-
-        J_c, col_colorvec, row_colorvec = __generate_sparse_jacobian_prototype(alg, _u0, N,
-            cur_nshoot)
+            pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
 
         ode_fn = (du, u) -> solve_internal_odes!(du, u, prob.p, cur_nshoot, nodes)
-        ode_jac_cache = sparse_jacobian_cache(AutoSparseForwardDiff(),
-            PrecomputedJacobianColorvec(; jac_prototype = J_c, col_colorvec, row_colorvec),
-            ode_fn, copy(resid_prototype.x[2]), u_at_nodes)
+        sd_ode = if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
+            J_c, col_colorvec, row_colorvec = __generate_sparse_jacobian_prototype(alg, _u0,
+                N, cur_nshoot)
+            PrecomputedJacobianColorvec(; jac_prototype = J_c, row_colorvec, col_colorvec)
+        else
+            NoSparsityDetection()
+        end
+        ode_jac_cache = sparse_jacobian_cache(alg.jac_alg.nonbc_diffmode, sd_ode,
+            ode_fn, similar(u_at_nodes, cur_nshoot * N), u_at_nodes)
 
         bc_fn = (du, u) -> compute_bc_residual!(du, u, prob.p, cur_nshoot,
             nodes, resid_nodes)
-        bc_jac_cache = sparse_jacobian_cache(AutoForwardDiff(),
-            NoSparsityDetection(), bc_fn, copy(resid_prototype.x[1]), u_at_nodes)
+        sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
+                SymbolicsSparsityDetection() : NoSparsityDetection()
+        bc_jac_cache = sparse_jacobian_cache(alg.jac_alg.bc_diffmode,
+            sd_bc, bc_fn, similar(bcresid_prototype), u_at_nodes)
 
         jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
 
-        jac_fn = (J, us, p) -> jac!(J, us, p, resid_prototype.x[1], resid_nodes,
+        jac_fn = (J, us, p) -> jac!(J, us, p, similar(bcresid_prototype), resid_nodes,
             ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
 
         loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
@@ -152,8 +153,8 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
     end
 
     single_shooting_prob = remake(prob; u0 = reshape(u_at_nodes[1:N], u0_size))
-    return SciMLBase.__solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve);
-        odesolve_kwargs, nlsolve_kwargs, verbose, kwargs...)
+    return solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve); odesolve_kwargs,
+        nlsolve_kwargs, verbose, kwargs...)
 end
 
 function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
diff --git a/src/types.jl b/src/types.jl
index ecbe1ae8..0c5beea8 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -33,40 +33,40 @@ end
 @truncate_stacktrace MIRKInterpTableau 1
 
 # Sparsity Detection
-@concrete struct MIRKJacobianComputationAlgorithm
+@concrete struct BVPJacobianAlgorithm
     bc_diffmode
-    collocation_diffmode
+    nonbc_diffmode
     diffmode
 end
 
-function MIRKJacobianComputationAlgorithm(diffmode = missing;
-    collocation_diffmode = missing, bc_diffmode = missing)
+function BVPJacobianAlgorithm(diffmode = missing; nonbc_diffmode = missing,
+    bc_diffmode = missing)
     if diffmode !== missing
-        @assert collocation_diffmode === missing && bc_diffmode === missing
-        return MIRKJacobianComputationAlgorithm(diffmode, diffmode, diffmode)
+        @assert nonbc_diffmode === missing && bc_diffmode === missing
+        return BVPJacobianAlgorithm(diffmode, diffmode, diffmode)
     else
-        @static if VERSION < v"1.9"
-            diffmode = AutoForwardDiff()
-            bc_diffmode = bc_diffmode === missing ? AutoForwardDiff() : bc_diffmode
-            collocation_diffmode = collocation_diffmode === missing ?
-                                   AutoForwardDiff() : collocation_diffmode
-        else
-            diffmode = AutoSparseForwardDiff()
-            bc_diffmode = bc_diffmode === missing ? AutoForwardDiff() : bc_diffmode
-            collocation_diffmode = collocation_diffmode === missing ?
-                                   AutoSparseForwardDiff() : collocation_diffmode
-        end
-        return MIRKJacobianComputationAlgorithm(bc_diffmode, collocation_diffmode,
-            collocation_diffmode)
+        diffmode = AutoSparseForwardDiff()
+        bc_diffmode = bc_diffmode === missing ? AutoForwardDiff() : bc_diffmode
+        nonbc_diffmode = nonbc_diffmode === missing ?
+                         AutoSparseForwardDiff() : nonbc_diffmode
+        return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, nonbc_diffmode)
     end
 end
 
+function MIRKJacobianComputationAlgorithm(diffmode = missing;
+    collocation_diffmode = missing, bc_diffmode = missing)
+    Base.depwarn("`MIRKJacobianComputationAlgorithm` has been deprecated in favor of \
+        `BVPJacobianAlgorithm`. Replace `collocation_diffmode` with `nonbc_diffmode",
+        :MIRKJacobianComputationAlgorithm)
+    return BVPJacobianAlgorithm(diffmode; nonbc_diffmode = collocation_diffmode,
+        bc_diffmode)
+end
+
 __needs_diffcache(::Union{AutoForwardDiff, AutoSparseForwardDiff}) = true
 __needs_diffcache(_) = false
-function __needs_diffcache(jac_alg::MIRKJacobianComputationAlgorithm)
-    return __needs_diffcache(jac_alg.diffmode) ||
-           __needs_diffcache(jac_alg.bc_diffmode) ||
-           __needs_diffcache(jac_alg.collocation_diffmode)
+function __needs_diffcache(jac_alg::BVPJacobianAlgorithm)
+    return __needs_diffcache(jac_alg.diffmode) || __needs_diffcache(jac_alg.bc_diffmode) ||
+           __needs_diffcache(jac_alg.nonbc_diffmode)
 end
 
 # We don't need to always allocate a DiffCache. This works around that.
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
index ec40ea6a..65f644fc 100644
--- a/test/shooting/orbital.jl
+++ b/test/shooting/orbital.jl
@@ -77,7 +77,8 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     cur_bc!(resid_f, sol, nothing, sol.t)
     @test norm(resid_f, Inf) < TestTol
 
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve); abstol = 1e-6,
+    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff)
+    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg); abstol = 1e-6,
         reltol = 1e-6)
     @test SciMLBase.successful_retcode(sol)
     cur_bc!(resid_f, sol, nothing, sol.t)
@@ -97,7 +98,8 @@ for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
     @test norm(reduce(vcat, resid_f_2p), Inf) < TestTol
 
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve); abstol = 1e-6,
+    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff)
+    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg); abstol = 1e-6,
         reltol = 1e-6)
     @test SciMLBase.successful_retcode(sol)
     cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index 8a8dbc4a..c7cd1df8 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -76,7 +76,7 @@ using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
         sol = solve(bvp4, solver; abstol = 1e-13, reltol = 1e-13)
         @test SciMLBase.successful_retcode(sol)
         resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
-        @test norm(resid_f) < 1e-12
+        @test norm(resid_f) < 1e-11
     end
 end
 
@@ -101,10 +101,10 @@ end
     resid_f = Array{ComplexF64}(undef, 2)
 
     nlsolve = NewtonRaphson(; autodiff = AutoFiniteDiff())
-    for solver in [Shooting(Tsit5(); nlsolve)]
-        # FIXME: Need to reenable MS. Currently it always uses ForwardDiff which is a
-        # regression and needs fixing
-        # , MultipleShooting(10, Tsit5(); nlsolve)]
+    jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoFiniteDiff(),
+        nonbc_diffmode = AutoSparseFiniteDiff())
+    for solver in [Shooting(Tsit5(); nlsolve),
+        MultipleShooting(10, Tsit5(); nlsolve, jac_alg)]
         sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
         @test SciMLBase.successful_retcode(sol)
         bc1!(resid_f, sol, nothing, sol.t)

From a5b10cc14736e627305d14c01c27da07a9c2f632 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 10 Oct 2023 16:57:16 -0400
Subject: [PATCH 23/28] Two Point Specialization for MultipleShooting

---
 src/algorithms.jl               |  15 ++
 src/solve/mirk.jl               |   1 +
 src/solve/multiple_shooting.jl  | 281 ++++++++++++++++++++++++++------
 src/types.jl                    |  16 +-
 test/runtests.jl                |   3 +
 test/shooting/ray_tracing.jl    | 137 ++++++++++++++++
 test/shooting/shooting_tests.jl | 115 -------------
 7 files changed, 403 insertions(+), 165 deletions(-)
 create mode 100644 test/shooting/ray_tracing.jl

diff --git a/src/algorithms.jl b/src/algorithms.jl
index faafae4e..28fe6d26 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -29,6 +29,21 @@ Significantly more stable than Single Shooting.
     grid_coarsening
 end
 
+function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob,
+    alg::MultipleShooting)
+    diffmode = jac_alg.diffmode === nothing ? AutoSparseForwardDiff() : jac_alg.diffmode
+    bc_diffmode = if jac_alg.bc_diffmode === nothing
+        prob.problem_type isa TwoPointBVProblem ? AutoSparseForwardDiff() :
+        AutoForwardDiff()
+    else
+        jac_alg.bc_diffmode
+    end
+    nonbc_diffmode = jac_alg.nonbc_diffmode === nothing ? AutoSparseForwardDiff() :
+                     jac_alg.nonbc_diffmode
+
+    return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
+end
+
 function MultipleShooting(nshoots::Int, ode_alg; nlsolve = NewtonRaphson(),
     grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
     @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 6137d4d3..18c0e94d 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -1,6 +1,7 @@
 function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
     abstol = 1e-3, adaptive = true, kwargs...)
     has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
+    @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
     iip = isinplace(prob)
     (T, M, n) = if has_initial_guess
         # If user provided a vector of initial guesses
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index c70e0fae..6f98fe97 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -3,10 +3,24 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
     @unpack f, tspan = prob
     bc = prob.f.bc
     has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
+    @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
     _u0 = has_initial_guess ? first(prob.u0) : prob.u0
     N, u0_size, nshoots, iip = length(_u0), size(_u0), alg.nshoots, isinplace(prob)
-    bcresid_prototype = prob.f.bcresid_prototype === nothing ? similar(_u0) :
-                        prob.f.bcresid_prototype
+    if prob.f.bcresid_prototype === nothing
+        if prob.problem_type isa TwoPointBVProblem
+            # This can only happen if the problem is !iip
+            bcresid_prototype = ArrayPartition(bc[1](_u0, prob.p), bc[2](_u0, prob.p))
+        else
+            bcresid_prototype = similar(_u0)
+        end
+    else
+        bcresid_prototype = prob.f.bcresid_prototype
+    end
+
+    if prob.problem_type isa TwoPointBVProblem
+        resida_len = length(bcresid_prototype.x[1])
+        residb_len = length(bcresid_prototype.x[2])
+    end
 
     if has_initial_guess && length(prob.u0) != nshoots + 1
         nshoots = length(prob.u0) - 1
@@ -45,29 +59,58 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         return reduce(vcat, ensemble_sol.u.us), reduce(vcat, ensemble_sol.u.ts)
     end
 
-    @views function compute_bc_residual!(resid_bc, us, p, cur_nshoots, nodes,
-        resid_nodes::Union{Nothing, MaybeDiffCache} = nothing)
-        if resid_nodes === nothing
-            _resid_nodes = similar(us, cur_nshoots * N)  # This might be Dual based on `us`
-        else
-            _resid_nodes = get_tmp(resid_nodes, us)
+    compute_bc_residual! = if prob.problem_type isa TwoPointBVProblem
+        @views function compute_bc_residual_tp!(resid_bc, us::ArrayPartition, p,
+            cur_nshoots, nodes, resid_nodes::Union{Nothing, MaybeDiffCache} = nothing)
+            ua, ub0 = us.x
+            # Just Recompute the last ODE Solution
+            lastodeprob = ODEProblem{iip}(f, reshape(ub0, u0_size),
+                (nodes[end - 1], nodes[end]), p)
+            sol_ode_last = solve(lastodeprob, alg.ode_alg; odesolve_kwargs..., verbose,
+                kwargs..., save_everystep = false, saveat = (), save_end = true)
+            ub = vec(sol_ode_last.u[end])
+
+            resid_bc_a, resid_bc_b = if resid_bc isa ArrayPartition
+                resid_bc.x
+            else
+                resid_bc[1:resida_len], resid_bc[(resida_len + 1):end]
+            end
+
+            if iip
+                bc[1](resid_bc_a, ua, p)
+                bc[2](resid_bc_b, ub, p)
+            else
+                resid_bc_a .= bc[1](ua, p)
+                resid_bc_b .= bc[2](ub, p)
+            end
+
+            return resid_bc
         end
+    else
+        @views function compute_bc_residual_mp!(resid_bc, us, p, cur_nshoots, nodes,
+            resid_nodes::Union{Nothing, MaybeDiffCache} = nothing)
+            if resid_nodes === nothing
+                _resid_nodes = similar(us, cur_nshoots * N)  # This might be Dual based on `us`
+            else
+                _resid_nodes = get_tmp(resid_nodes, us)
+            end
 
-        # NOTE: We need to recompute this to correctly propagate the dual numbers / gradients
-        _us, _ts = solve_internal_odes!(_resid_nodes, us, p, cur_nshoots, nodes)
+            # NOTE: We need to recompute this to correctly propagate the dual numbers / gradients
+            _us, _ts = solve_internal_odes!(_resid_nodes, us, p, cur_nshoots, nodes)
 
-        # Boundary conditions
-        # Builds an ODESolution object to keep the framework for bc(,,) consistent
-        odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
-        total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
+            # Boundary conditions
+            # Builds an ODESolution object to keep the framework for bc(,,) consistent
+            odeprob = ODEProblem{iip}(f, reshape(us[1:N], u0_size), tspan, p)
+            total_solution = SciMLBase.build_solution(odeprob, alg.ode_alg, _ts, _us)
 
-        if iip
-            eval_bc_residual!(resid_bc, prob.problem_type, bc, total_solution, p)
-        else
-            resid_bc .= eval_bc_residual(prob.problem_type, bc, total_solution, p)
-        end
+            if iip
+                eval_bc_residual!(resid_bc, prob.problem_type, bc, total_solution, p)
+            else
+                resid_bc .= eval_bc_residual(prob.problem_type, bc, total_solution, p)
+            end
 
-        return resid_bc
+            return resid_bc
+        end
     end
 
     @views function loss!(resid::ArrayPartition, us, p, cur_nshoots, nodes)
@@ -89,18 +132,44 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         return resid
     end
 
-    @views function jac!(J::AbstractMatrix, us, p, resid_bc, resid_nodes::MaybeDiffCache,
-        ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
-        J_bc = J[1:N, :]
-        J_c = J[(N + 1):end, :]
+    jac! = if prob.problem_type isa TwoPointBVProblem
+        @views function jac_tp!(J::AbstractMatrix, us, p, resid_bc,
+            resid_nodes::MaybeDiffCache, ode_jac_cache, bc_jac_cache::Tuple, ode_fn, bc_fn,
+            cur_nshoot, nodes)
+            J isa SparseArrays.SparseMatrixCSC || fill!(J, 0)
+            J_bc = J[1:N, :]
+            J_c = J[(N + 1):end, :]
+
+            sparse_jacobian!(J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn,
+                resid_nodes.du, us)
+
+            # For BC
+            bc_jac_cache′, J_bc′ = bc_jac_cache
+            sparse_jacobian!(J_bc′, alg.jac_alg.bc_diffmode, bc_jac_cache′, bc_fn,
+                resid_bc, ArrayPartition(us[1:N], us[(end - N + 1):end]))
+            resida, residb = resid_bc.x
+            J_bc[1:length(resida), 1:N] .= J_bc′[1:length(resida), 1:N]
+            idxᵢ = (length(resida) + 1):(length(resida) + length(residb))
+            J_bc[idxᵢ, (end - N + 1):end] .= J_bc′[idxᵢ, (end - N + 1):end]
+
+            return nothing
+        end
+    else
+        @views function jac_mp!(J::AbstractMatrix, us, p, resid_bc,
+            resid_nodes::MaybeDiffCache, ode_jac_cache, bc_jac_cache, ode_fn, bc_fn,
+            cur_nshoot, nodes)
+            J_bc = J[1:N, :]
+            J_c = J[(N + 1):end, :]
 
-        sparse_jacobian!(J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn,
-            resid_nodes.du, us)
+            sparse_jacobian!(J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn,
+                resid_nodes.du, us)
 
-        # For BC
-        sparse_jacobian!(J_bc, alg.jac_alg.bc_diffmode, bc_jac_cache, bc_fn, resid_bc, us)
+            # For BC
+            sparse_jacobian!(J_bc, alg.jac_alg.bc_diffmode, bc_jac_cache, bc_fn, resid_bc,
+                us)
 
-        return nothing
+            return nothing
+        end
     end
 
     # This gets all the nshoots except the final SingleShooting case
@@ -122,10 +191,19 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         resid_nodes = maybe_allocate_diffcache(resid_prototype.x[2],
             pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
 
+        if prob.problem_type isa TwoPointBVProblem
+            if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType ||
+               alg.jac_alg.bc_diffmode isa AbstractSparseADType
+                J_full, col_colorvec, row_colorvec, (J_c, J_bc_partial), col_colorvec_bc, row_colorvec_bc, = __generate_sparse_jacobian_prototype(alg,
+                    prob.problem_type, bcresid_prototype, _u0, N, cur_nshoot)
+            end
+        elseif alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
+            J_c, col_colorvec, row_colorvec, = __generate_sparse_jacobian_prototype(alg,
+                prob.problem_type, bcresid_prototype, _u0, N, cur_nshoot)
+        end
+
         ode_fn = (du, u) -> solve_internal_odes!(du, u, prob.p, cur_nshoot, nodes)
         sd_ode = if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
-            J_c, col_colorvec, row_colorvec = __generate_sparse_jacobian_prototype(alg, _u0,
-                N, cur_nshoot)
             PrecomputedJacobianColorvec(; jac_prototype = J_c, row_colorvec, col_colorvec)
         else
             NoSparsityDetection()
@@ -133,15 +211,37 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         ode_jac_cache = sparse_jacobian_cache(alg.jac_alg.nonbc_diffmode, sd_ode,
             ode_fn, similar(u_at_nodes, cur_nshoot * N), u_at_nodes)
 
-        bc_fn = (du, u) -> compute_bc_residual!(du, u, prob.p, cur_nshoot,
-            nodes, resid_nodes)
-        sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
-                SymbolicsSparsityDetection() : NoSparsityDetection()
-        bc_jac_cache = sparse_jacobian_cache(alg.jac_alg.bc_diffmode,
-            sd_bc, bc_fn, similar(bcresid_prototype), u_at_nodes)
-
-        jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
+        bc_fn = (du, u) -> compute_bc_residual!(du, u, prob.p, cur_nshoot, nodes,
+            resid_nodes)
+        if prob.problem_type isa TwoPointBVProblem
+            sd_bc = if alg.jac_alg.bc_diffmode isa AbstractSparseADType
+                PrecomputedJacobianColorvec(; jac_prototype = J_bc_partial,
+                    row_colorvec = row_colorvec_bc, col_colorvec = col_colorvec_bc)
+            else
+                NoSparsityDetection()
+            end
+            bc_jac_cache_partial = sparse_jacobian_cache(alg.jac_alg.bc_diffmode, sd_bc,
+                bc_fn, similar(bcresid_prototype),
+                ArrayPartition(@view(u_at_nodes[1:N]),
+                    @view(u_at_nodes[(end - N + 1):end])))
+
+            bc_jac_cache = (bc_jac_cache_partial, init_jacobian(bc_jac_cache_partial))
+
+            jac_prototype = if alg.jac_alg.bc_diffmode isa AbstractSparseADType ||
+                               alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
+                J_full
+            else
+                # Dense AD being used!
+                fill!(similar(u_at_nodes, length(resid_prototype), length(u_at_nodes)), 0)
+            end
+        else
+            sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
+                    SymbolicsSparsityDetection() : NoSparsityDetection()
+            bc_jac_cache = sparse_jacobian_cache(alg.jac_alg.bc_diffmode,
+                sd_bc, bc_fn, similar(bcresid_prototype), u_at_nodes)
 
+            jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
+        end
         jac_fn = (J, us, p) -> jac!(J, us, p, similar(bcresid_prototype), resid_nodes,
             ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
 
@@ -157,7 +257,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
         nlsolve_kwargs, verbose, kwargs...)
 end
 
-function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
+@views function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
     nshoots; odesolve_kwargs = (;), verbose = true, kwargs...)
     @unpack f, u0, tspan, p = prob
     @unpack ode_alg = alg
@@ -199,8 +299,8 @@ function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_g
     return nodes, u_at_nodes
 end
 
-@views @inline function multiple_shooting_initialize(u_at_nodes_prev, prob, alg,
-    prev_nodes, nshoots, old_nshoots, has_initial_guess; odesolve_kwargs = (;), kwargs...)
+@views function multiple_shooting_initialize(u_at_nodes_prev, prob, alg, prev_nodes,
+    nshoots, old_nshoots, has_initial_guess; odesolve_kwargs = (;), kwargs...)
     @unpack f, u0, tspan, p = prob
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
     N = has_initial_guess ? length(first(u0)) : length(u0)
@@ -262,10 +362,26 @@ end
     return nshoots_vec
 end
 
-function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, N::Int, nshoots::Int)
+"""
+    __generate_sparse_jacobian_prototype(::MultipleShooting, _, _, u0, N::Int,
+        nshoots::Int)
+    __generate_sparse_jacobian_prototype(::MultipleShooting, ::TwoPointBVProblem,
+        bcresid_prototype, u0, N::Int, nshoots::Int)
+
+For a Multi-Point Problem, returns the Jacobian Prototype for the Sparse Part. For a Two-
+Point Problem, returns the Jacobian Prototype for the Entire Jacobian.
+
+Also returns the column and row color vectors for the Sparse Non-BC Part Jacobian.
+
+Returns the column and row color vectors for the Sparse BC Part Jacobian (if computed).
+
+Also returns the indices `Is` and `Js` used to construct the Sparse Jacobian.
+"""
+function __generate_sparse_jacobian_prototype(::MultipleShooting, _, _, u0, N::Int,
+    nshoots::Int)
     # Sparse for Stitching solution together
-    Is = Vector{UInt32}(undef, (N^2 + N) * nshoots)
-    Js = Vector{UInt32}(undef, (N^2 + N) * nshoots)
+    Is = Vector{Int64}(undef, (N^2 + N) * nshoots)
+    Js = Vector{Int64}(undef, (N^2 + N) * nshoots)
 
     idx = 1
     for i in 1:nshoots
@@ -291,5 +407,78 @@ function __generate_sparse_jacobian_prototype(::MultipleShooting, u0, N::Int, ns
         row_colorvec[i] = mod1(i, 2 * N)
     end
 
-    return J_c, col_colorvec, row_colorvec
+    return J_c, col_colorvec, row_colorvec, (J_c, nothing), nothing, nothing, Is, Js
+end
+
+function __generate_sparse_jacobian_prototype(alg::MultipleShooting, ::TwoPointBVProblem,
+    bcresid_prototype, u0, N::Int, nshoots::Int)
+    resida, residb = bcresid_prototype.x
+    # Sparse for Stitching solution together
+    L = N * length(resida) + (N^2 + N) * nshoots + N * length(residb)
+    Is = Vector{Int64}(undef, L)
+    Js = Vector{Int64}(undef, L)
+
+    idx = 1
+    for row in 1:length(resida)
+        for j in 1:N
+            Is[idx] = row
+            Js[idx] = j
+            idx += 1
+        end
+    end
+    for row in 1:length(residb)
+        for j in 1:N
+            Is[idx] = length(resida) + row
+            Js[idx] = j + (nshoots * N)
+            idx += 1
+        end
+    end
+    J_c, col_colorvec, row_colorvec, _, _, _, Is′, Js′ = __generate_sparse_jacobian_prototype(alg,
+        nothing, nothing, u0, N, nshoots)
+    for (i, j) in zip(Is′, Js′)
+        Is[idx] = length(resida) + length(residb) + i
+        Js[idx] = j
+        idx += 1
+    end
+
+    col_colorvec_bc = Vector{Int}(undef, 2N)
+    row_colorvec_bc = Vector{Int}(undef, length(resida) + length(residb))
+    col_colorvec_bc[1:N] .= 1:N
+    col_colorvec_bc[(N + 1):end] .= 1:N
+    for i in 1:max(length(resida), length(residb))
+        if i ≤ length(resida)
+            row_colorvec_bc[i] = i
+        end
+        if i ≤ length(residb)
+            row_colorvec_bc[i + length(resida)] = i
+        end
+    end
+
+    J = sparse(adapt(parameterless_type(u0), Is), adapt(parameterless_type(u0), Js),
+        similar(u0, length(Is)))
+
+    Is_bc = Vector{Int64}(undef, N^2)
+    Js_bc = Vector{Int64}(undef, N^2)
+    idx = 1
+    for i in 1:length(resida)
+        for j in 1:N
+            Is_bc[idx] = i
+            Js_bc[idx] = j
+            idx += 1
+        end
+    end
+    for i in 1:length(residb)
+        for j in 1:N
+            Is_bc[idx] = i + length(resida)
+            Js_bc[idx] = j + N
+            idx += 1
+        end
+    end
+
+    J_bc = sparse(adapt(parameterless_type(u0), Is_bc),
+        adapt(parameterless_type(u0), Js_bc),
+        similar(u0, length(Is_bc)))
+
+    return (J, col_colorvec, row_colorvec, (J_c, J_bc), col_colorvec_bc, row_colorvec_bc,
+        Is, Js)
 end
diff --git a/src/types.jl b/src/types.jl
index 0c5beea8..bf3ff899 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -45,14 +45,22 @@ function BVPJacobianAlgorithm(diffmode = missing; nonbc_diffmode = missing,
         @assert nonbc_diffmode === missing && bc_diffmode === missing
         return BVPJacobianAlgorithm(diffmode, diffmode, diffmode)
     else
-        diffmode = AutoSparseForwardDiff()
-        bc_diffmode = bc_diffmode === missing ? AutoForwardDiff() : bc_diffmode
-        nonbc_diffmode = nonbc_diffmode === missing ?
-                         AutoSparseForwardDiff() : nonbc_diffmode
+        diffmode = nothing
+        bc_diffmode = bc_diffmode === missing ? nothing : bc_diffmode
+        nonbc_diffmode = nonbc_diffmode === missing ? nothing : nonbc_diffmode
         return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, nonbc_diffmode)
     end
 end
 
+function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob, alg)
+    diffmode = jac_alg.diffmode === nothing ? AutoSparseForwardDiff() : jac_alg.diffmode
+    bc_diffmode = jac_alg.bc_diffmode === nothing ? AutoForwardDiff() : jac_alg.bc_diffmode
+    nonbc_diffmode = jac_alg.nonbc_diffmode === nothing ? AutoSparseForwardDiff() :
+                     jac_alg.nonbc_diffmode
+
+    return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
+end
+
 function MIRKJacobianComputationAlgorithm(diffmode = missing;
     collocation_diffmode = missing, bc_diffmode = missing)
     Base.depwarn("`MIRKJacobianComputationAlgorithm` has been deprecated in favor of \
diff --git a/test/runtests.jl b/test/runtests.jl
index 0d404a56..7779d163 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,9 @@ using Test, SafeTestsets
         @time @safetestset "Shooting Tests" begin
             include("shooting/shooting_tests.jl")
         end
+        @time @safetestset "Ray Tracing BVP" begin
+            include("shooting/ray_tracing.jl")
+        end
         @time @safetestset "Orbital" begin
             include("shooting/orbital.jl")
         end
diff --git a/test/shooting/ray_tracing.jl b/test/shooting/ray_tracing.jl
new file mode 100644
index 00000000..e4d55781
--- /dev/null
+++ b/test/shooting/ray_tracing.jl
@@ -0,0 +1,137 @@
+using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
+
+@inline v(x, y, z, p) = 1 / (4 + cos(p[1] * x) + sin(p[2] * y) - cos(p[3] * z))
+@inline ux(x, y, z, p) = -p[1] * sin(p[1] * x)
+@inline uy(x, y, z, p) = p[2] * cos(p[2] * y)
+@inline uz(x, y, z, p) = p[3] * sin(p[3] * z)
+
+function ray_tracing(u, p, t)
+    du = similar(u)
+    ray_tracing!(du, u, p, t)
+    return du
+end
+
+function ray_tracing!(du, u, p, t)
+    x, y, z, ξ, η, ζ, T, S = u
+
+    nu = v(x, y, z, p) # Velocity of a sound wave, function of space;
+    μx = ux(x, y, z, p) # ∂(slowness)/∂x, function of space
+    μy = uy(x, y, z, p) # ∂(slowness)/∂y, function of space
+    μz = uz(x, y, z, p) # ∂(slowness)/∂z, function of space
+
+    du[1] = S * nu * ξ
+    du[2] = S * nu * η
+    du[3] = S * nu * ζ
+
+    du[4] = S * μx
+    du[5] = S * μy
+    du[6] = S * μz
+
+    du[7] = S / nu
+    du[8] = 0
+
+    return nothing
+end
+
+function ray_tracing_bc(sol, p, t)
+    res = similar(first(sol))
+    ray_tracing_bc!(res, sol, p, t)
+    return res
+end
+
+function ray_tracing_bc!(res, sol, p, t)
+    ua = sol(0.0)
+    ub = sol(1.0)
+    nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
+
+    res[1] = ua[1] - p[4]
+    res[2] = ua[2] - p[5]
+    res[3] = ua[3] - p[6]
+    res[4] = ua[7]      # T(0) = 0
+    res[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
+    res[6] = ub[1] - p[7]
+    res[7] = ub[2] - p[8]
+    res[8] = ub[3] - p[9]
+    return nothing
+end
+
+function ray_tracing_bc_a(ua, p)
+    resa = similar(ua, 5)
+    ray_tracing_bc_a!(resa, ua, p)
+    return resa
+end
+
+function ray_tracing_bc_a!(resa, ua, p)
+    nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
+
+    resa[1] = ua[1] - p[4]
+    resa[2] = ua[2] - p[5]
+    resa[3] = ua[3] - p[5]
+    resa[4] = ua[7]
+    resa[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
+
+    return nothing
+end
+
+function ray_tracing_bc_b(ub, p)
+    resb = similar(ub, 3)
+    ray_tracing_bc_b!(resb, ub, p)
+    return resb
+end
+
+function ray_tracing_bc_b!(resb, ub, p)
+    resb[1] = ub[1] - p[7]
+    resb[2] = ub[2] - p[8]
+    resb[3] = ub[3] - p[9]
+    return nothing
+end
+
+p = [0, 1, 2, 0, 0, 0, 4, 3, 2.0]
+
+dx = p[7] - p[4]
+dy = p[8] - p[5]
+dz = p[9] - p[6]
+
+u0 = zeros(8)
+u0[1:3] .= 0 # position
+u0[4] = dx / v(p[4], p[5], p[6], p)
+u0[5] = dy / v(p[4], p[5], p[6], p)
+u0[6] = dz / v(p[4], p[5], p[6], p)
+u0[8] = 1
+
+tspan = (0.0, 1.0)
+
+prob_oop = BVProblem{false}(ray_tracing, ray_tracing_bc, u0, tspan, p)
+prob_iip = BVProblem{true}(ray_tracing!, ray_tracing_bc!, u0, tspan, p)
+prob_tp_oop = TwoPointBVProblem{false}(ray_tracing, (ray_tracing_bc_a, ray_tracing_bc_b),
+    u0, tspan, p)
+prob_tp_iip = TwoPointBVProblem{true}(ray_tracing!, (ray_tracing_bc_a!, ray_tracing_bc_b!),
+    u0, tspan, p; bcresid_prototype = (zeros(5), zeros(3)))
+
+alg_sp = MultipleShooting(10, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
+    grid_coarsening = Base.Fix2(div, 3),
+    jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(),
+        nonbc_diffmode = AutoSparseForwardDiff()))
+alg_dense = MultipleShooting(10, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
+    grid_coarsening = Base.Fix2(div, 3),
+    jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(),
+        nonbc_diffmode = AutoForwardDiff()))
+alg_default = MultipleShooting(10, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
+    grid_coarsening = Base.Fix2(div, 3))
+
+for (prob, alg) in Iterators.product((prob_oop, prob_iip, prob_tp_oop, prob_tp_iip),
+    (alg_sp, alg_dense, alg_default))
+    sol = solve(prob, alg; abstol = 1e-9, reltol = 1e-9, maxiters = 1000)
+    @test SciMLBase.successful_retcode(sol.retcode)
+
+    if prob.problem_type isa TwoPointBVProblem
+        resida, residb = zeros(5), zeros(3)
+        ray_tracing_bc_a!(resida, sol.u[1], p)
+        ray_tracing_bc_b!(residb, sol.u[end], p)
+        @test norm(vcat(resida, residb), 2) < 5e-5
+    else
+        resid = zeros(8)
+        ray_tracing_bc!(resid, sol, p, sol.t)
+        @test norm(resid, 2) < 5e-5
+    end
+end
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
index c7cd1df8..b2d96bb7 100644
--- a/test/shooting/shooting_tests.jl
+++ b/test/shooting/shooting_tests.jl
@@ -164,118 +164,3 @@ end
     bc_flow!(resid, sol_msshooting, p, sol_msshooting.t)
     @test norm(resid, Inf) < 1e-6
 end
-
-@testset "Ray Tracing BVP" begin
-    # Example 1.7 from
-    # "Numerical Solution to Boundary Value Problems for Ordinary Differential equations",
-    # 'Ascher, Mattheij, Russell'
-
-    # Earthquake happens at known position (x0, y0, z0)
-    # Earthquake is detected by seismograph at (xi, yi, zi)
-
-    # Find the path taken by the first ray that reached seismograph.
-    # i.e. given some velocity field finds the quickest path from
-    # (x0,y0,z0) to (xi, yi, zi)
-
-    # du = [dx, dy, dz, dξ, dη, dζ, dT, dS]
-    # du = [x, y, z, ξ, η, ζ, T, S]
-    # p = [ν(x,y,z), μ_x(x,y,z), μ_y(x,y,z), μ_z(x,y,z)]
-    @inline v(x, y, z, p) = 1 / (4 + cos(p[1] * x) + sin(p[2] * y) - cos(p[3] * z))
-    @inline ux(x, y, z, p) = -p[1] * sin(p[1] * x)
-    @inline uy(x, y, z, p) = p[2] * cos(p[2] * y)
-    @inline uz(x, y, z, p) = p[3] * sin(p[3] * z)
-
-    function ray_tracing(u, p, t)
-        du = similar(u)
-        ray_tracing!(du, u, p, t)
-        return du
-    end
-
-    function ray_tracing!(du, u, p, t)
-        x, y, z, ξ, η, ζ, T, S = u
-
-        nu = v(x, y, z, p) # Velocity of a sound wave, function of space;
-        μx = ux(x, y, z, p) # ∂(slowness)/∂x, function of space
-        μy = uy(x, y, z, p) # ∂(slowness)/∂y, function of space
-        μz = uz(x, y, z, p) # ∂(slowness)/∂z, function of space
-
-        du[1] = S * nu * ξ
-        du[2] = S * nu * η
-        du[3] = S * nu * ζ
-
-        du[4] = S * μx
-        du[5] = S * μy
-        du[6] = S * μz
-
-        du[7] = S / nu
-        du[8] = 0
-
-        return nothing
-    end
-
-    function ray_tracing_bc(sol, p, t)
-        res = similar(first(sol))
-        ray_tracing_bc!(res, sol, p, t)
-        return res
-    end
-
-    function ray_tracing_bc!(res, sol, p, t)
-        ua = sol(0.0)
-        ub = sol(1.0)
-        nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
-
-        res[1] = ua[1] - x0
-        res[2] = ua[2] - y0
-        res[3] = ua[3] - z0
-        res[4] = ua[7]      # T(0) = 0
-        res[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
-        res[6] = ub[1] - xi
-        res[7] = ub[2] - yi
-        res[8] = ub[3] - zi
-        return nothing
-    end
-
-    a = 0
-    b = 1
-    c = 2
-    x0 = 0
-    y0 = 0
-    z0 = 0
-    xi = 4
-    yi = 3
-    zi = 2.0
-    p = [a, b, c, x0, y0, z0, xi, yi, zi]
-
-    dx = xi - x0
-    dy = yi - y0
-    dz = zi - z0
-
-    u0 = zeros(8)
-    u0[1:3] .= 0 # position
-    u0[4] = dx / v(x0, y0, z0, p)
-    u0[5] = dy / v(x0, y0, z0, p)
-    u0[6] = dz / v(x0, y0, z0, p)
-    u0[8] = 1
-
-    tspan = (0.0, 1.0)
-
-    prob_oop = BVProblem{false}(ray_tracing, ray_tracing_bc, u0, tspan, p)
-    alg = MultipleShooting(16, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
-        grid_coarsening = Base.Fix2(div, 3))
-
-    sol = solve(prob_oop, alg; reltol = 1e-6, abstol = 1e-6)
-    @test SciMLBase.successful_retcode(sol.retcode)
-    resid = zeros(8)
-    ray_tracing_bc!(resid, sol, p, sol.t)
-    @test norm(resid, Inf) < 1e-6
-
-    prob_iip = BVProblem{true}(ray_tracing!, ray_tracing_bc!, u0, tspan, p)
-    alg = MultipleShooting(16, AutoVern7(Rodas4P()); nlsolve = NewtonRaphson(),
-        grid_coarsening = Base.Fix2(div, 3))
-
-    sol = solve(prob_iip, alg; reltol = 1e-6, abstol = 1e-6)
-    @test SciMLBase.successful_retcode(sol.retcode)
-    resid = zeros(8)
-    ray_tracing_bc!(resid, sol, p, sol.t)
-    @test norm(resid, Inf) < 1e-6
-end

From 2b6f3260a4b1860b5b34b81280f7a32ffeb149b3 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 10 Oct 2023 18:54:45 -0400
Subject: [PATCH 24/28] Cleanup sparse prototype creation

---
 src/BoundaryValueDiffEq.jl |   1 +
 src/solve/mirk.jl          |  80 ++------------------------
 src/sparse_jacobians.jl    | 112 +++++++++++++++++++++++++++++++++++++
 src/utils.jl               |   9 ---
 4 files changed, 118 insertions(+), 84 deletions(-)
 create mode 100644 src/sparse_jacobians.jl

diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index f63fe51a..7612cf6e 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -23,6 +23,7 @@ include("alg_utils.jl")
 include("mirk_tableaus.jl")
 include("cache.jl")
 include("collocation.jl")
+include("sparse_jacobians.jl")
 
 include("solve/single_shooting.jl")
 include("solve/multiple_shooting.jl")
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 18c0e94d..5f7553a7 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -278,9 +278,8 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
         resid_bc, y)
 
     sd_collocation = if jac_alg.nonbc_diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(cache, y, cache.M, N)
-        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
-            col_colorvec = cvec)
+        PrecomputedJacobianColorvec(__generate_sparse_jacobian_prototype(cache,
+            cache.problem_type, y, cache.M, N))
     else
         NoSparsityDetection()
     end
@@ -329,17 +328,15 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     end
 
     sd = if jac_alg.diffmode isa AbstractSparseADType
-        Jₛ, cvec, rvec = construct_sparse_banded_jac_prototype(cache, resid, cache.M, N)
-        PrecomputedJacobianColorvec(; jac_prototype = Jₛ, row_colorvec = rvec,
-            col_colorvec = cvec)
+        PrecomputedJacobianColorvec(__generate_sparse_jacobian_prototype(cache,
+            cache.problem_type, resid.x[1], cache.M, N))
     else
         NoSparsityDetection()
     end
 
     diffcache = __sparse_jacobian_cache(Val(iip), jac_alg.diffmode, sd, loss, resid, y)
 
-    jac_prototype = jac_alg.diffmode isa AbstractSparseADType ? Jₛ :
-                    init_jacobian(diffcache)
+    jac_prototype = init_jacobian(diffcache)
 
     # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
     #       mismatch for ForwardDiff
@@ -358,70 +355,3 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
 
     return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
 end
-
-# Generating Banded Matrix
-function construct_sparse_banded_jac_prototype(::MIRKCache, y, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    Is = Vector{Int}(undef, l)
-    Js = Vector{Int}(undef, l)
-    idx = 1
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * (N - 1))
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * (N - 1), M * N), col_colorvec, row_colorvec)
-end
-
-# Two Point Specialization
-function construct_sparse_banded_jac_prototype(::MIRKCache, y::ArrayPartition, M, N)
-    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
-    l_top = M * length(y.x[1].x[1])
-    l_bot = M * length(y.x[1].x[2])
-
-    Is = Vector{Int}(undef, l + l_top + l_bot)
-    Js = Vector{Int}(undef, l + l_top + l_bot)
-    idx = 1
-
-    for i in 1:length(y.x[1].x[1]), j in 1:M
-        Is[idx] = i
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
-        Is[idx] = i + length(y.x[1].x[1])
-        Js[idx] = j
-        idx += 1
-    end
-
-    for i in 1:length(y.x[1].x[2]), j in 1:M
-        Is[idx] = i + length(y.x[1].x[1]) + M * (N - 1)
-        Js[idx] = j + M * (N - 1)
-        idx += 1
-    end
-
-    col_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-    row_colorvec = Vector{Int}(undef, M * N)
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
-    end
-
-    y_ = similar(y, length(Is))
-    return (sparse(adapt(parameterless_type(y), Is), adapt(parameterless_type(y), Js),
-            y_, M * N, M * N), col_colorvec, row_colorvec)
-end
diff --git a/src/sparse_jacobians.jl b/src/sparse_jacobians.jl
new file mode 100644
index 00000000..d1deeb52
--- /dev/null
+++ b/src/sparse_jacobians.jl
@@ -0,0 +1,112 @@
+# This file defines several common patterns of sparse Jacobians we see in the BVP solvers.
+function _sparse_like(I, J, x::AbstractArray, m = maximum(I), n = maximum(J))
+    I′ = adapt(parameterless_type(x), I)
+    J′ = adapt(parameterless_type(x), J)
+    V = similar(x, length(I))
+    return sparse(I′, J′, V, m, n)
+end
+
+# Helpers for IIP/OOP functions
+function __sparse_jacobian_cache(::Val{iip}, ad, sd, fn, fx, y) where {iip}
+    if iip
+        sparse_jacobian_cache(ad, sd, fn, fx, y)
+    else
+        sparse_jacobian_cache(ad, sd, fn, y; fx)
+    end
+end
+
+@concrete struct ColoredMatrix
+    M
+    row_colorvec
+    col_colorvec
+end
+
+function SparseDiffTools.PrecomputedJacobianColorvec(M::ColoredMatrix)
+    return PrecomputedJacobianColorvec(; jac_prototype = M.M, M.row_colorvec,
+        M.col_colorvec)
+end
+
+# For MIRK Methods
+"""
+    __generate_sparse_jacobian_prototype(::MIRKCache, y, M, N)
+    __generate_sparse_jacobian_prototype(::MIRKCache, _, y, M, N)
+    __generate_sparse_jacobian_prototype(::MIRKCache, ::TwoPointBVProblem, y, M, N)
+
+Generate a prototype of the sparse Jacobian matrix for the BVP problem with row and column
+coloring.
+
+If the problem is a TwoPointBVProblem, then this is the complete Jacobian, else it only
+computes the sparse part excluding the contributions from the boundary conditions.
+"""
+function __generate_sparse_jacobian_prototype(cache::MIRKCache, y, M, N)
+    return __generate_sparse_jacobian_prototype(cache, cache.problem_type, y, M, N)
+end
+
+function __generate_sparse_jacobian_prototype(::MIRKCache, _, y, M, N)
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    Is = Vector{Int}(undef, l)
+    Js = Vector{Int}(undef, l)
+    idx = 1
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+
+    J_c = _sparse_like(Is, Js, y, M * (N - 1), M * N)
+
+    col_colorvec = Vector{Int}(undef, size(J_c, 2))
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, size(J_c, 1))
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    return ColoredMatrix(J_c, row_colorvec, col_colorvec)
+end
+
+function __generate_sparse_jacobian_prototype(::MIRKCache, ::TwoPointBVProblem,
+    y::ArrayPartition, M, N)
+    resida, residb = y.x
+
+    l = sum(i -> min(2M + i, M * N) - max(1, i - 1) + 1, 1:(M * (N - 1)))
+    l_top = M * length(resida)
+    l_bot = M * length(residb)
+
+    Is = Vector{Int}(undef, l + l_top + l_bot)
+    Js = Vector{Int}(undef, l + l_top + l_bot)
+
+    idx = 1
+    for i in 1:length(resida), j in 1:M
+        Is[idx] = i
+        Js[idx] = j
+        idx += 1
+    end
+    for i in 1:(M * (N - 1)), j in max(1, i - 1):min(2M + i, M * N)
+        Is[idx] = i + length(resida)
+        Js[idx] = j
+        idx += 1
+    end
+    for i in 1:length(residb), j in 1:M
+        Is[idx] = i + length(resida) + M * (N - 1)
+        Js[idx] = j + M * (N - 1)
+        idx += 1
+    end
+
+    J = _sparse_like(Is, Js, y, M * N, M * N)
+
+    col_colorvec = Vector{Int}(undef, size(J, 2))
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+    row_colorvec = Vector{Int}(undef, size(J, 1))
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, min(2M + 1, M * N) + 1)
+    end
+
+    return ColoredMatrix(J, row_colorvec, col_colorvec)
+end
+
+# For Multiple Shooting
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index 636b100c..3082566d 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -90,12 +90,3 @@ eval_bc_residual!(resid, _, bc!, sol, p, t) = bc!(resid, sol, p, t)
     bcb!(resid.x[2], ub, p)
     return resid
 end
-
-# Helpers for IIP/OOP functions
-function __sparse_jacobian_cache(::Val{iip}, ad, sd, fn, fx, y) where {iip}
-    if iip
-        sparse_jacobian_cache(ad, sd, fn, fx, y)
-    else
-        sparse_jacobian_cache(ad, sd, fn, y; fx)
-    end
-end

From 6f40df9bbbd0cbc48ce1658f4f1b8958e999bbc4 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 10 Oct 2023 23:19:59 -0400
Subject: [PATCH 25/28] Move things around for MIRK

---
 Project.toml                 |   1 +
 src/BoundaryValueDiffEq.jl   |  11 +-
 src/cache.jl                 |  67 ----------
 src/collocation.jl           |   6 -
 src/solve/mirk.jl            | 236 ++++++++++++++++-------------------
 src/solve/single_shooting.jl |   5 +-
 src/types.jl                 |  12 +-
 src/utils.jl                 |  65 ++++++++++
 8 files changed, 183 insertions(+), 220 deletions(-)
 delete mode 100644 src/cache.jl

diff --git a/Project.toml b/Project.toml
index 7aa43f32..35027d33 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,6 +18,7 @@ SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
+Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 TruncatedStacktraces = "781d530d-4396-4725-bb49-402e4bee1e77"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 7612cf6e..0a3a4f2a 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -1,7 +1,7 @@
 module BoundaryValueDiffEq
 
 using Adapt, LinearAlgebra, PreallocationTools, Reexport, Setfield, SparseArrays, SciMLBase,
-    RecursiveArrayTools, ForwardDiff
+    Static, RecursiveArrayTools, ForwardDiff
 @reexport using ADTypes, DiffEqBase, NonlinearSolve, SparseDiffTools, SciMLBase
 
 import ADTypes: AbstractADType
@@ -10,7 +10,7 @@ import ConcreteStructs: @concrete
 import DiffEqBase: solve
 import ForwardDiff: pickchunksize
 import RecursiveArrayTools: ArrayPartition, DiffEqArray
-import SciMLBase: AbstractDiffEqInterpolation
+import SciMLBase: AbstractDiffEqInterpolation, StandardBVProblem
 import RecursiveArrayTools: ArrayPartition
 import SparseDiffTools: AbstractSparseADType
 import TruncatedStacktraces: @truncate_stacktrace
@@ -20,15 +20,16 @@ include("types.jl")
 include("utils.jl")
 include("algorithms.jl")
 include("alg_utils.jl")
+
 include("mirk_tableaus.jl")
-include("cache.jl")
-include("collocation.jl")
-include("sparse_jacobians.jl")
 
 include("solve/single_shooting.jl")
 include("solve/multiple_shooting.jl")
 include("solve/mirk.jl")
 
+include("collocation.jl")
+include("sparse_jacobians.jl")
+
 include("adaptivity.jl")
 include("interpolation.jl")
 
diff --git a/src/cache.jl b/src/cache.jl
deleted file mode 100644
index 6d70d9e9..00000000
--- a/src/cache.jl
+++ /dev/null
@@ -1,67 +0,0 @@
-@concrete struct MIRKCache{iip, T}
-    order::Int                 # The order of MIRK method
-    stage::Int                 # The state of MIRK method
-    M::Int                     # The number of equations
-    in_size
-    f
-    bc
-    prob                       # BVProblem
-    problem_type               # StandardBVProblem
-    p                          # Parameters
-    alg                        # MIRK methods
-    TU                         # MIRK Tableau
-    ITU                        # MIRK Interpolation Tableau
-    # Everything below gets resized in adaptive methods
-    mesh                       # Discrete mesh
-    mesh_dt                    # Step size
-    k_discrete                 # Stage information associated with the discrete Runge-Kutta method
-    k_interp                   # Stage information associated with the discrete Runge-Kutta method
-    y
-    y₀
-    residual
-    # The following 2 caches are never resized
-    fᵢ_cache
-    fᵢ₂_cache
-    defect
-    new_stages
-    kwargs
-end
-
-Base.eltype(::MIRKCache{iip, T}) where {iip, T} = T
-
-"""
-    expand_cache!(cache::MIRKCache)
-
-After redistributing or halving the mesh, this function expands the required vectors to
-match the length of the new mesh.
-"""
-function expand_cache!(cache::MIRKCache)
-    Nₙ = length(cache.mesh)
-    __append_similar!(cache.k_discrete, Nₙ - 1, cache.M)
-    __append_similar!(cache.k_interp, Nₙ - 1, cache.M)
-    __append_similar!(cache.y, Nₙ, cache.M)
-    __append_similar!(cache.y₀, Nₙ, cache.M)
-    __append_similar!(cache.residual, Nₙ, cache.M)
-    __append_similar!(cache.defect, Nₙ - 1, cache.M)
-    __append_similar!(cache.new_stages, Nₙ - 1, cache.M)
-    return cache
-end
-
-__append_similar!(::Nothing, n, _) = nothing
-
-function __append_similar!(x::AbstractVector{<:AbstractArray}, n, _)
-    N = n - length(x)
-    N == 0 && return x
-    N < 0 && throw(ArgumentError("Cannot append a negative number of elements"))
-    append!(x, [similar(first(x)) for _ in 1:N])
-    return x
-end
-
-function __append_similar!(x::AbstractVector{<:MaybeDiffCache}, n, M)
-    N = n - length(x)
-    N == 0 && return x
-    N < 0 && throw(ArgumentError("Cannot append a negative number of elements"))
-    chunksize = pickchunksize(M * (N + length(x)))
-    append!(x, [maybe_allocate_diffcache(first(x), chunksize) for _ in 1:N])
-    return x
-end
diff --git a/src/collocation.jl b/src/collocation.jl
index 8ab995f3..0bfeb3f3 100644
--- a/src/collocation.jl
+++ b/src/collocation.jl
@@ -1,9 +1,3 @@
-__initial_state_from_prob(prob::BVProblem, mesh) = __initial_state_from_prob(prob.u0, mesh)
-__initial_state_from_prob(u0::AbstractArray, mesh) = [copy(vec(u0)) for _ in mesh]
-function __initial_state_from_prob(u0::AbstractVector{<:AbstractVector}, _)
-    [copy(vec(u)) for u in u0]
-end
-
 function Φ!(residual, cache::MIRKCache, y, u, p = cache.p)
     return Φ!(residual, cache.fᵢ_cache, cache.k_discrete, cache.f, cache.TU,
         y, u, p, cache.mesh, cache.mesh_dt, cache.stage)
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 5f7553a7..9242f60b 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -1,27 +1,47 @@
+@concrete struct MIRKCache{iip, T}
+    order::Int                 # The order of MIRK method
+    stage::Int                 # The state of MIRK method
+    M::Int                     # The number of equations
+    in_size
+    f
+    bc
+    prob                       # BVProblem
+    problem_type               # StandardBVProblem
+    p                          # Parameters
+    alg                        # MIRK methods
+    TU                         # MIRK Tableau
+    ITU                        # MIRK Interpolation Tableau
+    bcresid_prototype
+    # Everything below gets resized in adaptive methods
+    mesh                       # Discrete mesh
+    mesh_dt                    # Step size
+    k_discrete                 # Stage information associated with the discrete Runge-Kutta method
+    k_interp                   # Stage information associated with the discrete Runge-Kutta method
+    y
+    y₀
+    residual
+    # The following 2 caches are never resized
+    fᵢ_cache
+    fᵢ₂_cache
+    defect
+    new_stages
+    kwargs
+end
+
+Base.eltype(::MIRKCache{iip, T}) where {iip, T} = T
+
 function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
     abstol = 1e-3, adaptive = true, kwargs...)
-    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
     @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
     iip = isinplace(prob)
-    (T, M, n) = if has_initial_guess
-        # If user provided a vector of initial guesses
-        _u0 = first(prob.u0)
-        eltype(_u0), length(_u0), (length(prob.u0) - 1)
-    else
-        dt ≤ 0 && throw(ArgumentError("dt must be positive"))
-        eltype(prob.u0), length(prob.u0), Int(cld((prob.tspan[2] - prob.tspan[1]), dt))
-    end
+    has_initial_guess, T, M, n, X = __extract_problem_details(prob; dt,
+        check_positive_dt = true)
     chunksize = pickchunksize(M * (n + 1))
-    if has_initial_guess
-        fᵢ_cache = maybe_allocate_diffcache(vec(similar(_u0)), chunksize, alg.jac_alg)
-        fᵢ₂_cache = vec(similar(_u0))
-    else
-        fᵢ_cache = maybe_allocate_diffcache(vec(similar(prob.u0)), chunksize, alg.jac_alg)
-        fᵢ₂_cache = vec(similar(prob.u0))
-    end
 
-    # Without this, boxing breaks type stability
-    X = has_initial_guess ? _u0 : prob.u0
+    __alloc_diffcache = x -> __maybe_allocate_diffcache(vec(x), chunksize, alg.jac_alg)
+
+    fᵢ_cache = __alloc_diffcache(similar(X))
+    fᵢ₂_cache = vec(similar(X))
 
     # NOTE: Assumes the user provided initial guess is on a uniform mesh
     mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
@@ -32,85 +52,51 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
 
     # Don't flatten this here, since we need to expand it later if needed
     y₀ = __initial_state_from_prob(prob, mesh)
-    y = [maybe_allocate_diffcache(vec(copy(yᵢ)), chunksize, alg.jac_alg) for yᵢ in y₀]
+    y = __alloc_diffcache.(copy.(y₀))
     TU, ITU = constructMIRK(alg, T)
     stage = alg_stage(alg)
 
-    k_discrete = [maybe_allocate_diffcache(similar(X, M, stage), chunksize, alg.jac_alg)
+    k_discrete = [__maybe_allocate_diffcache(similar(X, M, stage), chunksize, alg.jac_alg)
                   for _ in 1:n]
-    k_interp = adaptive ? [similar(X, M, ITU.s_star - stage) for _ in 1:n] :
-               [similar(X, 0, 0) for _ in 1:n]
+    k_interp = [similar(X, ifelse(adaptive, M, 0), ifelse(adaptive, ITU.s_star - stage, 0))
+                for _ in 1:n]
 
-    resid₁_size = if prob.f.bcresid_prototype === nothing
-        size(X)
-    elseif prob.f.bcresid_prototype isa ArrayPartition
-        size.(prob.f.bcresid_prototype.x)
-    else
-        size(prob.f.bcresid_prototype)
-    end
+    bcresid_prototype, resid₁_size = __get_bcresid_prototype(prob.problem_type, prob, X)
 
-    if iip
-        if prob.f.bcresid_prototype === nothing
-            residual = [maybe_allocate_diffcache(vec(copy(yᵢ)), chunksize, alg.jac_alg)
-                        for yᵢ in y₀]
-        else
-            residual = vcat([
-                    maybe_allocate_diffcache(vec(copy(prob.f.bcresid_prototype)),
-                        chunksize, alg.jac_alg)],
-                [maybe_allocate_diffcache(vec(copy(yᵢ)), chunksize, alg.jac_alg)
-                 for yᵢ in y₀[2:end]])
-        end
+    residual = if iip
+        vcat([__alloc_diffcache(bcresid_prototype)],
+            __alloc_diffcache.(copy.(@view(y₀[2:end]))))
     else
-        residual = nothing
+        nothing
     end
 
-    defect = adaptive ? [similar(X, M) for _ in 1:n] : [similar(X, 0) for _ in 1:n]
-
-    new_stages = adaptive ? [similar(X, M) for _ in 1:n] : [similar(X, 0) for _ in 1:n]
+    defect = [similar(X, ifelse(adaptive, M, 0)) for _ in 1:n]
+    new_stages = [similar(X, ifelse(adaptive, M, 0)) for _ in 1:n]
 
     # Transform the functions to handle non-vector inputs
     f, bc = if X isa AbstractVector
         prob.f, prob.f.bc
     elseif iip
-        function vecf!(du, u, p, t)
-            du_ = reshape(du, size(X))
-            x_ = reshape(u, size(X))
-            prob.f(du_, x_, p, t)
-            return du
-        end
+        vecf!(du, u, p, t) = prob.f(reshape(du, size(X)), reshape(u, size(X)), p, t)
         vecbc! = if !(prob.problem_type isa TwoPointBVProblem)
             function __vecbc!(resid, sol, p, t)
-                resid_ = reshape(resid, resid₁_size)
-                sol_ = map(s -> reshape(s, size(X)), sol)
-                prob.f.bc(resid_, sol_, p, t)
-                return resid
+                prob.f.bc(reshape(resid, resid₁_size),
+                    map(Base.Fix2(reshape, size(X)), sol), p, t)
             end
         else
             function __vecbc_a!(resida, ua, p)
-                resida_ = reshape(resida, resid₁_size[1])
-                ua_ = reshape(ua, size(X))
-                prob.f.bc[1](resida_, ua_, p)
-                return nothing
+                prob.f.bc[1](reshape(resida, resid₁_size[1]), reshape(ua, size(X)), p)
             end
             function __vecbc_b!(residb, ub, p)
-                residb_ = reshape(residb, resid₁_size[2])
-                ub_ = reshape(ub, size(X))
-                prob.f.bc[2](residb_, ub_, p)
-                return nothing
+                prob.f.bc[2](reshape(residb, resid₁_size[2]), reshape(ub, size(X)), p)
             end
             (__vecbc_a!, __vecbc_b!)
         end
         vecf!, vecbc!
     else
-        function vecf(u, p, t)
-            x_ = reshape(u, size(X))
-            return vec(prob.f(x_, p, t))
-        end
+        vecf(u, p, t) = vec(prob.f(reshape(u, size(X)), p, t))
         vecbc = if !(prob.problem_type isa TwoPointBVProblem)
-            function __vecbc(sol, p, t)
-                sol_ = map(s -> reshape(s, size(X)), sol)
-                return vec(prob.f.bc(sol_, p, t))
-            end
+            __vecbc(sol, p, t) = vec(prob.f.bc(map(Base.Fix2(reshape, size(X)), sol), p, t))
         else
             __vecbc_a(ua, p) = vec(prob.f.bc[1](reshape(ua, size(X)), p))
             __vecbc_b(ub, p) = vec(prob.f.bc[2](reshape(ub, size(X)), p))
@@ -120,11 +106,29 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
     end
 
     return MIRKCache{iip, T}(alg_order(alg), stage, M, size(X), f, bc, prob,
-        prob.problem_type, prob.p, alg, TU, ITU, mesh, mesh_dt, k_discrete, k_interp, y, y₀,
-        residual, fᵢ_cache, fᵢ₂_cache, defect, new_stages,
+        prob.problem_type, prob.p, alg, TU, ITU, bcresid_prototype, mesh, mesh_dt,
+        k_discrete, k_interp, y, y₀, residual, fᵢ_cache, fᵢ₂_cache, defect, new_stages,
         (; defect_threshold, MxNsub, abstol, dt, adaptive, kwargs...))
 end
 
+"""
+    __expand_cache!(cache::MIRKCache)
+
+After redistributing or halving the mesh, this function expands the required vectors to
+match the length of the new mesh.
+"""
+function __expand_cache!(cache::MIRKCache)
+    Nₙ = length(cache.mesh)
+    __append_similar!(cache.k_discrete, Nₙ - 1, cache.M)
+    __append_similar!(cache.k_interp, Nₙ - 1, cache.M)
+    __append_similar!(cache.y, Nₙ, cache.M)
+    __append_similar!(cache.y₀, Nₙ, cache.M)
+    __append_similar!(cache.residual, Nₙ, cache.M)
+    __append_similar!(cache.defect, Nₙ - 1, cache.M)
+    __append_similar!(cache.new_stages, Nₙ - 1, cache.M)
+    return cache
+end
+
 function __split_mirk_kwargs(; defect_threshold, MxNsub, abstol, dt, adaptive = true,
     kwargs...)
     return ((defect_threshold, MxNsub, abstol, adaptive, dt),
@@ -139,7 +143,7 @@ function SciMLBase.solve!(cache::MIRKCache)
     defect_norm = 2 * abstol
 
     while SciMLBase.successful_retcode(info) && defect_norm > abstol
-        nlprob = construct_nlproblem(cache, recursive_flatten(y₀))
+        nlprob = __construct_nlproblem(cache, recursive_flatten(y₀))
         sol_nlprob = solve(nlprob, alg.nlsolve; abstol, kwargs...)
         recursive_unflatten!(cache.y₀, sol_nlprob.u)
 
@@ -162,7 +166,7 @@ function SciMLBase.solve!(cache::MIRKCache)
                     for (i, m) in enumerate(cache.mesh)
                         interp_eval!(cache.y₀[i], cache, m, mesh, mesh_dt)
                     end
-                    expand_cache!(cache)
+                    __expand_cache!(cache)
                 end
             end
         else
@@ -172,7 +176,7 @@ function SciMLBase.solve!(cache::MIRKCache)
                 info = ReturnCode.Failure
             else
                 half_mesh!(cache)
-                expand_cache!(cache)
+                __expand_cache!(cache)
                 recursive_fill!(cache.y₀, 0)
                 info = ReturnCode.Success # Force a restart
                 defect_norm = 2 * abstol
@@ -186,7 +190,7 @@ function SciMLBase.solve!(cache::MIRKCache)
 end
 
 # Constructing the Nonlinear Problem
-function construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {iip}
+function __construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {iip}
     loss_bc = if iip
         function loss_bc_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
             y_ = recursive_unflatten!(cache.y, u)
@@ -217,63 +221,47 @@ function construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {ii
         end
     end
 
-    loss = if !(cache.problem_type isa TwoPointBVProblem)
-        if iip
-            function loss_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resids = [get_tmp(r, u) for r in cache.residual]
-                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
-                    cache.mesh)
-                Φ!(resids[2:end], cache, y_, u, p)
+    loss = if iip
+        function loss_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            resids = [get_tmp(r, u) for r in cache.residual]
+            eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
+                cache.mesh)
+            Φ!(resids[2:end], cache, y_, u, p)
+            if cache.problem_type isa TwoPointBVProblem
+                recursive_flatten_twopoint!(resid, resids)
+            else
                 recursive_flatten!(resid, resids)
-                return resid
-            end
-        else
-            function loss_internal(u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
-                resid_co = Φ(cache, y_, u, p)
-                return vcat(resid_bc, mapreduce(vec, vcat, resid_co))
             end
+            return resid
         end
     else
-        # Reordering for 2 point BVP
-        if iip
-            function loss_internal_2point!(resid::AbstractVector, u::AbstractVector,
-                p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resids = [get_tmp(r, u) for r in cache.residual]
-                eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
-                    cache.mesh)
-                Φ!(resids[2:end], cache, y_, u, p)
-                recursive_flatten_twopoint!(resid, resids)
-                return resid
-            end
-        else
-            function loss_internal_2point(u::AbstractVector, p = cache.p)
-                y_ = recursive_unflatten!(cache.y, u)
-                resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
-                resid_co = Φ(cache, y_, u, p)
+        function loss_internal(u::AbstractVector, p = cache.p)
+            y_ = recursive_unflatten!(cache.y, u)
+            resid_bc = eval_bc_residual(cache.problem_type, cache.bc, y_, p, cache.mesh)
+            resid_co = Φ(cache, y_, u, p)
+            if cache.problem_type isa TwoPointBVProblem
                 return vcat(resid_bc.x[1], mapreduce(vec, vcat, resid_co), resid_bc.x[2])
+            else
+                return vcat(resid_bc, mapreduce(vec, vcat, resid_co))
             end
         end
     end
 
-    return generate_nlprob(cache, y, loss_bc, loss_collocation, loss, cache.problem_type)
+    return __construct_nlproblem(cache, y, loss_bc, loss_collocation, loss,
+        cache.problem_type)
 end
 
-function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
-    _) where {iip}
+function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+    ::StandardBVProblem) where {iip}
     @unpack nlsolve, jac_alg = cache.alg
     N = length(cache.mesh)
 
-    resid_bc = cache.prob.f.bcresid_prototype === nothing ? similar(y, cache.M) :
-               cache.prob.f.bcresid_prototype
+    resid_bc = cache.bcresid_prototype
     resid_collocation = similar(y, cache.M * (N - 1))
 
     sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
             NoSparsityDetection()
-
     cache_bc = __sparse_jacobian_cache(Val(iip), jac_alg.bc_diffmode, sd_bc, loss_bc,
         resid_bc, y)
 
@@ -283,14 +271,11 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     else
         NoSparsityDetection()
     end
-
     cache_collocation = __sparse_jacobian_cache(Val(iip), jac_alg.nonbc_diffmode,
         sd_collocation, loss_collocation, resid_collocation, y)
 
     jac_prototype = vcat(init_jacobian(cache_bc), init_jacobian(cache_collocation))
 
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
     jac = if iip
         function jac_internal!(J, x, p)
             sparse_jacobian!(@view(J[1:(cache.M), :]), jac_alg.bc_diffmode, cache_bc,
@@ -313,19 +298,12 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     return NonlinearProblem(NonlinearFunction{iip}(loss; jac, jac_prototype), y, cache.p)
 end
 
-function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
+function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, loss,
     ::TwoPointBVProblem) where {iip}
     @unpack nlsolve, jac_alg = cache.alg
     N = length(cache.mesh)
 
-    if !iip && cache.prob.f.bcresid_prototype === nothing
-        y_ = recursive_unflatten!(cache.y, y)
-        resid_ = ArrayPartition(cache.bc[1](y_[1], cache.p), cache.bc[2](y_[end], cache.p))
-        resid = ArrayPartition(resid_, similar(y, cache.M * (N - 1)))
-    else
-        resid = ArrayPartition(cache.prob.f.bcresid_prototype,
-            similar(y, cache.M * (N - 1)))
-    end
+    resid = ArrayPartition(cache.bcresid_prototype, similar(y, cache.M * (N - 1)))
 
     sd = if jac_alg.diffmode isa AbstractSparseADType
         PrecomputedJacobianColorvec(__generate_sparse_jacobian_prototype(cache,
@@ -333,13 +311,9 @@ function generate_nlprob(cache::MIRKCache{iip}, y, loss_bc, loss_collocation, lo
     else
         NoSparsityDetection()
     end
-
     diffcache = __sparse_jacobian_cache(Val(iip), jac_alg.diffmode, sd, loss, resid, y)
-
     jac_prototype = init_jacobian(diffcache)
 
-    # TODO: Pass `p` into `loss_bc` and `loss_collocation`. Currently leads to a Tag
-    #       mismatch for ForwardDiff
     jac = if iip
         function jac_internal!(J, x, p)
             sparse_jacobian!(J, jac_alg.diffmode, diffcache, loss, resid, x)
diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 2de87648..5a02f0f4 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,9 +1,8 @@
 function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), verbose = true, kwargs...)
-    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
-    has_initial_guess && verbose &&
+    ig, T, _, _, u0 = __extract_problem_details(prob; dt = 0.1)
+    known(ig) && verbose &&
         @warn "Initial guess provided, but will be ignored for Shooting!"
-    u0 = has_initial_guess ? first(prob.u0) : prob.u0
 
     iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(u0), size(u0)
     resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
diff --git a/src/types.jl b/src/types.jl
index bf3ff899..5232faa3 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -82,15 +82,11 @@ end
     du
 end
 
-function maybe_allocate_diffcache(x, chunksize, jac_alg)
-    if __needs_diffcache(jac_alg)
-        return DiffCache(x, chunksize)
-    else
-        return FakeDiffCache(x)
-    end
+function __maybe_allocate_diffcache(x, chunksize, jac_alg)
+    return __needs_diffcache(jac_alg) ? DiffCache(x, chunksize) : FakeDiffCache(x)
 end
-maybe_allocate_diffcache(x::DiffCache, chunksize) = DiffCache(similar(x.du), chunksize)
-maybe_allocate_diffcache(x::FakeDiffCache, _) = FakeDiffCache(similar(x.du))
+__maybe_allocate_diffcache(x::DiffCache, chunksize) = DiffCache(similar(x.du), chunksize)
+__maybe_allocate_diffcache(x::FakeDiffCache, _) = FakeDiffCache(similar(x.du))
 
 PreallocationTools.get_tmp(dc::FakeDiffCache, _) = dc.du
 
diff --git a/src/utils.jl b/src/utils.jl
index 3082566d..6a99dbdb 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -90,3 +90,68 @@ eval_bc_residual!(resid, _, bc!, sol, p, t) = bc!(resid, sol, p, t)
     bcb!(resid.x[2], ub, p)
     return resid
 end
+
+__append_similar!(::Nothing, n, _) = nothing
+
+function __append_similar!(x::AbstractVector{<:AbstractArray}, n, _)
+    N = n - length(x)
+    N == 0 && return x
+    N < 0 && throw(ArgumentError("Cannot append a negative number of elements"))
+    append!(x, [similar(first(x)) for _ in 1:N])
+    return x
+end
+
+function __append_similar!(x::AbstractVector{<:MaybeDiffCache}, n, M)
+    N = n - length(x)
+    N == 0 && return x
+    N < 0 && throw(ArgumentError("Cannot append a negative number of elements"))
+    chunksize = pickchunksize(M * (N + length(x)))
+    append!(x, [maybe_allocate_diffcache(first(x), chunksize) for _ in 1:N])
+    return x
+end
+
+## Problem with Initial Guess
+function __extract_problem_details(prob; kwargs...)
+    return __extract_problem_details(prob, prob.u0; kwargs...)
+end
+function __extract_problem_details(prob, u0::AbstractVector{<:AbstractArray}; kwargs...)
+    # Problem has Initial Guess
+    _u0 = first(u0)
+    return True(), eltype(_u0), length(_u0), (length(u0) - 1), _u0
+end
+function __extract_problem_details(prob, u0; dt = 0.0, check_positive_dt::Bool = false)
+    # Problem does not have Initial Guess
+    check_positive_dt && dt ≤ 0 && throw(ArgumentError("dt must be positive"))
+    t₀, t₁ = prob.tspan
+    return False(), eltype(u0), length(u0), Int(cld(t₁ - t₀, dt)), prob.u0
+end
+
+__initial_state_from_prob(prob::BVProblem, mesh) = __initial_state_from_prob(prob.u0, mesh)
+__initial_state_from_prob(u0::AbstractArray, mesh) = [copy(vec(u0)) for _ in mesh]
+function __initial_state_from_prob(u0::AbstractVector{<:AbstractVector}, _)
+    return [copy(vec(u)) for u in u0]
+end
+
+function __get_bcresid_prototype(::TwoPointBVProblem, prob, u)
+    prototype = if isinplace(prob)
+        prob.f.bcresid_prototype
+    elseif prob.f.bcresid_prototype === nothing
+        prob.f.bcresid_prototype
+    else
+        ArrayPartition(first(prob.f.bc)(u, prob.p), last(prob.f.bc)(u, prob.p))
+    end
+    return prototype, size.(prototype.x)
+end
+function __get_bcresid_prototype(::StandardBVProblem, prob, u)
+    prototype = prob.f.bcresid_prototype !== nothing ? prob.f.bcresid_prototype :
+                fill!(similar(u), 0)
+    return prototype, size(prototype)
+end
+
+function __fill_like(v, x, args...)
+    y = similar(x, args...)
+    fill!(y, v)
+    return y
+end
+__zeros_like(args...) = __fill_like(0, args...)
+__ones_like(args...) = __fill_like(1, args...)

From b9074b6ab990e5a693a135097b409c937a287cbb Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 11 Oct 2023 11:26:59 -0400
Subject: [PATCH 26/28] Single Shooting Cleanup

---
 ext/BoundaryValueDiffEqODEInterfaceExt.jl |  5 ++--
 src/BoundaryValueDiffEq.jl                |  5 ++--
 src/algorithms.jl                         | 15 ------------
 src/solve/mirk.jl                         |  8 +++++--
 src/solve/multiple_shooting.jl            |  4 ++--
 src/solve/single_shooting.jl              | 29 ++++++++++-------------
 src/sparse_jacobians.jl                   |  2 +-
 src/types.jl                              | 29 ++++++++++++++++++++++-
 src/utils.jl                              | 18 ++++++++++----
 test/misc/non_vector_inputs.jl            |  2 ++
 10 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/ext/BoundaryValueDiffEqODEInterfaceExt.jl b/ext/BoundaryValueDiffEqODEInterfaceExt.jl
index 1e357006..257ea6de 100644
--- a/ext/BoundaryValueDiffEqODEInterfaceExt.jl
+++ b/ext/BoundaryValueDiffEqODEInterfaceExt.jl
@@ -1,6 +1,7 @@
 module BoundaryValueDiffEqODEInterfaceExt
 
 using SciMLBase, BoundaryValueDiffEq, ODEInterface
+import SciMLBase: __solve
 import ODEInterface: OptionsODE, OPT_ATOL, OPT_RTOL, OPT_METHODCHOICE, OPT_DIAGNOSTICOUTPUT,
     OPT_ERRORCONTROL, OPT_SINGULARTERM, OPT_MAXSTEPS, OPT_BVPCLASS, OPT_SOLMETHOD,
     OPT_RHS_CALLMODE, RHS_CALL_INSITU, evalSolution
@@ -18,7 +19,7 @@ end
 # BVPM2
 #------
 ## TODO: We can specify Drhs using forwarddiff if we want to
-function SciMLBase.__solve(prob::BVProblem, alg::BVPM2; dt = 0.0, reltol = 1e-3, kwargs...)
+function __solve(prob::BVProblem, alg::BVPM2; dt = 0.0, reltol = 1e-3, kwargs...)
     _test_bvpm2_bvpsol_problem_criteria(prob, prob.problem_type, :BVPM2)
 
     has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
@@ -64,7 +65,7 @@ end
 #-------
 # BVPSOL
 #-------
-function SciMLBase.__solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000, reltol = 1e-3,
+function __solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000, reltol = 1e-3,
     dt = 0.0, verbose = true, kwargs...)
     _test_bvpm2_bvpsol_problem_criteria(prob, prob.problem_type, :BVPSOL)
     @assert isa(prob.p, SciMLBase.NullParameters) "BVPSOL only supports NullParameters!"
diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 0a3a4f2a..ad06c840 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -10,7 +10,7 @@ import ConcreteStructs: @concrete
 import DiffEqBase: solve
 import ForwardDiff: pickchunksize
 import RecursiveArrayTools: ArrayPartition, DiffEqArray
-import SciMLBase: AbstractDiffEqInterpolation, StandardBVProblem
+import SciMLBase: AbstractDiffEqInterpolation, StandardBVProblem, __solve
 import RecursiveArrayTools: ArrayPartition
 import SparseDiffTools: AbstractSparseADType
 import TruncatedStacktraces: @truncate_stacktrace
@@ -33,8 +33,7 @@ include("sparse_jacobians.jl")
 include("adaptivity.jl")
 include("interpolation.jl")
 
-function SciMLBase.__solve(prob::BVProblem, alg::BoundaryValueDiffEqAlgorithm, args...;
-    kwargs...)
+function __solve(prob::BVProblem, alg::BoundaryValueDiffEqAlgorithm, args...; kwargs...)
     cache = init(prob, alg, args...; kwargs...)
     return solve!(cache)
 end
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 28fe6d26..faafae4e 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -29,21 +29,6 @@ Significantly more stable than Single Shooting.
     grid_coarsening
 end
 
-function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob,
-    alg::MultipleShooting)
-    diffmode = jac_alg.diffmode === nothing ? AutoSparseForwardDiff() : jac_alg.diffmode
-    bc_diffmode = if jac_alg.bc_diffmode === nothing
-        prob.problem_type isa TwoPointBVProblem ? AutoSparseForwardDiff() :
-        AutoForwardDiff()
-    else
-        jac_alg.bc_diffmode
-    end
-    nonbc_diffmode = jac_alg.nonbc_diffmode === nothing ? AutoSparseForwardDiff() :
-                     jac_alg.nonbc_diffmode
-
-    return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
-end
-
 function MultipleShooting(nshoots::Int, ode_alg; nlsolve = NewtonRaphson(),
     grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
     @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 9242f60b..5fc55ef2 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -92,6 +92,7 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
             end
             (__vecbc_a!, __vecbc_b!)
         end
+        bcresid_prototype = vec(bcresid_prototype)
         vecf!, vecbc!
     else
         vecf(u, p, t) = vec(prob.f(reshape(u, size(X)), p, t))
@@ -102,6 +103,7 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
             __vecbc_b(ub, p) = vec(prob.f.bc[2](reshape(ub, size(X)), p))
             (__vecbc_a, __vecbc_b)
         end
+        bcresid_prototype = vec(bcresid_prototype)
         vecf, vecbc
     end
 
@@ -225,8 +227,7 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {
         function loss_internal!(resid::AbstractVector, u::AbstractVector, p = cache.p)
             y_ = recursive_unflatten!(cache.y, u)
             resids = [get_tmp(r, u) for r in cache.residual]
-            eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p,
-                cache.mesh)
+            eval_bc_residual!(resids[1], cache.problem_type, cache.bc, y_, p, cache.mesh)
             Φ!(resids[2:end], cache, y_, u, p)
             if cache.problem_type isa TwoPointBVProblem
                 recursive_flatten_twopoint!(resid, resids)
@@ -305,6 +306,9 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc, loss_collocati
 
     resid = ArrayPartition(cache.bcresid_prototype, similar(y, cache.M * (N - 1)))
 
+    # TODO: We can splitup the computation here as well similar to the Multiple Shooting
+    # TODO: code. That way for the BC part the actual jacobian computation is even cheaper
+    # TODO: Remember to not reorder if we end up using that implementation
     sd = if jac_alg.diffmode isa AbstractSparseADType
         PrecomputedJacobianColorvec(__generate_sparse_jacobian_prototype(cache,
             cache.problem_type, resid.x[1], cache.M, N))
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 6f98fe97..4c9403c2 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,4 +1,4 @@
-function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
+function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
     @unpack f, tspan = prob
     bc = prob.f.bc
@@ -188,7 +188,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwar
 
         resid_prototype = ArrayPartition(bcresid_prototype,
             similar(u_at_nodes, cur_nshoot * N))
-        resid_nodes = maybe_allocate_diffcache(resid_prototype.x[2],
+        resid_nodes = __maybe_allocate_diffcache(resid_prototype.x[2],
             pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
 
         if prob.problem_type isa TwoPointBVProblem
diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 5a02f0f4..2b8d9489 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,37 +1,32 @@
-function SciMLBase.__solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
+function __solve(prob::BVProblem, alg::Shooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), verbose = true, kwargs...)
     ig, T, _, _, u0 = __extract_problem_details(prob; dt = 0.1)
     known(ig) && verbose &&
         @warn "Initial guess provided, but will be ignored for Shooting!"
 
+    bcresid_prototype, resid_size = __get_bcresid_prototype(prob, u0)
     iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(u0), size(u0)
-    resid_size = prob.f.bcresid_prototype === nothing ? u0_size :
-                 size(prob.f.bcresid_prototype)
 
     loss_fn = if iip
         function loss!(resid, u0_, p)
-            u0_internal = reshape(u0_, u0_size)
-            tmp_prob = ODEProblem{iip}(prob.f, u0_internal, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., verbose,
-                kwargs...)
-            eval_bc_residual!(reshape(resid, resid_size), prob.problem_type, bc,
-                internal_sol, p)
+            odeprob = ODEProblem{true}(prob.f, reshape(u0_, u0_size), prob.tspan, p)
+            odesol = __solve(odeprob, alg.ode_alg; odesolve_kwargs..., verbose, kwargs...)
+            eval_bc_residual!(__safe_reshape(resid, resid_size), prob.problem_type, bc,
+                odesol, p)
             return nothing
         end
     else
         function loss(u0_, p)
-            u0_internal = reshape(u0_, u0_size)
-            tmp_prob = ODEProblem(prob.f, u0_internal, prob.tspan, p)
-            internal_sol = solve(tmp_prob, alg.ode_alg; odesolve_kwargs..., verbose,
-                kwargs...)
-            return vec(eval_bc_residual(prob.problem_type, bc, internal_sol, p))
+            odeprob = ODEProblem{false}(prob.f, reshape(u0_, u0_size), prob.tspan, p)
+            odesol = __solve(odeprob, alg.ode_alg; odesolve_kwargs..., verbose, kwargs...)
+            return vec(eval_bc_residual(prob.problem_type, bc, odesol, p))
         end
     end
-    opt = solve(NonlinearProblem(NonlinearFunction{iip}(loss_fn; prob.f.jac_prototype,
-                resid_prototype = prob.f.bcresid_prototype), vec(u0), prob.p), alg.nlsolve;
+    opt = __solve(NonlinearProblem(NonlinearFunction{iip}(loss_fn; prob.f.jac_prototype,
+                resid_prototype = bcresid_prototype), vec(u0), prob.p), alg.nlsolve;
         nlsolve_kwargs..., verbose, kwargs...)
     newprob = ODEProblem{iip}(prob.f, reshape(opt.u, u0_size), prob.tspan, prob.p)
-    sol = solve(newprob, alg.ode_alg; odesolve_kwargs..., verbose, kwargs...)
+    sol = __solve(newprob, alg.ode_alg; odesolve_kwargs..., verbose, kwargs...)
 
     if !SciMLBase.successful_retcode(opt)
         return SciMLBase.solution_new_retcode(sol, ReturnCode.Failure)
diff --git a/src/sparse_jacobians.jl b/src/sparse_jacobians.jl
index d1deeb52..3be9d4ce 100644
--- a/src/sparse_jacobians.jl
+++ b/src/sparse_jacobians.jl
@@ -109,4 +109,4 @@ function __generate_sparse_jacobian_prototype(::MIRKCache, ::TwoPointBVProblem,
     return ColoredMatrix(J, row_colorvec, col_colorvec)
 end
 
-# For Multiple Shooting
\ No newline at end of file
+# For Multiple Shooting
diff --git a/src/types.jl b/src/types.jl
index 5232faa3..02c2bf7a 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -52,7 +52,23 @@ function BVPJacobianAlgorithm(diffmode = missing; nonbc_diffmode = missing,
     end
 end
 
-function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob, alg)
+"""
+    concrete_jacobian_algorithm(jac_alg, prob, alg)
+    concrete_jacobian_algorithm(jac_alg, problem_type, prob, alg)
+
+If user provided all the required fields, then return the user provided algorithm.
+Otherwise, based on the problem type and the algorithm, decide the missing fields.
+
+For example, for `TwoPointBVProblem`, the `bc_diffmode` is set to
+`AutoSparseForwardDiff` while for `StandardBVProblem`, the `bc_diffmode` is set to
+`AutoForwardDiff`.
+"""
+function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob::BVProblem, alg)
+    return concrete_jacobian_algorithm(jac_alg, prob.problem_type, prob, alg)
+end
+
+function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, ::StandardBVProblem,
+    prob::BVProblem, alg)
     diffmode = jac_alg.diffmode === nothing ? AutoSparseForwardDiff() : jac_alg.diffmode
     bc_diffmode = jac_alg.bc_diffmode === nothing ? AutoForwardDiff() : jac_alg.bc_diffmode
     nonbc_diffmode = jac_alg.nonbc_diffmode === nothing ? AutoSparseForwardDiff() :
@@ -61,6 +77,17 @@ function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob, alg)
     return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
 end
 
+function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, ::TwoPointBVProblem,
+    prob::BVProblem, alg)
+    diffmode = jac_alg.diffmode === nothing ? AutoSparseForwardDiff() : jac_alg.diffmode
+    bc_diffmode = jac_alg.bc_diffmode === nothing ? AutoSparseForwardDiff() :
+                  jac_alg.bc_diffmode
+    nonbc_diffmode = jac_alg.nonbc_diffmode === nothing ? AutoSparseForwardDiff() :
+                     jac_alg.nonbc_diffmode
+
+    return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
+end
+
 function MIRKJacobianComputationAlgorithm(diffmode = missing;
     collocation_diffmode = missing, bc_diffmode = missing)
     Base.depwarn("`MIRKJacobianComputationAlgorithm` has been deprecated in favor of \
diff --git a/src/utils.jl b/src/utils.jl
index 6a99dbdb..d68958b2 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -106,7 +106,7 @@ function __append_similar!(x::AbstractVector{<:MaybeDiffCache}, n, M)
     N == 0 && return x
     N < 0 && throw(ArgumentError("Cannot append a negative number of elements"))
     chunksize = pickchunksize(M * (N + length(x)))
-    append!(x, [maybe_allocate_diffcache(first(x), chunksize) for _ in 1:N])
+    append!(x, [__maybe_allocate_diffcache(first(x), chunksize) for _ in 1:N])
     return x
 end
 
@@ -132,19 +132,22 @@ function __initial_state_from_prob(u0::AbstractVector{<:AbstractVector}, _)
     return [copy(vec(u)) for u in u0]
 end
 
-function __get_bcresid_prototype(::TwoPointBVProblem, prob, u)
+function __get_bcresid_prototype(prob::BVProblem, u)
+    return __get_bcresid_prototype(prob.problem_type, prob, u)
+end
+function __get_bcresid_prototype(::TwoPointBVProblem, prob::BVProblem, u)
     prototype = if isinplace(prob)
         prob.f.bcresid_prototype
-    elseif prob.f.bcresid_prototype === nothing
+    elseif prob.f.bcresid_prototype !== nothing
         prob.f.bcresid_prototype
     else
         ArrayPartition(first(prob.f.bc)(u, prob.p), last(prob.f.bc)(u, prob.p))
     end
     return prototype, size.(prototype.x)
 end
-function __get_bcresid_prototype(::StandardBVProblem, prob, u)
+function __get_bcresid_prototype(::StandardBVProblem, prob::BVProblem, u)
     prototype = prob.f.bcresid_prototype !== nothing ? prob.f.bcresid_prototype :
-                fill!(similar(u), 0)
+                __zeros_like(u)
     return prototype, size(prototype)
 end
 
@@ -155,3 +158,8 @@ function __fill_like(v, x, args...)
 end
 __zeros_like(args...) = __fill_like(0, args...)
 __ones_like(args...) = __fill_like(1, args...)
+
+__safe_reshape(x, args...) = reshape(x, args...)
+function __safe_reshape(x::ArrayPartition, sizes::NTuple)
+    return ArrayPartition(__safe_reshape.(x.x, sizes))
+end
diff --git a/test/misc/non_vector_inputs.jl b/test/misc/non_vector_inputs.jl
index 99aa0fb2..b7332505 100644
--- a/test/misc/non_vector_inputs.jl
+++ b/test/misc/non_vector_inputs.jl
@@ -57,4 +57,6 @@ probs = [
             @test norm(boundary(sol, prob.p, nothing)) < 0.01
         end
     end
+
+    # TODO: Multiple Shooting
 end

From be2298f281a3f11fa4d06adaa2f28da7929f80ab Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 11 Oct 2023 19:12:11 -0400
Subject: [PATCH 27/28] Fix tests and make Multiple Shooting Type Stable

---
 src/algorithms.jl                   |  11 ++
 src/solve/multiple_shooting.jl      | 256 +++++++---------------------
 src/sparse_jacobians.jl             |  95 ++++++++++-
 src/types.jl                        |  11 +-
 test/mirk/ensemble.jl               |   6 +-
 test/mirk/mirk_convergence_tests.jl |   4 +-
 test/misc/non_vector_inputs.jl      |   2 +-
 test/shooting/orbital.jl            |  36 ++--
 8 files changed, 203 insertions(+), 218 deletions(-)

diff --git a/src/algorithms.jl b/src/algorithms.jl
index faafae4e..cb825cd2 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -29,6 +29,17 @@ Significantly more stable than Single Shooting.
     grid_coarsening
 end
 
+function concretize_jacobian_algorithm(alg::MultipleShooting, prob)
+    jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
+    return MultipleShooting(alg.ode_alg, alg.nlsolve, jac_alg, alg.nshoots,
+        alg.grid_coarsening)
+end
+
+function update_nshoots(alg::MultipleShooting, nshoots::Int)
+    return MultipleShooting(alg.ode_alg, alg.nlsolve, alg.jac_alg, nshoots,
+        alg.grid_coarsening)
+end
+
 function MultipleShooting(nshoots::Int, ode_alg; nlsolve = NewtonRaphson(),
     grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
     @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 4c9403c2..e9bb791f 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,33 +1,28 @@
-function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
+function __solve(prob::BVProblem, _alg::MultipleShooting; odesolve_kwargs = (;),
     nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
     @unpack f, tspan = prob
-    bc = prob.f.bc
-    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
-    @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
-    _u0 = has_initial_guess ? first(prob.u0) : prob.u0
-    N, u0_size, nshoots, iip = length(_u0), size(_u0), alg.nshoots, isinplace(prob)
-    if prob.f.bcresid_prototype === nothing
-        if prob.problem_type isa TwoPointBVProblem
-            # This can only happen if the problem is !iip
-            bcresid_prototype = ArrayPartition(bc[1](_u0, prob.p), bc[2](_u0, prob.p))
-        else
-            bcresid_prototype = similar(_u0)
-        end
+
+    ig, T, N, Nig, u0 = __extract_problem_details(prob; dt = 0.1)
+    has_initial_guess = known(ig)
+
+    bcresid_prototype, resid_size = __get_bcresid_prototype(prob, u0)
+    iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(u0), size(u0)
+
+    __alg = concretize_jacobian_algorithm(_alg, prob)
+    alg = if has_initial_guess && Nig != __alg.nshoots + 1
+        verbose &&
+            @warn "Initial guess length != `nshoots + 1`! Adapting to `nshoots = $(Nig - 1)`"
+        update_nshoots(__alg, Nig - 1)
     else
-        bcresid_prototype = prob.f.bcresid_prototype
+        __alg
     end
+    nshoots = alg.nshoots
 
     if prob.problem_type isa TwoPointBVProblem
         resida_len = length(bcresid_prototype.x[1])
         residb_len = length(bcresid_prototype.x[2])
     end
 
-    if has_initial_guess && length(prob.u0) != nshoots + 1
-        nshoots = length(prob.u0) - 1
-        verbose &&
-            @warn "Initial guess length != `nshoots + 1`! Adapting to `nshoots = $(nshoots)`"
-    end
-
     # We will use colored AD for this part!
     @views function solve_internal_odes!(resid_nodes, us, p, cur_nshoots, nodes)
         ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
@@ -52,7 +47,7 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
 
         ensemble_prob = EnsembleProblem(odeprob; prob_func, reduction, safetycopy = false,
             u_init = (; us = us_, ts = ts_, resid = resid_nodes))
-        ensemble_sol = solve(ensemble_prob, alg.ode_alg, ensemblealg; odesolve_kwargs...,
+        ensemble_sol = __solve(ensemble_prob, alg.ode_alg, ensemblealg; odesolve_kwargs...,
             verbose, kwargs..., save_end = true, save_everystep = false,
             trajectories = cur_nshoots)
 
@@ -66,7 +61,7 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
             # Just Recompute the last ODE Solution
             lastodeprob = ODEProblem{iip}(f, reshape(ub0, u0_size),
                 (nodes[end - 1], nodes[end]), p)
-            sol_ode_last = solve(lastodeprob, alg.ode_alg; odesolve_kwargs..., verbose,
+            sol_ode_last = __solve(lastodeprob, alg.ode_alg; odesolve_kwargs..., verbose,
                 kwargs..., save_everystep = false, saveat = (), save_end = true)
             ub = vec(sol_ode_last.u[end])
 
@@ -136,7 +131,9 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
         @views function jac_tp!(J::AbstractMatrix, us, p, resid_bc,
             resid_nodes::MaybeDiffCache, ode_jac_cache, bc_jac_cache::Tuple, ode_fn, bc_fn,
             cur_nshoot, nodes)
-            J isa SparseArrays.SparseMatrixCSC || fill!(J, 0)
+            # This is mostly a safety measure
+            fill!(J, 0)
+
             J_bc = J[1:N, :]
             J_c = J[(N + 1):end, :]
 
@@ -150,7 +147,7 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
             resida, residb = resid_bc.x
             J_bc[1:length(resida), 1:N] .= J_bc′[1:length(resida), 1:N]
             idxᵢ = (length(resida) + 1):(length(resida) + length(residb))
-            J_bc[idxᵢ, (end - N + 1):end] .= J_bc′[idxᵢ, (end - N + 1):end]
+            J_bc[idxᵢ, (end - 2N + 1):(end - N)] .= J_bc′[idxᵢ, (end - N + 1):end]
 
             return nothing
         end
@@ -158,6 +155,9 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
         @views function jac_mp!(J::AbstractMatrix, us, p, resid_bc,
             resid_nodes::MaybeDiffCache, ode_jac_cache, bc_jac_cache, ode_fn, bc_fn,
             cur_nshoot, nodes)
+            # This is mostly a safety measure
+            fill!(J, 0)
+
             J_bc = J[1:N, :]
             J_c = J[(N + 1):end, :]
 
@@ -174,15 +174,15 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
 
     # This gets all the nshoots except the final SingleShooting case
     all_nshoots = get_all_nshoots(alg.grid_coarsening, nshoots)
-    u_at_nodes, nodes = nothing, nothing
+    u_at_nodes, nodes = similar(u0, 0), typeof(first(tspan))[]
 
     for (i, cur_nshoot) in enumerate(all_nshoots)
         if i == 1
-            nodes, u_at_nodes = multiple_shooting_initialize(prob, alg, has_initial_guess,
-                nshoots; odesolve_kwargs, verbose, kwargs...)
+            nodes, u_at_nodes = multiple_shooting_initialize(prob, alg, ig, nshoots;
+                odesolve_kwargs, verbose, kwargs...)
         else
             nodes, u_at_nodes = multiple_shooting_initialize(u_at_nodes, prob, alg, nodes,
-                cur_nshoot, all_nshoots[i - 1], has_initial_guess; odesolve_kwargs, verbose,
+                cur_nshoot, all_nshoots[i - 1]::Int, ig; odesolve_kwargs, verbose,
                 kwargs...)
         end
 
@@ -191,35 +191,23 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
         resid_nodes = __maybe_allocate_diffcache(resid_prototype.x[2],
             pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
 
-        if prob.problem_type isa TwoPointBVProblem
-            if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType ||
-               alg.jac_alg.bc_diffmode isa AbstractSparseADType
-                J_full, col_colorvec, row_colorvec, (J_c, J_bc_partial), col_colorvec_bc, row_colorvec_bc, = __generate_sparse_jacobian_prototype(alg,
-                    prob.problem_type, bcresid_prototype, _u0, N, cur_nshoot)
-            end
-        elseif alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
-            J_c, col_colorvec, row_colorvec, = __generate_sparse_jacobian_prototype(alg,
-                prob.problem_type, bcresid_prototype, _u0, N, cur_nshoot)
+        if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType ||
+           alg.jac_alg.bc_diffmode isa AbstractSparseADType
+            J_full, J_c, J_bc = __generate_sparse_jacobian_prototype(alg, prob.problem_type,
+                bcresid_prototype, u0, N, cur_nshoot)
         end
 
         ode_fn = (du, u) -> solve_internal_odes!(du, u, prob.p, cur_nshoot, nodes)
-        sd_ode = if alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
-            PrecomputedJacobianColorvec(; jac_prototype = J_c, row_colorvec, col_colorvec)
-        else
-            NoSparsityDetection()
-        end
+        sd_ode = alg.jac_alg.nonbc_diffmode isa AbstractSparseADType ?
+                 PrecomputedJacobianColorvec(J_c) : NoSparsityDetection()
         ode_jac_cache = sparse_jacobian_cache(alg.jac_alg.nonbc_diffmode, sd_ode,
             ode_fn, similar(u_at_nodes, cur_nshoot * N), u_at_nodes)
 
         bc_fn = (du, u) -> compute_bc_residual!(du, u, prob.p, cur_nshoot, nodes,
             resid_nodes)
         if prob.problem_type isa TwoPointBVProblem
-            sd_bc = if alg.jac_alg.bc_diffmode isa AbstractSparseADType
-                PrecomputedJacobianColorvec(; jac_prototype = J_bc_partial,
-                    row_colorvec = row_colorvec_bc, col_colorvec = col_colorvec_bc)
-            else
-                NoSparsityDetection()
-            end
+            sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
+                    PrecomputedJacobianColorvec(J_bc) : NoSparsityDetection()
             bc_jac_cache_partial = sparse_jacobian_cache(alg.jac_alg.bc_diffmode, sd_bc,
                 bc_fn, similar(bcresid_prototype),
                 ArrayPartition(@view(u_at_nodes[1:N]),
@@ -227,12 +215,10 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
 
             bc_jac_cache = (bc_jac_cache_partial, init_jacobian(bc_jac_cache_partial))
 
-            jac_prototype = if alg.jac_alg.bc_diffmode isa AbstractSparseADType ||
-                               alg.jac_alg.nonbc_diffmode isa AbstractSparseADType
+            jac_prototype = if @isdefined(J_full)
                 J_full
             else
-                # Dense AD being used!
-                fill!(similar(u_at_nodes, length(resid_prototype), length(u_at_nodes)), 0)
+                __zeros_like(u_at_nodes, length(resid_prototype), length(u_at_nodes))
             end
         else
             sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
@@ -242,34 +228,42 @@ function __solve(prob::BVProblem, alg::MultipleShooting; odesolve_kwargs = (;),
 
             jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
         end
+
         jac_fn = (J, us, p) -> jac!(J, us, p, similar(bcresid_prototype), resid_nodes,
             ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, cur_nshoot, nodes)
 
         loss_function! = NonlinearFunction{true}((args...) -> loss!(args..., cur_nshoot,
                 nodes); resid_prototype, jac = jac_fn, jac_prototype)
         nlprob = NonlinearProblem(loss_function!, u_at_nodes, prob.p)
-        sol_nlsolve = solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
-        u_at_nodes = sol_nlsolve.u
+        sol_nlsolve = __solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
+        # u_at_nodes = sol_nlsolve.u
     end
 
     single_shooting_prob = remake(prob; u0 = reshape(u_at_nodes[1:N], u0_size))
-    return solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve); odesolve_kwargs,
-        nlsolve_kwargs, verbose, kwargs...)
+    return __solve(single_shooting_prob, Shooting(alg.ode_alg; alg.nlsolve);
+        odesolve_kwargs, nlsolve_kwargs, verbose, kwargs...)
 end
 
-@views function multiple_shooting_initialize(prob, alg::MultipleShooting, has_initial_guess,
+@views function multiple_shooting_initialize(prob, alg::MultipleShooting, ::True,
     nshoots; odesolve_kwargs = (;), verbose = true, kwargs...)
     @unpack f, u0, tspan, p = prob
     @unpack ode_alg = alg
 
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
-    N = has_initial_guess ? length(first(u0)) : length(u0)
+    N = length(first(u0))
 
-    if has_initial_guess
-        u_at_nodes = similar(first(u0), (nshoots + 1) * N)
-        recursive_flatten!(u_at_nodes, u0)
-        return nodes, u_at_nodes
-    end
+    u_at_nodes = similar(first(u0), (nshoots + 1) * N)
+    recursive_flatten!(u_at_nodes, u0)
+    return nodes, u_at_nodes
+end
+
+@views function multiple_shooting_initialize(prob, alg::MultipleShooting, ::False,
+    nshoots; odesolve_kwargs = (;), verbose = true, kwargs...)
+    @unpack f, u0, tspan, p = prob
+    @unpack ode_alg = alg
+
+    nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
+    N = length(u0)
 
     # Ensures type stability in case the parameters are dual numbers
     if !(typeof(p) <: SciMLBase.NullParameters)
@@ -283,12 +277,13 @@ end
 
     # Assumes no initial guess for now
     start_prob = ODEProblem{isinplace(prob)}(f, u0, tspan, p)
-    sol = solve(start_prob, ode_alg; odesolve_kwargs..., verbose, kwargs..., saveat = nodes)
+    sol = __solve(start_prob, ode_alg; odesolve_kwargs..., verbose, kwargs...,
+        saveat = nodes)
 
     if SciMLBase.successful_retcode(sol)
-        u_at_nodes[1:N] .= sol.u[1]
+        u_at_nodes[1:N] .= vec(sol.u[1])
         for i in 2:(nshoots + 1)
-            u_at_nodes[(N + (i - 2) * N) .+ (1:N)] .= sol.u[i]
+            u_at_nodes[(N + (i - 2) * N) .+ (1:N)] .= vec(sol.u[i])
         end
     else
         @warn "Initialization using odesolve failed. Initializing using 0s. It is \
@@ -300,10 +295,10 @@ end
 end
 
 @views function multiple_shooting_initialize(u_at_nodes_prev, prob, alg, prev_nodes,
-    nshoots, old_nshoots, has_initial_guess; odesolve_kwargs = (;), kwargs...)
+    nshoots, old_nshoots, ig; odesolve_kwargs = (;), kwargs...)
     @unpack f, u0, tspan, p = prob
     nodes = range(tspan[1], tspan[2]; length = nshoots + 1)
-    N = has_initial_guess ? length(first(u0)) : length(u0)
+    N = known(ig) ? length(first(u0)) : length(u0)
 
     u_at_nodes = similar(u_at_nodes_prev, N + nshoots * N)
     u_at_nodes[1:N] .= u_at_nodes_prev[1:N]
@@ -332,8 +327,8 @@ end
             ustart = u_at_nodes_prev[idxs_prev]
 
             odeprob = ODEProblem(f, ustart, (t0, tstop), p)
-            odesol = solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs..., saveat = (),
-                save_end = true)
+            odesol = __solve(odeprob, alg.ode_alg; odesolve_kwargs..., kwargs...,
+                saveat = (), save_end = true)
 
             u_at_nodes[idxs] .= odesol.u[end]
         end
@@ -361,124 +356,3 @@ end
     @assert !(1 in nshoots_vec)
     return nshoots_vec
 end
-
-"""
-    __generate_sparse_jacobian_prototype(::MultipleShooting, _, _, u0, N::Int,
-        nshoots::Int)
-    __generate_sparse_jacobian_prototype(::MultipleShooting, ::TwoPointBVProblem,
-        bcresid_prototype, u0, N::Int, nshoots::Int)
-
-For a Multi-Point Problem, returns the Jacobian Prototype for the Sparse Part. For a Two-
-Point Problem, returns the Jacobian Prototype for the Entire Jacobian.
-
-Also returns the column and row color vectors for the Sparse Non-BC Part Jacobian.
-
-Returns the column and row color vectors for the Sparse BC Part Jacobian (if computed).
-
-Also returns the indices `Is` and `Js` used to construct the Sparse Jacobian.
-"""
-function __generate_sparse_jacobian_prototype(::MultipleShooting, _, _, u0, N::Int,
-    nshoots::Int)
-    # Sparse for Stitching solution together
-    Is = Vector{Int64}(undef, (N^2 + N) * nshoots)
-    Js = Vector{Int64}(undef, (N^2 + N) * nshoots)
-
-    idx = 1
-    for i in 1:nshoots
-        for (i₁, i₂) in Iterators.product(1:N, 1:N)
-            Is[idx] = i₁ + ((i - 1) * N)
-            Js[idx] = i₂ + ((i - 1) * N)
-            idx += 1
-        end
-        Is[idx:(idx + N - 1)] .= (1:N) .+ ((i - 1) * N)
-        Js[idx:(idx + N - 1)] .= (1:N) .+ (i * N)
-        idx += N
-    end
-
-    J_c = sparse(adapt(parameterless_type(u0), Is), adapt(parameterless_type(u0), Js),
-        similar(u0, length(Is)))
-
-    col_colorvec = Vector{Int}(undef, N * (nshoots + 1))
-    for i in eachindex(col_colorvec)
-        col_colorvec[i] = mod1(i, 2 * N)
-    end
-    row_colorvec = Vector{Int}(undef, N * nshoots)
-    for i in eachindex(row_colorvec)
-        row_colorvec[i] = mod1(i, 2 * N)
-    end
-
-    return J_c, col_colorvec, row_colorvec, (J_c, nothing), nothing, nothing, Is, Js
-end
-
-function __generate_sparse_jacobian_prototype(alg::MultipleShooting, ::TwoPointBVProblem,
-    bcresid_prototype, u0, N::Int, nshoots::Int)
-    resida, residb = bcresid_prototype.x
-    # Sparse for Stitching solution together
-    L = N * length(resida) + (N^2 + N) * nshoots + N * length(residb)
-    Is = Vector{Int64}(undef, L)
-    Js = Vector{Int64}(undef, L)
-
-    idx = 1
-    for row in 1:length(resida)
-        for j in 1:N
-            Is[idx] = row
-            Js[idx] = j
-            idx += 1
-        end
-    end
-    for row in 1:length(residb)
-        for j in 1:N
-            Is[idx] = length(resida) + row
-            Js[idx] = j + (nshoots * N)
-            idx += 1
-        end
-    end
-    J_c, col_colorvec, row_colorvec, _, _, _, Is′, Js′ = __generate_sparse_jacobian_prototype(alg,
-        nothing, nothing, u0, N, nshoots)
-    for (i, j) in zip(Is′, Js′)
-        Is[idx] = length(resida) + length(residb) + i
-        Js[idx] = j
-        idx += 1
-    end
-
-    col_colorvec_bc = Vector{Int}(undef, 2N)
-    row_colorvec_bc = Vector{Int}(undef, length(resida) + length(residb))
-    col_colorvec_bc[1:N] .= 1:N
-    col_colorvec_bc[(N + 1):end] .= 1:N
-    for i in 1:max(length(resida), length(residb))
-        if i ≤ length(resida)
-            row_colorvec_bc[i] = i
-        end
-        if i ≤ length(residb)
-            row_colorvec_bc[i + length(resida)] = i
-        end
-    end
-
-    J = sparse(adapt(parameterless_type(u0), Is), adapt(parameterless_type(u0), Js),
-        similar(u0, length(Is)))
-
-    Is_bc = Vector{Int64}(undef, N^2)
-    Js_bc = Vector{Int64}(undef, N^2)
-    idx = 1
-    for i in 1:length(resida)
-        for j in 1:N
-            Is_bc[idx] = i
-            Js_bc[idx] = j
-            idx += 1
-        end
-    end
-    for i in 1:length(residb)
-        for j in 1:N
-            Is_bc[idx] = i + length(resida)
-            Js_bc[idx] = j + N
-            idx += 1
-        end
-    end
-
-    J_bc = sparse(adapt(parameterless_type(u0), Is_bc),
-        adapt(parameterless_type(u0), Js_bc),
-        similar(u0, length(Is_bc)))
-
-    return (J, col_colorvec, row_colorvec, (J_c, J_bc), col_colorvec_bc, row_colorvec_bc,
-        Is, Js)
-end
diff --git a/src/sparse_jacobians.jl b/src/sparse_jacobians.jl
index 3be9d4ce..e9c88abb 100644
--- a/src/sparse_jacobians.jl
+++ b/src/sparse_jacobians.jl
@@ -2,7 +2,7 @@
 function _sparse_like(I, J, x::AbstractArray, m = maximum(I), n = maximum(J))
     I′ = adapt(parameterless_type(x), I)
     J′ = adapt(parameterless_type(x), J)
-    V = similar(x, length(I))
+    V = __ones_like(x, length(I))
     return sparse(I′, J′, V, m, n)
 end
 
@@ -21,6 +21,11 @@ end
     col_colorvec
 end
 
+Base.size(M::ColoredMatrix, args...) = size(M.M, args...)
+Base.eltype(M::ColoredMatrix) = eltype(M.M)
+
+ColoredMatrix() = ColoredMatrix(nothing, nothing, nothing)
+
 function SparseDiffTools.PrecomputedJacobianColorvec(M::ColoredMatrix)
     return PrecomputedJacobianColorvec(; jac_prototype = M.M, M.row_colorvec,
         M.col_colorvec)
@@ -110,3 +115,91 @@ function __generate_sparse_jacobian_prototype(::MIRKCache, ::TwoPointBVProblem,
 end
 
 # For Multiple Shooting
+"""
+    __generate_sparse_jacobian_prototype(::MultipleShooting, ::StandardBVProblem,
+        bcresid_prototype, u0, N::Int, nshoots::Int)
+    __generate_sparse_jacobian_prototype(::MultipleShooting, ::TwoPointBVProblem,
+        bcresid_prototype, u0, N::Int, nshoots::Int)
+
+Returns a 3-Tuple:
+
+* Entire Jacobian Prototype (if Two-Point Problem) else `nothing`.
+* Sparse Non-BC Part Jacobian Prototype along with the column and row color vectors.
+* Sparse BC Part Jacobian Prototype along with the column and row color vectors (if
+  Two-Point Problem) else `nothing`.
+"""
+function __generate_sparse_jacobian_prototype(::MultipleShooting, ::StandardBVProblem,
+    bcresid_prototype, u0, N::Int, nshoots::Int)
+    Is = Vector{Int}(undef, (N^2 + N) * nshoots)
+    Js = Vector{Int}(undef, (N^2 + N) * nshoots)
+
+    idx = 1
+    for i in 1:nshoots
+        for (i₁, i₂) in Iterators.product(1:N, 1:N)
+            Is[idx] = i₁ + ((i - 1) * N)
+            Js[idx] = i₂ + ((i - 1) * N)
+            idx += 1
+        end
+        Is[idx:(idx + N - 1)] .= (1:N) .+ ((i - 1) * N)
+        Js[idx:(idx + N - 1)] .= (1:N) .+ (i * N)
+        idx += N
+    end
+
+    J_c = _sparse_like(Is, Js, u0)
+
+    col_colorvec = Vector{Int}(undef, size(J_c, 2))
+    for i in eachindex(col_colorvec)
+        col_colorvec[i] = mod1(i, 2N)
+    end
+    row_colorvec = Vector{Int}(undef, size(J_c, 1))
+    for i in eachindex(row_colorvec)
+        row_colorvec[i] = mod1(i, 2N)
+    end
+
+    return nothing, ColoredMatrix(J_c, row_colorvec, col_colorvec), nothing
+end
+
+function __generate_sparse_jacobian_prototype(alg::MultipleShooting, ::TwoPointBVProblem,
+    bcresid_prototype::ArrayPartition, u0, N::Int, nshoots::Int)
+    resida, residb = bcresid_prototype.x
+    L₁, L₂ = length(resida), length(residb)
+
+    _, J_c, _ = __generate_sparse_jacobian_prototype(alg, StandardBVProblem(),
+        bcresid_prototype, u0, N, nshoots)
+
+    Is_bc = Vector{Int}(undef, (L₁ + L₂) * N)
+    Js_bc = Vector{Int}(undef, (L₁ + L₂) * N)
+    idx = 1
+    for i in 1:L₁, j in 1:N
+        Is_bc[idx] = i
+        Js_bc[idx] = j
+        idx += 1
+    end
+    for i in 1:L₂, j in 1:N
+        Is_bc[idx] = i + L₁
+        Js_bc[idx] = j + N
+        idx += 1
+    end
+
+    col_colorvec_bc = Vector{Int}(undef, 2N)
+    row_colorvec_bc = Vector{Int}(undef, L₁ + L₂)
+    col_colorvec_bc[1:N] .= 1:N
+    col_colorvec_bc[(N + 1):end] .= 1:N
+    for i in 1:max(L₁, L₂)
+        i ≤ L₁ && (row_colorvec_bc[i] = i)
+        i ≤ L₂ && (row_colorvec_bc[i + L₁] = i)
+    end
+
+    J_bc = ColoredMatrix(_sparse_like(Is_bc, Js_bc, bcresid_prototype), row_colorvec_bc,
+        col_colorvec_bc)
+
+    J_full = _sparse_like(Int[], Int[], u0, size(J_bc, 1) + size(J_c, 1),
+        size(J_c, 2))
+
+    J_full[(L₁ + L₂ + 1):end, :] .= J_c.M
+    J_full[1:L₁, 1:N] .= J_bc.M[1:L₁, 1:N]
+    J_full[(L₁ + 1):(L₁ + L₂), (end - 2N + 1):(end - N)] .= J_bc.M[(L₁ + 1):(L₁ + L₂),
+        (N + 1):(2N)]
+
+    return J_full, J_c, J_bc
+end
diff --git a/src/types.jl b/src/types.jl
index 02c2bf7a..f38898af 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -42,13 +42,14 @@ end
 function BVPJacobianAlgorithm(diffmode = missing; nonbc_diffmode = missing,
     bc_diffmode = missing)
     if diffmode !== missing
-        @assert nonbc_diffmode === missing && bc_diffmode === missing
+        bc_diffmode = bc_diffmode === missing ? diffmode : bc_diffmode
+        nonbc_diffmode = nonbc_diffmode === missing ? diffmode : nonbc_diffmode
         return BVPJacobianAlgorithm(diffmode, diffmode, diffmode)
     else
         diffmode = nothing
         bc_diffmode = bc_diffmode === missing ? nothing : bc_diffmode
         nonbc_diffmode = nonbc_diffmode === missing ? nothing : nonbc_diffmode
-        return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, nonbc_diffmode)
+        return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
     end
 end
 
@@ -88,6 +89,12 @@ function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, ::TwoPointBV
     return BVPJacobianAlgorithm(bc_diffmode, nonbc_diffmode, diffmode)
 end
 
+# This can cause Type Instability
+function concretize_jacobian_algorithm(alg, prob)
+    @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
+    return alg
+end
+
 function MIRKJacobianComputationAlgorithm(diffmode = missing;
     collocation_diffmode = missing, bc_diffmode = missing)
     Base.depwarn("`MIRKJacobianComputationAlgorithm` has been deprecated in favor of \
diff --git a/test/mirk/ensemble.jl b/test/mirk/ensemble.jl
index e0771993..d03c2d94 100644
--- a/test/mirk/ensemble.jl
+++ b/test/mirk/ensemble.jl
@@ -19,9 +19,9 @@ bvp = BVProblem(ode!, bc!, initial_guess, tspan, p)
 ensemble_prob = EnsembleProblem(bvp; prob_func)
 
 @testset "$(solver)" for solver in (MIRK2, MIRK3, MIRK4, MIRK5, MIRK6)
-    jac_algs = [MIRKJacobianComputationAlgorithm(),
-        MIRKJacobianComputationAlgorithm(; bc_diffmode = AutoFiniteDiff(),
-            collocation_diffmode = AutoSparseFiniteDiff())]
+    jac_algs = [BVPJacobianAlgorithm(),
+        BVPJacobianAlgorithm(; bc_diffmode = AutoFiniteDiff(),
+            nonbc_diffmode = AutoSparseFiniteDiff())]
     for jac_alg in jac_algs
         # Not sure why it is throwing so many warnings
         sol = solve(ensemble_prob, solver(; jac_alg); trajectories = 10, dt = 0.1)
diff --git a/test/mirk/mirk_convergence_tests.jl b/test/mirk/mirk_convergence_tests.jl
index 54707c75..cdfd879f 100644
--- a/test/mirk/mirk_convergence_tests.jl
+++ b/test/mirk/mirk_convergence_tests.jl
@@ -115,8 +115,8 @@ end
 u0 = MVector{2}([pi / 2, pi / 2])
 bvp1 = BVProblem(simplependulum!, bc_pendulum!, u0, tspan)
 
-jac_alg = MIRKJacobianComputationAlgorithm(; bc_diffmode = AutoFiniteDiff(),
-    collocation_diffmode = AutoSparseFiniteDiff())
+jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoFiniteDiff(),
+    nonbc_diffmode = AutoSparseFiniteDiff())
 
 # Using ForwardDiff might lead to Cache expansion warnings
 @test_nowarn solve(bvp1, MIRK2(; jac_alg); dt = 0.005)
diff --git a/test/misc/non_vector_inputs.jl b/test/misc/non_vector_inputs.jl
index b7332505..170d48f9 100644
--- a/test/misc/non_vector_inputs.jl
+++ b/test/misc/non_vector_inputs.jl
@@ -58,5 +58,5 @@ probs = [
         end
     end
 
-    # TODO: Multiple Shooting
+    # FIXME: Add Multiple Shooting here once it supports non-vector inputs
 end
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
index 65f644fc..d28c47d4 100644
--- a/test/shooting/orbital.jl
+++ b/test/shooting/orbital.jl
@@ -64,25 +64,24 @@ cur_bc_2point_b! = (resid, sol, p) -> bc!_generator_2p_b(resid, sol, init_val)
 resid_f = Array{Float64}(undef, 6)
 resid_f_2p = (Array{Float64, 1}(undef, 3), Array{Float64, 1}(undef, 3))
 
-TestTol = 0.05
-
 ### Now use the BVP solver to get closer
 bvp = BVProblem(orbital!, cur_bc!, y0, tspan)
 for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     AutoSparseForwardDiff(), AutoFiniteDiff(; fdtype = Val(:forward)),
     AutoSparseFiniteDiff())
-    nlsolve = NewtonRaphson(; autodiff)
-    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true,
-        abstol = 1e-13, reltol = 1e-13)
+    nlsolve = TrustRegion(; autodiff)
+    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13,
+        reltol = 1e-13, verbose = false)
     cur_bc!(resid_f, sol, nothing, sol.t)
-    @test norm(resid_f, Inf) < TestTol
+    @info "Single Shooting Lambert's Problem: $(norm(resid_f, Inf))"
+    @test norm(resid_f, Inf) < 0.005
 
     jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff)
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg); abstol = 1e-6,
-        reltol = 1e-6)
-    @test SciMLBase.successful_retcode(sol)
+    @time sol = solve(bvp, MultipleShooting(10, AutoVern7(Rodas5P()); nlsolve, jac_alg);
+        abstol = 1e-6, reltol = 1e-6, verbose = false)
     cur_bc!(resid_f, sol, nothing, sol.t)
-    @test norm(resid_f, Inf) < 1e-6
+    @info "Multiple Shooting Lambert's Problem: $(norm(resid_f, Inf))"
+    @test norm(resid_f, Inf) < 0.005
 end
 
 ### Using the TwoPoint BVP Structure
@@ -91,18 +90,19 @@ bvp = TwoPointBVProblem(orbital!, (cur_bc_2point_a!, cur_bc_2point_b!), y0, tspa
 for autodiff in (AutoForwardDiff(), AutoFiniteDiff(; fdtype = Val(:central)),
     AutoSparseForwardDiff(), AutoFiniteDiff(; fdtype = Val(:forward)),
     AutoSparseFiniteDiff())
-    nlsolve = NewtonRaphson(; autodiff)
+    nlsolve = TrustRegion(; autodiff)
     @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13,
-        reltol = 1e-13)
+        reltol = 1e-13, verbose = false)
     cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
     cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
-    @test norm(reduce(vcat, resid_f_2p), Inf) < TestTol
+    @info "Single Shooting Lambert's Problem: $(norm(reduce(vcat, resid_f_2p), Inf))"
+    @test norm(reduce(vcat, resid_f_2p), Inf) < 0.005
 
-    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff)
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg); abstol = 1e-6,
-        reltol = 1e-6)
-    @test SciMLBase.successful_retcode(sol)
+    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff, bc_diffmode = autodiff)
+    @time sol = solve(bvp, MultipleShooting(10, AutoVern7(Rodas5P()); nlsolve, jac_alg);
+        abstol = 1e-6, reltol = 1e-6, verbose = false)
     cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
     cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
-    @test norm(reduce(vcat, resid_f_2p), Inf) < TestTol
+    @info "Multiple Shooting Lambert's Problem: $(norm(reduce(vcat, resid_f_2p), Inf))"
+    @test norm(reduce(vcat, resid_f_2p), Inf) < 0.005
 end

From 54e75da23bc362fd99d24a0842e8d7dfca917991 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 11 Oct 2023 22:18:02 -0400
Subject: [PATCH 28/28] Split up the tests

---
 .github/workflows/CI.yml |  6 +++-
 test/runtests.jl         | 62 +++++++++++++++++++++++-----------------
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index e25f4f89..84629b0e 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -12,7 +12,9 @@ jobs:
     strategy:
       matrix:
         group:
-          - Core
+          - Shooting
+          - MIRK
+          - Others
         version:
           - '1'
     steps:
@@ -32,6 +34,8 @@ jobs:
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
+        env:
+          GROUP: ${{ matrix.group }}
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v3
         with:
diff --git a/test/runtests.jl b/test/runtests.jl
index 7779d163..aa3bd432 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,41 +1,49 @@
 using Test, SafeTestsets
 
+const GROUP = uppercase(get(ENV, "GROUP", "ALL"))
+
 @testset "Boundary Value Problem Tests" begin
-    @time @testset "Shooting Method Tests" begin
-        @time @safetestset "Shooting Tests" begin
-            include("shooting/shooting_tests.jl")
-        end
-        @time @safetestset "Ray Tracing BVP" begin
-            include("shooting/ray_tracing.jl")
-        end
-        @time @safetestset "Orbital" begin
-            include("shooting/orbital.jl")
+    if GROUP == "ALL" || GROUP == "SHOOTING"
+        @time @testset "Shooting Method Tests" begin
+            @time @safetestset "Shooting Tests" begin
+                include("shooting/shooting_tests.jl")
+            end
+            @time @safetestset "Ray Tracing BVP" begin
+                include("shooting/ray_tracing.jl")
+            end
+            @time @safetestset "Orbital" begin
+                include("shooting/orbital.jl")
+            end
         end
     end
 
-    @time @testset "Collocation Method (MIRK) Tests" begin
-        @time @safetestset "Ensemble" begin
-            include("mirk/ensemble.jl")
-        end
-        @time @safetestset "MIRK Convergence Tests" begin
-            include("mirk/mirk_convergence_tests.jl")
-        end
-        @time @safetestset "Vector of Vector" begin
-            include("mirk/vectorofvector_initials.jl")
+    if GROUP == "ALL" || GROUP == "MIRK"
+        @time @testset "Collocation Method (MIRK) Tests" begin
+            @time @safetestset "Ensemble" begin
+                include("mirk/ensemble.jl")
+            end
+            @time @safetestset "MIRK Convergence Tests" begin
+                include("mirk/mirk_convergence_tests.jl")
+            end
+            @time @safetestset "Vector of Vector" begin
+                include("mirk/vectorofvector_initials.jl")
+            end
         end
     end
 
-    @time @testset "Miscelleneous" begin
-        @time @safetestset "Non Vector Inputs" begin
-            include("misc/non_vector_inputs.jl")
-        end
+    if GROUP == "ALL" || GROUP == "OTHERS"
+        @time @testset "Miscelleneous" begin
+            @time @safetestset "Non Vector Inputs" begin
+                include("misc/non_vector_inputs.jl")
+            end
 
-        @time @safetestset "Type Stability" begin
-            include("misc/type_stability.jl")
-        end
+            @time @safetestset "Type Stability" begin
+                include("misc/type_stability.jl")
+            end
 
-        @time @safetestset "ODE Interface Tests" begin
-            include("misc/odeinterface_ex7.jl")
+            @time @safetestset "ODE Interface Tests" begin
+                include("misc/odeinterface_ex7.jl")
+            end
         end
     end
 end