diff --git a/docs/pages.jl b/docs/pages.jl
index 7f97e0083..c79eccd76 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -14,9 +14,6 @@ pages = ["index.md",
         "Hybrid and Jump Equations" => Any["examples/hybrid_jump/hybrid_diffeq.md",
             "examples/hybrid_jump/bouncing_ball.md"],
         "Bayesian Estimation" => Any["examples/bayesian/turing_bayesian.md"],
-        "Optimal and Model Predictive Control" => Any[
-            "examples/optimal_control/optimal_control.md",
-            "examples/optimal_control/feedback_control.md"]],
     "Manual and APIs" => Any[
         "manual/differential_equation_sensitivities.md",
         "manual/nonlinear_solve_sensitivities.md",
diff --git a/docs/src/examples/optimal_control/feedback_control.md b/docs/src/examples/optimal_control/feedback_control.md
deleted file mode 100644
index a2d9b8e33..000000000
--- a/docs/src/examples/optimal_control/feedback_control.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# Universal Differential Equations for Neural Feedback Control
-
-You can also mix a known differential equation and a neural differential
-equation, so that the parameters and the neural network are estimated
-simultaneously!
-
-We will assume that we know the dynamics of the second equation
-(linear dynamics), and our goal is to find a neural network that is dependent
-on the current state of the dynamical system that will control the second
-equation to stay close to 1.
-
-```@example udeneuralcontrol
-using Lux, Optimization, OptimizationPolyalgorithms, ComponentArrays,
-      SciMLSensitivity, Zygote, OrdinaryDiffEq, Plots, Random
-
-rng = Random.default_rng()
-u0 = [1.1]
-tspan = (0.0, 25.0)
-tsteps = 0.0:1.0:25.0
-
-model_univ = Chain(Dense(2, 16, tanh), Dense(16, 16, tanh), Dense(16, 1))
-ps, st = Lux.setup(Random.default_rng(), model_univ)
-p_model = ComponentArray(ps)
-
-# Parameters of the second equation (linear dynamics)
-p_system = [0.5, -0.5]
-p_all = ComponentArray(; p_model, p_system)
-θ = ComponentArray(; u0, p_all)
-
-function dudt_univ!(du, u, p, t)
-    # Destructure the parameters
-    model_weights = p.p_model
-    α, β = p.p_system
-
-    # The neural network outputs a control taken by the system
-    # The system then produces an output
-    model_control, system_output = u
-
-    # Dynamics of the control and system
-    dmodel_control = first(model_univ(u, model_weights, st))[1]
-    dsystem_output = α * system_output + β * model_control
-
-    # Update in place
-    du[1] = dmodel_control
-    du[2] = dsystem_output
-end
-
-prob_univ = ODEProblem(dudt_univ!, [0.0, u0[1]], tspan, p_all)
-sol_univ = solve(prob_univ, Tsit5(), abstol = 1e-8, reltol = 1e-6)
-
-function predict_univ(θ)
-    return Array(solve(prob_univ, Tsit5(), u0 = [0.0, θ.u0[1]], p = θ.p_all,
-        sensealg = InterpolatingAdjoint(autojacvec = ReverseDiffVJP(true)),
-        saveat = tsteps))
-end
-
-loss_univ(θ) = sum(abs2, predict_univ(θ)[2, :] .- 1)
-l = loss_univ(θ)
-```
-
-```@example udeneuralcontrol
-list_plots = []
-iter = 0
-cb = function (state, l)
-    global list_plots, iter
-
-    if iter == 0
-        list_plots = []
-    end
-    iter += 1
-
-    println(l)
-
-    plt = plot(predict_univ(state.u)', ylim = (0, 6))
-    push!(list_plots, plt)
-    display(plt)
-    return false
-end
-```
-
-```@example udeneuralcontrol
-adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x, p) -> loss_univ(x), adtype)
-optprob = Optimization.OptimizationProblem(optf, θ)
-result_univ = Optimization.solve(optprob, PolyOpt(), callback = cb)
-```
diff --git a/docs/src/examples/optimal_control/optimal_control.md b/docs/src/examples/optimal_control/optimal_control.md
deleted file mode 100644
index 49f3ba2a0..000000000
--- a/docs/src/examples/optimal_control/optimal_control.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# [Solving Optimal Control Problems with Universal Differential Equations](@id optcontrol)
-
-Here we will solve a classic optimal control problem with a universal differential
-equation. Let
-
-```math
-x^{′′} = u^3(t)
-```
-
-where we want to optimize our controller `u(t)` such that the following is
-minimized:
-
-```math
-L(\theta) = \sum_i \Vert 4 - x(t_i) \Vert + 2 \Vert x^\prime(t_i) \Vert + \Vert u(t_i) \Vert
-```
-
-where ``i`` is measured on (0,8) at 0.01 intervals. To do this, we rewrite the
-ODE in first order form:
-
-```math
-\begin{aligned}
-x^\prime &= v \\
-v^′ &= u^3(t) \\
-\end{aligned}
-```
-
-and thus
-
-```math
-L(\theta) = \sum_i \Vert 4 - x(t_i) \Vert + 2 \Vert v(t_i) \Vert + \Vert u(t_i) \Vert
-```
-
-is our loss function on the first order system. We thus choose a neural network
-form for ``u`` and optimize the equation with respect to this loss. Note that we
-will first reduce control cost (the last term) by 10x in order to bump the network out
-of a local minimum. This looks like:
-
-```@example neuraloptimalcontrol
-using Lux, ComponentArrays, OrdinaryDiffEq, Optimization, OptimizationOptimJL,
-      OptimizationOptimisers, SciMLSensitivity, Zygote, Plots, Statistics, Random
-
-rng = Random.default_rng()
-tspan = (0.0f0, 8.0f0)
-
-ann = Chain(Dense(1, 32, tanh), Dense(32, 32, tanh), Dense(32, 1))
-ps, st = Lux.setup(rng, ann)
-p = ComponentArray(ps)
-
-θ, _ax = getdata(p), getaxes(p)
-const ax = _ax
-
-function dxdt_(dx, x, p, t)
-    ps = ComponentArray(p, ax)
-    x1, x2 = x
-    dx[1] = x[2]
-    dx[2] = first(ann([t], ps, st))[1]^3
-end
-x0 = [-4.0f0, 0.0f0]
-ts = Float32.(collect(0.0:0.01:tspan[2]))
-prob = ODEProblem(dxdt_, x0, tspan, θ)
-solve(prob, Vern9(), abstol = 1e-10, reltol = 1e-10)
-
-function predict_adjoint(θ)
-    Array(solve(prob, Vern9(), p = θ, saveat = ts))
-end
-function loss_adjoint(θ)
-    x = predict_adjoint(θ)
-    ps = ComponentArray(θ, ax)
-    mean(abs2, 4.0f0 .- x[1, :]) + 2mean(abs2, x[2, :]) +
-    mean(abs2, [first(first(ann([t], ps, st))) for t in ts]) / 10
-end
-
-l = loss_adjoint(θ)
-cb = function (state, l; doplot = true)
-    println(l)
-
-    ps = ComponentArray(state.u, ax)
-
-    if doplot
-        p = plot(solve(remake(prob, p = state.u), Tsit5(), saveat = 0.01),
-            ylim = (-6, 6), lw = 3)
-        plot!(p, ts, [first(first(ann([t], ps, st))) for t in ts], label = "u(t)", lw = 3)
-        display(p)
-    end
-
-    return false
-end
-
-# Setup and run the optimization
-
-loss1 = loss_adjoint(θ)
-adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x, p) -> loss_adjoint(x), adtype)
-
-optprob = Optimization.OptimizationProblem(optf, θ)
-res1 = Optimization.solve(
-    optprob, OptimizationOptimisers.Adam(0.01), callback = cb, maxiters = 100)
-
-optprob2 = Optimization.OptimizationProblem(optf, res1.u)
-res2 = Optimization.solve(
-    optprob2, OptimizationOptimJL.BFGS(), callback = cb, maxiters = 100)
-```
-
-Now that the system is in a better behaved part of parameter space, we return to
-the original loss function to finish the optimization:
-
-```@example neuraloptimalcontrol
-function loss_adjoint(θ)
-    x = predict_adjoint(θ)
-    ps = ComponentArray(θ, ax)
-    mean(abs2, 4.0 .- x[1, :]) + 2mean(abs2, x[2, :]) +
-    mean(abs2, [first(first(ann([t], ps, st))) for t in ts])
-end
-optf3 = Optimization.OptimizationFunction((x, p) -> loss_adjoint(x), adtype)
-
-optprob3 = Optimization.OptimizationProblem(optf3, res2.u)
-res3 = Optimization.solve(optprob3, OptimizationOptimJL.BFGS(), maxiters = 100)
-```
-
-Now let's see what we received:
-
-```@example neuraloptimalcontrol
-l = loss_adjoint(res3.u)
-cb(res3, l)
-p = plot(solve(remake(prob, p = res3.u), Tsit5(), saveat = 0.01), ylim = (-6, 6), lw = 3)
-plot!(p, ts, [first(first(ann([t], ComponentArray(res3.u, ax), st))) for t in ts],
-    label = "u(t)", lw = 3)
-```