From e4924d5f053ecc44f82205a09520549e3e8fbe88 Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Wed, 18 Oct 2023 12:02:44 -0400
Subject: [PATCH 1/4] [WIP] Setup DiffEqGPU integration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MWE:

```py
from diffeqpy import de

def f(u,p,t):
    x, y, z = u
    sigma, rho, beta = p
    return [sigma * (y - x), x * (rho - z) - y, x * y - beta * z]

u0 = [1.0,0.0,0.0]
tspan = (0., 100.)
p = [10.0,28.0,8/3]
prob = de.ODEProblem(f, u0, tspan, p)
fast_prob = de.jit(prob)
sol = de.solve(fast_prob,saveat=0.01)

import random
def prob_func(prob,i,rep):
  de.remake(prob,u0=[random.uniform(0, 1)*u0[i] for i in range(0,3)],
            p=[random.uniform(0, 1)*p[i] for i in range(0,3)])

ensembleprob = de.EnsembleProblem(fast_prob, prob_func = prob_func, safetycopy=False)
sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)

from diffeqpy import cuda

sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(cuda.CUDABackend()),trajectories=10000,saveat=0.01)
```

Currently fails at the first ensemble solve:

```py
sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
```

```
>>> sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "C:\Users\accou\.julia\packages\PythonCall\qTEA1\src\jlwrap\any.jl", line 208, in __call__
    return self._jl_callmethod($(pyjl_methodnum(pyjlany_call)), args, kwargs)
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
juliacall.JuliaError: MethodError: no method matching init(::Py, ::OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}; saveat::Float64)

Closest candidates are:
  init(!Matched::SciMLBase.OptimizationProblem, ::Any, !Matched::Any...; kwargs...)
   @ SciMLBase C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\solve.jl:146
  init(!Matched::SciMLBase.PDEProblem, ::SciMLBase.AbstractDEAlgorithm, !Matched::Any...; kwargs...)
   @ DiffEqBase C:\Users\accou\.julia\packages\DiffEqBase\xSmHR\src\solve.jl:1116
  init(!Matched::SciMLBase.AbstractJumpProblem, ::Any...; kwargs...)
   @ DiffEqBase C:\Users\accou\.julia\packages\DiffEqBase\xSmHR\src\solve.jl:499
  ...

Stacktrace:
  [1] solve(::Py, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Float64, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{Float64}}})
    @ CommonSolve C:\Users\accou\.julia\packages\CommonSolve\JfpfI\src\CommonSolve.jl:23
  [2] batch_func(i::Int64, prob::SciMLBase.EnsembleProblem{SciMLBase.ODEProblem{Vector{Float64}, Tuple{Float64, Float64}, true, Vector{Float64}, SciMLBase.ODEFunction{true, SciMLBase.AutoSpecialize, ModelingToolkit.var"#k#545"{RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0x946926fe, 0xab7b8dc2, 0x707dbba2, 0x8b060826, 0xe63fdf5b), Nothing}, RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋out, :ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0xaf9b47a5, 0x8d86fb18, 0x65c64d40, 0xf8480c5a, 0x8614cffa), Nothing}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Vector{Symbol}, Symbol, Vector{Symbol}, ModelingToolkit.var"#637#generated_observed#555"{Bool, ModelingToolkit.ODESystem, Dict{Any, Any}, Vector{SymbolicUtils.BasicSymbolic{Real}}}, Nothing, ModelingToolkit.ODESystem}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, Py, typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, alg::OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}; kwargs::Base.Pairs{Symbol, Float64, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{Float64}}})
    @ SciMLBase C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:100
  [3] (::SciMLBase.var"#604#605"{Base.Pairs{Symbol, Float64, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{Float64}}}, SciMLBase.EnsembleProblem{SciMLBase.ODEProblem{Vector{Float64}, Tuple{Float64, Float64}, true, Vector{Float64}, SciMLBase.ODEFunction{true, SciMLBase.AutoSpecialize, ModelingToolkit.var"#k#545"{RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0x946926fe, 0xab7b8dc2, 0x707dbba2, 0x8b060826, 0xe63fdf5b), Nothing}, RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋out, :ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0xaf9b47a5, 0x8d86fb18, 0x65c64d40, 0xf8480c5a, 0x8614cffa), Nothing}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Vector{Symbol}, Symbol, Vector{Symbol}, ModelingToolkit.var"#637#generated_observed#555"{Bool, ModelingToolkit.ODESystem, Dict{Any, Any}, Vector{SymbolicUtils.BasicSymbolic{Real}}}, Nothing, ModelingToolkit.ODESystem}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, Py, typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}})(i::Int64)
    @ SciMLBase C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:154
  [4] responsible_map
    @ C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:147 [inlined]
  [5] #solve_batch#603
    @ C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:153 [inlined]
  [6] solve_batch
    @ C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:152 [inlined]
  [7] macro expansion
    @ .\timing.jl:393 [inlined]
  [8] __solve(prob::SciMLBase.EnsembleProblem{SciMLBase.ODEProblem{Vector{Float64}, Tuple{Float64, Float64}, true, Vector{Float64}, SciMLBase.ODEFunction{true, SciMLBase.AutoSpecialize, ModelingToolkit.var"#k#545"{RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0x946926fe, 0xab7b8dc2, 0x707dbba2, 0x8b060826, 0xe63fdf5b), Nothing}, RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋out, :ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0xaf9b47a5, 0x8d86fb18, 0x65c64d40, 0xf8480c5a, 0x8614cffa), Nothing}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Vector{Symbol}, Symbol, Vector{Symbol}, ModelingToolkit.var"#637#generated_observed#555"{Bool, ModelingToolkit.ODESystem, Dict{Any, Any}, Vector{SymbolicUtils.BasicSymbolic{Real}}}, Nothing, ModelingToolkit.ODESystem}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, Py, typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, alg::OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, ensemblealg::SciMLBase.EnsembleSerial; trajectories::Int64, batch_size::Int64, pmap_batch_size::Int64, kwargs::Base.Pairs{Symbol, Float64, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{Float64}}})
    @ SciMLBase C:\Users\accou\.julia\packages\SciMLBase\McEqc\src\ensemble\basic_ensemble_solve.jl:64
  [9] solve(::SciMLBase.EnsembleProblem{SciMLBase.ODEProblem{Vector{Float64}, Tuple{Float64, Float64}, true, Vector{Float64}, SciMLBase.ODEFunction{true, SciMLBase.AutoSpecialize, ModelingToolkit.var"#k#545"{RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0x946926fe, 0xab7b8dc2, 0x707dbba2, 0x8b060826, 0xe63fdf5b), Nothing}, RuntimeGeneratedFunctions.RuntimeGeneratedFunction{(:ˍ₋out, :ˍ₋arg1, :ˍ₋arg2, :t), ModelingToolkit.var"#_RGF_ModTag", ModelingToolkit.var"#_RGF_ModTag", (0xaf9b47a5, 0x8d86fb18, 0x65c64d40, 0xf8480c5a, 0x8614cffa), Nothing}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Vector{Symbol}, Symbol, Vector{Symbol}, ModelingToolkit.var"#637#generated_observed#555"{Bool, ModelingToolkit.ODESystem, Dict{Any, Any}, Vector{SymbolicUtils.BasicSymbolic{Real}}}, Nothing, ModelingToolkit.ODESystem}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, Py, typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, ::OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Real, Tuple{Symbol, Symbol}, NamedTuple{(:saveat, :trajectories), Tuple{Float64, Int64}}})
    @ DiffEqBase C:\Users\accou\.julia\packages\DiffEqBase\xSmHR\src\solve.jl:1053
 [10] pyjlany_call(self::typeof(CommonSolve.solve), args_::Py, kwargs_::Py)
    @ PythonCall C:\Users\accou\.julia\packages\PythonCall\qTEA1\src\jlwrap\any.jl:34
 [11] _pyjl_callmethod(f::Any, self_::Ptr{PythonCall.C.PyObject}, args_::Ptr{PythonCall.C.PyObject}, nargs::Int64)
    @ PythonCall C:\Users\accou\.julia\packages\PythonCall\qTEA1\src\jlwrap\base.jl:69
 [12] _pyjl_callmethod(o::Ptr{PythonCall.C.PyObject}, args::Ptr{PythonCall.C.PyObject})
    @ PythonCall.C C:\Users\accou\.julia\packages\PythonCall\qTEA1\src\cpython\jlwrap.jl:47
```
---
 diffeqpy/cuda.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 diffeqpy/cuda.py
diff --git a/diffeqpy/cuda.py b/diffeqpy/cuda.py
new file mode 100644
index 0000000..c46a300
--- /dev/null
+++ b/diffeqpy/cuda.py
@@ -0,0 +1,4 @@
+import sys
+from . import load_julia_packages
+cuda, _ = load_julia_packages("DiffEqGPU, CUDA")
+sys.modules[__name__] = cuda # mutate myself

From 00e53cf5c67c5082415e270ce37ba58cff9c824b Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Thu, 19 Oct 2023 18:18:49 -0400
Subject: [PATCH 2/4] import CUDABackend

---
 diffeqpy/cuda.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/diffeqpy/cuda.py b/diffeqpy/cuda.py
index c46a300..c10ac81 100644
--- a/diffeqpy/cuda.py
+++ b/diffeqpy/cuda.py
@@ -1,4 +1,6 @@
 import sys
 from . import load_julia_packages
 cuda, _ = load_julia_packages("DiffEqGPU, CUDA")
+from juliacall import Main
+de.CUDABackend = Main.seval("CUDA.CUDABackend") # kinda hacky
 sys.modules[__name__] = cuda # mutate myself

From b0ebe3f2636711052e593a2de8a6c46f07c3dfaf Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Thu, 19 Oct 2023 23:44:03 -0400
Subject: [PATCH 3/4] support other GPU backends and write the README with
 jit32

---
 README.md          | 264 +++++++++++++++++++++++++++++++++++++++++++++
 diffeqpy/amdgpu.py |   6 ++
 diffeqpy/de.py     |   1 +
 diffeqpy/metal.py  |   6 ++
 diffeqpy/oneapi.py |   6 ++
 5 files changed, 283 insertions(+)
 create mode 100644 diffeqpy/amdgpu.py
 create mode 100644 diffeqpy/metal.py
 create mode 100644 diffeqpy/oneapi.py

diff --git a/README.md b/README.md
index a42a5eb..362b670 100644
--- a/README.md
+++ b/README.md
@@ -456,6 +456,270 @@ Notice that the solver accurately is able to simulate the kink (discontinuity)
 at `t=20` due to the discontinuity of the derivative at the initial time point!
 This is why declaring discontinuities can enhance the solver accuracy.
 
+## GPU-Accelerated ODE Solving of Ensembles
+
+In many cases one is interested in solving the same ODE many times over many
+different initial conditions and parameters. In diffeqpy parlance this is called
+an ensemble solve. diffeqpy inherits the parallelism tools of the
+[SciML ecosystem](https://sciml.ai/) that are used for things like
+[automated equation discovery and acceleration](https://arxiv.org/abs/2001.04385).
+Here we will demonstrate using these parallel tools to accelerate the solving
+of an ensemble.
+
+First, let's define the JIT-accelerated Lorenz equation like before:
+
+```py
+from diffeqpy import de
+
+def f(u,p,t):
+    x, y, z = u
+    sigma, rho, beta = p
+    return [sigma * (y - x), x * (rho - z) - y, x * y - beta * z]
+
+u0 = [1.0,0.0,0.0]
+tspan = (0., 100.)
+p = [10.0,28.0,8/3]
+prob = de.ODEProblem(f, u0, tspan, p)
+fast_prob = de.jit32(prob)
+sol = de.solve(fast_prob,saveat=0.01)
+```
+
+Note that here we used `de.jit32` to JIT-compile the problem into a `Float32` form in order to make it more
+efficient on most GPUs.
+
+Now we use the `EnsembleProblem` as defined on the
+[ensemble parallelism page of the documentation](https://diffeq.sciml.ai/stable/features/ensemble/):
+Let's build an ensemble by utilizing uniform random numbers to randomize the
+initial conditions and parameters:
+
+```py
+import random
+def prob_func(prob,i,rep):
+  de.remake(prob,u0=[random.uniform(0, 1)*u0[i] for i in range(0,3)],
+            p=[random.uniform(0, 1)*p[i] for i in range(0,3)])
+
+ensembleprob = de.EnsembleProblem(fast_prob, safetycopy=False)
+```
+
+Now we solve the ensemble in serial:
+
+```py
+sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
+```
+
+To add GPUs to the mix, we need to bring in [DiffEqGPU](https://github.com/SciML/DiffEqGPU.jl).
+The `cuda` submodule will install CUDA for you and bring all of the bindings into the returned object:
+
+```py
+from diffeqpy import cuda
+```
+
+#### Note: `from diffeqpy import cuda` can take awhile to run the first time as it installs the drivers!
+
+Now we simply use `EnsembleGPUKernel(cuda.CUDABackend())` with a
+GPU-specialized ODE solver `cuda.GPUTsit5()` to solve 10,000 ODEs on the GPU in 
+parallel:
+
+```py
+sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=10000,saveat=0.01)
+```
+
+For the full list of choices for specialized GPU solvers, see 
+[the DiffEqGPU.jl documentation](https://docs.sciml.ai/DiffEqGPU/stable/manual/ensemblegpukernel/).
+
+Note that `EnsembleGPUArray` can be used as well, like:
+
+```py
+sol = de.solve(ensembleprob,de.Tsit5(),cuda.EnsembleGPUArray(CUDABackend()),trajectories=10000,saveat=0.01)
+```
+
+though we highly recommend the `EnsembleGPUKernel` methods for more speed. Given
+the way the JIT compilation performed will also ensure that the faster kernel
+generation methods work, `EnsembleGPUKernel` is almost certainly the
+better choice in most applications.
+
+### Benchmark
+
+To see how much of an effect the parallelism has, let's test this against R's
+deSolve package. This is exactly the same problem as the documentation example
+for deSolve, so let's copy that verbatim and then add a function to do the
+ensemble generation:
+
+```py
+import numpy as np
+from scipy.integrate import odeint
+ 
+def lorenz(state, t, sigma, beta, rho):
+    x, y, z = state
+     
+    dx = sigma * (y - x)
+    dy = x * (rho - z) - y
+    dz = x * y - beta * z
+     
+    return [dx, dy, dz]
+
+sigma = 10.0
+beta = 8.0 / 3.0
+rho = 28.0 
+p = (sigma, beta, rho)	
+y0 = [1.0, 1.0, 1.0]
+
+t = np.arange(0.0, 100.0, 0.01)
+result = odeint(lorenz, y0, t, p)
+```
+
+Using `lapply` to generate the ensemble we get:
+
+```py
+import timeit
+def time_func():
+    for itr in range(1, 1001):
+        result = odeint(lorenz, y0, t, p)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 38.08861699999761 seconds
+```
+
+Now let's see how the JIT-accelerated serial Julia version stacks up against that:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=1000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 3.1903300999983912
+```
+
+Julia is already about 12x faster than the pure Python solvers here! Now let's add
+GPU-acceleration to the mix:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=1000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 0.013322799997695256
+```
+
+Already 2900x faster than SciPy! But the GPU acceleration is made for massively
+parallel problems, so let's up the trajectories a bit. We will not use more
+trajectories from R because that would take too much computing power, so let's
+see what happens to the Julia serial and GPU at 10,000 trajectories:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 68.80795999999827
+```
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=10000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 0.10774460000175168
+```
+
+To compare this to the pure Julia code:
+
+```julia
+using OrdinaryDiffEq, DiffEqGPU, CUDA, StaticArrays
+function lorenz(u, p, t)
+    σ = p[1]
+    ρ = p[2]
+    β = p[3]
+    du1 = σ * (u[2] - u[1])
+    du2 = u[1] * (ρ - u[3]) - u[2]
+    du3 = u[1] * u[2] - β * u[3]
+    return SVector{3}(du1, du2, du3)
+end
+
+u0 = SA[1.0f0; 0.0f0; 0.0f0]
+tspan = (0.0f0, 10.0f0)
+p = SA[10.0f0, 28.0f0, 8 / 3.0f0]
+prob = ODEProblem{false}(lorenz, u0, tspan, p)
+prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
+monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
+@time sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
+    trajectories = 10_000,
+    saveat = 0.01);
+
+# 0.014481 seconds (257.64 k allocations: 13.130 MiB)
+```
+
+which is about an order of magnitude faster for computing 10,000 trajectories,
+note that the major factors are that we cannot define 32-bit floating point values
+from Python and the `prob_func` for generating the initial conditions and parameters
+is a major bottleneck since this function is written in Python.
+
+To see how this scales in Julia, let's take it to insane heights. First, let's
+reduce the amount we're saving:
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=10_000,saveat=1.0f0)
+0.015040 seconds (257.64 k allocations: 13.130 MiB)
+```
+
+This highlights that controlling memory pressure is key with GPU usage: you will
+get much better performance when requiring less saved points on the GPU.
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=100_000,saveat=1.0f0)
+# 0.150901 seconds (2.60 M allocations: 131.576 MiB)
+```
+
+compared to serial:
+
+```julia
+@time sol = solve(monteprob,Tsit5(),EnsembleSerial(),trajectories=100_000,saveat=1.0f0)
+# 22.136743 seconds (16.40 M allocations: 1.628 GiB, 42.98% gc time)
+```
+
+And now we start to see that scaling power! Let's solve 1 million trajectories:
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=1_000_000,saveat=1.0f0)
+# 1.031295 seconds (3.40 M allocations: 241.075 MiB)
+```
+
+For reference, let's look at deSolve with the change to only save that much:
+
+```py
+t = np.arange(0.0, 100.0, 1.0)
+def time_func():
+    for itr in range(1, 1001):
+        result = odeint(lorenz, y0, t, p)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 37.42609280000033
+```
+
+The GPU version is solving 1000x as many trajectories, 37x as fast! So conclusion,
+if you need the most speed, you may want to move to the Julia version to get the
+most out of your GPU due to Float32's, and when using GPUs make sure it's a problem
+with a relatively average or low memory pressure, and these methods will give
+orders of magnitude acceleration compared to what you might be used to.
+
+## GPU Backend Choices
+
+Just like DiffEqGPU.jl, diffeqpy supports many different GPU venders. `from diffeqpy import cuda`
+is just for cuda, but the following others are supported:
+
+- `from diffeqpy import cuda` with `cuda.CUDABackend` for NVIDIA GPUs via CUDA
+- `from diffeqpy import amdgpu` with `amdgpu.AMDGPUBackend` for AMD GPUs
+- `from diffeqpy import oneapi` with `oneapi.oneAPIBackend` for Intel's oneAPI GPUs
+- `from diffeqpy import metal` with `metal.MetalBackend` for Apple's Metal GPUs (on M-series processors)
+
+For more information, see [the DiffEqGPU.jl documentation](https://docs.sciml.ai/DiffEqGPU/stable/manual/backends/).
+
 ## Known Limitations
 
 - Autodiff does not work on Python functions. When applicable, either define the derivative function
diff --git a/diffeqpy/amdgpu.py b/diffeqpy/amdgpu.py
new file mode 100644
index 0000000..8c97cb9
--- /dev/null
+++ b/diffeqpy/amdgpu.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+amdgpu, _ = load_julia_packages("DiffEqGPU, AMDGPU")
+from juliacall import Main
+de.AMDGPUBackend = Main.seval("AMDGPU.AMDGPUBackend") # kinda hacky
+sys.modules[__name__] = amdgpu # mutate myself
diff --git a/diffeqpy/de.py b/diffeqpy/de.py
index 0e3376a..b049fa8 100644
--- a/diffeqpy/de.py
+++ b/diffeqpy/de.py
@@ -3,4 +3,5 @@
 de, _ = load_julia_packages("DifferentialEquations, ModelingToolkit")
 from juliacall import Main
 de.jit = Main.seval("jit(x) = typeof(x).name.wrapper(ModelingToolkit.modelingtoolkitize(x), x.u0, x.tspan, x.p)") # kinda hackey
+de.jit32 = Main.seval("jit(x) = typeof(x).name.wrapper(ModelingToolkit.modelingtoolkitize(x), Float32.(x.u0), Float32.(x.tspan), Float32.(x.p))") # kinda hackey
 sys.modules[__name__] = de # mutate myself
diff --git a/diffeqpy/metal.py b/diffeqpy/metal.py
new file mode 100644
index 0000000..9c61925
--- /dev/null
+++ b/diffeqpy/metal.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+metal, _ = load_julia_packages("DiffEqGPU, Metal")
+from juliacall import Main
+de.MetalBackend = Main.seval("Metal.MetalBackend") # kinda hacky
+sys.modules[__name__] = metal # mutate myself
\ No newline at end of file
diff --git a/diffeqpy/oneapi.py b/diffeqpy/oneapi.py
new file mode 100644
index 0000000..d79283e
--- /dev/null
+++ b/diffeqpy/oneapi.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+oneapi, _ = load_julia_packages("DiffEqGPU, oneAPI")
+from juliacall import Main
+de.oneAPIBackend = Main.seval("oneAPI.oneAPIBackend") # kinda hacky
+sys.modules[__name__] = oneapi # mutate myself

From d0c5ab1212e7ecd2c367b9415549447c9a8a8f5e Mon Sep 17 00:00:00 2001
From: Chris Rackauckas <accounts@chrisrackauckas.com>
Date: Thu, 19 Oct 2023 23:44:57 -0400
Subject: [PATCH 4/4] version bump

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 74d0d7b..a148f22 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
 
 setup(name='diffeqpy',
-      version='2.1.0',
+      version='2.2.0',
       description='Solving Differential Equations in Python',
       long_description=readme(),
       long_description_content_type="text/markdown",
@@ -19,7 +19,7 @@ def readme():
         'Topic :: Scientific/Engineering :: Physics'
       ],
       url='http://github.com/SciML/diffeqpy',
-      keywords='differential equations stochastic ordinary delay differential-algebraic dae ode sde dde',
+      keywords='differential equations stochastic ordinary delay differential-algebraic dae ode sde dde oneapi AMD ROCm CUDA Metal GPU',
       author='Chris Rackauckas and Takafumi Arakaki',
       author_email='contact@juliadiffeq.org',
       license='MIT',