diff --git a/README.md b/README.md
index a42a5eb..362b670 100644
--- a/README.md
+++ b/README.md
@@ -456,6 +456,270 @@ Notice that the solver accurately is able to simulate the kink (discontinuity)
 at `t=20` due to the discontinuity of the derivative at the initial time point!
 This is why declaring discontinuities can enhance the solver accuracy.
 
+## GPU-Accelerated ODE Solving of Ensembles
+
+In many cases one is interested in solving the same ODE many times over many
+different initial conditions and parameters. In diffeqpy parlance this is called
+an ensemble solve. diffeqpy inherits the parallelism tools of the
+[SciML ecosystem](https://sciml.ai/) that are used for things like
+[automated equation discovery and acceleration](https://arxiv.org/abs/2001.04385).
+Here we will demonstrate using these parallel tools to accelerate the solving
+of an ensemble.
+
+First, let's define the JIT-accelerated Lorenz equation like before:
+
+```py
+from diffeqpy import de
+
+def f(u,p,t):
+    x, y, z = u
+    sigma, rho, beta = p
+    return [sigma * (y - x), x * (rho - z) - y, x * y - beta * z]
+
+u0 = [1.0,0.0,0.0]
+tspan = (0., 100.)
+p = [10.0,28.0,8/3]
+prob = de.ODEProblem(f, u0, tspan, p)
+fast_prob = de.jit32(prob)
+sol = de.solve(fast_prob,saveat=0.01)
+```
+
+Note that here we used `de.jit32` to JIT-compile the problem into a `Float32` form in order to make it more
+efficient on most GPUs.
+
+Now we use the `EnsembleProblem` as defined on the
+[ensemble parallelism page of the documentation](https://diffeq.sciml.ai/stable/features/ensemble/):
+Let's build an ensemble by utilizing uniform random numbers to randomize the
+initial conditions and parameters:
+
+```py
+import random
+def prob_func(prob,i,rep):
+  de.remake(prob,u0=[random.uniform(0, 1)*u0[i] for i in range(0,3)],
+            p=[random.uniform(0, 1)*p[i] for i in range(0,3)])
+
+ensembleprob = de.EnsembleProblem(fast_prob, safetycopy=False)
+```
+
+Now we solve the ensemble in serial:
+
+```py
+sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
+```
+
+To add GPUs to the mix, we need to bring in [DiffEqGPU](https://github.com/SciML/DiffEqGPU.jl).
+The `cuda` submodule will install CUDA for you and bring all of the bindings into the returned object:
+
+```py
+from diffeqpy import cuda
+```
+
+#### Note: `from diffeqpy import cuda` can take awhile to run the first time as it installs the drivers!
+
+Now we simply use `EnsembleGPUKernel(cuda.CUDABackend())` with a
+GPU-specialized ODE solver `cuda.GPUTsit5()` to solve 10,000 ODEs on the GPU in 
+parallel:
+
+```py
+sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=10000,saveat=0.01)
+```
+
+For the full list of choices for specialized GPU solvers, see 
+[the DiffEqGPU.jl documentation](https://docs.sciml.ai/DiffEqGPU/stable/manual/ensemblegpukernel/).
+
+Note that `EnsembleGPUArray` can be used as well, like:
+
+```py
+sol = de.solve(ensembleprob,de.Tsit5(),cuda.EnsembleGPUArray(CUDABackend()),trajectories=10000,saveat=0.01)
+```
+
+though we highly recommend the `EnsembleGPUKernel` methods for more speed. Given
+the way the JIT compilation performed will also ensure that the faster kernel
+generation methods work, `EnsembleGPUKernel` is almost certainly the
+better choice in most applications.
+
+### Benchmark
+
+To see how much of an effect the parallelism has, let's test this against R's
+deSolve package. This is exactly the same problem as the documentation example
+for deSolve, so let's copy that verbatim and then add a function to do the
+ensemble generation:
+
+```py
+import numpy as np
+from scipy.integrate import odeint
+ 
+def lorenz(state, t, sigma, beta, rho):
+    x, y, z = state
+     
+    dx = sigma * (y - x)
+    dy = x * (rho - z) - y
+    dz = x * y - beta * z
+     
+    return [dx, dy, dz]
+
+sigma = 10.0
+beta = 8.0 / 3.0
+rho = 28.0 
+p = (sigma, beta, rho)	
+y0 = [1.0, 1.0, 1.0]
+
+t = np.arange(0.0, 100.0, 0.01)
+result = odeint(lorenz, y0, t, p)
+```
+
+Using `lapply` to generate the ensemble we get:
+
+```py
+import timeit
+def time_func():
+    for itr in range(1, 1001):
+        result = odeint(lorenz, y0, t, p)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 38.08861699999761 seconds
+```
+
+Now let's see how the JIT-accelerated serial Julia version stacks up against that:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=1000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 3.1903300999983912
+```
+
+Julia is already about 12x faster than the pure Python solvers here! Now let's add
+GPU-acceleration to the mix:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=1000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 0.013322799997695256
+```
+
+Already 2900x faster than SciPy! But the GPU acceleration is made for massively
+parallel problems, so let's up the trajectories a bit. We will not use more
+trajectories from R because that would take too much computing power, so let's
+see what happens to the Julia serial and GPU at 10,000 trajectories:
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,de.Tsit5(),de.EnsembleSerial(),trajectories=10000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 68.80795999999827
+```
+
+```py
+def time_func():
+    sol = de.solve(ensembleprob,cuda.GPUTsit5(),cuda.EnsembleGPUKernel(CUDABackend()),trajectories=10000,saveat=0.01)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 0.10774460000175168
+```
+
+To compare this to the pure Julia code:
+
+```julia
+using OrdinaryDiffEq, DiffEqGPU, CUDA, StaticArrays
+function lorenz(u, p, t)
+    σ = p[1]
+    ρ = p[2]
+    β = p[3]
+    du1 = σ * (u[2] - u[1])
+    du2 = u[1] * (ρ - u[3]) - u[2]
+    du3 = u[1] * u[2] - β * u[3]
+    return SVector{3}(du1, du2, du3)
+end
+
+u0 = SA[1.0f0; 0.0f0; 0.0f0]
+tspan = (0.0f0, 10.0f0)
+p = SA[10.0f0, 28.0f0, 8 / 3.0f0]
+prob = ODEProblem{false}(lorenz, u0, tspan, p)
+prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
+monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
+@time sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
+    trajectories = 10_000,
+    saveat = 0.01);
+
+# 0.014481 seconds (257.64 k allocations: 13.130 MiB)
+```
+
+which is about an order of magnitude faster for computing 10,000 trajectories,
+note that the major factors are that we cannot define 32-bit floating point values
+from Python and the `prob_func` for generating the initial conditions and parameters
+is a major bottleneck since this function is written in Python.
+
+To see how this scales in Julia, let's take it to insane heights. First, let's
+reduce the amount we're saving:
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=10_000,saveat=1.0f0)
+0.015040 seconds (257.64 k allocations: 13.130 MiB)
+```
+
+This highlights that controlling memory pressure is key with GPU usage: you will
+get much better performance when requiring less saved points on the GPU.
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=100_000,saveat=1.0f0)
+# 0.150901 seconds (2.60 M allocations: 131.576 MiB)
+```
+
+compared to serial:
+
+```julia
+@time sol = solve(monteprob,Tsit5(),EnsembleSerial(),trajectories=100_000,saveat=1.0f0)
+# 22.136743 seconds (16.40 M allocations: 1.628 GiB, 42.98% gc time)
+```
+
+And now we start to see that scaling power! Let's solve 1 million trajectories:
+
+```julia
+@time sol = solve(monteprob,GPUTsit5(),EnsembleGPUKernel(CUDA.CUDABackend()),trajectories=1_000_000,saveat=1.0f0)
+# 1.031295 seconds (3.40 M allocations: 241.075 MiB)
+```
+
+For reference, let's look at deSolve with the change to only save that much:
+
+```py
+t = np.arange(0.0, 100.0, 1.0)
+def time_func():
+    for itr in range(1, 1001):
+        result = odeint(lorenz, y0, t, p)
+
+timeit.Timer(time_func).timeit(number=1)
+
+# 37.42609280000033
+```
+
+The GPU version is solving 1000x as many trajectories, 37x as fast! So conclusion,
+if you need the most speed, you may want to move to the Julia version to get the
+most out of your GPU due to Float32's, and when using GPUs make sure it's a problem
+with a relatively average or low memory pressure, and these methods will give
+orders of magnitude acceleration compared to what you might be used to.
+
+## GPU Backend Choices
+
+Just like DiffEqGPU.jl, diffeqpy supports many different GPU venders. `from diffeqpy import cuda`
+is just for cuda, but the following others are supported:
+
+- `from diffeqpy import cuda` with `cuda.CUDABackend` for NVIDIA GPUs via CUDA
+- `from diffeqpy import amdgpu` with `amdgpu.AMDGPUBackend` for AMD GPUs
+- `from diffeqpy import oneapi` with `oneapi.oneAPIBackend` for Intel's oneAPI GPUs
+- `from diffeqpy import metal` with `metal.MetalBackend` for Apple's Metal GPUs (on M-series processors)
+
+For more information, see [the DiffEqGPU.jl documentation](https://docs.sciml.ai/DiffEqGPU/stable/manual/backends/).
+
 ## Known Limitations
 
 - Autodiff does not work on Python functions. When applicable, either define the derivative function
diff --git a/diffeqpy/amdgpu.py b/diffeqpy/amdgpu.py
new file mode 100644
index 0000000..8c97cb9
--- /dev/null
+++ b/diffeqpy/amdgpu.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+amdgpu, _ = load_julia_packages("DiffEqGPU, AMDGPU")
+from juliacall import Main
+de.AMDGPUBackend = Main.seval("AMDGPU.AMDGPUBackend") # kinda hacky
+sys.modules[__name__] = amdgpu # mutate myself
diff --git a/diffeqpy/cuda.py b/diffeqpy/cuda.py
new file mode 100644
index 0000000..c10ac81
--- /dev/null
+++ b/diffeqpy/cuda.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+cuda, _ = load_julia_packages("DiffEqGPU, CUDA")
+from juliacall import Main
+de.CUDABackend = Main.seval("CUDA.CUDABackend") # kinda hacky
+sys.modules[__name__] = cuda # mutate myself
diff --git a/diffeqpy/de.py b/diffeqpy/de.py
index 0e3376a..b049fa8 100644
--- a/diffeqpy/de.py
+++ b/diffeqpy/de.py
@@ -3,4 +3,5 @@
 de, _ = load_julia_packages("DifferentialEquations, ModelingToolkit")
 from juliacall import Main
 de.jit = Main.seval("jit(x) = typeof(x).name.wrapper(ModelingToolkit.modelingtoolkitize(x), x.u0, x.tspan, x.p)") # kinda hackey
+de.jit32 = Main.seval("jit(x) = typeof(x).name.wrapper(ModelingToolkit.modelingtoolkitize(x), Float32.(x.u0), Float32.(x.tspan), Float32.(x.p))") # kinda hackey
 sys.modules[__name__] = de # mutate myself
diff --git a/diffeqpy/metal.py b/diffeqpy/metal.py
new file mode 100644
index 0000000..9c61925
--- /dev/null
+++ b/diffeqpy/metal.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+metal, _ = load_julia_packages("DiffEqGPU, Metal")
+from juliacall import Main
+de.MetalBackend = Main.seval("Metal.MetalBackend") # kinda hacky
+sys.modules[__name__] = metal # mutate myself
\ No newline at end of file
diff --git a/diffeqpy/oneapi.py b/diffeqpy/oneapi.py
new file mode 100644
index 0000000..d79283e
--- /dev/null
+++ b/diffeqpy/oneapi.py
@@ -0,0 +1,6 @@
+import sys
+from . import load_julia_packages
+oneapi, _ = load_julia_packages("DiffEqGPU, oneAPI")
+from juliacall import Main
+de.oneAPIBackend = Main.seval("oneAPI.oneAPIBackend") # kinda hacky
+sys.modules[__name__] = oneapi # mutate myself
diff --git a/setup.py b/setup.py
index 74d0d7b..a148f22 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
 
 setup(name='diffeqpy',
-      version='2.1.0',
+      version='2.2.0',
       description='Solving Differential Equations in Python',
       long_description=readme(),
       long_description_content_type="text/markdown",
@@ -19,7 +19,7 @@ def readme():
         'Topic :: Scientific/Engineering :: Physics'
       ],
       url='http://github.com/SciML/diffeqpy',
-      keywords='differential equations stochastic ordinary delay differential-algebraic dae ode sde dde',
+      keywords='differential equations stochastic ordinary delay differential-algebraic dae ode sde dde oneapi AMD ROCm CUDA Metal GPU',
       author='Chris Rackauckas and Takafumi Arakaki',
       author_email='contact@juliadiffeq.org',
       license='MIT',