diff --git a/Project.toml b/Project.toml
index 08a6e6de6..140fb7cf3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.2.2"
+version = "1.2.3"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -95,7 +95,7 @@ MacroTools = "0.5.13"
 Markdown = "1.10"
 NCCL = "0.1.1"
 NNlib = "0.9.24"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 Preferences = "1.4.3"
 Random = "1.10"
 Reactant = "0.2.4"
diff --git a/docs/Project.toml b/docs/Project.toml
index a48d7c818..7ef3bb514 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -50,7 +50,7 @@ LuxCore = "1"
 LuxLib = "1.3.4"
 LuxTestUtils = "1.5"
 MLDataDevices = "1.4"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 Pkg = "1.10"
 Printf = "1.10"
 Random = "1.10"
diff --git a/examples/Basics/Project.toml b/examples/Basics/Project.toml
index f24bb08ca..4e44d404c 100644
--- a/examples/Basics/Project.toml
+++ b/examples/Basics/Project.toml
@@ -13,5 +13,5 @@ ComponentArrays = "0.15"
 ForwardDiff = "0.10"
 Lux = "1"
 LuxCUDA = "0.3"
-Optimisers = "0.3"
+Optimisers = "0.3.3, 0.4"
 Zygote = "0.6"
diff --git a/examples/ConvMixer/Project.toml b/examples/ConvMixer/Project.toml
index acfbd4758..d1ffac2cd 100644
--- a/examples/ConvMixer/Project.toml
+++ b/examples/ConvMixer/Project.toml
@@ -31,7 +31,7 @@ LuxCUDA = "0.3.2"
 MLDatasets = "0.7.14"
 MLUtils = "0.4.4"
 OneHotArrays = "0.2.5"
-Optimisers = "0.3.3"
+Optimisers = "0.4"
 PreferenceTools = "0.1.2"
 Printf = "1.10"
 ProgressBars = "1.5.1"
diff --git a/examples/ConvMixer/README.md b/examples/ConvMixer/README.md
index f16d8850d..f072c1074 100644
--- a/examples/ConvMixer/README.md
+++ b/examples/ConvMixer/README.md
@@ -17,7 +17,7 @@ julia --startup-file=no \
     --threads=auto \
     main.jl \
     --lr-max=0.05 \
-    --weight-decay=0.000005
+    --weight-decay=0.0001
 ```
 
 Here's an example of the output of the above command (on a V100 32GB GPU):
@@ -76,11 +76,7 @@ Flags
 
 ## Notes
 
-  1. Weight-Decay with Adam in Optimisers.jl works differently from `torch.optim.AdamW`,
-     so you might need to adjust the value of `--weight-decay` to get the same results.
-     Pytorch multiplies the weight decay with the learning rate, whereas in Optimisers.jl
-     the learning rate is decoupled from the weight decay.
-  2. To match the results from the original repo, we need more augmentation strategies, that
+  1. To match the results from the original repo, we need more augmentation strategies, that
      are currently not implemented in DataAugmentation.jl.
-  3. Don't compare the reported timings in that repo against the numbers here. They time the
+  2. Don't compare the reported timings in that repo against the numbers here. They time the
      entire loop. We only time the training part of the loop.
diff --git a/examples/ConvMixer/main.jl b/examples/ConvMixer/main.jl
index 56ca4115f..03ddc63a5 100644
--- a/examples/ConvMixer/main.jl
+++ b/examples/ConvMixer/main.jl
@@ -22,17 +22,17 @@ function get_dataloaders(batchsize)
     cifar10_std = (0.2471, 0.2435, 0.2616)
 
     train_transform = RandomResizeCrop((32, 32)) |>
-                      Maybe(FlipX()) |>
+                      Maybe(FlipX{2}()) |>
                       ImageToTensor() |>
                       Normalize(cifar10_mean, cifar10_std)
 
     test_transform = ImageToTensor() |> Normalize(cifar10_mean, cifar10_std)
 
     trainset = TensorDataset(CIFAR10(:train), train_transform)
-    trainloader = DataLoader(trainset; batchsize, shuffle=true, buffer=true, parallel=true)
+    trainloader = DataLoader(trainset; batchsize, shuffle=true, parallel=true)
 
     testset = TensorDataset(CIFAR10(:test), test_transform)
-    testloader = DataLoader(testset; batchsize, shuffle=false, buffer=true, parallel=true)
+    testloader = DataLoader(testset; batchsize, shuffle=false, parallel=true)
 
     return trainloader, testloader
 end
diff --git a/examples/DDIM/Project.toml b/examples/DDIM/Project.toml
index 2f76e047c..4608d02b2 100644
--- a/examples/DDIM/Project.toml
+++ b/examples/DDIM/Project.toml
@@ -36,7 +36,7 @@ JLD2 = "0.4.48, 0.5"
 Lux = "1"
 LuxCUDA = "0.3"
 MLUtils = "0.4"
-Optimisers = " 0.3"
+Optimisers = "0.3, 0.4"
 ParameterSchedulers = "0.4.1"
 ProgressBars = "1"
 Random = "1.10"
diff --git a/examples/HyperNet/Project.toml b/examples/HyperNet/Project.toml
index 9213cd35f..da572377e 100644
--- a/examples/HyperNet/Project.toml
+++ b/examples/HyperNet/Project.toml
@@ -20,8 +20,8 @@ Lux = "1"
 LuxCUDA = "0.3"
 MLDatasets = "0.7"
 MLUtils = "0.4"
-OneHotArrays = "0.2"
-Optimisers = "0.3"
+OneHotArrays = "0.2.5"
+Optimisers = "0.3.3, 0.4"
 Setfield = "1"
 Statistics = "1"
 Zygote = "0.6"
diff --git a/examples/ImageNet/Project.toml b/examples/ImageNet/Project.toml
index 7abae248c..792a1341c 100644
--- a/examples/ImageNet/Project.toml
+++ b/examples/ImageNet/Project.toml
@@ -38,7 +38,7 @@ MLUtils = "0.4.4"
 MPI = "0.20.21"
 NCCL = "0.1.1"
 OneHotArrays = "0.2.5"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 ParameterSchedulers = "0.4.2"
 Random = "1.10"
 Setfield = "1.1.1"
diff --git a/examples/NeuralODE/Project.toml b/examples/NeuralODE/Project.toml
index 69f354cfb..e9aa48aa6 100644
--- a/examples/NeuralODE/Project.toml
+++ b/examples/NeuralODE/Project.toml
@@ -20,8 +20,8 @@ Lux = "1"
 LuxCUDA = "0.3"
 MLDatasets = "0.7"
 MLUtils = "0.4"
-OneHotArrays = "0.2"
-Optimisers = "0.3"
+OneHotArrays = "0.2.5"
+Optimisers = "0.3.3, 0.4"
 OrdinaryDiffEqTsit5 = "1"
 SciMLSensitivity = "7.63"
 Statistics = "1"
diff --git a/examples/PINN2DPDE/Project.toml b/examples/PINN2DPDE/Project.toml
index 7b1a5787a..03e427a64 100644
--- a/examples/PINN2DPDE/Project.toml
+++ b/examples/PINN2DPDE/Project.toml
@@ -18,7 +18,7 @@ Lux = "1"
 LuxCUDA = "0.3.3"
 MLUtils = "0.4.4"
 OnlineStats = "1.7.1"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 Printf = "1.10"
 Random = "1.10"
 Statistics = "1.10"
diff --git a/examples/PolynomialFitting/Project.toml b/examples/PolynomialFitting/Project.toml
index b607ee600..5cf0394ef 100644
--- a/examples/PolynomialFitting/Project.toml
+++ b/examples/PolynomialFitting/Project.toml
@@ -14,6 +14,6 @@ ADTypes = "1"
 CairoMakie = "0.12"
 Lux = "1"
 LuxCUDA = "0.3"
-Optimisers = "0.3"
+Optimisers = "0.3.3, 0.4"
 Statistics = "1"
 Zygote = "0.6"
diff --git a/examples/SimpleChains/Project.toml b/examples/SimpleChains/Project.toml
index 8d504559f..33304a7dc 100644
--- a/examples/SimpleChains/Project.toml
+++ b/examples/SimpleChains/Project.toml
@@ -16,7 +16,7 @@ Lux = "1"
 MLDatasets = "0.7.14"
 MLUtils = "0.4"
 OneHotArrays = "0.2.5"
-Optimisers = "0.3.2"
+Optimisers = "0.3.3, 0.4"
 Random = "1"
 SimpleChains = "0.4.6"
 Zygote = "0.6.69"
diff --git a/examples/SimpleRNN/Project.toml b/examples/SimpleRNN/Project.toml
index 2bd4f5864..02dff511c 100644
--- a/examples/SimpleRNN/Project.toml
+++ b/examples/SimpleRNN/Project.toml
@@ -16,6 +16,6 @@ JLD2 = "0.5"
 Lux = "1"
 LuxCUDA = "0.3"
 MLUtils = "0.4"
-Optimisers = "0.3"
+Optimisers = "0.3.3, 0.4"
 Statistics = "1"
 Zygote = "0.6"
diff --git a/lib/LuxCore/test/Project.toml b/lib/LuxCore/test/Project.toml
index 6d3c3d7f7..1088992ba 100644
--- a/lib/LuxCore/test/Project.toml
+++ b/lib/LuxCore/test/Project.toml
@@ -15,6 +15,6 @@ EnzymeCore = "0.8.5"
 ExplicitImports = "1.9.0"
 Functors = "0.4.12"
 MLDataDevices = "1.0.0"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 Random = "1.10"
 Test = "1.10"
diff --git a/lib/MLDataDevices/Project.toml b/lib/MLDataDevices/Project.toml
index 96bc0fd0c..4d4f67433 100644
--- a/lib/MLDataDevices/Project.toml
+++ b/lib/MLDataDevices/Project.toml
@@ -1,7 +1,7 @@
 name = "MLDataDevices"
 uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.5.0"
+version = "1.5.1"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -19,6 +19,7 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
@@ -36,6 +37,7 @@ MLDataDevicesFillArraysExt = "FillArrays"
 MLDataDevicesGPUArraysExt = "GPUArrays"
 MLDataDevicesMLUtilsExt = "MLUtils"
 MLDataDevicesMetalExt = ["GPUArrays", "Metal"]
+MLDataDevicesOneHotArraysExt = "OneHotArrays"
 MLDataDevicesReactantExt = "Reactant"
 MLDataDevicesRecursiveArrayToolsExt = "RecursiveArrayTools"
 MLDataDevicesReverseDiffExt = "ReverseDiff"
@@ -57,6 +59,7 @@ Functors = "0.4.8"
 GPUArrays = "10, 11"
 MLUtils = "0.4.4"
 Metal = "1"
+OneHotArrays = "0.2.5"
 Preferences = "1.4"
 Random = "1.10"
 Reactant = "0.2.4"
diff --git a/lib/MLDataDevices/ext/MLDataDevicesOneHotArraysExt.jl b/lib/MLDataDevices/ext/MLDataDevicesOneHotArraysExt.jl
new file mode 100644
index 000000000..ceb6d6bde
--- /dev/null
+++ b/lib/MLDataDevices/ext/MLDataDevicesOneHotArraysExt.jl
@@ -0,0 +1,17 @@
+module MLDataDevicesOneHotArraysExt
+
+using Adapt: Adapt
+using MLDataDevices: MLDataDevices, Internal, ReactantDevice, CPUDevice
+using OneHotArrays: OneHotArray
+
+for op in (:get_device, :get_device_type)
+    @eval Internal.$(op)(x::OneHotArray) = Internal.$(op)(x.indices)
+end
+
+# Reactant doesn't pay very nicely with OneHotArrays at the moment
+function Adapt.adapt_structure(dev::ReactantDevice, x::OneHotArray)
+    x_cpu = Adapt.adapt_structure(CPUDevice(), x)
+    return Adapt.adapt_storage(dev, convert(Array, x_cpu))
+end
+
+end
diff --git a/lib/MLDataDevices/test/Project.toml b/lib/MLDataDevices/test/Project.toml
index 9914e0f57..1fb732d37 100644
--- a/lib/MLDataDevices/test/Project.toml
+++ b/lib/MLDataDevices/test/Project.toml
@@ -9,6 +9,7 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
@@ -30,6 +31,7 @@ FillArrays = "1"
 ForwardDiff = "0.10.36"
 Functors = "0.4.8"
 MLUtils = "0.4"
+OneHotArrays = "0.2.5"
 Pkg = "1.10"
 Random = "1.10"
 RecursiveArrayTools = "3.8"
diff --git a/lib/MLDataDevices/test/misc_tests.jl b/lib/MLDataDevices/test/misc_tests.jl
index d9b3f8bd4..42d27cf00 100644
--- a/lib/MLDataDevices/test/misc_tests.jl
+++ b/lib/MLDataDevices/test/misc_tests.jl
@@ -5,6 +5,8 @@ using ReverseDiff, Tracker, ForwardDiff
 using SparseArrays, FillArrays, Zygote, RecursiveArrayTools
 using Functors: Functors
 
+const BACKEND_GROUP = lowercase(get(ENV, "BACKEND_GROUP", "none"))
+
 @testset "Issues Patches" begin
     @testset "#10 patch" begin
         dev = CPUDevice()
@@ -231,3 +233,23 @@ end
     g = Zygote.gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9])[1]
     @test g isa Matrix{Float32}
 end
+
+@testset "OneHotArrays" begin
+    using OneHotArrays
+
+    x = onehotbatch("abracadabra", 'a':'e', 'e')
+    @test get_device(x) isa CPUDevice
+
+    gdev = gpu_device()
+    x_g = gdev(x)
+    @test get_device(x_g) isa parameterless_type(typeof(gdev))
+
+    if BACKEND_GROUP == "none" || BACKEND_GROUP == "reactant"
+        using Reactant
+
+        rdev = reactant_device()
+        x_rd = rdev(x)
+        @test get_device(x_rd) isa ReactantDevice
+        @test x_rd isa Reactant.ConcreteRArray{Bool, 2}
+    end
+end
diff --git a/test/Project.toml b/test/Project.toml
index 90ee23de9..1a2b73f0c 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -65,7 +65,7 @@ MLUtils = "0.4.3"
 NNlib = "0.9.24"
 Octavian = "0.3.28"
 OneHotArrays = "0.2.5"
-Optimisers = "0.3.3"
+Optimisers = "0.3.3, 0.4"
 Pkg = "1.10"
 Preferences = "1.4.3"
 Random = "1.10"