diff --git a/Project.toml b/Project.toml index d29553f72..f242ed495 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,6 @@ Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d" [weakdeps] @@ -88,8 +87,8 @@ GPUArraysCore = "0.1.6, 0.2" LinearAlgebra = "1.10" LossFunctions = "0.11.1" LuxCore = "1" -LuxLib = "1.3" -MLDataDevices = "1.2" +LuxLib = "1.3.4" +MLDataDevices = "1.3" MLUtils = "0.4.4" MPI = "0.20.19" MacroTools = "0.5.13" @@ -109,7 +108,6 @@ Static = "1.1.1" StaticArraysCore = "1.4.3" Statistics = "1.10" Tracker = "0.2.34" -VectorizationBase = "0.21.70" WeightInitializers = "1" Zygote = "0.6.70" julia = "1.10" diff --git a/docs/Project.toml b/docs/Project.toml index 832aa2429..9f3c4beab 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -43,9 +43,9 @@ Literate = "2.18.0" Lux = "1" LuxCUDA = "0.3.2" LuxCore = "1" -LuxLib = "1" +LuxLib = "1.3.4" LuxTestUtils = "1.2" -MLDataDevices = "1.2" +MLDataDevices = "1.3" Optimisers = "0.3.3" Pkg = "1.10" Printf = "1.10" diff --git a/docs/src/manual/performance_pitfalls.md b/docs/src/manual/performance_pitfalls.md index 45ccf87af..68683efdc 100644 --- a/docs/src/manual/performance_pitfalls.md +++ b/docs/src/manual/performance_pitfalls.md @@ -83,6 +83,18 @@ Prefer to use deep learning primitives and their fused variants from `LuxLib.jl` 5. Replace uses of `σ.(x .+ b)` with [`LuxLib.bias_activation`](@ref) or [`LuxLib.bias_activation!!`](@ref) (the latter one is often faster). +## Optional Dependencies for Performance + +For faster performance on CPUs load the following packages: + +1. `LoopVectorization.jl` +2. `Octavian.jl` + +If these are available, we automatically use optimized versions of the layers. Though there +are cases where this might be an issue (see +[#980](https://github.com/LuxDL/Lux.jl/issues/980) and +[disabling loop vectorization](@ref disable_loop_vectorization)). + ## Data Loading and Device Transfer A common pattern for loading data and transferring data to GPUs looks like this: diff --git a/docs/src/manual/preferences.md b/docs/src/manual/preferences.md index 357c77acb..b1385c3e9 100644 --- a/docs/src/manual/preferences.md +++ b/docs/src/manual/preferences.md @@ -13,6 +13,8 @@ ```julia pkg> preference add Lux = + pkg> preference add LuxLib = + pkg> preference add LuxCore = ``` Lux.jl relies on several preferences to make decision on how to run your code. Here is an @@ -57,3 +59,12 @@ By default, both of these preferences are set to `false`. - Setting the `LuxLib` preference sets the check at the level of functional layer of Lux, for example, [`fused_dense_bias_activation`](@ref). These functions are supposed to be type stable for common input types and can be used to guarantee type stability. + +## [Disabling Loop Vectorization / Octavian](@id disable_loop_vectorization) + +`LoopVectorization.jl` and `Octavian.jl` are optional dependencies that are used to +accelerate certain CPU operations. However, these packages are tightly coupled with julia +and might not work with all julia versions and systems. If these packages are loaded in any +form LuxLib will use the optimized versions of the functions. But it might be desirable to +disable these packages and use the default implementations instead. This can be done by +setting the `disable_loop_vectorization` preference to `true` for `LuxLib`. diff --git a/examples/Basics/main.jl b/examples/Basics/main.jl index 2ce2fd9df..6bd6f21a8 100644 --- a/examples/Basics/main.jl +++ b/examples/Basics/main.jl @@ -95,7 +95,8 @@ W = randn(5, 10) x = rand(10) W * x -# Julia's arrays are very powerful, and you can learn more about what they can do [here](https://docs.julialang.org/en/v1/manual/arrays/). +# Julia's arrays are very powerful, and you can learn more about what they can do +# [here](https://docs.julialang.org/en/v1/manual/arrays/). # ### CUDA Arrays @@ -206,18 +207,20 @@ println("Computed Gradient via Forward Mode AD (ForwardDiff): ", ForwardDiff.gra # ### Jacobian-Vector Product # I will defer the discussion on forward-mode AD to -# [https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/](https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/). Here let us just look -# at a mini example on how to use it. +# [https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/](https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/). +# Here let us just look at a mini example on how to use it. f(x) = x .* x ./ 2 x = randn(rng, Float32, 5) v = ones(Float32, 5) # Construct the pushforward function. We will write out the function here but in -# practice we recommend using [SparseDiffTools.auto_jacvec](https://docs.sciml.ai/SparseDiffTools/stable/#Jacobian-Vector-and-Hessian-Vector-Products)! +# practice we recommend using +# [SparseDiffTools.auto_jacvec](https://docs.sciml.ai/SparseDiffTools/stable/#Jacobian-Vector-and-Hessian-Vector-Products)! # First we need to create a Tag for ForwardDiff. It is enough to know that this is something -# that you must do. For more details, see the [ForwardDiff Documentation](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Custom-tags-and-tag-checking)! +# that you must do. For more details, see the +# [ForwardDiff Documentation](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Custom-tags-and-tag-checking)! struct TestTag end # Going in the details of what is function is doing is beyond the scope of this tutorial. diff --git a/examples/BayesianNN/main.jl b/examples/BayesianNN/main.jl index aa850d2ed..3bc500dae 100644 --- a/examples/BayesianNN/main.jl +++ b/examples/BayesianNN/main.jl @@ -20,7 +20,9 @@ Turing.setprogress!(true); # ## Generating data -# Our goal here is to use a Bayesian neural network to classify points in an artificial dataset. The code below generates data points arranged in a box-like pattern and displays a graph of the dataset we'll be working with. +# Our goal here is to use a Bayesian neural network to classify points in an artificial +# dataset. The code below generates data points arranged in a box-like pattern and displays +# a graph of the dataset we'll be working with. ## Number of points to generate N = 80 diff --git a/examples/HyperNet/main.jl b/examples/HyperNet/main.jl index df0f6b93f..33163e9fe 100644 --- a/examples/HyperNet/main.jl +++ b/examples/HyperNet/main.jl @@ -81,10 +81,10 @@ function train() rng = Xoshiro(0) ps, st = Lux.setup(rng, model) |> dev - train_state = Training.TrainState(model, ps, st, Adam(3.0f-4)) + train_state = Training.TrainState(model, ps, st, Adam(0.001f0)) ### Lets train the model - nepochs = 25 + nepochs = 50 for epoch in 1:nepochs, data_idx in 1:2 train_dataloader, test_dataloader = dataloaders[data_idx] .|> dev @@ -106,8 +106,8 @@ function train() data_name = data_idx == 1 ? "MNIST" : "FashionMNIST" - @printf "[%3d/%3d] \t %12s \t Time %.5fs \t Training Accuracy: %.2f%% \t Test \ - Accuracy: %.2f%%\n" epoch nepochs data_name ttime train_acc test_acc + @printf "[%3d/%3d]\t%12s\tTime %3.5fs\tTraining Accuracy: %3.2f%%\tTest \ + Accuracy: %3.2f%%\n" epoch nepochs data_name ttime train_acc test_acc end println() @@ -126,13 +126,13 @@ function train() data_name = data_idx == 1 ? "MNIST" : "FashionMNIST" - @printf "[FINAL] \t %12s \t Training Accuracy: %.2f%% \t Test Accuracy: \ - %.2f%%\n" data_name train_acc test_acc + @printf "[FINAL]\t%12s\tTraining Accuracy: %3.2f%%\tTest Accuracy: \ + %3.2f%%\n" data_name train_acc test_acc test_acc_list[data_idx] = test_acc end return test_acc_list end test_acc_list = train() -@assert test_acc_list[1] > 0.90 && test_acc_list[2] > 0.70 #hide +@assert test_acc_list[1] > 60 && test_acc_list[2] > 60 #hide nothing #hide diff --git a/examples/ImageNet/Project.toml b/examples/ImageNet/Project.toml index bcf96be5e..b474e9e26 100644 --- a/examples/ImageNet/Project.toml +++ b/examples/ImageNet/Project.toml @@ -33,7 +33,7 @@ ImageMagick = "1" JLD2 = "0.5.1" Lux = "1" LuxCUDA = "0.3.3" -MLDataDevices = "1" +MLDataDevices = "1.3" MLUtils = "0.4.4" MPI = "0.20.21" NCCL = "0.1.1" diff --git a/examples/NeuralODE/Project.toml b/examples/NeuralODE/Project.toml index 6ee61e610..cc869ad93 100644 --- a/examples/NeuralODE/Project.toml +++ b/examples/NeuralODE/Project.toml @@ -7,7 +7,7 @@ MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" -OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" +OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1" @@ -17,12 +17,12 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] ComponentArrays = "0.15" Lux = "1" -LuxCUDA = "0.2, 0.3" -MLDatasets = "0.5, 0.7" -MLUtils = "0.2, 0.3, 0.4" -OneHotArrays = "0.1, 0.2" -Optimisers = "0.2, 0.3" -OrdinaryDiffEq = "6" +LuxCUDA = "0.3" +MLDatasets = "0.7" +MLUtils = "0.4" +OneHotArrays = "0.2" +Optimisers = "0.3" +OrdinaryDiffEqTsit5 = "1" SciMLSensitivity = "7.63" Statistics = "1" Zygote = "0.6" diff --git a/examples/NeuralODE/main.jl b/examples/NeuralODE/main.jl index ae63407c2..9640b6a2c 100644 --- a/examples/NeuralODE/main.jl +++ b/examples/NeuralODE/main.jl @@ -7,8 +7,8 @@ # ## Package Imports -using Lux, ComponentArrays, SciMLSensitivity, LuxCUDA, Optimisers, OrdinaryDiffEq, Random, - Statistics, Zygote, OneHotArrays, InteractiveUtils, Printf +using Lux, ComponentArrays, SciMLSensitivity, LuxCUDA, Optimisers, OrdinaryDiffEqTsit5, + Random, Statistics, Zygote, OneHotArrays, InteractiveUtils, Printf using MLDatasets: MNIST using MLUtils: DataLoader, splitobs @@ -139,9 +139,9 @@ function train(model_function; cpu::Bool=false, kwargs...) end ttime = time() - stime - tr_acc = accuracy(model, tstate.parameters, tstate.states, train_dataloader) - te_acc = accuracy(model, tstate.parameters, tstate.states, test_dataloader) - @printf "[%d/%d] \t Time %.2fs \t Training Accuracy: %.5f%% \t Test \ + tr_acc = accuracy(model, tstate.parameters, tstate.states, train_dataloader) * 100 + te_acc = accuracy(model, tstate.parameters, tstate.states, test_dataloader) * 100 + @printf "[%d/%d]\tTime %.4fs\tTraining Accuracy: %.5f%%\tTest \ Accuracy: %.5f%%\n" epoch nepochs ttime tr_acc te_acc end end diff --git a/src/helpers/size_propagator.jl b/src/helpers/size_propagator.jl index 954065e53..f6e37b4c1 100644 --- a/src/helpers/size_propagator.jl +++ b/src/helpers/size_propagator.jl @@ -9,7 +9,6 @@ using Static: Static, StaticBool # We need these to avoid ambiguities using SIMDTypes: SIMDTypes using StaticArraysCore: StaticArraysCore -using VectorizationBase: VectorizationBase const VecT = Union{Bool, Float16, Float32, Float64, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8, SIMDTypes.Bit} @@ -43,17 +42,6 @@ function Base.convert(::Type{ForwardDiff.Dual{T, V, Tag}}, ::Nil) where {T, V, T throw(ArgumentError(NIL_DUAL_ERROR_MSG)) end -const NIL_VEC_ERROR_MSG = "`Nil` is incompatible with `VectorizationBase` numbers." - -VectorizationBase.Vec{W, T}(::Nil) where {T, W} = throw(ArgumentError(NIL_VEC_ERROR_MSG)) -function VectorizationBase.VecUnroll{ - N, W, T, V}(::Nil) where {T, W, V <: VectorizationBase.AbstractSIMDVector{W, T}, N} - throw(ArgumentError(NIL_VEC_ERROR_MSG)) -end -function VectorizationBase.VecUnroll{N, 1, T, T}(::Nil) where {T <: VecT, N} - throw(ArgumentError(NIL_VEC_ERROR_MSG)) -end - const NIL_STATIC_ERROR_MSG = "`Nil` is incompatible with `Static` numbers." function Base.convert(::Type{Nil}, diff --git a/test/Project.toml b/test/Project.toml index 998901f64..b83006940 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -14,12 +14,14 @@ Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11" LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531" MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" @@ -55,12 +57,14 @@ Hwloc = "3.2.0" InteractiveUtils = "<0.0.1, 1" LinearAlgebra = "1.10" Logging = "1.10" +LoopVectorization = "0.12.171" LuxCore = "1.0" -LuxLib = "1.3" +LuxLib = "1.3.4" LuxTestUtils = "1.3" -MLDataDevices = "1.1" +MLDataDevices = "1.3" MLUtils = "0.4.3" NNlib = "0.9.24" +Octavian = "0.3.28" OneHotArrays = "0.2.5" Optimisers = "0.3.3" Pkg = "1.10" diff --git a/test/qa_tests.jl b/test/qa_tests.jl index 074f464b0..49977d1f6 100644 --- a/test/qa_tests.jl +++ b/test/qa_tests.jl @@ -10,7 +10,7 @@ Aqua.test_piracies(Lux; treat_as_own=[Lux.outputsize]) end -@testitem "Explicit Imports: Quality Assurance" setup=[SharedTestSetup] tags=[:others] begin +@testitem "Explicit Imports: Quality Assurance" tags=[:others] begin # Load all trigger packages import Lux, ComponentArrays, ReverseDiff, SimpleChains, Tracker, Zygote, Enzyme using ExplicitImports diff --git a/test/reactant/loss_tests.jl b/test/reactant/loss_tests.jl index a98bf1a71..b70af9f63 100644 --- a/test/reactant/loss_tests.jl +++ b/test/reactant/loss_tests.jl @@ -1,4 +1,4 @@ -@testitem "Compiled Loss Functions" tags=[:reactant] setup=[SharedTestSetup] begin +@testitem "Compiled Loss Functions" tags=[:reactant] setup=[SharedTestSetup] skip=:(Sys.iswindows()) begin using Reactant, Lux, OneHotArrays rng = StableRNG(123) diff --git a/test/reactant/training_tests.jl b/test/reactant/training_tests.jl index b3b27969c..cc9d0f9b0 100644 --- a/test/reactant/training_tests.jl +++ b/test/reactant/training_tests.jl @@ -1,4 +1,4 @@ -@testitem "Reactant: Training API" tags=[:reactant] setup=[SharedTestSetup] begin +@testitem "Reactant: Training API" tags=[:reactant] setup=[SharedTestSetup] skip=:(Sys.iswindows()) begin using Reactant, Optimisers @testset "$(mode)" for (mode, atype, dev, ongpu) in MODES diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl index aba3646de..66cac2715 100644 --- a/test/shared_testsetup.jl +++ b/test/shared_testsetup.jl @@ -13,6 +13,8 @@ using MLDataDevices: default_device_rng, CPUDevice, CUDADevice, AMDGPUDevice using LuxTestUtils: check_approx using Static: True +using Octavian, LoopVectorization + LuxTestUtils.jet_target_modules!(["Lux", "LuxCore", "LuxLib"]) LinearAlgebra.BLAS.set_num_threads(Threads.nthreads())