handle data movement with MLDataDevices.jl (#2492)

* removed Flux devices * fix gpu extensions * ported MPI extension * docs * docs * skip enzyme tests * fix docs * more enzyme fixes * fix metal * fix gpu * doc project * fix buildkite preference * fix docs * fix docs * fix docs * fix docs * some tests are broken * cleanup * fix tests * buildkite * rework rng_from_array
FluxML · Oct 11, 2024 · aa035e9 · aa035e9
1 parent b0c6653
commit aa035e9
Show file tree

Hide file tree

Showing 37 changed files with 400 additions and 676 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -26,10 +26,10 @@ steps:
   #     cuda: "*"
   #   timeout_in_minutes: 60
 
-  - label: "Metal with julia {{matrix.julia}}"
+  - label: "Metal with julia v1"
     plugins:
       - JuliaCI/julia#v1:
-          version: "{{matrix.julia}}"
+          version: "1"
       - JuliaCI/julia-test#v1:
           test_args: "--quickfail"
       - JuliaCI/julia-coverage#v1:
@@ -46,7 +46,7 @@ steps:
           using Pkg
           Pkg.resolve()'
     commands: |
-      printf "[Flux]\ngpu_backend = \"Metal\"" > LocalPreferences.toml
+      printf "[MLDataDevices]\ngpu_backend = \"Metal\"\n" > LocalPreferences.toml
 
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
@@ -74,7 +74,7 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     commands: |
-      printf "[Flux]\ngpu_backend = \"AMDGPU\"" > LocalPreferences.toml
+      printf "[MLDataDevices]\ngpu_backend = \"AMDGPU\"\n" > LocalPreferences.toml
     timeout_in_minutes: 60
     env:
       JULIA_AMDGPU_CORE_MUST_LOAD: "1"

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,9 @@
 
 See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a complete list of PRs merged before each release.
 
+## v0.14.22
+* Data movement between devices is now provided by [MLDataDevices.jl](https://github.com/LuxDL/MLDataDevices.jl).
+
 ## v0.14.18
 * Add [support for distributed data parallel training](https://github.com/FluxML/Flux.jl/pull/2446).
 * MPI and NCCL backend available with `FluxMPIExt` and `FluxMPINCCLExt` extensions respectively.

diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
@@ -49,6 +50,7 @@ ChainRulesCore = "1.12"
 Compat = "4.10.0"
 Enzyme = "0.12, 0.13"
 Functors = "0.4"
+MLDataDevices = "1.2.0"
 MLUtils = "0.4"
 MPI = "0.20.19"
 MacroTools = "0.5"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -6,6 +6,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"

diff --git a/docs/make.jl b/docs/make.jl
@@ -1,11 +1,11 @@
 using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, 
       OneHotArrays, Zygote, ChainRulesCore, Plots, MLDatasets, Statistics, 
-      DataFrames, JLD2
+      DataFrames, JLD2, MLDataDevices
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 
 makedocs(
-    modules = [Flux, NNlib, Functors, MLUtils, Zygote, OneHotArrays, Optimisers, ChainRulesCore],
+    modules = [Flux, NNlib, Functors, MLUtils, Zygote, OneHotArrays, Optimisers, ChainRulesCore, MLDataDevices],
     sitename = "Flux",
     pages = [
         "Welcome" => "index.md",

diff --git a/docs/src/guide/gpu.md b/docs/src/guide/gpu.md
@@ -232,19 +232,17 @@ More information for conditional use of GPUs in CUDA.jl can be found in its [doc
 
 ## Using device objects
 
-As a more convenient syntax, Flux allows the usage of GPU `device` objects which can be used to easily transfer models to GPUs (and defaulting to using the CPU if no GPU backend is available). This syntax has a few advantages including automatic selection of the GPU backend and type stability of data movement. To do this, the [`Flux.get_device`](@ref) function can be used.
+As a more convenient syntax, Flux allows the usage of GPU `device` objects which can be used to easily transfer models to GPUs (and defaulting to using the CPU if no GPU backend is available). This syntax has a few advantages including automatic selection of the GPU backend and type stability of data movement. 
+These features are provided by [MLDataDevices.jl](https://github.com/LuxDL/MLDataDevices.jl) package, that Flux's uses internally and re-exports.
 
-`Flux.get_device` first checks for a GPU preference, and if possible returns a device for the preference backend. For instance, consider the following example, where we load the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package to use an NVIDIA GPU (`"CUDA"` is the default preference):
+A `device` object can be created using the [`gpu_device`](@ref MLDataDevices.gpu_device) function. 
+`gpu_device` first checks for a GPU preference, and if possible returns a device for the preference backend. For instance, consider the following example, where we load the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package to use an NVIDIA GPU (`"CUDA"` is the default preference):
 
 ```julia-repl
 julia> using Flux, CUDA;
 
-julia> device = Flux.get_device(; verbose=true)   # returns handle to an NVIDIA GPU
-[ Info: Using backend set in preferences: CUDA.
-(::Flux.FluxCUDADevice) (generic function with 1 method)
-
-julia> device.deviceID      # check the id of the GPU
-CuDevice(0): NVIDIA GeForce GTX 1650
+julia> device = gpu_device()   # returns handle to an NVIDIA GPU if available
+(::CUDADevice{Nothing}) (generic function with 4 methods)
 
 julia> model = Dense(2 => 3);
 
@@ -262,77 +260,57 @@ julia> model.weight
  -0.984794  -0.904345
   0.720379  -0.486398
   0.851011  -0.586942
-
 ```
 
-The device preference can also be set via the [`Flux.gpu_backend!`](@ref) function. For instance, below we first set our device preference to `"CPU"`:
+The device preference can also be set via the [`gpu_backend!`](@ref MLDataDevices.gpu_backend!) function. For instance, below we first set our device preference to `"AMDGPU"`:
 
 ```julia-repl
-julia> using Flux; Flux.gpu_backend!("CPU")
-┌ Info: New GPU backend set: CPU.
-└ Restart your Julia session for this change to take effect!
+julia> gpu_backend!("AMDGPU")
+[ Info: GPU backend has been set to AMDGPU. Restart Julia to use the new backend.
 ```
-
-Then, after restarting the Julia session, `Flux.get_device` returns a handle to the `"CPU"`:
+If no functional GPU backend is available, the device will default to a CPU device. 
+You can also explictly request a CPU device by calling the [`cpu_device`](@ref MLDataDevices.cpu_device) function.
 
 ```julia-repl
-julia> using Flux, CUDA;    # even if CUDA is loaded, we'll still get a CPU device
-
-julia> device = Flux.get_device(; verbose=true)   # get a CPU device
-[ Info: Using backend set in preferences: CPU.
-(::Flux.FluxCPUDevice) (generic function with 1 method)
+julia> using Flux, MLDataDevices
 
-julia> model = Dense(2 => 3);
-
-julia> model = model |> device
-Dense(2 => 3)       # 9 parameters
+julia> cdev = cpu_device()
+(::CPUDevice{Nothing}) (generic function with 4 methods)
 
-julia> model.weight     # no change; model still lives on CPU
-3×2 Matrix{Float32}:
- -0.942968   0.856258
-  0.440009   0.714106
- -0.419192  -0.471838
-```
-Clearly, this means that the same code will work for any GPU backend and the CPU. 
+julia> gdev = gpu_device(force=true)   # force GPU device, error if no GPU is available
+(::CUDADevice{Nothing}) (generic function with 4 methods)
 
-If the preference backend isn't available or isn't functional, then [`Flux.get_device`](@ref) looks for a CUDA, AMDGPU or Metal backend, and returns a corresponding device (if the backend is available and functional). Otherwise, a CPU device is returned. In the below example, the GPU preference is `"CUDA"`:
+julia> model = Dense(2 => 3);     # model in CPU memory
 
-```julia-repl
-julia> using Flux;      # preference is CUDA, but CUDA.jl not loaded
+julia> gmodel = model |> gdev;    # transfer model to GPU
 
-julia> device = Flux.get_device(; verbose=true)       # this will resort to automatic device selection
-[ Info: Using backend set in preferences: CUDA.
-┌ Warning: Trying to use backend: CUDA but it's trigger package is not loaded.
-│ Please load the package and call this function again to respect the preferences backend.
-└ @ Flux ~/fluxml/Flux.jl/src/functor.jl:637
-[ Info: Using backend: CPU.
-(::Flux.FluxCPUDevice) (generic function with 1 method)
+julia> cmodel = gmodel |> cdev;   # transfer model back to CPU
 ```
-For detailed information about how the backend is selected, check the documentation for [`Flux.get_device`](@ref).
 
 ## Data movement across GPU devices
 
-Flux also supports getting handles to specific GPU devices, and transferring models from one GPU device to another GPU
-device from the same backend. Let's try it out for NVIDIA GPUs. First, we list all the available devices:
+Flux also supports getting handles to specific GPU devices, and transferring models from one GPU device to another GPU device from the same backend. Let's try it out for NVIDIA GPUs. First, we list all the available devices:
 
 ```julia-repl
 julia> using Flux, CUDA;
 
 julia> CUDA.devices()
 CUDA.DeviceIterator() for 3 devices:
-0. GeForce RTX 2080 Ti
-1. GeForce RTX 2080 Ti
-2. TITAN X (Pascal)
-
+0. NVIDIA TITAN RTX
+1. NVIDIA TITAN RTX
+2. NVIDIA TITAN RTX
 ```
 
 Then, let's select the device with id `0`:
 
 ```julia-repl
-julia> device0 = Flux.get_device("CUDA", 0)        # the currently supported values for backend are "CUDA" and "AMDGPU"
-(::Flux.FluxCUDADevice) (generic function with 1 method)
+julia> device0 = gpu_device(1)
+(::CUDADevice{CuDevice}) (generic function with 4 methods)
 
+julia> device0.device
+CuDevice(0): NVIDIA TITAN RTX
 ```
+Notice that indexing starts from `0` in the `CUDA.devices()` output, but `gpu_device!` expects the device id starting from `1`.
 
 Then, let's move a simple dense layer to the GPU represented by `device0`:
 
@@ -343,27 +321,25 @@ Dense(2 => 3)       # 9 parameters
 julia> dense_model = dense_model |> device0;
 
 julia> dense_model.weight
-3×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
-  0.695662   0.816299
- -0.204763  -0.10232
- -0.955829   0.538412
+3×2 CuArray{Float32, 2, CUDA.DeviceMemory}:
+ -0.142062  -0.131455
+ -0.828134  -1.06552
+  0.608595  -1.05375
 
 julia> CUDA.device(dense_model.weight)      # check the GPU to which dense_model is attached
-CuDevice(0): GeForce RTX 2080 Ti
-
+CuDevice(0): NVIDIA TITAN RTX
 ```
 
 Next, we'll get a handle to the device with id `1`, and move `dense_model` to that device:
 
 ```julia-repl
-julia> device1 = Flux.get_device("CUDA", 1)
-(::Flux.FluxCUDADevice) (generic function with 1 method)
+julia> device1 = gpu_device(2)
+(::CUDADevice{CuDevice}) (generic function with 4 methods)
 
 julia> dense_model = dense_model |> device1;    # don't directly print the model; see warning below
 
 julia> CUDA.device(dense_model.weight)
-CuDevice(1): GeForce RTX 2080 Ti
-
+CuDevice(1): NVIDIA TITAN RTX
 ```
 
 Due to a limitation in `Metal.jl`, currently this kind of data movement across devices is only supported for `CUDA` and `AMDGPU` backends.
@@ -376,14 +352,15 @@ Due to a limitation in `Metal.jl`, currently this kind of data movement across d
 
 
 ```@docs
-Flux.AbstractDevice
-Flux.FluxCPUDevice
-Flux.FluxCUDADevice
-Flux.FluxAMDGPUDevice
-Flux.FluxMetalDevice
-Flux.supported_devices
-Flux.get_device
-Flux.gpu_backend!
+MLDataDevices.cpu_device
+MLDataDevices.default_device_rng
+MLDataDevices.get_device
+MLDataDevices.gpu_device
+MLDataDevices.gpu_backend!
+MLDataDevices.get_device_type
+MLDataDevices.reset_gpu_device!
+MLDataDevices.supported_gpu_backends
+MLDataDevices.DeviceIterator
 ```
 
 ## Distributed data parallel training

diff --git a/docs/src/guide/models/recurrence.md b/docs/src/guide/models/recurrence.md
@@ -71,7 +71,7 @@ julia> RNN(2, 5)  # or equivalently RNN(2 => 5)
 Recur(
   RNNCell(2 => 5, tanh),                # 45 parameters
 )         # Total: 4 trainable arrays, 45 parameters,
-          # plus 1 non-trainable, 5 parameters, summarysize 412 bytes.
+          # plus 1 non-trainable, 5 parameters, summarysize 404 bytes.
 ```
 
 Equivalent to the `RNN` stateful constructor, `LSTM` and `GRU` are also available. 
@@ -86,7 +86,7 @@ Chain(
   ),
   Dense(5 => 1),                        # 6 parameters
 )         # Total: 6 trainable arrays, 51 parameters,
-          # plus 1 non-trainable, 5 parameters, summarysize 580 bytes.   
+          # plus 1 non-trainable, 5 parameters, summarysize 540 bytes.   
 ```
 In this example, each output has only one component.
 

diff --git a/docs/src/guide/saving.md b/docs/src/guide/saving.md
@@ -62,7 +62,7 @@ julia> m = Chain(Dense(10 => 5, relu), Dense(5 => 2))
 Chain(
   Dense(10 => 5, relu),                 # 55 parameters
   Dense(5 => 2),                        # 12 parameters
-)                   # Total: 4 arrays, 67 parameters, 524 bytes.
+)                   # Total: 4 arrays, 67 parameters, 476 bytes.
 
 julia> for epoch in 1:10
           # ... train model ...
@@ -131,7 +131,7 @@ julia> model
 Chain(
   Dense(10 => 5, relu),                 # 55 parameters
   Dense(5 => 2),                        # 12 parameters
-)                   # Total: 4 arrays, 67 parameters, 524 bytes.
+)                   # Total: 4 arrays, 67 parameters, 476 bytes.
 ```
 !!! warning
     Saving models this way could lead to compatibility issues across julia versions

diff --git a/docs/src/reference/destructure.md b/docs/src/reference/destructure.md
@@ -94,4 +94,5 @@ Flux.loadmodel!
 Functors.KeyPath
 Functors.getkeypath
 Functors.haskeypath
-```
+Functors.setkeypath!
+```
diff --git a/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl b/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl
@@ -17,16 +17,6 @@ const MIOPENFloat = AMDGPU.MIOpen.MIOPENFloat
 # Set to boolean on the first call to check_use_amdgpu
 const USE_AMDGPU = Ref{Union{Nothing, Bool}}(nothing)
 
-function (device::Flux.FluxAMDGPUDevice)(x)
-    if device.deviceID === nothing
-        Flux.gpu(Flux.FluxAMDGPUAdaptor(), x)
-    else
-        return Flux.gpu(Flux.FluxAMDGPUAdaptor(AMDGPU.device_id(device.deviceID) - 1), x)  # subtracting 1, because device_id returns a positive integer
-    end
-end
-Flux._get_device_name(::Flux.FluxAMDGPUDevice) = "AMDGPU"
-Flux._isavailable(::Flux.FluxAMDGPUDevice) = true
-Flux._isfunctional(::Flux.FluxAMDGPUDevice) = AMDGPU.functional()
 
 function check_use_amdgpu()
     if !isnothing(USE_AMDGPU[])
@@ -55,7 +45,6 @@ include("conv.jl")
 
 function __init__()
     Flux.AMDGPU_LOADED[] = true
-    Flux.DEVICES[][Flux.GPU_BACKEND_ORDER["AMDGPU"]] = AMDGPU.functional() ? Flux.FluxAMDGPUDevice(AMDGPU.device()) : Flux.FluxAMDGPUDevice(nothing)
 end
 
 # TODO

diff --git a/ext/FluxAMDGPUExt/functor.jl b/ext/FluxAMDGPUExt/functor.jl
@@ -108,10 +108,6 @@ function Adapt.adapt_structure(to::FluxCPUAdaptor, m::AMDGPU_CONV)
         Adapt.adapt(to, m.bias), _other_args(m)...)
 end
 
-function Flux.get_device(::Val{:AMDGPU}, id::Int)     # id should start from 0
-    old_id = AMDGPU.device_id(AMDGPU.device()) - 1     # subtracting 1 because ids start from 0
-    AMDGPU.device!(AMDGPU.devices()[id + 1])           # adding 1 because ids start from 0
-    device = Flux.FluxAMDGPUDevice(AMDGPU.device())
-    AMDGPU.device!(AMDGPU.devices()[old_id + 1])
-    return device
+function Flux._get_device(::Val{:AMDGPU}, id::Int)     # id should start from 0
+    return MLDataDevices.gpu_device(id+1, force=true)
 end
diff --git a/ext/FluxCUDAExt/FluxCUDAExt.jl b/ext/FluxCUDAExt/FluxCUDAExt.jl
@@ -14,17 +14,6 @@ import Adapt: adapt_storage
 
 const USE_CUDA = Ref{Union{Nothing, Bool}}(nothing)
 
-function (device::Flux.FluxCUDADevice)(x)
-    if device.deviceID === nothing
-        return Flux.gpu(Flux.FluxCUDAAdaptor(), x)
-    else
-        return Flux.gpu(Flux.FluxCUDAAdaptor(device.deviceID.handle), x)
-    end
-end
-Flux._get_device_name(::Flux.FluxCUDADevice) = "CUDA"
-Flux._isavailable(::Flux.FluxCUDADevice) = true
-Flux._isfunctional(::Flux.FluxCUDADevice) = CUDA.functional()
-
 function check_use_cuda()
     if !isnothing(USE_CUDA[])
         return
@@ -43,14 +32,10 @@ end
 ChainRulesCore.@non_differentiable check_use_cuda()
 
 include("functor.jl")
-include("utils.jl")
 
 function __init__()
     Flux.CUDA_LOADED[] = true
 
-    ## add device to available devices
-    Flux.DEVICES[][Flux.GPU_BACKEND_ORDER["CUDA"]] = CUDA.functional() ? Flux.FluxCUDADevice(CUDA.device()) : Flux.FluxCUDADevice(nothing)
-
     try
        Base.require(Main, :cuDNN)
     catch

diff --git a/ext/FluxCUDAExt/functor.jl b/ext/FluxCUDAExt/functor.jl
@@ -56,10 +56,6 @@ function _cuda(id::Union{Nothing, Int}, x)
   fmap(x -> Adapt.adapt(FluxCUDAAdaptor(id), x), x; exclude=Flux._isleaf)
 end
 
-function Flux.get_device(::Val{:CUDA}, id::Int)
-    old_id = CUDA.device().handle
-    CUDA.device!(id)
-    device = Flux.FluxCUDADevice(CUDA.device())
-    CUDA.device!(old_id)
-    return device
+function Flux._get_device(::Val{:CUDA}, id::Int)
+    return MLDataUtils.gpu_device(id+1, force=true)
 end
diff --git a/ext/FluxCUDAExt/utils.jl b/ext/FluxCUDAExt/utils.jl