From 4cea75c331da0da8ff506e75f47634bad5f12dfb Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Wed, 28 Aug 2024 23:22:48 -0400 Subject: [PATCH 1/4] fix: remove uses of NNPACK --- benchmark/perf_report.jl | 15 ---- src/NNlib.jl | 13 ---- src/conv.jl | 27 +------ src/nnpack/NNPACK.jl | 55 -------------- src/nnpack/error.jl | 83 --------------------- src/nnpack/impl.jl | 50 ------------- src/nnpack/interface.jl | 44 ----------- src/nnpack/libnnpack.jl | 135 ---------------------------------- src/nnpack/libnnpack_types.jl | 85 --------------------- src/nnpack/performance.jl | 31 -------- src/pooling.jl | 12 +-- test/conv.jl | 18 ----- test/inference.jl | 3 - test/pooling.jl | 20 ----- 14 files changed, 3 insertions(+), 588 deletions(-) delete mode 100644 src/nnpack/NNPACK.jl delete mode 100644 src/nnpack/error.jl delete mode 100644 src/nnpack/impl.jl delete mode 100644 src/nnpack/interface.jl delete mode 100644 src/nnpack/libnnpack.jl delete mode 100644 src/nnpack/libnnpack_types.jl delete mode 100644 src/nnpack/performance.jl diff --git a/benchmark/perf_report.jl b/benchmark/perf_report.jl index 5c06515eb..9b861e869 100644 --- a/benchmark/perf_report.jl +++ b/benchmark/perf_report.jl @@ -37,10 +37,6 @@ for rank in (2,), (NNlib.depthwiseconv_im2col!, NNlib.∇depthwiseconv_data_im2col!, NNlib.∇depthwiseconv_filter_im2col!, DepthwiseConvDims, "im2col"), ] - if NNlib.is_nnpack_available() - push!(benchmark_items, (NNlib.conv_nnpack!, NNlib.∇conv_data_nnpack!, NNlib.∇conv_filter_nnpack!, DenseConvDims, "nnpack")) - end - for (conv!, ∇conv_data!, ∇conv_filter!, cT, backend) in benchmark_items x = zeros(Float32, repeat([N], rank)..., C_in, 1) @@ -105,15 +101,4 @@ for rank in (2,), @show(pdims) @save "results.jld2" results end - - if NNlib.is_nnpack_available() - if NNlib.nnpack_supported_operation(pdims) - t_fwd = @benchmark NNlib.maxpool_nnpack!($y, $x, $pdims) - - add_result(t_fwd, "maxpool2d", "nnpack", pdims) - - @show(pdims) - @save "results.jld2" results - end - end end diff --git a/src/NNlib.jl b/src/NNlib.jl index 8cf66370f..c1ac58933 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -24,19 +24,6 @@ const Numeric = Union{AbstractArray{<:T}, T} where {T<:Number} include("dim_helpers.jl") export ConvDims, DenseConvDims, PoolDims, DepthwiseConvDims -is_nnpack_available() = false - -@init @require NNPACK_jll="a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee" begin - if isdefined(NNPACK_jll, :libnnpack) - include("nnpack/NNPACK.jl") - else - @warn "NNPACK not available for your platform: " * - "$( Pkg.BinaryPlatforms.platform_name(Pkg.BinaryPlatforms.platform_key_abi()))" * - "($( Pkg.BinaryPlatforms.triplet(Pkg.BinaryPlatforms.platform_key_abi()))) - You will be able to use only the default Julia NNlib backend" - end -end - include("activations.jl") for f in ACTIVATIONS @eval export $(f) diff --git a/src/conv.jl b/src/conv.jl index 3fecb9151..fead2ee21 100644 --- a/src/conv.jl +++ b/src/conv.jl @@ -76,7 +76,7 @@ end # Let's generate auto-allocating versions of all our functions, for all backends. # We `@timeit` these methods separately, as we want to know how much time is spent in # allocation. :P -for backend in (Symbol(), :_direct, :_im2col, :_nnpack) +for backend in (Symbol(), :_direct, :_im2col) # First make auto-allocating versions of the conv()-like calls: for name in (:conv, :depthwiseconv) @eval begin @@ -134,7 +134,7 @@ end # since we can specialize on sizes. for front_name in (:conv, :∇conv_data, :∇conv_filter, :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter) - for backend in (Symbol(), :_direct, :_im2col) ## NNPACK is only for 2d conv + for backend in (Symbol(), :_direct, :_im2col) for N in (3, 4) @eval begin function $(Symbol("$(front_name)$(backend)!"))( @@ -381,26 +381,3 @@ function rrule(::typeof(∇conv_filter), x, dy, cdims; kw...) end return ∇conv_filter(x, dy, cdims; kw...), ∇conv_filter_pullback end - -# Use NNPACK if it is available and the operation is supported -# commented out 'till proper benchmarking and more correctness test are performed -# if is_nnpack_available() -# function conv(x::Array{Float32, 4}, w::Array{Float32, 4}, -# cdims::DenseConvDims{2, K, C_in, C_out, (1, 1), P, (1, 1), F}; -# kwargs...) where {K, C_in, C_out, P, F} -# return conv_nnpack(x, w, cdims; kwargs...) -# end - -# function ∇conv_data(dy::Array{Float32, 4}, w::Array{Float32, 4}, -# cdims::DenseConvDims{2, K, C_in, C_out, (1, 1), P, (1, 1), F}; -# kwargs...) where {K, C_in, C_out, P, F} -# return ∇conv_data_nnpack(dy, w, cdims; kwargs...) -# end - -# function ∇conv_filter(x::Array{Float32, 4}, dy::Array{Float32, 4}, -# cdims::DenseConvDims{2, K, C_in, C_out, (1, 1), P, (1, 1), F}; -# kwargs...) where {K, C_in, C_out, P, F} -# return ∇conv_filter_nnpack(x, dy, cdims; kwargs...) -# end -# end -######################################################## diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl deleted file mode 100644 index 685415a7e..000000000 --- a/src/nnpack/NNPACK.jl +++ /dev/null @@ -1,55 +0,0 @@ -using NNPACK_jll - -include("libnnpack_types.jl") -include("error.jl") -include("libnnpack.jl") -include("performance.jl") -include("interface.jl") - - -const shared_threadpool_dict = Dict{UInt64, Base.RefValue}() - -""" - is_nnpack_available() - -Checks if the current hardware is supported by NNPACK. -""" -function is_nnpack_available() - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - return false - else - return true - end -end - -""" - allocate_threadpool() - -Allocates several threadpool based on the upper limit on the number of threads for the machine. -Allows NNPACK to intelligently choose which threadpool to use for getting the best -performance. -""" -function allocate_threadpool() - global NNPACK_CPU_THREADS = NNPACK_CPU_THREADS > 8 ? UInt64(8) : UInt64(exp2(floor(log2(NNPACK_CPU_THREADS)))) - for i in 0:Int(log2(NNPACK_CPU_THREADS)) - threads = UInt64(2^i) - push!(shared_threadpool_dict, threads => Ref(pthreadpool_create(threads))) - end -end - -@init begin - status = nnp_initialize() - if status == nnp_status_unsupported_hardware - @warn "Hardware is unsupported by NNPACK so falling back to default NNlib" - end - try - global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"]) - catch - # Sys.CPU_THREADS should be a better default if we are tuning the benchmark suite on - # a particular machine. However, we fix the runtime threadpool here to have a max of - # 4 threads so anything above will be ignored anyways - global NNPACK_CPU_THREADS = UInt64(4) - end - allocate_threadpool() -end diff --git a/src/nnpack/error.jl b/src/nnpack/error.jl deleted file mode 100644 index 83522c37d..000000000 --- a/src/nnpack/error.jl +++ /dev/null @@ -1,83 +0,0 @@ -struct NNPACKError <: Exception - code::nnp_status - msg::AbstractString -end - -Base.show(io::IO, err::NNPACKError) = print(io, "NNPACKError(code $(err.code), $(err.msg))") - -function NNPACKError(status::nnp_status) - msg = "NNPACK STATUS SUCCESS" - if status == nnp_status_invalid_batch_size - msg = "NNPACK STATUS INVALID BATCH SIZE" - elseif status == nnp_status_invalid_channels - msg = "NNPACK STATUS INVALID CHANNELS" - elseif status == nnp_status_invalid_input_channels - msg = "NNPACK STATUS INVALID INPUT CHANNELS" - elseif status == nnp_status_invalid_output_channels - msg = "NNPACK STATUS INVALID OUTPUT CHANNELS" - elseif status == nnp_status_invalid_input_size - msg = "NNPACK STATUS INVALID INPUT SIZE" - elseif status == nnp_status_invalid_input_stride - msg = "NNPACK STATUS INVALID INPUT STRIDE" - elseif status == nnp_status_invalid_input_padding - msg = "NNPACK STATUS INVALID INPUT PADDING" - elseif status == nnp_status_invalid_kernel_size - msg = "NNPACK STATUS INVALID KERNEL SIZE" - elseif status == nnp_status_invalid_pooling_size - msg = "NNPACK STATUS INVALID POOLING SIZE" - elseif status == nnp_status_invalid_pooling_stride - msg = "NNPACK STATUS INVALID POOLING STRIDE" - elseif status == nnp_status_invalid_algorithm - msg = "NNPACK STATUS INVALID ALGORITHM" - elseif status == nnp_status_invalid_transform_strategy - msg = "NNPACK STATUS INVALID TRANSFORM STRATEGY" - elseif status == nnp_status_invalid_output_subsampling - msg = "NNPACK STATUS INVALID OUTPUT SUBSAMPLING" - elseif status == nnp_status_invalid_activation - msg = "NNPACK STATUS INVALID ACTIVATION" - elseif status == nnp_status_invalid_activation_parameters - msg = "NNPACK STATUS INVALID ACTIVATION PARAMETERS" - elseif status == nnp_status_unsupported_input_size - msg = "NNPACK STATUS UNSUPPORTED INPUT SIZE" - elseif status == nnp_status_unsupported_input_stride - msg = "NNPACK STATUS UNSUPPORTED INPUT STRIDE" - elseif status == nnp_status_unsupported_input_padding - msg = "NNPACK STATUS UNSUPPORTED INPUT PADDING" - elseif status == nnp_status_unsupported_kernel_size - msg = "NNPACK STATUS UNSUPPORTED KERNEL SIZE" - elseif status == nnp_status_unsupported_pooling_size - msg = "NNPACK STATUS UNSUPPORTED POOLING SIZE" - elseif status == nnp_status_unsupported_pooling_stride - msg = "NNPACK STATUS UNSUPPORTED POOLING STRIDE" - elseif status == nnp_status_unsupported_algorithm - msg = "NNPACK STATUS UNSUPPORTED ALGORITHM" - elseif status == nnp_status_unsupported_transform_strategy - msg = "NNPACK STATUS UNSUPPORTED TRANSFORM STRATEGY" - elseif status == nnp_status_unsupported_activation - msg = "NNPACK STATUS UNSUPPORTED ACTIVATION" - elseif status == nnp_status_unsupported_activation_parameters - msg = "NNPACK STATUS UNSUPPORTED ACTIVATION PARAMETERS" - elseif status == nnp_status_uninitialized - msg = "NNPACK STATUS UNINITIALIZED" - elseif status == nnp_status_unsupported_hardware - msg = "NNPACK STATUS UNSUPPORTED HARDWARE" - elseif status == nnp_status_out_of_memory - msg = "NNPACK STATUS OUT OF MEMORY" - elseif status == nnp_status_insufficient_buffer - msg = "NNPACK STATUS INSUFFICIENT BUFFER" - elseif status == nnp_status_misaligned_buffer - msg = "NNPACK STATUS MISALIGNED BUFFER" - end - NNPACKError(status, msg) -end - -macro nnpack_check(nnp_func) - quote - local err::nnp_status - err = $(esc(nnp_func)) - if err != nnp_status_success - throw(NNPACKError(err)) - end - err - end -end diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl deleted file mode 100644 index 3309404e1..000000000 --- a/src/nnpack/impl.jl +++ /dev/null @@ -1,50 +0,0 @@ -function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}} - check_dims(size(x), size(y), pdims) - threadpool = select_threadpool(pdims, size(y, 4)) - nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), - stride = stride(pdims), threadpool = threadpool) -end - -function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims; - b::A2 = zeros(Float32, size(x, 3)), - algo = UInt32(0)) where {A1<:Array{Float32, 4}, - A2<:Array{Float32, 1}} - check_dims(size(x), size(w), size(y), cdims) - threadpool = select_threadpool(cdims, size(y, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(dx), size(w), size(dy), cdims) - threadpool = select_threadpool(cdims, size(dy, 4)) - - if flipkernel(cdims) == 0 - w = flipweight(w) - end - - nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) -end - -function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims; - algo = UInt32(0)) where{A<:Array{Float32, 4}} - check_dims(size(x), size(dw), size(dy), cdims) - threadpool = select_threadpool(cdims, size(dy, 4)) - - nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), - stride = stride(cdims), threadpool = threadpool) - - if flipkernel(cdims) == 0 - dw .= flipweight(dw) - end - - dw -end - diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl deleted file mode 100644 index 5cdaccb4d..000000000 --- a/src/nnpack/interface.jl +++ /dev/null @@ -1,44 +0,0 @@ -include("impl.jl") - -## NNPACK supports only Float32 -for (front_name, backend) in ( - :conv => :_nnpack, - :∇conv_data => :_nnpack, - :∇conv_filter => :_nnpack, - ) - @eval begin - function $(Symbol("$(front_name)$(backend)!"))( - out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4}, - cdims::ConvDims; kwargs...) where {T1, T2, T3} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # Output must of the same type as in the function signature - T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1), - Float32.(in2), cdims; kwargs...)) - end - end -end - -function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims; - kwargs...) where {T1, T2} - @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1 - # We want the output to be of the same type as desired - T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...)) -end - -""" - nnpack_supported_operation(cdims::ConvDims) - nnpack_supported_operation(pdims::PoolDims) - -Returns `true` if nnpack supports the convolution/pooling operation for the given parameters. -""" -function nnpack_supported_operation(pdims::PoolDims{2, K, S, P, (1, 1)}) where {K, S, P} - val = input_size(pdims)[1:2] .+ (P[1] + P[2], P[3] + P[4]) .- K - return val .% S == (0, 0) ? true : false -end - -function nnpack_supported_operation(cdims::ConvDims{2, K, (1, 1), P, (1, 1)}) where {K, S, P} - return true -end - -# Return false for everything else -nnpack_supported_operation(dims) = false diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl deleted file mode 100644 index 2f3996c32..000000000 --- a/src/nnpack/libnnpack.jl +++ /dev/null @@ -1,135 +0,0 @@ -#NOTE: We do the error handling of nnp_initialize while loading NNPACK -function nnp_initialize() - ccall((:nnp_initialize, libnnpack), nnp_status, (),) -end - -function nnp_deinitialize() - @nnpack_check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),) -end - -function pthreadpool_create(n = 0) - ccall((:pthreadpool_create, libnnpack), Ptr{Cvoid}, (Csize_t,), n) -end - -function nnp_relu_output(batch_size, channels, input, output, negative_slope, threadpool) - @nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool) -end - -function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N} - # Investigate why the channel and batch dims need to specified like this - nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool) - y -end - -function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool) - @nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool) -end - -function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N} - # Investigate why the channel and batch dims need to specified like this - nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool) - dx -end - -function nnp_softmax_output(batch_size, channels, input, output, threadpool) - @nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool) -end - -function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = C_NULL) - nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool) - y -end - -#FIXME: Output of fully connected not consistent with `kernel * input` -#NOTE: This most likely due to nnpack being row major. Investigate this. - -function nnp_fully_connected_output(batch_size, input_channels, output_channels, input, kernel, output, threadpool, profile) - @nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL) -end - -function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = C_NULL) - profile = profile == nothing ? nnp_profile() : profile - nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile) - y -end - -function nnp_fully_connected_inference_f16f32(input_channels, output_channels, input, kernel, output, threadpool) - @nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool) -end - -nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = C_NULL) = - nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool) - -function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = C_NULL) - nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool) - y -end - -function nnp_fully_connected_inference(input_channels, output_channels, input, kernel, output, threadpool) - @nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool) -end - -nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = C_NULL) = - nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool) - -function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = C_NULL) - nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool) - y -end - -function nnp_max_pooling_output(batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool) - @nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool) -end - -function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = C_NULL) - input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...) - pooling_size = nnp_size(Csize_t.(kernel)...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - pooling_stride = nnp_size(Csize_t.(stride)...) - nnp_max_pooling_output(size(x, 4), size(x, 3), input_size, input_padding, pooling_size, pooling_stride, x, y, threadpool) - y -end - -#TODO: Add wrapper for convolution inference - -function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(dx,1), size(dx,2)))...) - kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_input_gradient(UInt32(algo), size(dx,4), size(dx,3), size(w,4), input_size, input_padding, kernel_size, dy, w, dx, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - dx -end - -function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...) - kernel_size = nnp_size(Csize_t.((size(dw,1),size(dw,2)))...) - input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_kernel_gradient(UInt32(algo), size(x,4), size(x,3), size(dw,4), input_size, input_padding, kernel_size, x, dy, dw, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - dw -end - -function nnp_convolution_output(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile) - @nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL) -end - -function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing) - input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...) - kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...) - input_padding = nnp_padding(Csize_t(padding[3]), Csize_t(padding[2]), Csize_t(padding[4]), Csize_t(padding[1])) - profile = profile == nothing ? nnp_profile() : profile - workspace_buffer = workspace_buffer === nothing ? C_NULL : workspace_buffer - nnp_convolution_output(UInt32(algo), size(x,4), size(x,3), size(w,4), input_size, input_padding, kernel_size, x, w, b, y, workspace_buffer, workspace_size, UInt32(0), C_NULL, threadpool, profile) - y -end diff --git a/src/nnpack/libnnpack_types.jl b/src/nnpack/libnnpack_types.jl deleted file mode 100644 index 6e7b23c16..000000000 --- a/src/nnpack/libnnpack_types.jl +++ /dev/null @@ -1,85 +0,0 @@ -const nnp_status = UInt32 -const nnp_status_success = (UInt32)(0) -const nnp_status_invalid_batch_size = (UInt32)(2) -const nnp_status_invalid_channels = (UInt32)(3) -const nnp_status_invalid_input_channels = (UInt32)(4) -const nnp_status_invalid_output_channels = (UInt32)(5) -const nnp_status_invalid_input_size = (UInt32)(10) -const nnp_status_invalid_input_stride = (UInt32)(11) -const nnp_status_invalid_input_padding = (UInt32)(12) -const nnp_status_invalid_kernel_size = (UInt32)(13) -const nnp_status_invalid_pooling_size = (UInt32)(14) -const nnp_status_invalid_pooling_stride = (UInt32)(15) -const nnp_status_invalid_algorithm = (UInt32)(16) -const nnp_status_invalid_transform_strategy = (UInt32)(17) -const nnp_status_invalid_output_subsampling = (UInt32)(13) -const nnp_status_invalid_activation = (UInt32)(14) -const nnp_status_invalid_activation_parameters = (UInt32)(15) -const nnp_status_unsupported_input_size = (UInt32)(20) -const nnp_status_unsupported_input_stride = (UInt32)(21) -const nnp_status_unsupported_input_padding = (UInt32)(22) -const nnp_status_unsupported_kernel_size = (UInt32)(23) -const nnp_status_unsupported_pooling_size = (UInt32)(24) -const nnp_status_unsupported_pooling_stride = (UInt32)(25) -const nnp_status_unsupported_algorithm = (UInt32)(26) -const nnp_status_unsupported_transform_strategy = (UInt32)(57) -const nnp_status_unsupported_activation = (UInt32)(28) -const nnp_status_unsupported_activation_parameters = (UInt32)(29) -const nnp_status_uninitialized = (UInt32)(50) -const nnp_status_unsupported_hardware = (UInt32)(51) -const nnp_status_out_of_memory = (UInt32)(52) -const nnp_status_insufficient_buffer = (UInt32)(53) -const nnp_status_misaligned_buffer = (UInt32)(54) - -const nnp_activation = UInt32 -const nnp_activation_identity = (UInt32)(0) -const nnp_activation_relu = (UInt32)(1) - -const nnp_convolution_algorithm = UInt32 -const nnp_convolution_algorithm_auto = (UInt32)(0) -const nnp_convolution_algorithm_ft8x8 = (UInt32)(1) -const nnp_convolution_algorithm_ft16x16 = (UInt32)(2) -const nnp_convolution_algorithm_wt8x8 = (UInt32)(3) -const nnp_convolution_algorithm_implicit_gemm = (UInt32)(4) -const nnp_convolution_algorithm_direct = (UInt32)(5) -const nnp_convolution_algorithm_wt8x8_fp16 = (UInt32)(6) - -const nnp_convolution_transform_strategy = UInt32 -const nnp_convolution_transform_strategy_compute = (UInt32)(1) -const nnp_convolution_transform_strategy_precompute = (UInt32)(2) -const nnp_convolution_transform_strategy_reuse = (UInt32)(3) - -const pthreadpool_t = Ptr{Nothing} - -mutable struct nnp_size - width::Csize_t - height::Csize_t - nnp_size() = new(Csize_t(0), Csize_t(0)) - nnp_size(w, h) = new(Csize_t(w), Csize_t(h)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_size}}, a::nnp_size) = Ptr{a} - -mutable struct nnp_padding - top::Csize_t - right::Csize_t - bottom::Csize_t - left::Csize_t - nnp_padding() = new(Csize_t(0), Csize_t(0), Csize_t(0), Csize_t(0)) - nnp_padding(val) = new(Csize_t(val), Csize_t(val), Csize_t(val), Csize_t(val)) - nnp_padding(t, r, b, l) = new(Csize_t(t), Csize_t(r), Csize_t(b), Csize_t(l)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_padding}}, a::nnp_padding) = Ptr{a} - -mutable struct nnp_profile - total::Cdouble - input_transform::Cdouble - kernel_transform::Cdouble - output_transform::Cdouble - block_multiplication::Cdouble - nnp_profile() = new(Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0), Cdouble(0.0)) - nnp_profile(t, it, kt, ot, bm) = new(Cdouble(t), Cdouble(it), Cdouble(kt), Cdouble(ot), Cdouble(bm)) -end - -Base.unsafe_convert(::Type{Ptr{nnp_profile}}, a::nnp_profile) = Ptr{a} diff --git a/src/nnpack/performance.jl b/src/nnpack/performance.jl deleted file mode 100644 index 24abdb411..000000000 --- a/src/nnpack/performance.jl +++ /dev/null @@ -1,31 +0,0 @@ -function select_threadpool(cdims::DenseConvDims, batch_size::Int) - inp_size = input_size(cdims)[1] - if batch_size >= 32 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif batch_size >= 16 && inp_size >= 64 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size <= 32 - return C_NULL - elseif inp_size >= 128 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size * batch_size >= 256 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end - return C_NULL -end - -function select_threadpool(pdims::PoolDims, batch_size::Int) - inp_size = input_size(pdims)[1] - if batch_size >= 32 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif batch_size >= 16 && inp_size >= 64 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size <= 32 - return C_NULL - elseif inp_size >= 128 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - elseif inp_size * batch_size >= 256 - return shared_threadpool_dict[Int(NNPACK_CPU_THREADS)][] - end - return C_NULL -end diff --git a/src/pooling.jl b/src/pooling.jl index 1cf666f54..6caa1045d 100644 --- a/src/pooling.jl +++ b/src/pooling.jl @@ -107,7 +107,7 @@ end # Finally, let's generate auto-allocating versions of all our functions, for all backends: -for backend in (Symbol(), :_direct, :_nnpack) +for backend in (Symbol(), :_direct) # First make auto-allocating versions of the basic pooling calls: for name in (:maxpool, :meanpool, :lpnormpool) @eval begin @@ -132,16 +132,6 @@ for backend in (Symbol(), :_direct, :_nnpack) end end -## Use NNPACK if it is available and operation is supported. -## The corresponding gradient is not available in NNPACK -## Commented out due to #210 -# if is_nnpack_available() -# function maxpool(x::Array{Float32, 4}, pdims::PoolDims{2, K, S, P, (1, 1)}; kwargs...) where {T, K, S, P} -# func = nnpack_supported_operation(pdims) ? maxpool_nnpack : maxpool_direct -# return func(x, pdims; kwargs...) -# end -# end - expand(N, i::Tuple) = i expand(N, i::Integer) = ntuple(_ -> i, N) diff --git a/test/conv.jl b/test/conv.jl index 492de2cc7..badb2c5f0 100644 --- a/test/conv.jl +++ b/test/conv.jl @@ -277,13 +277,7 @@ ddims(x) = dropdims(x, dims=(ndims(x)-1, ndims(x))) w = reshape(Float64[1:prod(size(dw));], size(dw)..., 1, 1) convs = [NNlib.conv, NNlib.conv_im2col, NNlib.conv_direct,] - NNlib.is_nnpack_available() && push!(convs, NNlib.conv_nnpack) for conv in convs - if NNlib.is_nnpack_available() - if conv == NNlib.conv_nnpack && !NNlib.nnpack_supported_operation(DenseConvDims(x, w)) - continue - end - end @testset "$(conv)" begin cdims = DenseConvDims(x, w) # First, your basic convolution with no parameters @@ -313,13 +307,7 @@ ddims(x) = dropdims(x, dims=(ndims(x)-1, ndims(x))) # Test all in-place implementations/interfaces convs = [NNlib.conv!, NNlib.conv_im2col!, NNlib.conv_direct!,] - NNlib.is_nnpack_available() && push!(convs, NNlib.conv_nnpack!) for conv! in convs - if NNlib.is_nnpack_available() - if conv! == NNlib.conv_nnpack! && !NNlib.nnpack_supported_operation(DenseConvDims(x, w)) - continue - end - end α, β = 2e0, -1e0 @testset "$(conv!)" begin @@ -470,13 +458,7 @@ end w = reshape(complex.(Float64[1:4;] .+ 2, Float64[1:4;] .+ 3), 1, 4, 1) cdims = DenseConvDims(x, w) convs = [NNlib.conv, NNlib.conv_im2col, NNlib.conv_direct,] - NNlib.is_nnpack_available() && push!(convs, NNlib.conv_nnpack) for conv in convs - if NNlib.is_nnpack_available() - if conv == NNlib.conv_nnpack && !NNlib.nnpack_supported_operation(cdims) - continue - end - end @testset "$(conv)" begin @test isapprox(ddims(conv(x, w, cdims)), [transpose(vec(w)) * vec(x)], rtol = 1.0e-7) end diff --git a/test/inference.jl b/test/inference.jl index 31785597c..9b3e74db8 100644 --- a/test/inference.jl +++ b/test/inference.jl @@ -3,9 +3,6 @@ import NNlib: conv_direct, conv_im2col, channels_in, channels_out @testset "Conv Inference" begin for T in (Float32, Float64) impl = [conv, conv_direct, conv_im2col] - if NNlib.is_nnpack_available() && T == Float32 - push!(impl, NNlib.conv_nnpack) - end x = rand(T, 10, 10, 3, 2) w = rand(T, 3, 3, 3, 1) diff --git a/test/pooling.jl b/test/pooling.jl index 97d014d52..f9d57ade7 100644 --- a/test/pooling.jl +++ b/test/pooling.jl @@ -869,16 +869,6 @@ maxpool_answer_nature = Dict( @test y_maxpool_dir ≈ y_maxpool atol = 1e-6 @test isapprox(config.dx_maxpool, NNlib.∇maxpool_direct(dy, y_maxpool_dir, x, pdims), rtol=1e-5) @test isapprox(config.dx_meanpool, NNlib.∇meanpool_direct(dy, y_meanpool_dir, x, pdims), rtol=1e-5) - - # CHECK NNPACK - if NNlib.is_nnpack_available() && T == Float32 - if NNlib.nnpack_supported_operation(pdims) - y_maxpool_nnp = NNlib.maxpool_nnpack(x, pdims) - @test y_maxpool_nnp ≈ y_maxpool atol = 1e-6 - # NNPACK maxpool gradient still missing - # @test isapprox(config.dx_maxpool, NNlib.∇maxpool_nnpack(dy, y_maxpool_nnp, config.x, pdims), rtol=1e-5) - end - end end for (rank_name, config_dict) in maxpool_answer_nature @@ -940,16 +930,6 @@ maxpool_answer_nature = Dict( @test RD.gradient(_x -> only(maxpool(_x,(2,2))), x)[:,:,1,1] == [0 0; 0 1] @test only(meanpool(x, (2,2))) == 2.5 @test all(==(0.25), RD.gradient(_x -> only(meanpool(_x,(2,2))), x)) - - # if NNlib.is_nnpack_available() - # if NNlib.nnpack_supported_operation(pdims1) - # @test NNlib.maxpool_nnpack(x, pdims1) isa Array{Float32, 4} - # end - # if NNlib.nnpack_supported_operation(pdims2) - # print("you should not see this") - # @test NNlib.maxpool_nnpack(x, pdims2) isa Array{Float32, 4} - # end - # end end @testset "AutoDiff: spatial_rank=$spatial_rank" for spatial_rank in (1, 2) From 4e957ada078048f681b0b563e91ed5e1d34861aa Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Wed, 28 Aug 2024 23:26:24 -0400 Subject: [PATCH 2/4] refactor: move ForwardDiff to an ext --- Project.toml | 3 +++ ext/NNlibForwardDiffExt.jl | 9 +++++++++ src/NNlib.jl | 5 ----- 3 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 ext/NNlibForwardDiffExt.jl diff --git a/Project.toml b/Project.toml index f19d965b1..4033d2e9b 100644 --- a/Project.toml +++ b/Project.toml @@ -19,6 +19,7 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [extensions] @@ -27,6 +28,7 @@ NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] NNlibCUDAExt = "CUDA" NNlibEnzymeCoreExt = "EnzymeCore" NNlibFFTWExt = "FFTW" +NNlibForwardDiffExt = "ForwardDiff" [compat] AMDGPU = "0.9.4, 1" @@ -36,6 +38,7 @@ CUDA = "4, 5" ChainRulesCore = "1.13" EnzymeCore = "0.5, 0.6, 0.7" FFTW = "1.8.0" +ForwardDiff = "0.10.36" GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" LinearAlgebra = "<0.0.1, 1" diff --git a/ext/NNlibForwardDiffExt.jl b/ext/NNlibForwardDiffExt.jl new file mode 100644 index 000000000..84351bc17 --- /dev/null +++ b/ext/NNlibForwardDiffExt.jl @@ -0,0 +1,9 @@ +module NNlibForwardDiffExt + +using ForwardDiff: ForwardDiff +using NNlib: NNlib + +NNlib.within_gradient(x::ForwardDiff.Dual) = true +NNlib.within_gradient(x::AbstractArray{<:ForwardDiff.Dual}) = true + +end diff --git a/src/NNlib.jl b/src/NNlib.jl index c1ac58933..4ab643157 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -82,11 +82,6 @@ export upsample_nearest, ∇upsample_nearest, include("gather.jl") include("scatter.jl") include("utils.jl") -@init @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" begin - using .ForwardDiff - within_gradient(x::ForwardDiff.Dual) = true - within_gradient(x::AbstractArray{<:ForwardDiff.Dual}) = true -end include("sampling.jl") include("functions.jl") From 36e3b96082687a3d855d699670bc77b5c20f8f7a Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Wed, 28 Aug 2024 23:27:08 -0400 Subject: [PATCH 3/4] fix: remove uses of Requires --- Project.toml | 2 -- src/NNlib.jl | 1 - 2 files changed, 3 deletions(-) diff --git a/Project.toml b/Project.toml index 4033d2e9b..bf2e68e50 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,6 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [weakdeps] @@ -44,7 +43,6 @@ KernelAbstractions = "0.9.2" LinearAlgebra = "<0.0.1, 1" Pkg = "<0.0.1, 1" Random = "<0.0.1, 1" -Requires = "1.0" Statistics = "1" cuDNN = "1" julia = "1.9" diff --git a/src/NNlib.jl b/src/NNlib.jl index 4ab643157..2f2c95e69 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -14,7 +14,6 @@ using LinearAlgebra.BLAS: @blasfunc, BlasInt using LinearAlgebra: AdjOrTransAbsMat, Adjoint, BlasFloat, Transpose using Pkg using Random -using Requires using Statistics using Statistics: mean From 393e8307047e864b582a3f6742ba463cf0015e62 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Wed, 28 Aug 2024 23:41:42 -0400 Subject: [PATCH 4/4] fix: remove usage of Pkg --- Project.toml | 2 -- src/NNlib.jl | 1 - 2 files changed, 3 deletions(-) diff --git a/Project.toml b/Project.toml index bf2e68e50..e9880b23a 100644 --- a/Project.toml +++ b/Project.toml @@ -9,7 +9,6 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -41,7 +40,6 @@ ForwardDiff = "0.10.36" GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" LinearAlgebra = "<0.0.1, 1" -Pkg = "<0.0.1, 1" Random = "<0.0.1, 1" Statistics = "1" cuDNN = "1" diff --git a/src/NNlib.jl b/src/NNlib.jl index 2f2c95e69..687206fca 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -12,7 +12,6 @@ using KernelAbstractions: @atomic using LinearAlgebra using LinearAlgebra.BLAS: @blasfunc, BlasInt using LinearAlgebra: AdjOrTransAbsMat, Adjoint, BlasFloat, Transpose -using Pkg using Random using Statistics using Statistics: mean