Skip to content

Commit

Permalink
Merge pull request #515 from JuliaGPU/tb/cudnn_broadcast
Browse files Browse the repository at this point in the history
Fix CUDNN-optimized activation broadcasts
  • Loading branch information
maleadt authored Oct 29, 2020
2 parents 9465f3b + beffb7d commit 836de1d
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 23 deletions.
9 changes: 6 additions & 3 deletions lib/cudnn/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,19 @@ end

function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
beta=false) where {T,N}
cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
scalingParameter(T, alpha), TensorDesc(x), x,
scalingParameter(T, beta ), TensorDesc(y), y)
return y
end

function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N}, y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
beta=false) where {T,N}
cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
scalingParameter(T, alpha), TensorDesc( y), y,
TensorDesc(dy), dy,
Expand Down
53 changes: 35 additions & 18 deletions lib/cudnn/nnlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,24 +233,41 @@ meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDN

# Activation

# in-place for x
Base.broadcasted(::typeof(NNlib.σ), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_SIGMOID, coeff=0.0); return x)

Base.broadcasted(::typeof(NNlib.relu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_RELU, coeff=0.0); return x)

Base.broadcasted(::typeof(NNlib.tanh), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_TANH, coeff=0.0); return x)

Base.broadcasted(::typeof(NNlib.relu6), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_CLIPPED_RELU, coeff=6.0); return x)
using Base.Broadcast

for (f, op) in [
CUDA.tanh => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
mode=CUDNN_ACTIVATION_TANH),
NNlib.σ => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
mode=CUDNN_ACTIVATION_SIGMOID),
NNlib.elu => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
mode=CUDNN_ACTIVATION_ELU),
NNlib.relu => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
mode=CUDNN_ACTIVATION_RELU),
NNlib.relu6 => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
mode=CUDNN_ACTIVATION_CLIPPED_RELU,
coeff=6.0),
NNlib.leakyrelu => (src,dst)->cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(src),
reshape4D(src), reshape4D(dst),
alpha1=0.01)]
@eval begin
# in-place
function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
$op(bc.args[1], dst)
return dst
end

Base.broadcasted(::typeof(NNlib.elu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_ELU, coeff=1.0); return x)
# out of place
function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
ElType = Broadcast.combine_eltypes(bc.f, bc.args)
dst = similar(bc, ElType)
$op(bc.args[1], dst)
return dst
end
end
end

# CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
Base.broadcasted(::typeof(NNlib.identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x

Base.broadcasted(::typeof(NNlib.leakyrelu), x::DenseCuArray{T}, a=T(0.01)) where {T<:CUDNNFloat} =
(cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(x), reshape4D(x), reshape4D(x), alpha1=a); return x)
# FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
4 changes: 2 additions & 2 deletions lib/cudnn/tensor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ OpTensorDesc(op::cudnnOpTensorOp_t, a::DenseCuArray) = OpTensorDesc(op, eltype(a

function cudnnOpTensor(op::cudnnOpTensorOp_t,
A::DenseCuArray{T,N}, B::DenseCuArray{T,N}, C::DenseCuArray{T,N};
alpha1=1, alpha2=1, beta=0) where {T,N}
alpha1=true, alpha2=true, beta=false) where {T,N}
cudnnOpTensor(handle(), OpTensorDesc(op, T),
scalingParameter(T, alpha1), TensorDesc(A), A,
scalingParameter(T, alpha2), TensorDesc(B), B,
Expand Down Expand Up @@ -113,7 +113,7 @@ end

function cudnnReduceTensor(op::cudnnReduceTensorOp_t,
A::DenseCuArray{T,N}, C::DenseCuArray{T,N};
alpha=1, beta=0) where {T,N}
alpha=true, beta=false) where {T,N}
# indices = Array{UInt64, 1}(undef, N)
indicesSizeInBytes = cudnnGetReductionIndicesSize(op, A, C)
@workspace size=@argout(
Expand Down
15 changes: 15 additions & 0 deletions test/cudnn.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,24 @@ end
@test testf(x -> f.(x), rand(Float64, dims))
end
end

# softplus does not give `Inf` for large arguments
x = CuArray([1000.])
@test all(softplus.(x) .== x)

# optimized activation overwrote inputs
let
x = CUDA.ones(1)
@test Array(x) == [1f0]
tanh.(x)
@test Array(x) == [1f0]
y = tanh.(x)
@test Array(x) == [1f0]
@test Array(y) == [tanh(1f0)]
x .= tanh.(y)
@test Array(y) == [tanh(1f0)]
@test Array(x) == [tanh(tanh(1f0))]
end
end

@testset "Batchnorm" begin
Expand Down

0 comments on commit 836de1d

Please sign in to comment.