From b0ce77aa36b4ec4e88eec424fffbae860027923e Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 3 Aug 2017 16:42:37 +0100 Subject: [PATCH 1/7] overall testsets --- test/runtests.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index c661e5c..6837419 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,8 @@ using CUBLAS using CUDArt using Base.Test +@testset "CUBLAS" begin + m = 20 n = 35 k = 13 @@ -15,6 +17,8 @@ end # level 1 tests # ################# +@testset "Level 1" begin + @testset "blascopy!" begin @testset for elty in [Float32, Float64, Complex64, Complex128] A = convert(Vector{elty}, collect(1:m)) @@ -244,10 +248,14 @@ end end end +end # level 1 testset + ################# # level 2 tests # ################# +@testset "Level 2" begin + @testset "gemv!" begin @testset for elty in [Float32, Float64, Complex64, Complex128] alpha = convert(elty,1) @@ -1711,3 +1719,6 @@ end end end +end # level 2 testset + +end From c63faa21773df939806faeaa0c7d48a9e4e5dc17 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 3 Aug 2017 16:47:43 +0100 Subject: [PATCH 2/7] scale doesn't exist in Base --- src/highlevel.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/highlevel.jl b/src/highlevel.jl index 51b1954..df6853d 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -1,6 +1,6 @@ import Base.Operators.(*) -import Base: scale!, scale, norm, vecdot +import Base: scale!, norm, vecdot import Base: A_mul_B!, At_mul_B, Ac_mul_B, A_mul_Bc, At_mul_Bt, Ac_mul_Bc, At_mul_Bt, At_mul_B!, Ac_mul_B!, A_mul_Bc!, At_mul_Bt!, Ac_mul_Bc!, At_mul_Bt! @@ -17,7 +17,6 @@ cublas_size(t::Char, M::CudaVecOrMat) = (size(M, t=='N' ? 1:2), size(M, t=='N' ? # SCAL ####### scale!{T<:CublasFloat}(x::CudaArray{T}, k::Number) = CUBLAS.scal!(length(x), k, x, 1) -scale{T<:CublasFloat}(x::CudaArray{T}, k::Number) = CUBLAS.scal!(length(x), k, copy(x), 1) ####### # DOT From 500e88df8916d7ead721e431df97ea2c572360f3 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 3 Aug 2017 16:48:28 +0100 Subject: [PATCH 3/7] trim whitespace --- src/blas.jl | 2 +- src/highlevel.jl | 2 -- src/libcublas.jl | 2 +- src/libcublas_types.jl | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/blas.jl b/src/blas.jl index 1d07303..9371b1f 100644 --- a/src/blas.jl +++ b/src/blas.jl @@ -1614,7 +1614,7 @@ for (fname, elty) in Cptrs = CudaArray(map( (x) -> pointer(x).ptr, C )) info = CudaArray(zeros(Cint,length(A))) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, - (cublasHandle_t, Cint, Ptr{Ptr{$elty}}, Cint, + (cublasHandle_t, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Cint), cublashandle[1], n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A))) diff --git a/src/highlevel.jl b/src/highlevel.jl index df6853d..8071e02 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -175,5 +175,3 @@ end function Ac_mul_Bc{T,S}(A::CudaMatrix{T}, B::CudaMatrix{S}) Ac_mul_Bc!(similar(B, T, (size(A,2), size(B,1))), A, B) end - - diff --git a/src/libcublas.jl b/src/libcublas.jl index 45848fb..ea97816 100644 --- a/src/libcublas.jl +++ b/src/libcublas.jl @@ -773,4 +773,4 @@ try catch exception Base.show_backtrace(STDOUT, backtrace()); println(); -end \ No newline at end of file +end diff --git a/src/libcublas_types.jl b/src/libcublas_types.jl index 8856d65..cd587e5 100644 --- a/src/libcublas_types.jl +++ b/src/libcublas_types.jl @@ -100,4 +100,4 @@ try catch exception Base.show_backtrace(STDOUT, backtrace()); println(); -end \ No newline at end of file +end From 6d8247dda14eaa043751cad6b26c16b23bc41eec Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 3 Aug 2017 17:36:20 +0100 Subject: [PATCH 4/7] remove cudaStream_t dependency --- src/CUBLAS.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/CUBLAS.jl b/src/CUBLAS.jl index 63455a7..2b23f4c 100644 --- a/src/CUBLAS.jl +++ b/src/CUBLAS.jl @@ -12,7 +12,8 @@ module CUBLAS importall Base.LinAlg.BLAS using CUDArt -using CUDArt.rt.cudaStream_t + +const cudaStream_t = Ptr{Void} const BlasChar = Char #import Base.LinAlg.BlasChar import Base.one From c2da846cb061370bca18a31203ab1f9446a22698 Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 3 Aug 2017 18:20:53 +0100 Subject: [PATCH 5/7] cudart -> cudadrv --- REQUIRE | 2 +- src/CUBLAS.jl | 4 +- src/blas.jl | 390 ++++++++++++++++---------------- src/highlevel.jl | 68 +++--- test/runtests.jl | 562 ++++++++++++++++++++++++----------------------- 5 files changed, 515 insertions(+), 511 deletions(-) diff --git a/REQUIRE b/REQUIRE index 0cf5fd9..f721415 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,2 +1,2 @@ julia 0.5 -CUDArt 0.3.0 +CUDAdrv 0.5.1 diff --git a/src/CUBLAS.jl b/src/CUBLAS.jl index 2b23f4c..4ca81ac 100644 --- a/src/CUBLAS.jl +++ b/src/CUBLAS.jl @@ -11,7 +11,9 @@ module CUBLAS importall Base.LinAlg.BLAS -using CUDArt +using CUDAdrv: OwnedPtr, CuArray, CuVector, CuMatrix + +CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} const cudaStream_t = Ptr{Void} diff --git a/src/blas.jl b/src/blas.jl index 9371b1f..aa2bfa2 100644 --- a/src/blas.jl +++ b/src/blas.jl @@ -66,9 +66,9 @@ for (fname, elty) in ((:cublasDcopy_v2,:Float64), @eval begin # SUBROUTINE DCOPY(N,DX,INCX,DY,INCY) function blascopy!(n::Integer, - DX::Union{CudaPtr{$elty},CudaArray{$elty}}, + DX::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer, - DY::Union{CudaPtr{$elty},CudaArray{$elty}}, + DY::Union{OwnedPtr{$elty},CuArray{$elty}}, incy::Integer) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{$elty}, Cint, @@ -88,7 +88,7 @@ for (fname, elty) in ((:cublasDscal_v2,:Float64), # SUBROUTINE DSCAL(N,DA,DX,INCX) function scal!(n::Integer, DA::$elty, - DX::Union{CudaPtr{$elty},CudaArray{$elty}}, + DX::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{$elty}, Ptr{$elty}, @@ -99,7 +99,7 @@ for (fname, elty) in ((:cublasDscal_v2,:Float64), end end # TODO: uncomment and test the following method -#scal{T}(n::Integer, DA::T, DX::CudaArray{T}, incx::Integer) = scal!(n, DA, copy(DX), incx) +#scal{T}(n::Integer, DA::T, DX::CuArray{T}, incx::Integer) = scal!(n, DA, copy(DX), incx) # In case DX is complex, and DA is real, use dscal/sscal to save flops for (fname, elty, celty) in ((:cublasSscal_v2, :Float32, :Complex64), (:cublasDscal_v2, :Float64, :Complex128)) @@ -107,7 +107,7 @@ for (fname, elty, celty) in ((:cublasSscal_v2, :Float32, :Complex64), # SUBROUTINE DSCAL(N,DA,DX,INCX) function scal!(n::Integer, DA::$elty, - DX::Union{CudaPtr{$celty},CudaArray{$celty}}, + DX::Union{OwnedPtr{$celty},CuArray{$celty}}, incx::Integer) #DY = reinterpret($elty,DX,(2*n,)) #$(cublascall(fname))(cublashandle[1],2*n,[DA],DY,incx) @@ -135,9 +135,9 @@ for (jname, fname, elty) in ((:dot,:cublasDdot_v2,:Float64), (:dotu,:cublasCdotu_v2,:Complex64)) @eval begin function $jname(n::Integer, - DX::Union{CudaPtr{$elty},CudaArray{$elty}}, + DX::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer, - DY::Union{CudaPtr{$elty},CudaArray{$elty}}, + DY::Union{OwnedPtr{$elty},CuArray{$elty}}, incy::Integer) result = Array{$elty}(1) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, @@ -149,17 +149,17 @@ for (jname, fname, elty) in ((:dot,:cublasDdot_v2,:Float64), end end # TODO: inspect blas.jl in julia to correct types here (dot{c,u}) -function dot{T<:Union{Float32,Float64}}(DX::CudaArray{T}, DY::CudaArray{T}) +function dot{T<:Union{Float32,Float64}}(DX::CuArray{T}, DY::CuArray{T}) n = length(DX) n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) dot(n, DX, 1, DY, 1) end -function dotc{T<:Union{Complex64,Complex128}}(DX::CudaArray{T}, DY::CudaArray{T}) +function dotc{T<:Union{Complex64,Complex128}}(DX::CuArray{T}, DY::CuArray{T}) n = length(DX) n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) dotc(n, DX, 1, DY, 1) end -function dotu{T<:Union{Complex64,Complex128}}(DX::CudaArray{T}, DY::CudaArray{T}) +function dotu{T<:Union{Complex64,Complex128}}(DX::CuArray{T}, DY::CuArray{T}) n = length(DX) n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) dotu(n, DX, 1, DY, 1) @@ -173,7 +173,7 @@ for (fname, elty, ret_type) in ((:cublasDnrm2_v2,:Float64,:Float64), @eval begin # SUBROUTINE DNRM2(N,X,INCX) function nrm2(n::Integer, - X::Union{CudaPtr{$elty},CudaArray{$elty}}, + X::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer) result = Array{$ret_type}(1) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, @@ -184,9 +184,9 @@ for (fname, elty, ret_type) in ((:cublasDnrm2_v2,:Float64,:Float64), end end end -# TODO: consider CudaVector and CudaStridedVector +# TODO: consider CuVector and CudaStridedVector #nrm2(x::StridedVector) = nrm2(length(x), x, stride(x,1)) -nrm2(x::CudaArray) = nrm2(length(x), x, 1) +nrm2(x::CuArray) = nrm2(length(x), x, 1) ## asum for (fname, elty, ret_type) in ((:cublasDasum_v2,:Float64,:Float64), @@ -196,7 +196,7 @@ for (fname, elty, ret_type) in ((:cublasDasum_v2,:Float64,:Float64), @eval begin # SUBROUTINE ASUM(N, X, INCX) function asum(n::Integer, - X::Union{CudaPtr{$elty},CudaArray{$elty}}, + X::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer) result = Array{$ret_type}(1) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, @@ -208,7 +208,7 @@ for (fname, elty, ret_type) in ((:cublasDasum_v2,:Float64,:Float64), end end #asum(x::StridedVector) = asum(length(x), x, stride(x,1)) -asum(x::CudaArray) = asum(length(x), pointer(x), 1) +asum(x::CuArray) = asum(length(x), pointer(x), 1) ## axpy for (fname, elty) in ((:cublasDaxpy_v2,:Float64), @@ -228,9 +228,9 @@ for (fname, elty) in ((:cublasDaxpy_v2,:Float64), # int incy); function axpy!(n::Integer, alpha::($elty), - dx::Union{CudaPtr{$elty},CudaArray{$elty}}, + dx::Union{OwnedPtr{$elty},CuArray{$elty}}, incx::Integer, - dy::Union{CudaPtr{$elty},CudaArray{$elty}}, + dy::Union{OwnedPtr{$elty},CuArray{$elty}}, incy::Integer) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{$elty}, Ptr{$elty}, @@ -243,16 +243,16 @@ for (fname, elty) in ((:cublasDaxpy_v2,:Float64), end function axpy!{T<:CublasFloat,Ta<:Number}(alpha::Ta, - x::CudaArray{T}, - y::CudaArray{T}) + x::CuArray{T}, + y::CuArray{T}) length(x)==length(y) || throw(DimensionMismatch("")) axpy!(length(x), convert(T,alpha), x, 1, y, 1) end function axpy!{T<:CublasFloat,Ta<:Number,Ti<:Integer}(alpha::Ta, - x::CudaArray{T}, + x::CuArray{T}, rx::Union{UnitRange{Ti},Range{Ti}}, - y::CudaArray{T}, + y::CuArray{T}, ry::Union{UnitRange{Ti},Range{Ti}}) length(rx)==length(ry) || throw(DimensionMismatch("")) if minimum(rx) < 1 || maximum(rx) > length(x) || minimum(ry) < 1 || maximum(ry) > length(y) @@ -271,7 +271,7 @@ for (fname, elty) in ((:cublasIdamax_v2,:Float64), (:cublasIcamax_v2,:Complex64)) @eval begin function iamax(n::Integer, - dx::Union{CudaPtr{$elty}, CudaArray{$elty}}, + dx::Union{OwnedPtr{$elty}, CuArray{$elty}}, incx::Integer) result = Array{Cint}(1) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, @@ -282,7 +282,7 @@ for (fname, elty) in ((:cublasIdamax_v2,:Float64), end end end -iamax(dx::CudaArray) = iamax(length(dx), dx, 1) +iamax(dx::CuArray) = iamax(length(dx), dx, 1) ## iamin # iamin is not in standard blas is a CUBLAS extension @@ -292,7 +292,7 @@ for (fname, elty) in ((:cublasIdamin_v2,:Float64), (:cublasIcamin_v2,:Complex64)) @eval begin function iamin(n::Integer, - dx::Union{CudaPtr{$elty}, CudaArray{$elty}}, + dx::Union{OwnedPtr{$elty}, CuArray{$elty}}, incx::Integer) result = Array{Cint}(1) statuscheck(ccall(($(string(fname)), libcublas), cublasStatus_t, @@ -303,7 +303,7 @@ for (fname, elty) in ((:cublasIdamin_v2,:Float64), end end end -iamin(dx::CudaArray) = iamin(length(dx), dx, 1) +iamin(dx::CuArray) = iamin(length(dx), dx, 1) # Level 2 ## mv @@ -323,10 +323,10 @@ for (fname, elty) in ((:cublasDgemv_v2,:Float64), # double *y, int incy) function gemv!(trans::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - X::CudaVector{$elty}, + A::CuMatrix{$elty}, + X::CuVector{$elty}, beta::($elty), - Y::CudaVector{$elty}) + Y::CuVector{$elty}) # handle trans cutrans = cublasop(trans) m,n = size(A) @@ -344,10 +344,10 @@ for (fname, elty) in ((:cublasDgemv_v2,:Float64), incy)) Y end - function gemv(trans::BlasChar, alpha::($elty), A::CudaMatrix{$elty}, X::CudaVector{$elty}) + function gemv(trans::BlasChar, alpha::($elty), A::CuMatrix{$elty}, X::CuVector{$elty}) gemv!(trans, alpha, A, X, zero($elty), similar(X, $elty, size(A, (trans == 'N' ? 1 : 2)))) end - function gemv(trans::BlasChar, A::CudaMatrix{$elty}, X::CudaVector{$elty}) + function gemv(trans::BlasChar, A::CuMatrix{$elty}, X::CuVector{$elty}) gemv!(trans, one($elty), A, X, zero($elty), similar(X, $elty, size(A, (trans == 'N' ? 1 : 2)))) end end @@ -370,10 +370,10 @@ for (fname, elty) in ((:cublasDgbmv_v2,:Float64), kl::Integer, ku::Integer, alpha::($elty), - A::CudaMatrix{$elty}, - x::CudaVector{$elty}, + A::CuMatrix{$elty}, + x::CuVector{$elty}, beta::($elty), - y::CudaVector{$elty}) + y::CuVector{$elty}) # handle trans cutrans = cublasop(trans) n = size(A,2) @@ -396,8 +396,8 @@ for (fname, elty) in ((:cublasDgbmv_v2,:Float64), kl::Integer, ku::Integer, alpha::($elty), - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) # TODO: fix gbmv bug in julia n = size(A,2) leny = trans == 'N' ? m : n @@ -407,8 +407,8 @@ for (fname, elty) in ((:cublasDgbmv_v2,:Float64), m::Integer, kl::Integer, ku::Integer, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) gbmv(trans, m, kl, ku, one($elty), A, x) end end @@ -428,10 +428,10 @@ for (fname, elty) in ((:cublasDsymv_v2,:Float64), # const double *beta, double *y, int incy) function symv!(uplo::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - x::CudaVector{$elty}, + A::CuMatrix{$elty}, + x::CuVector{$elty}, beta::($elty), - y::CudaVector{$elty}) + y::CuVector{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end @@ -448,10 +448,10 @@ for (fname, elty) in ((:cublasDsymv_v2,:Float64), A, lda, x, incx, [beta], y, incy)) y end - function symv(uplo::BlasChar, alpha::($elty), A::CudaMatrix{$elty}, x::CudaVector{$elty}) + function symv(uplo::BlasChar, alpha::($elty), A::CuMatrix{$elty}, x::CuVector{$elty}) symv!(uplo, alpha, A, x, zero($elty), similar(x)) end - function symv(uplo::BlasChar, A::CudaMatrix{$elty}, x::CudaVector{$elty}) + function symv(uplo::BlasChar, A::CuMatrix{$elty}, x::CuVector{$elty}) symv(uplo, one($elty), A, x) end end @@ -469,10 +469,10 @@ for (fname, elty) in ((:cublasZhemv_v2,:Complex128), # const cuComplex *beta, cuComplex *y, int incy) function hemv!(uplo::BlasChar, alpha::$elty, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}, + A::CuMatrix{$elty}, + x::CuVector{$elty}, beta::$elty, - y::CudaVector{$elty}) + y::CuVector{$elty}) # TODO: fix dimension check bug in julia cuuplo = cublasfill(uplo) m, n = size(A) @@ -490,12 +490,12 @@ for (fname, elty) in ((:cublasZhemv_v2,:Complex128), A, lda, x, incx, [beta], y, incy)) y end - function hemv(uplo::BlasChar, alpha::($elty), A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + function hemv(uplo::BlasChar, alpha::($elty), A::CuMatrix{$elty}, + x::CuVector{$elty}) hemv!(uplo, alpha, A, x, zero($elty), similar(x)) end - function hemv(uplo::BlasChar, A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + function hemv(uplo::BlasChar, A::CuMatrix{$elty}, + x::CuVector{$elty}) hemv(uplo, one($elty), A, x) end end @@ -515,10 +515,10 @@ for (fname, elty) in ((:cublasDsbmv_v2,:Float64), function sbmv!(uplo::BlasChar, k::Integer, alpha::($elty), - A::CudaMatrix{$elty}, - x::CudaVector{$elty}, + A::CuMatrix{$elty}, + x::CuVector{$elty}, beta::($elty), - y::CudaVector{$elty}) + y::CuVector{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) #if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end @@ -537,12 +537,12 @@ for (fname, elty) in ((:cublasDsbmv_v2,:Float64), y end function sbmv(uplo::BlasChar, k::Integer, alpha::($elty), - A::CudaMatrix{$elty}, x::CudaVector{$elty}) + A::CuMatrix{$elty}, x::CuVector{$elty}) n = size(A,2) sbmv!(uplo, k, alpha, A, x, zero($elty), similar(x, $elty, n)) end - function sbmv(uplo::BlasChar, k::Integer, A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + function sbmv(uplo::BlasChar, k::Integer, A::CuMatrix{$elty}, + x::CuVector{$elty}) sbmv(uplo, k, one($elty), A, x) end end @@ -560,10 +560,10 @@ for (fname, elty) in ((:cublasZhbmv_v2,:Complex128), function hbmv!(uplo::BlasChar, k::Integer, alpha::($elty), - A::CudaMatrix{$elty}, - x::CudaVector{$elty}, + A::CuMatrix{$elty}, + x::CuVector{$elty}, beta::($elty), - y::CudaVector{$elty}) + y::CuVector{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end @@ -581,12 +581,12 @@ for (fname, elty) in ((:cublasZhbmv_v2,:Complex128), y end function hbmv(uplo::BlasChar, k::Integer, alpha::($elty), - A::CudaMatrix{$elty}, x::CudaVector{$elty}) + A::CuMatrix{$elty}, x::CuVector{$elty}) n = size(A,2) hbmv!(uplo, k, alpha, A, x, zero($elty), similar(x, $elty, n)) end - function hbmv(uplo::BlasChar, k::Integer, A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + function hbmv(uplo::BlasChar, k::Integer, A::CuMatrix{$elty}, + x::CuVector{$elty}) hbmv(uplo, k, one($elty), A, x) end end @@ -607,8 +607,8 @@ for (fname, elty) in ((:cublasStbmv_v2,:Float32), trans::BlasChar, diag::BlasChar, k::Integer, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) cudiag = cublasdiag(diag) @@ -628,8 +628,8 @@ for (fname, elty) in ((:cublasStbmv_v2,:Float32), function tbmv(uplo::BlasChar, trans::BlasChar, diag::BlasChar, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) tbmv!(uplo, trans, diag, A, copy(x)) end end @@ -650,8 +650,8 @@ for (fname, elty) in ((:cublasStbsv_v2,:Float32), trans::BlasChar, diag::BlasChar, k::Integer, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) cudiag = cublasdiag(diag) @@ -672,8 +672,8 @@ for (fname, elty) in ((:cublasStbsv_v2,:Float32), trans::BlasChar, diag::BlasChar, k::Integer, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) tbsv!(uplo, trans, diag, k, A, copy(x)) end end @@ -693,8 +693,8 @@ for (fname, elty) in ((:cublasDtrmv_v2,:Float64), function trmv!(uplo::BlasChar, trans::BlasChar, diag::BlasChar, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if n != length(x) @@ -715,8 +715,8 @@ for (fname, elty) in ((:cublasDtrmv_v2,:Float64), function trmv(uplo::BlasChar, trans::BlasChar, diag::BlasChar, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) trmv!(uplo, trans, diag, A, copy(x)) end end @@ -736,8 +736,8 @@ for (fname, elty) in ((:cublasDtrsv_v2,:Float64), function trsv!(uplo::BlasChar, trans::BlasChar, diag::BlasChar, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if n != length(x) @@ -758,8 +758,8 @@ for (fname, elty) in ((:cublasDtrsv_v2,:Float64), function trsv(uplo::BlasChar, trans::BlasChar, diag::BlasChar, - A::CudaMatrix{$elty}, - x::CudaVector{$elty}) + A::CuMatrix{$elty}, + x::CuVector{$elty}) trsv!(uplo, trans, diag, A, copy(x)) end end @@ -777,9 +777,9 @@ for (fname, elty) in ((:cublasDger_v2,:Float64), # const double *y, int incy, # double *A, int lda) function ger!(alpha::$elty, - x::CudaVector{$elty}, - y::CudaVector{$elty}, - A::CudaMatrix{$elty}) + x::CuVector{$elty}, + y::CuVector{$elty}, + A::CuMatrix{$elty}) m, n = size(A) m == length(x) || throw(DimensionMismatch("")) n == length(y) || throw(DimensionMismatch("")) @@ -809,8 +809,8 @@ for (fname, elty) in ((:cublasDsyr_v2,:Float64), # double *A, int lda) function syr!(uplo::BlasChar, alpha::$elty, - x::CudaVector{$elty}, - A::CudaMatrix{$elty}) + x::CuVector{$elty}, + A::CuMatrix{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) @@ -833,8 +833,8 @@ for (fname, elty) in ((:cublasZher_v2,:Complex128), @eval begin function her!(uplo::BlasChar, alpha::$elty, - x::CudaVector{$elty}, - A::CudaMatrix{$elty}) + x::CuVector{$elty}, + A::CuMatrix{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) @@ -857,9 +857,9 @@ for (fname, elty) in ((:cublasZher2_v2,:Complex128), @eval begin function her2!(uplo::BlasChar, alpha::$elty, - x::CudaVector{$elty}, - y::CudaVector{$elty}, - A::CudaMatrix{$elty}) + x::CuVector{$elty}, + y::CuVector{$elty}, + A::CuMatrix{$elty}) cuuplo = cublasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) @@ -897,10 +897,10 @@ for (fname, elty) in function gemm!(transA::BlasChar, transB::BlasChar, alpha::($elty), - A::CudaVecOrMat{$elty}, - B::CudaVecOrMat{$elty}, + A::CuVecOrMat{$elty}, + B::CuVecOrMat{$elty}, beta::($elty), - C::CudaVecOrMat{$elty}) + C::CuVecOrMat{$elty}) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) n = size(B, transB == 'N' ? 2 : 1) @@ -924,16 +924,16 @@ for (fname, elty) in function gemm(transA::BlasChar, transB::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) gemm!(transA, transB, alpha, A, B, zero($elty), similar(B, $elty, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1)))) end function gemm(transA::BlasChar, transB::BlasChar, - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) gemm(transA, transB, one($elty), A, B) end end @@ -955,10 +955,10 @@ for (fname, elty) in function gemm_batched!(transA::BlasChar, transB::BlasChar, alpha::($elty), - A::Array{CudaMatrix{$elty},1}, - B::Array{CudaMatrix{$elty},1}, + A::Array{CuMatrix{$elty},1}, + B::Array{CuMatrix{$elty},1}, beta::($elty), - C::Array{CudaMatrix{$elty},1}) + C::Array{CuMatrix{$elty},1}) if( length(A) != length(B) || length(A) != length(C) ) throw(DimensionMismatch("")) end @@ -978,9 +978,9 @@ for (fname, elty) in lda = max(1,stride(A[1],2)) ldb = max(1,stride(B[1],2)) ldc = max(1,stride(C[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) - Bptrs = CudaArray(map( (x) -> pointer(x).ptr, B )) - Cptrs = CudaArray(map( (x) -> pointer(x).ptr, C )) + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) + Bptrs = CuArray(map( (x) -> pointer(x).ptr, B )) + Cptrs = CuArray(map( (x) -> pointer(x).ptr, C )) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, Cint, Ptr{$elty}, @@ -993,15 +993,15 @@ for (fname, elty) in function gemm_batched(transA::BlasChar, transB::BlasChar, alpha::($elty), - A::Array{CudaMatrix{$elty},1}, - B::Array{CudaMatrix{$elty},1}) - C = CudaMatrix{$elty}[similar( B[1], $elty, (size(A[1], transA == 'N' ? 1 : 2),size(B[1], transB == 'N' ? 2 : 1))) for i in 1:length(A)] + A::Array{CuMatrix{$elty},1}, + B::Array{CuMatrix{$elty},1}) + C = CuMatrix{$elty}[similar( B[1], $elty, (size(A[1], transA == 'N' ? 1 : 2),size(B[1], transB == 'N' ? 2 : 1))) for i in 1:length(A)] gemm_batched!(transA, transB, alpha, A, B, zero($elty), C ) end function gemm_batched(transA::BlasChar, transB::BlasChar, - A::Array{CudaMatrix{$elty},1}, - B::Array{CudaMatrix{$elty},1}) + A::Array{CuMatrix{$elty},1}, + B::Array{CuMatrix{$elty},1}) gemm_batched(transA, transB, one($elty), A, B) end end @@ -1023,10 +1023,10 @@ for (fname, elty) in ((:cublasDsymm_v2,:Float64), function symm!(side::BlasChar, uplo::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}, + A::CuMatrix{$elty}, + B::CuMatrix{$elty}, beta::($elty), - C::CudaMatrix{$elty}) + C::CuMatrix{$elty}) cuside = cublasside(side) cuuplo = cublasfill(uplo) k, nA = size(A) @@ -1051,14 +1051,14 @@ for (fname, elty) in ((:cublasDsymm_v2,:Float64), function symm(side::BlasChar, uplo::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) symm!(side, uplo, alpha, A, B, zero($elty), similar(B)) end function symm(side::BlasChar, uplo::BlasChar, - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) symm(side, uplo, one($elty), A, B) end end @@ -1078,9 +1078,9 @@ for (fname, elty) in ((:cublasDsyrk_v2,:Float64), function syrk!(uplo::BlasChar, trans::BlasChar, alpha::($elty), - A::CudaVecOrMat{$elty}, + A::CuVecOrMat{$elty}, beta::($elty), - C::CudaMatrix{$elty}) + C::CuMatrix{$elty}) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) mC, n = size(C) @@ -1103,12 +1103,12 @@ end function syrk(uplo::BlasChar, trans::BlasChar, alpha::Number, - A::CudaVecOrMat) + A::CuVecOrMat) T = eltype(A) n = size(A, trans == 'N' ? 1 : 2) syrk!(uplo, trans, convert(T,alpha), A, zero(T), similar(A, T, (n, n))) end -syrk(uplo::BlasChar, trans::BlasChar, A::CudaVecOrMat) = syrk(uplo, trans, +syrk(uplo::BlasChar, trans::BlasChar, A::CuVecOrMat) = syrk(uplo, trans, one(eltype(A)), A) @@ -1127,10 +1127,10 @@ for (fname, elty) in ((:cublasZhemm_v2,:Complex128), function hemm!(side::BlasChar, uplo::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}, + A::CuMatrix{$elty}, + B::CuMatrix{$elty}, beta::($elty), - C::CudaMatrix{$elty}) + C::CuMatrix{$elty}) cuside = cublasside(side) cuuplo = cublasfill(uplo) mA, nA = size(A) @@ -1153,12 +1153,12 @@ for (fname, elty) in ((:cublasZhemm_v2,:Complex128), function hemm(uplo::BlasChar, trans::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) m,n = size(B) hemm!( uplo, trans, alpha, A, B, zero($elty), similar(B, $elty, (m,n) ) ) end - hemm( uplo::BlasChar, trans::BlasChar, A::CudaMatrix{$elty}, B::CudaMatrix{$elty}) = hemm( uplo, trans, one($elty), A, B) + hemm( uplo::BlasChar, trans::BlasChar, A::CuMatrix{$elty}, B::CuMatrix{$elty}) = hemm( uplo, trans, one($elty), A, B) end end @@ -1174,9 +1174,9 @@ for (fname, elty) in ((:cublasZherk_v2,:Complex128), function herk!(uplo::BlasChar, trans::BlasChar, alpha::($elty), - A::CudaVecOrMat{$elty}, + A::CuVecOrMat{$elty}, beta::($elty), - C::CudaMatrix{$elty}) + C::CuMatrix{$elty}) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) mC, n = size(C) @@ -1194,11 +1194,11 @@ for (fname, elty) in ((:cublasZherk_v2,:Complex128), lda, [beta], C, ldc)) C end - function herk(uplo::BlasChar, trans::BlasChar, alpha::($elty), A::CudaVecOrMat{$elty}) + function herk(uplo::BlasChar, trans::BlasChar, alpha::($elty), A::CuVecOrMat{$elty}) n = size(A, trans == 'N' ? 1 : 2) herk!(uplo, trans, alpha, A, zero($elty), similar(A, $elty, (n,n))) end - herk(uplo::BlasChar, trans::BlasChar, A::CudaVecOrMat{$elty}) = herk(uplo, trans, one($elty), A) + herk(uplo::BlasChar, trans::BlasChar, A::CuVecOrMat{$elty}) = herk(uplo, trans, one($elty), A) end end @@ -1220,10 +1220,10 @@ for (fname, elty) in ((:cublasDsyr2k_v2,:Float64), function syr2k!(uplo::BlasChar, trans::BlasChar, alpha::($elty), - A::CudaVecOrMat{$elty}, - B::CudaVecOrMat{$elty}, + A::CuVecOrMat{$elty}, + B::CuVecOrMat{$elty}, beta::($elty), - C::CudaMatrix{$elty}) + C::CuMatrix{$elty}) # TODO: check size of B in julia (syr2k!) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) @@ -1253,13 +1253,13 @@ end function syr2k(uplo::BlasChar, trans::BlasChar, alpha::Number, - A::CudaVecOrMat, - B::CudaVecOrMat) + A::CuVecOrMat, + B::CuVecOrMat) T = eltype(A) n = size(A, trans == 'N' ? 1 : 2) syr2k!(uplo, trans, convert(T,alpha), A, B, zero(T), similar(A, T, (n, n))) end -syr2k(uplo::BlasChar, trans::BlasChar, A::CudaVecOrMat, B::CudaVecOrMat) = syr2k(uplo, trans, one(eltype(A)), A, B) +syr2k(uplo::BlasChar, trans::BlasChar, A::CuVecOrMat, B::CuVecOrMat) = syr2k(uplo, trans, one(eltype(A)), A, B) ## her2k for (fname, elty1, elty2) in ((:cublasZher2k_v2,:Complex128,:Float64), @@ -1274,10 +1274,10 @@ for (fname, elty1, elty2) in ((:cublasZher2k_v2,:Complex128,:Float64), function her2k!(uplo::BlasChar, trans::BlasChar, alpha::($elty1), - A::CudaVecOrMat{$elty1}, - B::CudaVecOrMat{$elty1}, + A::CuVecOrMat{$elty1}, + B::CuVecOrMat{$elty1}, beta::($elty2), - C::CudaMatrix{$elty1}) + C::CuMatrix{$elty1}) # TODO: check size of B in julia (her2k!) cuuplo = cublasfill(uplo) cutrans = cublasop(trans) @@ -1305,15 +1305,15 @@ for (fname, elty1, elty2) in ((:cublasZher2k_v2,:Complex128,:Float64), function her2k(uplo::BlasChar, trans::BlasChar, alpha::($elty1), - A::CudaVecOrMat{$elty1}, - B::CudaVecOrMat{$elty1}) + A::CuVecOrMat{$elty1}, + B::CuVecOrMat{$elty1}) n = size(A, trans == 'N' ? 1 : 2) her2k!(uplo, trans, alpha, A, B, zero($elty2), similar(A, $elty1, (n,n))) end her2k(uplo::BlasChar, trans::BlasChar, - A::CudaVecOrMat{$elty1}, - B::CudaVecOrMat{$elty1}) = her2k(uplo, trans, one($elty1), A, B) + A::CuVecOrMat{$elty1}, + B::CuVecOrMat{$elty1}) = her2k(uplo, trans, one($elty1), A, B) end end @@ -1339,9 +1339,9 @@ for (mmname, smname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}, - C::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}, + C::CuMatrix{$elty}) cuside = cublasside(side) cuuplo = cublasfill(uplo) cutransa = cublasop(transa) @@ -1370,8 +1370,8 @@ for (mmname, smname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) trmm!(side, uplo, transa, diag, alpha, A, B, similar(B)) end # cublasStatus_t cublasDtrsm(cublasHandle_t handle, @@ -1386,8 +1386,8 @@ for (mmname, smname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) cuside = cublasside(side) cuuplo = cublasfill(uplo) cutransa = cublasop(transa) @@ -1413,8 +1413,8 @@ for (mmname, smname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, - B::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + B::CuMatrix{$elty}) trsm!(side, uplo, transa, diag, alpha, A, copy(B)) end end @@ -1440,8 +1440,8 @@ for (fname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::Array{CudaMatrix{$elty},1}, - B::Array{CudaMatrix{$elty},1}) + A::Array{CuMatrix{$elty},1}, + B::Array{CuMatrix{$elty},1}) cuside = cublasside(side) cuuplo = cublasfill(uplo) cutransa = cublasop(transa) @@ -1458,8 +1458,8 @@ for (fname, elty) in m,n = size(B[1]) lda = max(1,stride(A[1],2)) ldb = max(1,stride(B[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) - Bptrs = CudaArray(map( (x) -> pointer(x).ptr, B )) + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) + Bptrs = CuArray(map( (x) -> pointer(x).ptr, B )) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, Cint, Cint, @@ -1474,8 +1474,8 @@ for (fname, elty) in transa::BlasChar, diag::BlasChar, alpha::($elty), - A::Array{CudaMatrix{$elty},1}, - B::Array{CudaMatrix{$elty},1}) + A::Array{CuMatrix{$elty},1}, + B::Array{CuMatrix{$elty},1}) trsm_batched!(side, uplo, transa, diag, alpha, A, copy(B) ) end end @@ -1502,10 +1502,10 @@ for (fname, elty) in ((:cublasDgeam,:Float64), function geam!(transa::BlasChar, transb::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, + A::CuMatrix{$elty}, beta::($elty), - B::CudaMatrix{$elty}, - C::CudaMatrix{$elty}) + B::CuMatrix{$elty}, + C::CuMatrix{$elty}) cutransa = cublasop(transa) cutransb = cublasop(transb) mA, nA = size(A) @@ -1528,9 +1528,9 @@ for (fname, elty) in ((:cublasDgeam,:Float64), function geam(transa::BlasChar, transb::BlasChar, alpha::($elty), - A::CudaMatrix{$elty}, + A::CuMatrix{$elty}, beta::($elty), - B::CudaMatrix{$elty}) + B::CuMatrix{$elty}) m,n = size(B) if ((transb == 'T' || transb == 'C')) geam!( transa, transb, alpha, A, beta, B, similar(B, $elty, (n,m) ) ) @@ -1539,7 +1539,7 @@ for (fname, elty) in ((:cublasDgeam,:Float64), geam!( transa, transb, alpha, A, beta, B, similar(B, $elty, (m,n) ) ) end end - geam( uplo::BlasChar, trans::BlasChar, A::CudaMatrix{$elty}, B::CudaMatrix{$elty}) = geam( uplo, trans, one($elty), A, one($elty), B) + geam( uplo::BlasChar, trans::BlasChar, A::CuMatrix{$elty}, B::CuMatrix{$elty}) = geam( uplo, trans, one($elty), A, one($elty), B) end end @@ -1555,7 +1555,7 @@ for (fname, elty) in # cublasHandle_t handle, int n, double **A, # int lda, int *PivotArray, int *infoArray, # int batchSize) - function getrf_batched!(A::Array{CudaMatrix{$elty},1}, + function getrf_batched!(A::Array{CuMatrix{$elty},1}, Pivot::Bool) for As in A m,n = size(As) @@ -1565,19 +1565,19 @@ for (fname, elty) in end m,n = size(A[1]) lda = max(1,stride(A[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) - info = CudaArray(Cint, (length(A))) - pivotArray = Pivot ? CudaArray(Cint, (n, length(A))) : C_NULL + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) + info = CuArray{Cint}(length(A)) + pivotArray = Pivot ? CuArray{Int32}((n, length(A))) : C_NULL statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Ptr{Cint}, Cint), cublashandle[1], n, Aptrs, lda, pivotArray, info, length(A))) if( !Pivot ) - pivotArray = CudaArray(zeros(Cint, (n, length(A)))) + pivotArray = CuArray(zeros(Cint, (n, length(A)))) end pivotArray, info, A end - function getrf_batched(A::Array{CudaMatrix{$elty},1}, + function getrf_batched(A::Array{CuMatrix{$elty},1}, Pivot::Bool) newA = copy(A) pivotarray, info = getrf_batched!(newA, Pivot) @@ -1598,21 +1598,21 @@ for (fname, elty) in # cublasHandle_t handle, int n, double **A, # int lda, int *PivotArray, double **C, # int ldc, int *info, int batchSize) - function getri_batched(A::Array{CudaMatrix{$elty},1}, - pivotArray::CudaMatrix{Cint}) + function getri_batched(A::Array{CuMatrix{$elty},1}, + pivotArray::CuMatrix{Cint}) for As in A m,n = size(As) if m != n throw(DimensionMismatch("All A matrices must be square!")) end end - C = CudaMatrix{$elty}[similar(A[1]) for i in 1:length(A)] + C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)] n = size(A[1])[1] lda = max(1,stride(A[1],2)) ldc = max(1,stride(C[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) - Cptrs = CudaArray(map( (x) -> pointer(x).ptr, C )) - info = CudaArray(zeros(Cint,length(A))) + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) + Cptrs = CuArray(map( (x) -> pointer(x).ptr, C )) + info = CuArray(zeros(Cint,length(A))) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Cint), @@ -1635,7 +1635,7 @@ for (fname, elty) in # cublasHandle_t handle, int n, double **A, # int lda, double **C, int ldc, # int *info, int batchSize) - function matinv_batched(A::Array{CudaMatrix{$elty},1}) + function matinv_batched(A::Array{CuMatrix{$elty},1}) for As in A m,n = size(As) if m != n @@ -1645,13 +1645,13 @@ for (fname, elty) in throw(ArgumentError("matinv requires all matrices be smaller than 32 x 32")) end end - C = CudaMatrix{$elty}[similar(A[1]) for i in 1:length(A)] + C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)] n = size(A[1])[1] lda = max(1,stride(A[1],2)) ldc = max(1,stride(C[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) - Cptrs = CudaArray(map( (x) -> pointer(x).ptr, C )) - info = CudaArray(zeros(Cint,length(A))) + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) + Cptrs = CuArray(map( (x) -> pointer(x).ptr, C )) + info = CuArray(zeros(Cint,length(A))) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Cint}, Cint), @@ -1674,16 +1674,16 @@ for (fname, elty) in # cublasHandle_t handle, int n, int m, # double **A, int lda, double **TauArray, # int *infoArray, int batchSize) - function geqrf_batched!(A::Array{CudaMatrix{$elty},1}) + function geqrf_batched!(A::Array{CuMatrix{$elty},1}) m,n = size(A[1]) lda = max(1,stride(A[1],2)) - Aptrs = CudaArray(map( (x) -> pointer(x).ptr, A )) + Aptrs = CuArray(map( (x) -> pointer(x).ptr, A )) hTauArray = [zeros($elty, min(m,n)) for i in 1:length(A)] - TauArray = CudaArray{$elty,1}[] + TauArray = CuArray{$elty,1}[] for i in 1:length(A) - push!(TauArray,CudaArray(hTauArray[i])) + push!(TauArray,CuArray(hTauArray[i])) end - Tauptrs = CudaArray(map( (x) -> pointer(x).ptr, TauArray )) + Tauptrs = CuArray(map( (x) -> pointer(x).ptr, TauArray )) info = zero(Cint) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, Cint, Cint, Ptr{Ptr{$elty}}, @@ -1695,7 +1695,7 @@ for (fname, elty) in end TauArray, A end - function geqrf_batched(A::Array{CudaMatrix{$elty},1}) + function geqrf_batched(A::Array{CuMatrix{$elty},1}) geqrf_batched!(copy(A)) end end @@ -1715,8 +1715,8 @@ for (fname, elty) in # double **C, int ldc, int *infoArray, # int *devInfoArray, int batchSize) function gels_batched!(trans::BlasChar, - A::Array{CudaMatrix{$elty},1}, - C::Array{CudaMatrix{$elty},1}) + A::Array{CuMatrix{$elty},1}, + C::Array{CuMatrix{$elty},1}) cutrans = cublasop(trans) if( length(A) != length(C) ) throw(DimensionMismatch("")) @@ -1735,10 +1735,10 @@ for (fname, elty) in nrhs = size(C[1])[2] lda = max(1,stride(A[1],2)) ldc = max(1,stride(A[1],2)) - Aptrs = CudaArray(map((x) -> pointer(x).ptr, A )) - Cptrs = CudaArray(map((x) -> pointer(x).ptr, C )) + Aptrs = CuArray(map((x) -> pointer(x).ptr, A )) + Cptrs = CuArray(map((x) -> pointer(x).ptr, C )) info = zero(Cint) - infoarray = CudaArray(zeros(Cint, length(A))) + infoarray = CuArray(zeros(Cint, length(A))) statuscheck(ccall(($(string(fname)),libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, Cint, Cint, Cint, Ptr{Ptr{$elty}}, Cint, Ptr{Ptr{$elty}}, @@ -1751,8 +1751,8 @@ for (fname, elty) in A, C, infoarray end function gels_batched(trans::BlasChar, - A::Array{CudaMatrix{$elty},1}, - C::Array{CudaMatrix{$elty},1}) + A::Array{CuMatrix{$elty},1}, + C::Array{CuMatrix{$elty},1}) gels_batched!(trans, copy(A), copy(C)) end end @@ -1771,9 +1771,9 @@ for (fname, elty) in ((:cublasDdgmm,:Float64), # const cuComplex *X, int incx, # cuComplex *C, int ldc) function dgmm!(mode::BlasChar, - A::CudaMatrix{$elty}, - X::CudaVector{$elty}, - C::CudaMatrix{$elty}) + A::CuMatrix{$elty}, + X::CuVector{$elty}, + C::CuMatrix{$elty}) cuside = cublasside(mode) m, n = size(C) mA, nA = size(A) @@ -1791,8 +1791,8 @@ for (fname, elty) in ((:cublasDdgmm,:Float64), C end function dgmm(mode::BlasChar, - A::CudaMatrix{$elty}, - X::CudaVector{$elty}) + A::CuMatrix{$elty}, + X::CuVector{$elty}) m,n = size(A) dgmm!( mode, A, X, similar(A, $elty, (m,n) ) ) end diff --git a/src/highlevel.jl b/src/highlevel.jl index 8071e02..4a0c99a 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -5,7 +5,7 @@ import Base: scale!, norm, vecdot import Base: A_mul_B!, At_mul_B, Ac_mul_B, A_mul_Bc, At_mul_Bt, Ac_mul_Bc, At_mul_Bt, At_mul_B!, Ac_mul_B!, A_mul_Bc!, At_mul_Bt!, Ac_mul_Bc!, At_mul_Bt! -cublas_size(t::Char, M::CudaVecOrMat) = (size(M, t=='N' ? 1:2), size(M, t=='N' ? 2:1)) +cublas_size(t::Char, M::CuVecOrMat) = (size(M, t=='N' ? 1:2), size(M, t=='N' ? 2:1)) ########### # @@ -16,12 +16,12 @@ cublas_size(t::Char, M::CudaVecOrMat) = (size(M, t=='N' ? 1:2), size(M, t=='N' ? ####### # SCAL ####### -scale!{T<:CublasFloat}(x::CudaArray{T}, k::Number) = CUBLAS.scal!(length(x), k, x, 1) +scale!{T<:CublasFloat}(x::CuArray{T}, k::Number) = CUBLAS.scal!(length(x), k, x, 1) ####### # DOT ####### -function dot{T <: CublasFloat, TI<:Integer}(x::CudaVector{T}, rx::Union{UnitRange{TI},Range{TI}}, y::CudaVector{T}, ry::Union{UnitRange{TI},Range{TI}}) +function dot{T <: CublasFloat, TI<:Integer}(x::CuVector{T}, rx::Union{UnitRange{TI},Range{TI}}, y::CuVector{T}, ry::Union{UnitRange{TI},Range{TI}}) if length(rx) != length(ry) throw(DimensionMismatch("length of rx, $(length(rx)), does not equal length of ry, $(length(ry))")) end @@ -34,17 +34,17 @@ function dot{T <: CublasFloat, TI<:Integer}(x::CudaVector{T}, rx::Union{UnitRang dot(length(rx), pointer(x)+(first(rx)-1)*sizeof(T), step(rx), pointer(y)+(first(ry)-1)*sizeof(T), step(ry)) end -At_mul_B{T<:CublasReal}(x::CudaVector{T}, y::CudaVector{T}) = [CUBLAS.dot(x, y)] -At_mul_B{T<:CublasComplex}(x::CudaVector{T}, y::CudaVector{T}) = [CUBLAS.dotu(x, y)] -Ac_mul_B{T<:CublasComplex}(x::CudaVector{T}, y::CudaVector{T}) = [CUBLAS.dotc(x, y)] +At_mul_B{T<:CublasReal}(x::CuVector{T}, y::CuVector{T}) = [CUBLAS.dot(x, y)] +At_mul_B{T<:CublasComplex}(x::CuVector{T}, y::CuVector{T}) = [CUBLAS.dotu(x, y)] +Ac_mul_B{T<:CublasComplex}(x::CuVector{T}, y::CuVector{T}) = [CUBLAS.dotc(x, y)] -vecdot{T<:CublasReal}(x::CudaVector{T}, y::CudaVector{T}) = dot(x, y) -vecdot{T<:CublasComplex}(x::CudaVector{T}, y::CudaVector{T}) = dotc(x, y) +vecdot{T<:CublasReal}(x::CuVector{T}, y::CuVector{T}) = dot(x, y) +vecdot{T<:CublasComplex}(x::CuVector{T}, y::CuVector{T}) = dotc(x, y) ####### # NRM2 ####### -norm(x::CudaArray) = nrm2(x) +norm(x::CuArray) = nrm2(x) ############ @@ -57,7 +57,7 @@ norm(x::CudaArray) = nrm2(x) ######### # GEMV ########## -function gemv_wrapper!{T<:CublasFloat}(y::CudaVector{T}, tA::Char, A::CudaMatrix{T}, x::CudaVector{T}, +function gemv_wrapper!{T<:CublasFloat}(y::CuVector{T}, tA::Char, A::CuMatrix{T}, x::CuVector{T}, alpha = one(T), beta = zero(T)) mA, nA = cublas_size(tA, A) if nA != length(x) @@ -75,20 +75,20 @@ function gemv_wrapper!{T<:CublasFloat}(y::CudaVector{T}, tA::Char, A::CudaMatrix gemv!(tA, alpha, A, x, beta, y) end -A_mul_B!{T<:CublasFloat}(y::CudaVector{T}, A::CudaMatrix{T}, x::CudaVector{T}) = gemv_wrapper!(y, 'N', A, x) -At_mul_B!{T<:CublasFloat}(y::CudaVector{T}, A::CudaMatrix{T}, x::CudaVector{T}) = gemv_wrapper!(y, 'T', A, x) -Ac_mul_B!{T<:CublasFloat}(y::CudaVector{T}, A::CudaMatrix{T}, x::CudaVector{T}) = gemv_wrapper!(y, 'T', A, x) -Ac_mul_B!{T<:CublasComplex}(y::CudaVector{T}, A::CudaMatrix{T}, x::CudaVector{T}) = gemv_wrapper!(y, 'C', A, x) +A_mul_B!{T<:CublasFloat}(y::CuVector{T}, A::CuMatrix{T}, x::CuVector{T}) = gemv_wrapper!(y, 'N', A, x) +At_mul_B!{T<:CublasFloat}(y::CuVector{T}, A::CuMatrix{T}, x::CuVector{T}) = gemv_wrapper!(y, 'T', A, x) +Ac_mul_B!{T<:CublasFloat}(y::CuVector{T}, A::CuMatrix{T}, x::CuVector{T}) = gemv_wrapper!(y, 'T', A, x) +Ac_mul_B!{T<:CublasComplex}(y::CuVector{T}, A::CuMatrix{T}, x::CuVector{T}) = gemv_wrapper!(y, 'C', A, x) -function (*){T<:CublasFloat}(A::CudaMatrix{T}, x::CudaVector{T}) +function (*){T<:CublasFloat}(A::CuMatrix{T}, x::CuVector{T}) A_mul_B!(similar(x, T, size(A,1)), A, x) end -function At_mul_B{T<:CublasFloat}(A::CudaMatrix{T}, x::CudaVector{T}) +function At_mul_B{T<:CublasFloat}(A::CuMatrix{T}, x::CuVector{T}) At_mul_B!(similar(x, T, size(A,2)), A, x) end -function Ac_mul_B{T<:CublasFloat}(A::CudaMatrix{T}, x::CudaVector{T}) +function Ac_mul_B{T<:CublasFloat}(A::CuMatrix{T}, x::CuVector{T}) Ac_mul_B!(similar(x, T, size(A,2)), A, x) end @@ -102,9 +102,9 @@ end ######## # GEMM ######## -function gemm_wrapper!{T <: CublasFloat}(C::CudaVecOrMat{T}, tA::Char, tB::Char, - A::CudaVecOrMat{T}, - B::CudaVecOrMat{T}, +function gemm_wrapper!{T <: CublasFloat}(C::CuVecOrMat{T}, tA::Char, tB::Char, + A::CuVecOrMat{T}, + B::CuVecOrMat{T}, alpha = one(T), beta = zero(T)) mA, nA = cublas_size(tA, A) @@ -129,49 +129,49 @@ function gemm_wrapper!{T <: CublasFloat}(C::CudaVecOrMat{T}, tA::Char, tB::Char, end # Mutating -A_mul_B!{T <: CublasFloat}(C::CudaMatrix{T}, A::CudaMatrix{T}, B::CudaMatrix{T}) = gemm_wrapper!(C, 'N', 'N', A, B) -At_mul_B!(C::CudaMatrix, A::CudaMatrix, B::CudaMatrix) = gemm_wrapper!(C, 'T', 'N', A, B) -At_mul_Bt!(C::CudaMatrix, A::CudaMatrix, B::CudaMatrix) = gemm_wrapper!(C, 'T', 'T', A, B) -Ac_mul_B!{T<:CublasReal}(C::CudaMatrix{T}, A::CudaMatrix{T}, B::CudaMatrix{T}) = At_mul_B!(C, A, B) -Ac_mul_B!(C::CudaMatrix, A::CudaMatrix, B::CudaMatrix) = gemm_wrapper!(C, 'C', 'N', A, B) +A_mul_B!{T <: CublasFloat}(C::CuMatrix{T}, A::CuMatrix{T}, B::CuMatrix{T}) = gemm_wrapper!(C, 'N', 'N', A, B) +At_mul_B!(C::CuMatrix, A::CuMatrix, B::CuMatrix) = gemm_wrapper!(C, 'T', 'N', A, B) +At_mul_Bt!(C::CuMatrix, A::CuMatrix, B::CuMatrix) = gemm_wrapper!(C, 'T', 'T', A, B) +Ac_mul_B!{T<:CublasReal}(C::CuMatrix{T}, A::CuMatrix{T}, B::CuMatrix{T}) = At_mul_B!(C, A, B) +Ac_mul_B!(C::CuMatrix, A::CuMatrix, B::CuMatrix) = gemm_wrapper!(C, 'C', 'N', A, B) -function A_mul_B!{T}(C::CudaMatrix{T}, A::CudaVecOrMat{T}, B::CudaVecOrMat{T}) +function A_mul_B!{T}(C::CuMatrix{T}, A::CuVecOrMat{T}, B::CuVecOrMat{T}) gemm_wrapper!(C, 'N', 'N', A, B) end # Non mutating # A_mul_Bx -function (*){T <: CublasFloat}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function (*){T <: CublasFloat}(A::CuMatrix{T}, B::CuMatrix{T}) A_mul_B!(similar(B, T,(size(A,1), size(B,2))), A, B) end -function A_mul_Bt{T}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function A_mul_Bt{T}(A::CuMatrix{T}, B::CuMatrix{T}) A_mul_Bt!(similar(B, T, (size(A,1), size(B,1))), A, B) end -function A_mul_Bc{T}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function A_mul_Bc{T}(A::CuMatrix{T}, B::CuMatrix{T}) A_mul_Bc!(similar(B, T,(size(A,1),size(B,1))),A, B) end # At_mul_Bx -function At_mul_B{T}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function At_mul_B{T}(A::CuMatrix{T}, B::CuMatrix{T}) At_mul_B!(similar(B, T, (size(A,2), size(B,2))), A, B) end -function At_mul_Bt{T}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function At_mul_Bt{T}(A::CuMatrix{T}, B::CuMatrix{T}) At_mul_Bt!(similar(B, T, (size(A,2), size(B,1))), A, B) end # Ac_mul_Bx -function Ac_mul_B{T}(A::CudaMatrix{T}, B::CudaMatrix{T}) +function Ac_mul_B{T}(A::CuMatrix{T}, B::CuMatrix{T}) Ac_mul_B!(similar(B, T, (size(A,2), size(B,2))), A, B) end -function Ac_mul_Bt{T,S}(A::CudaMatrix{T}, B::CudaMatrix{S}) +function Ac_mul_Bt{T,S}(A::CuMatrix{T}, B::CuMatrix{S}) Ac_mul_Bt(similar(B, T, (size(A,2), size(B,1))), A, B) end -function Ac_mul_Bc{T,S}(A::CudaMatrix{T}, B::CudaMatrix{S}) +function Ac_mul_Bc{T,S}(A::CuMatrix{T}, B::CuMatrix{S}) Ac_mul_Bc!(similar(B, T, (size(A,2), size(B,1))), A, B) end diff --git a/test/runtests.jl b/test/runtests.jl index 6837419..0f1196d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ import Base.LinAlg.BLAS using CUBLAS -using CUDArt +using CUDAdrv: OwnedPtr, CuArray, CuVector, CuMatrix using Base.Test @testset "CUBLAS" begin @@ -27,10 +27,10 @@ end end @test ndims(A) == 1 n1 = length(A) - d_A = CudaArray(A) - d_B = CudaArray(elty, n1) + d_A = CuArray(A) + d_B = CuArray{elty}(n1) CUBLAS.blascopy!(n,d_A,1,d_B,1) - B = to_host(d_B) + B = collect(d_B) @test A == B end end @@ -39,18 +39,18 @@ end function test_scal!{T}(alpha,A::Array{T}) @test ndims(A) == 1 n1 = length(A) - d_A = CudaArray(A) + d_A = CuArray(A) CUBLAS.scal!(n1,alpha,d_A,1) - A1 = to_host(d_A) + A1 = collect(d_A) @test alpha*A ≈ A1 - d_A = CudaArray(A) - d_As = CUBLAS.scale(d_A, alpha) - A1 = to_host(d_As) + d_A = CuArray(A) + d_As = CUBLAS.scale!(copy(d_A), alpha) + A1 = collect(d_As) @test alpha*A ≈ A1 CUBLAS.scale!(d_A, alpha) - A1 = to_host(d_As) + A1 = collect(d_As) @test alpha*A ≈ A1 end test_scal!(2.0f0,Float32[1:m;]) @@ -69,16 +69,16 @@ end @test ndims(B) == 1 @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) cuda_dot1 = CUBLAS.dot(n1,d_A,1,d_B,1) cuda_dot2 = CUBLAS.dot(d_A,d_B) host_dot = dot(A,B) @test host_dot ≈ cuda_dot1 @test host_dot ≈ cuda_dot2 - #d_A = CudaArray(A) - #d_B = CudaArray(B) + #d_A = CuArray(A) + #d_B = CuArray(B) #cuda_dot3 = CUBLAS.dot(d_A, 3:5, d_B, 5:7) #host_dot3 = dot(A, 3:5, B, 5:7) #@test_approx_eq(cuda_dot3, host_dot3) @@ -93,8 +93,8 @@ end @test ndims(B) == 1 @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) cuda_dot1 = CUBLAS.dotu(n1,d_A,1,d_B,1) cuda_dot2 = CUBLAS.dotu(d_A,d_B) host_dot = A.'*B @@ -118,8 +118,8 @@ end @test ndims(B) == 1 @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) cuda_dot1 = CUBLAS.dotc(n1,d_A,1,d_B,1) cuda_dot2 = CUBLAS.dotc(d_A,d_B) host_dot = A'*B @@ -140,7 +140,7 @@ end A = rand(elty, m) @test ndims(A) == 1 n1 = length(A) - d_A = CudaArray(A) + d_A = CuArray(A) cuda_nrm2_1 = CUBLAS.nrm2(n1,d_A,1) cuda_nrm2_2 = CUBLAS.nrm2(d_A) cuda_nrm2_3 = norm(d_A) @@ -156,7 +156,7 @@ end A = rand(elty, m) @test ndims(A) == 1 n1 = length(A) - d_A = CudaArray(A) + d_A = CuArray(A) cuda_asum1 = CUBLAS.asum(n1,d_A,1) cuda_asum2 = CUBLAS.asum(d_A) host_asum = sum(abs.(real(A)) + abs.(imag(A))) @@ -170,10 +170,10 @@ end function test_axpy!_1(alpha,A,B) @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B1 = CudaArray(B) + d_A = CuArray(A) + d_B1 = CuArray(B) CUBLAS.axpy!(n1,alpha,d_A,1,d_B1,1) - B1 = to_host(d_B1) + B1 = collect(d_B1) host_axpy = alpha*A + B @test host_axpy ≈ B1 end @@ -185,10 +185,10 @@ end function test_axpy!_2(alpha,A,B) @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B1 = CudaArray(B) + d_A = CuArray(A) + d_B1 = CuArray(B) CUBLAS.axpy!(alpha,d_A,d_B1) - B1 = to_host(d_B1) + B1 = collect(d_B1) host_axpy = alpha*A + B @test host_axpy ≈ B1 end @@ -200,10 +200,10 @@ end #=function test_axpy!_3(alpha,A,B) @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B1 = CudaArray(B) + d_A = CuArray(A) + d_B1 = CuArray(B) CUBLAS.axpy!(alpha,d_A,1:2:n1,d_B1,1:2:n1) - B1 = to_host(d_B1) + B1 = collect(d_B1) host_axpy = B host_axpy[1:2:n1] = alpha*A[1:2:n1] + B[1:2:n1] @test_approx_eq(host_axpy,B1) @@ -216,11 +216,11 @@ end function test_axpy!_4(alpha,A,B) @test length(A) == length(B) n1 = length(A) - d_A = CudaArray(A) - d_B1 = CudaArray(B) + d_A = CuArray(A) + d_B1 = CuArray(B) r = 1:div(n1,2) CUBLAS.axpy!(alpha,d_A,r,d_B1,r) - B1 = to_host(d_B1) + B1 = collect(d_B1) host_axpy = B host_axpy[r] = alpha*A[r] + B[r] @test_approx_eq(host_axpy,B1) @@ -235,7 +235,7 @@ end @testset for elty in [Float32, Float64, Complex64, Complex128] A = rand(elty, m) n1 = length(A) - d_A = CudaArray(A) + d_A = CuArray(A) Aabs = blasabs(A) imin1 = CUBLAS.iamin(n1,d_A,1) imax1 = CUBLAS.iamax(n1,d_A,1) @@ -261,45 +261,45 @@ end # level 1 testset alpha = convert(elty,1) beta = convert(elty,1) A = rand(elty,m,n) - d_A = CudaArray(A) + d_A = CuArray(A) # test y = A*x + y x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) y = A*x + y CUBLAS.gemv!('N',alpha,d_A,d_x,beta,d_y) - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y A_mul_B!(d_y,d_A,d_x) - h_y = to_host(d_y) + h_y = collect(d_y) @test h_y ≈ A*x # test x = A.'*y + x x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) x = A.'*y + x CUBLAS.gemv!('T',alpha,d_A,d_y,beta,d_x) - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x At_mul_B!(d_x,d_A,d_y) - h_x = to_host(d_x) + h_x = collect(d_x) @test h_x ≈ A.'*y # test x = A'*y + x x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) x = A'*y + x CUBLAS.gemv!('C',alpha,d_A,d_y,beta,d_x) - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x Ac_mul_B!(d_x,d_A,d_y) - h_x = to_host(d_x) + h_x = collect(d_x) @test h_x ≈ A'*y end end @@ -308,45 +308,45 @@ end @testset for elty in [Float32, Float64, Complex64, Complex128] alpha = convert(elty,2) A = rand(elty,m,n) - d_A = CudaArray(A) + d_A = CuArray(A) # test y = alpha*(A*x) x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y1 = alpha*(A*x) y2 = A*x d_y1 = CUBLAS.gemv('N',alpha,d_A,d_x) d_y2 = CUBLAS.gemv('N',d_A,d_x) - h_y1 = to_host(d_y1) - h_y2 = to_host(d_y2) + h_y1 = collect(d_y1) + h_y2 = collect(d_y2) @test y1 ≈ h_y1 @test y2 ≈ h_y2 - @test y2 ≈ to_host(d_A * d_x) + @test y2 ≈ collect(d_A * d_x) # test x = alpha*(A.'*y) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) x1 = alpha*(A.'*y) x2 = A.'*y d_x1 = CUBLAS.gemv('T',alpha,d_A,d_y) d_x2 = CUBLAS.gemv('T',d_A,d_y) - h_x1 = to_host(d_x1) - h_x2 = to_host(d_x2) + h_x1 = collect(d_x1) + h_x2 = collect(d_x2) @test x1 ≈ h_x1 @test x2 ≈ h_x2 - @test x2 ≈ to_host(d_A.' * d_y) + @test x2 ≈ collect(d_A.' * d_y) # test x = alpha*(A'*y) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) x1 = alpha*(A'*y) x2 = A'*y d_x1 = CUBLAS.gemv('C',alpha,d_A,d_y) d_x2 = CUBLAS.gemv('C',d_A,d_y) - h_x1 = to_host(d_x1) - h_x2 = to_host(d_x2) + h_x1 = collect(d_x1) + h_x2 = collect(d_x2) @test y1 ≈ h_y1 @test y2 ≈ h_y2 - @test x2 ≈ to_host(d_A' * d_y) + @test x2 ≈ collect(d_A' * d_y) end end @@ -363,33 +363,33 @@ end A = bandex(A,kl,ku) # get packed format Ab = band(A,kl,ku) - d_Ab = CudaArray(Ab) + d_Ab = CuArray(Ab) # test y = alpha*A*x + beta*y x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) CUBLAS.gbmv!('N',m,kl,ku,alpha,d_Ab,d_x,beta,d_y) BLAS.gbmv!('N',m,kl,ku,alpha,Ab,x,beta,y) - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y # test y = alpha*A.'*x + beta*y x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) CUBLAS.gbmv!('T',m,kl,ku,alpha,d_Ab,d_y,beta,d_x) BLAS.gbmv!('T',m,kl,ku,alpha,Ab,y,beta,x) - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x # test y = alpha*A'*x + beta*y x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) y = rand(elty,m) - d_y = CudaArray(y) + d_y = CuArray(y) CUBLAS.gbmv!('C',m,kl,ku,alpha,d_Ab,d_y,beta,d_x) BLAS.gbmv!('C',m,kl,ku,alpha,Ab,y,beta,x) - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x end end @@ -406,14 +406,14 @@ end A = bandex(A,kl,ku) # get packed format Ab = band(A,kl,ku) - d_Ab = CudaArray(Ab) + d_Ab = CuArray(Ab) # test y = alpha*A*x x = rand(elty,n) - d_x = CudaArray(x) + d_x = CuArray(x) d_y = CUBLAS.gbmv('N',m,kl,ku,alpha,d_Ab,d_x) y = zeros(elty,m) y = BLAS.gbmv('N',m,kl,ku,alpha,Ab,x) - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -430,15 +430,15 @@ end x = rand(elty,m) y = rand(elty,m) # copy to device - d_A = CudaArray(A) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_A = CuArray(A) + d_x = CuArray(x) + d_y = CuArray(y) # execute on host BLAS.symv!('U',alpha,A,x,beta,y) # execute on device CUBLAS.symv!('U',alpha,d_A,d_x,beta,d_y) # compare results - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -451,14 +451,14 @@ end # generate vectors x = rand(elty,m) # copy to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute on host y = BLAS.symv('U',A,x) # execute on device d_y = CUBLAS.symv('U',d_A,d_x) # compare results - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -475,15 +475,15 @@ end x = rand(elty,m) y = rand(elty,m) # copy to device - d_A = CudaArray(A) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_A = CuArray(A) + d_x = CuArray(x) + d_y = CuArray(y) # execute on host BLAS.hemv!('U',alpha,A,x,beta,y) # execute on device CUBLAS.hemv!('U',alpha,d_A,d_x,beta,d_y) # compare results - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -496,14 +496,14 @@ end # generate vectors x = rand(elty,m) # copy to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute on host y = BLAS.hemv('U',A,x) # execute on device d_y = CUBLAS.hemv('U',d_A,d_x) # compare results - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -526,14 +526,14 @@ end x = rand(elty,m) y = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_AB = CuArray(AB) + d_x = CuArray(x) + d_y = CuArray(y) # sbmv! CUBLAS.sbmv!('U',nbands,alpha,d_AB,d_x,beta,d_y) y = alpha*(A*x) + beta*y # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -556,13 +556,13 @@ end x = rand(elty,m) y = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) # sbmv! d_y = CUBLAS.sbmv('U',nbands,d_AB,d_x) y = A*x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -585,14 +585,14 @@ end x = rand(elty,m) y = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_AB = CuArray(AB) + d_x = CuArray(x) + d_y = CuArray(y) # hbmv! CUBLAS.hbmv!('U',nbands,alpha,d_AB,d_x,beta,d_y) y = alpha*(A*x) + beta*y # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -615,13 +615,13 @@ end x = rand(elty,m) y = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) # hbmv d_y = CUBLAS.hbmv('U',nbands,d_AB,d_x) y = A*x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -639,13 +639,13 @@ end # construct x and y x = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) # tbmv! CUBLAS.tbmv!('U','N','N',nbands,d_AB,d_x) x = A*x # compare - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x end end @@ -663,13 +663,13 @@ end # construct x x = rand(elty,m) # move to host - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) # tbmv! d_y = CUBLAS.tbmv!('U','N','N',nbands,d_AB,d_x) y = A*x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -687,13 +687,13 @@ end # generate vector x = rand(elty,m) # move to device - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) #tbsv! CUBLAS.tbsv!('U','N','N',nbands,d_AB,d_x) x = A\x # compare - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x end end @@ -711,13 +711,13 @@ end # generate vector x = rand(elty,m) # move to device - d_AB = CudaArray(AB) - d_x = CudaArray(x) + d_AB = CuArray(AB) + d_x = CuArray(x) #tbsv d_y = CUBLAS.tbsv('U','N','N',nbands,d_AB,d_x) y = A\x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -730,13 +730,13 @@ end # generate vector x = rand(elty,m) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute trmv! CUBLAS.trmv!('U','N','N',d_A,d_x) x = A*x # compare - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x end end @@ -749,13 +749,13 @@ end # generate vector x = rand(elty,m) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute trmv! d_y = CUBLAS.trmv('U','N','N',d_A,d_x) y = A*x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -768,13 +768,13 @@ end # generate vector x = rand(elty,m) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute trsv! CUBLAS.trsv!('U','N','N',d_A,d_x) x = A\x # compare - h_x = to_host(d_x) + h_x = collect(d_x) @test x ≈ h_x end end @@ -787,13 +787,13 @@ end # generate vector x = rand(elty,m) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # execute trsv! d_y = CUBLAS.trsv('U','N','N',d_A,d_x) y = A\x # compare - h_y = to_host(d_y) + h_y = collect(d_y) @test y ≈ h_y end end @@ -806,14 +806,14 @@ end y = rand(elty,n) alpha = convert(elty,2) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_A = CuArray(A) + d_x = CuArray(x) + d_y = CuArray(y) # perform rank one update CUBLAS.ger!(alpha,d_x,d_y,d_A) A = (alpha*x)*y' + A # move to host and compare - h_A = to_host(d_A) + h_A = collect(d_A) @test A ≈ h_A end end @@ -826,13 +826,13 @@ end x = rand(elty,m) alpha = convert(elty,2) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # perform rank one update CUBLAS.syr!('U',alpha,d_x,d_A) A = (alpha*x)*x.' + A # move to host and compare upper triangles - h_A = to_host(d_A) + h_A = collect(d_A) A = triu(A) h_A = triu(h_A) @test A ≈ h_A @@ -848,13 +848,13 @@ end x = rand(elty,m) alpha = convert(elty,2) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) + d_A = CuArray(A) + d_x = CuArray(x) # perform rank one update CUBLAS.her!('U',alpha,d_x,d_A) A = (alpha*x)*x' + A # move to host and compare upper triangles - h_A = to_host(d_A) + h_A = collect(d_A) A = triu(A) h_A = triu(h_A) @test A ≈ h_A @@ -871,14 +871,14 @@ end y = rand(elty,m) alpha = convert(elty,2) # move to device - d_A = CudaArray(A) - d_x = CudaArray(x) - d_y = CudaArray(y) + d_A = CuArray(A) + d_x = CuArray(x) + d_y = CuArray(y) # perform rank one update CUBLAS.her2!('U',alpha,d_x,d_y,d_A) A = (alpha*x)*y' + y*(alpha*x)' + A # move to host and compare upper triangles - h_A = to_host(d_A) + h_A = collect(d_A) A = triu(A) h_A = triu(h_A) @test A ≈ h_A @@ -896,15 +896,15 @@ end C1 = rand(elty,m,n) C2 = copy(C1) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C1 = CudaArray(C1) - d_C2 = CudaArray(C2) + d_A = CuArray(A) + d_B = CuArray(B) + d_C1 = CuArray(C1) + d_C2 = CuArray(C2) # C = (alpha*A)*B + beta*C CUBLAS.gemm!('N','N',alpha,d_A,d_B,beta,d_C1) A_mul_B!(d_C2, d_A, d_B) - h_C1 = to_host(d_C1) - h_C2 = to_host(d_C2) + h_C1 = collect(d_C1) + h_C2 = collect(d_C2) C1 = (alpha*A)*B + beta*C1 C2 = A*B # compare @@ -919,15 +919,15 @@ end A = rand(elty,m,k) B = rand(elty,k,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # C = (alpha*A)*B + beta*C d_C = CUBLAS.gemm('N','N',d_A,d_B) C = A*B C2 = d_A * d_B # compare - h_C = to_host(d_C) - h_C2 = to_host(C2) + h_C = collect(d_C) + h_C2 = collect(C2) @test C ≈ h_C @test C ≈ h_C2 end @@ -943,19 +943,19 @@ end B = [rand(elty,k,n) for i in 1:10] C = [rand(elty,m,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_B = CudaArray{elty, 2}[] - d_C = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_B = CuArray{elty, 2}[] + d_C = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) - push!(d_B,CudaArray(B[i])) - push!(d_C,CudaArray(C[i])) + push!(d_A,CuArray(A[i])) + push!(d_B,CuArray(B[i])) + push!(d_C,CuArray(C[i])) end # C = (alpha*A)*B + beta*C CUBLAS.gemm_batched!('N','N',alpha,d_A,d_B,beta,d_C) for i in 1:length(d_C) C[i] = (alpha*A[i])*B[i] + beta*C[i] - h_C = to_host(d_C[i]) + h_C = collect(d_C[i]) #compare @test C[i] ≈ h_C end @@ -968,17 +968,17 @@ end A = [rand(elty,m,k) for i in 1:10] B = [rand(elty,k,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_B = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_B = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A, CudaArray(A[i])) - push!(d_B, CudaArray(B[i])) + push!(d_A, CuArray(A[i])) + push!(d_B, CuArray(B[i])) end # C = A*B d_C = CUBLAS.gemm_batched('N','N',d_A,d_B) for i in 1:length(A) C = A[i]*B[i] - h_C = to_host(d_C[i]) + h_C = collect(d_C[i]) @test C ≈ h_C end end @@ -995,14 +995,14 @@ end B = rand(elty,m,n) C = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # C = (alpha*A)*B + beta*C CUBLAS.symm!('L','U',alpha,d_A,d_B,beta,d_C) C = (alpha*A)*B + beta*C # compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1014,13 +1014,13 @@ end A = A + A.' B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # C = (alpha*A)*B + beta*C d_C = CUBLAS.symm('L','U',d_A,d_B) C = A*B # compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1035,14 +1035,14 @@ end alpha = rand(elty) beta = rand(elty) # move to device - d_A = CudaArray(A) - d_C = CudaArray(C) + d_A = CuArray(A) + d_C = CuArray(C) # C = (alpha*A)*A.' + beta*C CUBLAS.syrk!('U','N',alpha,d_A,beta,d_C) C = (alpha*A)*A.' + beta*C C = triu(C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(C) @test C ≈ h_C end @@ -1053,13 +1053,13 @@ end # generate matrices A = rand(elty,m,k) # move to device - d_A = CudaArray(A) + d_A = CuArray(A) # C = A*A.' d_C = CUBLAS.syrk('U','N',d_A) C = A*A.' C = triu(C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(C) @test C ≈ h_C end @@ -1075,13 +1075,13 @@ end alpha = rand(elty) beta = rand(elty) # move to device - d_A = CudaArray(A) - d_C = CudaArray(C) + d_A = CuArray(A) + d_C = CuArray(C) CUBLAS.herk!('U','N',alpha,d_A,beta,d_C) C = alpha*(A*A') + beta*C C = triu(C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(C) @test C ≈ h_C end @@ -1092,13 +1092,13 @@ end # generate matrices A = rand(elty,m,k) # move to device - d_A = CudaArray(A) + d_A = CuArray(A) # C = A*A' d_C = CUBLAS.herk('U','N',d_A) C = A*A' C = triu(C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(C) @test C ≈ h_C end @@ -1117,16 +1117,16 @@ end C = rand(elty,m,m) C = C + C.' # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # compute #C = alpha*(A*B.') + conj(alpha)*(B*A.') + beta*C C = alpha*(A*B.' + B*A.') + beta*C CUBLAS.syr2k!('U','N',alpha,d_A,d_B,beta,d_C) # move back to host and compare C = triu(C) - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(h_C) @test C ≈ h_C @@ -1141,15 +1141,15 @@ end A = rand(elty,m,k) B = rand(elty,m,k) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute #C = alpha*(A*B.') + conj(alpha)*(B*A.') + beta*C C = alpha*(A*B.' + B*A.') d_C = CUBLAS.syr2k('U','N',alpha,d_A,d_B) # move back to host and compare C = triu(C) - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(h_C) @test C ≈ h_C end @@ -1166,16 +1166,16 @@ end C = rand(elty1,m,m) C = C + C' # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # compute #C = alpha*(A*B') + conj(alpha)*(B*A') + beta*C C = alpha*(A*B') + conj(alpha)*(B*A') + beta*C CUBLAS.her2k!('U','N',alpha,d_A,d_B,beta,d_C) # move back to host and compare C = triu(C) - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(h_C) @test C ≈ h_C end @@ -1187,14 +1187,14 @@ end A = rand(elty,m,k) B = rand(elty,m,k) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute C = A*B' + B*A' d_C = CUBLAS.her2k('U','N',d_A,d_B) # move back to host and compare C = triu(C) - h_C = to_host(d_C) + h_C = collect(d_C) h_C = triu(h_C) @test C ≈ h_C end @@ -1210,14 +1210,14 @@ end B = rand(elty,m,n) C = zeros(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # compute C = alpha*A*B CUBLAS.trmm!('L','U','N','N',alpha,d_A,d_B,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1231,13 +1231,13 @@ end A = triu(A) B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute C = alpha*A*B d_C = CUBLAS.trmm('L','U','N','N',alpha,d_A,d_B) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1251,13 +1251,13 @@ end A = triu(A) B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute C = alpha*(A\B) CUBLAS.trsm!('L','U','N','N',alpha,d_A,d_B) # move to host and compare - h_C = to_host(d_B) + h_C = collect(d_B) @test C ≈ h_C end end @@ -1271,13 +1271,13 @@ end A = triu(A) B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute C = alpha*(A\B) d_C = CUBLAS.trsm('L','U','N','N',alpha,d_A,d_B) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1291,18 +1291,18 @@ end map!((x) -> triu(x), A, A) B = [rand(elty,m,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_B = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_B = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) - push!(d_B,CudaArray(B[i])) + push!(d_A,CuArray(A[i])) + push!(d_B,CuArray(B[i])) end # compute CUBLAS.trsm_batched!('L','U','N','N',alpha,d_A,d_B) # move to host and compare for i in 1:length(d_B) C = alpha*(A[i]\B[i]) - h_C = to_host(d_B[i]) + h_C = collect(d_B[i]) #compare @test C ≈ h_C end @@ -1318,18 +1318,18 @@ end map!((x) -> triu(x), A, A) B = [rand(elty,m,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_B = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_B = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) - push!(d_B,CudaArray(B[i])) + push!(d_A,CuArray(A[i])) + push!(d_B,CuArray(B[i])) end # compute d_C = CUBLAS.trsm_batched('L','U','N','N',alpha,d_A,d_B) # move to host and compare for i in 1:length(d_C) C = alpha*(A[i]\B[i]) - h_C = to_host(d_C[i]) + h_C = collect(d_C[i]) @test C ≈ h_C end end @@ -1347,14 +1347,14 @@ end B = rand(elty,m,n) C = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # compute C = alpha*(A*B) + beta*C CUBLAS.hemm!('L','L',alpha,d_A,d_B,beta,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1369,13 +1369,13 @@ end @test ishermitian(A) B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) # compute C = alpha*(A*B) d_C = CUBLAS.hemm('L','U',alpha,d_A,d_B) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @@ -1390,39 +1390,39 @@ end B = rand(elty,m,n) C = zeros(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) - d_C = CudaArray(C) + d_A = CuArray(A) + d_B = CuArray(B) + d_C = CuArray(C) # compute C = alpha*A + beta*B CUBLAS.geam!('N','N',alpha,d_A,beta,d_B,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C #test in place versions too C = rand(elty,m,n) - d_C = CudaArray(C) + d_C = CuArray(C) C = alpha*C + beta*B CUBLAS.geam!('N','N',alpha,d_C,beta,d_B,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C C = rand(elty,m,n) - d_C = CudaArray(C) + d_C = CuArray(C) C = alpha*A + beta*C CUBLAS.geam!('N','N',alpha,d_A,beta,d_C,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C #test setting C to zero C = rand(elty,m,n) - d_C = CudaArray(C) + d_C = CuArray(C) alpha = zero(elty) beta = zero(elty) CUBLAS.geam!('N','N',alpha,d_A,beta,d_B,d_C) - h_C = to_host(d_C) + h_C = collect(d_C) @test h_C ≈ zeros(elty,m,n) # bounds checking @@ -1441,32 +1441,33 @@ end A = rand(elty,m,n) B = rand(elty,m,n) # move to device - d_A = CudaArray(A) - d_B = CudaArray(B) + d_A = CuArray(A) + d_B = CuArray(B) C = zeros(elty,m,n) # compute C = alpha*A + beta*B d_C = CUBLAS.geam('N','N',alpha,d_A,beta,d_B) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end @testset "getrf_batched!" begin @testset for elty in [Float32, Float64, Complex64, Complex128] + local k # generate matrices A = [rand(elty,m,m) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end pivot, info = CUBLAS.getrf_batched!(d_A, false) - h_info = to_host(info) + h_info = collect(info) for As in 1:length(d_A) C = lufact!(copy(A[As]), Val{false}) # lufact(A[As],pivot=false) - h_A = to_host(d_A[As]) + h_A = collect(d_A[As]) #reconstruct L,U dL = eye(elty,m) dU = zeros(elty,(m,m)) @@ -1480,14 +1481,14 @@ end @test isapprox(C[:U], dU, rtol=1e-2) end for i in 1:length(A) - d_A[ i ] = CudaArray(A[i]) + d_A[ i ] = CuArray(A[i]) end pivot, info = CUBLAS.getrf_batched!(d_A, true) - h_info = to_host(info) - h_pivot = to_host(pivot) + h_info = collect(info) + h_pivot = collect(pivot) for As in 1:length(d_A) C = lufact(A[As]) - h_A = to_host(d_A[As]) + h_A = collect(d_A[As]) #reconstruct L,U dL = eye(elty,m) dU = zeros(elty,(m,m)) @@ -1512,18 +1513,19 @@ end @testset "getrf_batched" begin @testset for elty in [Float32, Float64, Complex64, Complex128] + local k # generate matrices A = [rand(elty,m,m) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end pivot, info, d_B = CUBLAS.getrf_batched(d_A, false) - h_info = to_host(info) + h_info = collect(info) for Bs in 1:length(d_B) C = lufact!(copy(A[Bs]),Val{false}) # lufact(A[Bs],pivot=false) - h_B = to_host(d_B[Bs]) + h_B = collect(d_B[Bs]) #reconstruct L,U dL = eye(elty,m) dU = zeros(elty,(m,m)) @@ -1544,20 +1546,20 @@ end # generate matrices A = [rand(elty,m,m) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end pivot, info = CUBLAS.getrf_batched!(d_A, true) - h_info = to_host(info) + h_info = collect(info) for Cs in 1:length(h_info) @test h_info[Cs] == 0 end pivot, info, d_C = CUBLAS.getri_batched(d_A, pivot) - h_info = to_host(info) + h_info = collect(info) for Cs in 1:length(d_C) C = inv(A[Cs]) - h_C = to_host(d_C[Cs]) + h_C = collect(d_C[Cs]) @test h_info[Cs] == 0 @test C ≈ h_C end @@ -1569,14 +1571,14 @@ end # generate matrices A = [rand(elty,m,m) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end info, d_C = CUBLAS.matinv_batched(d_A) for Cs in 1:length(d_C) C = inv(A[Cs]) - h_C = to_host(d_C[Cs]) + h_C = collect(d_C[Cs]) @test C ≈ h_C end end @@ -1587,15 +1589,15 @@ end # generate matrices A = [rand(elty,m,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end tau, d_A = CUBLAS.geqrf_batched!(d_A) for As in 1:length(d_A) C = qrfact(A[As]) - h_A = to_host(d_A[As]) - h_tau = to_host(tau[As]) + h_A = collect(d_A[As]) + h_tau = collect(tau[As]) # build up Q Q = eye(elty,min(m,n)) for i in 1:min(m,n) @@ -1614,15 +1616,15 @@ end # generate matrices A = [rand(elty,m,n) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) + push!(d_A,CuArray(A[i])) end tau, d_B = CUBLAS.geqrf_batched!(d_A) for Bs in 1:length(d_B) C = qrfact(A[Bs]) - h_B = to_host(d_B[Bs]) - h_tau = to_host(tau[Bs]) + h_B = collect(d_B[Bs]) + h_tau = collect(tau[Bs]) # build up Q Q = eye(elty,min(m,n)) for i in 1:min(m,n) @@ -1642,16 +1644,16 @@ end A = [rand(elty,n,n) for i in 1:10] C = [rand(elty,n,k) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_C = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_C = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) - push!(d_C,CudaArray(C[i])) + push!(d_A,CuArray(A[i])) + push!(d_C,CuArray(C[i])) end d_A, d_C, info = CUBLAS.gels_batched!('N',d_A, d_C) for Cs in 1:length(d_C) X = A[Cs]\C[Cs] - h_C = to_host(d_C[Cs]) + h_C = collect(d_C[Cs]) @test X≈h_C end end @@ -1663,16 +1665,16 @@ end A = [rand(elty,n,n) for i in 1:10] C = [rand(elty,n,k) for i in 1:10] # move to device - d_A = CudaArray{elty, 2}[] - d_C = CudaArray{elty, 2}[] + d_A = CuArray{elty, 2}[] + d_C = CuArray{elty, 2}[] for i in 1:length(A) - push!(d_A,CudaArray(A[i])) - push!(d_C,CudaArray(C[i])) + push!(d_A,CuArray(A[i])) + push!(d_C,CuArray(C[i])) end d_B, d_D, info = CUBLAS.gels_batched('N',d_A, d_C) for Ds in 1:length(d_D) X = A[Ds]\C[Ds] - h_D = to_host(d_D[Ds]) + h_D = collect(d_D[Ds]) @test X ≈ h_D end end @@ -1685,19 +1687,19 @@ end C = rand(elty,m,n) X = rand(elty,m) # move to device - d_A = CudaArray(A) - d_C = CudaArray(C) - d_X = CudaArray(X) + d_A = CuArray(A) + d_C = CuArray(C) + d_X = CuArray(X) # compute C = diagm(X) * A CUBLAS.dgmm!('L',d_A,d_X,d_C) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C # bounds checking @test_throws DimensionMismatch CUBLAS.dgmm!('R',d_A,d_X,d_C) A = rand(elty,m,m) - d_A = CudaArray(A) + d_A = CuArray(A) @test_throws DimensionMismatch CUBLAS.dgmm!('L',d_A,d_X,d_C) end end @@ -1708,13 +1710,13 @@ end A = rand(elty,m,n) X = rand(elty,m) # move to device - d_A = CudaArray(A) - d_X = CudaArray(X) + d_A = CuArray(A) + d_X = CuArray(X) # compute C = diagm(X) * A d_C = CUBLAS.dgmm('L',d_A,d_X) # move to host and compare - h_C = to_host(d_C) + h_C = collect(d_C) @test C ≈ h_C end end From 42f29c6e782feb194dee56f001bf41030945035f Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 10 Aug 2017 12:03:39 +0100 Subject: [PATCH 6/7] clarify purpose --- src/CUBLAS.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/CUBLAS.jl b/src/CUBLAS.jl index 4ca81ac..59929f5 100644 --- a/src/CUBLAS.jl +++ b/src/CUBLAS.jl @@ -15,8 +15,6 @@ using CUDAdrv: OwnedPtr, CuArray, CuVector, CuMatrix CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} -const cudaStream_t = Ptr{Void} - const BlasChar = Char #import Base.LinAlg.BlasChar import Base.one import Base.zero @@ -80,6 +78,9 @@ if isempty(libcublas) error("CUBLAS library cannot be found. Please make sure that CUDA is installed") end +# Typedef needed by libcublas +const cudaStream_t = Ptr{Void} + include("libcublas.jl") # setup cublas handle From 05b68d01690409c253b8837839c316bf4d9f8bdd Mon Sep 17 00:00:00 2001 From: Mike J Innes Date: Thu, 10 Aug 2017 12:23:42 +0100 Subject: [PATCH 7/7] this is unnecessary --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0f1196d..9c315a5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ import Base.LinAlg.BLAS using CUBLAS -using CUDAdrv: OwnedPtr, CuArray, CuVector, CuMatrix +using CUDAdrv: CuArray, CuVector, CuMatrix using Base.Test @testset "CUBLAS" begin