diff --git a/Project.toml b/Project.toml index 762e51e..b889938 100644 --- a/Project.toml +++ b/Project.toml @@ -1,33 +1,13 @@ name = "ITensorGPU" uuid = "d89171c1-af8f-46b3-badf-d2a472317c15" authors = ["Katharine Hyatt", "Matthew Fishman "] -version = "0.1.7" +version = "0.2.0" [deps] -Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" -Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -NDTensors = "23ae76d9-e61a-49c4-8f12-3f1a16adf9cf" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SimpleTraits = "699a6c99-e7fa-54fc-8d76-47d257e15c1d" -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" -TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" [compat] -Adapt = "3.5, 4" -CUDA = "4.0" -Combinatorics = "1.0.2" -Functors = "0.2, 0.3, 0.4" -ITensors = "= 0.3.37" -NDTensors = "0.1.50" -SimpleTraits = "0.9.4" -StaticArrays = "1.2.13" -Strided = "1.1.2, 2" -TimerOutputs = "0.5.13" -cuTENSOR = "1.1.0" -julia = "1.6 - 1.9" +CUDA = "4, 5" +ITensors = "0.3, 0.4, 0.5" +julia = "1.6" diff --git a/README.md b/README.md index 40238be..b0578b0 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,3 @@ -# ITensorGPU: Intelligent Tensors with GPU acceleration - -[![codecov](https://codecov.io/gh/ITensor/ITensorGPU.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ITensor/ITensorGPU.jl) - -[![gitlab-ci](https://gitlab.com/JuliaGPU/ITensorGPU-jl/badges/master/pipeline.svg)](https://gitlab.com/JuliaGPU/ITensorGPU-jl/commits/master) - -This package extends the functionality of [ITensors.jl](https://github.com/ITensor/ITensors.jl) to make use of CUDA-enabled GPUs to accelerate tensor contractions and factorizations. It sits on top of the wonderful [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package and uses NVIDIA's [cuTENSOR](https://developer.nvidia.com/cutensor) library for high-performance tensor operations. - -## Installing ITensorGPU.jl - -Dependencies: - - [Julia 1.3 or later](https://julialang.org/downloads/) - - [CUDA 10.1 or later](https://developer.nvidia.com/cuda-downloads) -- Currently only NVIDIA GPUs are supported. NVIDIA drivers are required so that Julia can make use of the NVIDIA GPU on your system. - - [cuTENSOR v1.0.0 or later](https://developer.nvidia.com/cutensor) -- A specialized library for perfoming permutation-free tensor contractions on the GPU. `libcutensor.so` needs to be in your `LD_LIBRARY_PATH` so that `CUDA.jl` will be able to find it. - - [ITensors.jl](https://itensor.github.io/ITensors.jl/stable/#Installation-1) - -To properly install CUDA with Julia, it may be helpful to first follow the [CUDA.jl installation instructions](https://juliagpu.github.io/CUDA.jl/stable/installation/overview/) and test that you have that installed properly and that it is able to use `cuTENSOR`. You can run the commands: -```julia -julia> using CUDA.CUTENSOR - -julia> CUTENSOR.has_cutensor() -true - -julia> CUTENSOR.version() -v"1.2.1" -``` -to check that `CUDA.jl` can see the version of `cuTENSOR` you have installed. - -Once you have all of the dependencies installed, you can then go ahead and install `ITensorGPU.jl` with the following command: -``` -julia> ] - -pkg> add ITensorGPU -``` - -To check if this has all worked, you can run the package tests using: -```julia -julia> ] - -pkg> test ITensorGPU -``` - -## Examples - -Take a look at the `examples/` directory for examples of running ITensor calculations on the GPU. - -For an application of `ITensorGPU.jl` to more sophisticated tensor network calculations, take a look at [PEPS.jl](https://github.com/ITensor/PEPS.jl). - -For some background on the development and design of this package, you can take a look at [this blog post](https://kshyatt.github.io/post/itensorsgpu/) by Katie Hyatt, original author of the `ITensorGPU.jl` package. - +| :warning: WARNING | +|:---------------------------| +| The ITensorGPU.jl package is deprecated and only provided for backwards compatibility. For an alternative, see the ITensors.jl documentation about [running ITensor on GPUs](https://itensor.github.io/ITensors.jl/dev/RunningOnGPUs.html). | diff --git a/src/ITensorGPU.jl b/src/ITensorGPU.jl index 93be4c9..af8696e 100644 --- a/src/ITensorGPU.jl +++ b/src/ITensorGPU.jl @@ -1,114 +1,29 @@ module ITensorGPU -using NDTensors - -using Adapt -using CUDA -using CUDA.CUBLAS -using CUDA.CUSOLVER -using Functors -using ITensors -using LinearAlgebra -using Random -using SimpleTraits -using StaticArrays -using Strided -using TimerOutputs -using cuTENSOR - -using NDTensors: setdata, setstorage, cpu, IsWrappedArray, parenttype - -import Adapt: adapt_structure -import Base: *, permutedims! -import CUDA: CuArray, CuMatrix, CuVector, cu -import CUDA.Mem: pin -import ITensors: - randn!, - compute_contraction_labels, - eigen, - tensor, - scale!, - unioninds, - array, - matrix, - vector, - polar, - tensors, - truncate!, - leftlim, - rightlim, - permute, - BroadcastStyle, - Indices -import NDTensors: - Atrans, - Btrans, - CombinerTensor, - ContractionProperties, - Combiner, - Ctrans, - Diag, - DiagTensor, - Dense, - DenseTensor, - NonuniformDiag, - NonuniformDiagTensor, - Tensor, - UniformDiag, - UniformDiagTensor, - _contract!!, - _contract!, - _contract_scalar!, - _contract_scalar_noperm!, - can_contract, - compute_contraction_properties!, - contract!!, - contract!, - contract, - contraction_output, - contraction_output_type, - data, - getperm, - ind, - is_trivial_permutation, - outer!, - outer!!, - permutedims!!, - set_eltype, - set_ndims, - similartype, - zero_contraction_output -import cuTENSOR: cutensorContractionPlan_t, cutensorAlgo_t - -#const ContractionPlans = Dict{String, Tuple{cutensorAlgo_t, cutensorContractionPlan_t}}() -const ContractionPlans = Dict{String,cutensorAlgo_t}() - -include("cuarray/set_types.jl") -include("traits.jl") -include("adapt.jl") -include("tensor/cudense.jl") -include("tensor/dense.jl") -include("tensor/culinearalgebra.jl") -include("tensor/cutruncate.jl") -include("tensor/cucombiner.jl") -include("tensor/cudiag.jl") -include("cuitensor.jl") -include("mps/cumps.jl") - -export cu, - cpu, cuITensor, randomCuITensor, cuMPS, randomCuMPS, productCuMPS, randomCuMPO, cuMPO - -## TODO: Is this needed? -## const devs = Ref{Vector{CUDAdrv.CuDevice}}() -## const dev_rows = Ref{Int}(0) -## const dev_cols = Ref{Int}(0) -## function __init__() -## voltas = filter(dev->occursin("V100", CUDAdrv.name(dev)), collect(CUDAdrv.devices())) -## pascals = filter(dev->occursin("P100", CUDAdrv.name(dev)), collect(CUDAdrv.devices())) -## devs[] = voltas[1:1] -## #devs[] = pascals[1:2] -## CUBLASMG.cublasMgDeviceSelect(CUBLASMG.mg_handle(), length(devs[]), devs[]) -## dev_rows[] = 1 -## dev_cols[] = 1 -## end - -end #module +using CUDA: CUDA +using ITensors: cpu, cu +export cpu, cu + +using ITensors: ITensor, cpu, cu, randomITensor +function cuITensor(args...; kwargs...) + return cu(ITensor(args...; kwargs...)) +end +function randomCuITensor(args...; kwargs...) + return cu(randomITensor(args...; kwargs...)) +end +export cuITensor, randomCuITensor + +using ITensors.ITensorMPS: MPO, MPS, randomMPS +function CuMPS(args...; kwargs...) + return cu(MPS(args...; kwargs...)) +end +function productCuMPS(args...; kwargs...) + return cu(MPS(args...; kwargs...)) +end +function randomCuMPS(args...; kwargs...) + return cu(randomMPS(args...; kwargs...)) +end +function CuMPO(args...; kwargs...) + return cu(MPO(args...; kwargs...)) +end +export cuMPO, cuMPS, productCuMPS, randomCuMPO, randomCuMPS +end diff --git a/src/adapt.jl b/src/adapt.jl deleted file mode 100644 index fe40c14..0000000 --- a/src/adapt.jl +++ /dev/null @@ -1,33 +0,0 @@ -# -# Used to adapt `EmptyStorage` types -# - -function NDTensors.cu(eltype::Type{<:Number}, x) - return fmap(x -> adapt(CuVector{eltype,default_buffertype()}, x), x) -end -NDTensors.cu(x) = fmap(x -> adapt(CuArray, x), x) - -function NDTensors.set_eltype_if_unspecified( - arraytype::Type{CuVector{T}}, eltype::Type -) where {T} - return arraytype -end -function NDTensors.set_eltype_if_unspecified(arraytype::Type{CuVector}, eltype::Type) - return CuVector{eltype} -end - -# Overload `CUDA.cu` for convenience -const ITensorType = Union{ - TensorStorage,Tensor,ITensor,Array{ITensor},Array{<:Array{ITensor}},MPS,MPO -} -CUDA.cu(x::ITensorType) = NDTensors.cu(x) -CUDA.cu(eltype::Type{<:Number}, x::ITensorType) = NDTensors.cu(eltype, x) - -function NDTensors.adapt_storagetype( - to::Type{<:CUDA.CuArray}, x::Type{<:NDTensors.EmptyStorage} -) - store = NDTensors.storagetype(x) - return NDTensors.emptytype( - NDTensors.set_datatype(store, CuVector{eltype(store),default_buffertype()}) - ) -end diff --git a/src/cuarray/set_types.jl b/src/cuarray/set_types.jl deleted file mode 100644 index 21beda4..0000000 --- a/src/cuarray/set_types.jl +++ /dev/null @@ -1,17 +0,0 @@ -buffertype(datatype::Type{<:CuArray{<:Any,<:Any,B}}) where {B} = B -function buffertype(datatype::Type{<:CuArray}) - println( - "CuArray definitions require a CUDA.Mem buffer try $(datatype{default_buffertype()})" - ) - throw(TypeError) -end - -default_buffertype() = CUDA.Mem.DeviceBuffer - -function set_eltype(arraytype::Type{<:CuArray}, eltype::Type) - return CuArray{eltype,ndims(arraytype),buffertype(arraytype)} -end - -function set_ndims(arraytype::Type{<:CuArray}, ndims) - return CuArray{eltype(arraytype),ndims,buffertype(arraytype)} -end diff --git a/src/cuitensor.jl b/src/cuitensor.jl deleted file mode 100644 index 667aa28..0000000 --- a/src/cuitensor.jl +++ /dev/null @@ -1,127 +0,0 @@ -import ITensors.NDTensors: NeverAlias, AliasStyle, AllowAlias -import ITensors: ITensor -import CUDA: CuArray - -function cuITensor(eltype::Type{<:Number}, inds::IndexSet) - return itensor( - NDTensors.default_storagetype(CuVector{eltype,default_buffertype()})(dim(inds)), inds - ) -end -cuITensor(::Type{T}, inds::Index...) where {T<:Number} = cuITensor(T, IndexSet(inds...)) - -cuITensor(is::IndexSet) = cuITensor(Float64, is) -cuITensor(inds::Index...) = cuITensor(IndexSet(inds...)) - -cuITensor() = ITensor() -function cuITensor(x::S, inds::IndexSet{N}) where {S<:Number,N} - d = NDTensors.Dense{S,CuVector{S,default_buffertype()}}( - fill!(CuVector{S,default_buffertype()}(undef, dim(inds)), x) - ) - return ITensor(d, inds) -end -cuITensor(x::S, inds::Index...) where {S<:Number} = cuITensor(x, IndexSet(inds...)) - -function ITensor( - as::AliasStyle, - eltype::Type{<:Number}, - A::CuArray{<:Number}, - inds::Indices{Index{Int}}; - kwargs..., -) - length(A) ≠ dim(inds) && throw( - DimensionMismatch( - "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))", - ), - ) - data = CuArray{eltype}(as, A) - return itensor(Dense(data), inds) -end - -# Fix ambiguity error -function ITensor( - as::NDTensors.AliasStyle, eltype::Type{<:Number}, A::CuArray{<:Number}, inds::Tuple{} -) - length(A) ≠ dim(inds) && throw( - DimensionMismatch( - "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))", - ), - ) - data = CuArray{eltype}(as, A) - return itensor(Dense(data), inds) -end - -# Helper functions for different view behaviors -CuArray{ElT,N}(::NeverAlias, A::AbstractArray) where {ElT,N} = CuArray{ElT,N}(A) -function CuArray{ElT,N}(::AllowAlias, A::AbstractArray) where {ElT,N} - return convert(CuArray{ElT,N}, A) -end -function CuArray{ElT}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {ElT,N,ElTA} - return CuArray{ElT,N}(as, A) -end - -# TODO: Change to: -# (Array{ElT, N} where {ElT})([...]) = [...] -# once support for `VERSION < v"1.6"` is dropped. -# Previous to Julia v1.6 `where` syntax couldn't be used in a function name -function CuArray{<:Any,N}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {N,ElTA} - return CuArray{ElTA,N}(as, A) -end - -cuITensor(data::Array, inds...) = cu(ITensor(data, inds...)) - -cuITensor(data::CuArray, inds...) = ITensor(data, inds...) - -cuITensor(A::ITensor) = cu(A) - -function randomCuITensor(::Type{S}, inds::Indices) where {S<:Real} - T = cuITensor(S, inds) - randn!(T) - return T -end -function randomCuITensor(::Type{S}, inds::Indices) where {S<:Complex} - Tr = cuITensor(real(S), inds) - Ti = cuITensor(real(S), inds) - randn!(Tr) - randn!(Ti) - return complex(Tr) + im * Ti -end -function randomCuITensor(::Type{S}, inds::Index...) where {S<:Number} - return randomCuITensor(S, IndexSet(inds...)) -end -randomCuITensor(inds::IndexSet) = randomCuITensor(Float64, inds) -randomCuITensor(inds::Index...) = randomCuITensor(Float64, IndexSet(inds...)) - -CuArray(T::ITensor) = CuArray(tensor(T)) - -function CuArray{ElT,N}(T::ITensor, is::Vararg{Index,N}) where {ElT,N} - ndims(T) != N && throw( - DimensionMismatch( - "cannot convert an $(ndims(T)) dimensional ITensor to an $N-dimensional CuArray." - ), - ) - TT = tensor(permute(T, is...; allow_alias=true)) - return CuArray{ElT,N}(TT)::CuArray{ElT,N} -end - -function CuArray{ElT}(T::ITensor, is::Vararg{Index,N}) where {ElT,N} - return CuArray{ElT,N}(T, is...) -end - -function CuArray(T::ITensor, is::Vararg{Index,N}) where {N} - return CuArray{eltype(T),N}(T, is...)::CuArray{<:Number,N} -end - -CUDA.CuMatrix(A::ITensor) = CuArray(A) - -function CuVector(A::ITensor) - if ndims(A) != 1 - throw(DimensionMismatch("Vector() expected a 1-index ITensor")) - end - return CuArray(A) -end - -function CuMatrix(T::ITensor, i1::Index, i2::Index) - ndims(T) != 2 && - throw(DimensionMismatch("ITensor must be order 2 to convert to a Matrix")) - return CuArray(T, i1, i2) -end diff --git a/src/mps/cumps.jl b/src/mps/cumps.jl deleted file mode 100644 index f59099f..0000000 --- a/src/mps/cumps.jl +++ /dev/null @@ -1,13 +0,0 @@ -# cu(ψ::Union{MPS,MPO}) = map(cu, ψ) -# cpu(ψ::Union{MPS,MPO}) = map(cpu, ψ) - -cuMPS(ψ::MPS) = cu(ψ) -cuMPS(args...; kwargs...) = cu(MPS(args...; kwargs...)) -randomCuMPS(args...; kwargs...) = cu(randomMPS(args...; kwargs...)) - -# For backwards compatibility -productCuMPS(args...; kwargs...) = cuMPS(args...; kwargs...) - -cuMPO(M::MPO) = cu(M) -cuMPO(args...; kwargs...) = cu(MPO(args...; kwargs...)) -randomCuMPO(args...; kwargs...) = cu(randomMPO(args...; kwargs...)) diff --git a/src/tensor/cucombiner.jl b/src/tensor/cucombiner.jl deleted file mode 100644 index d01109a..0000000 --- a/src/tensor/cucombiner.jl +++ /dev/null @@ -1 +0,0 @@ -Base.promote_rule(::Type{<:Combiner}, StorageT::Type{<:CuDense}) = StorageT diff --git a/src/tensor/cudense.jl b/src/tensor/cudense.jl deleted file mode 100644 index 2db9055..0000000 --- a/src/tensor/cudense.jl +++ /dev/null @@ -1,573 +0,0 @@ -using LinearAlgebra: BlasFloat - -const CuDense{ElT,VecT} = Dense{ElT,VecT} where {VecT<:CuVector} -const CuDenseTensor{ElT,N,StoreT,IndsT} = Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDense} - -# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:CuArray,SB<:Array} -# return Dense{T,SA}(CuArray(x)) -# end -# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:Array,SB<:CuArray} -# return Dense{T,SA}(collect(x.data)) -# end -# Dense{T,S}(size::Integer) where {T,S<:CuArray{<:T}} = Dense{T,S}(CUDA.zeros(T, size)) -# function Dense{T,S}(x::T, size::Integer) where {T,S<:CuArray{<:T}} -# arr = CuArray{T}(undef, size) -# fill!(arr, x) -# return Dense{T,S}(arr) -# end - -function Base.complex(::Type{Dense{ElT,VT}}) where {ElT,VT<:CuArray} - return Dense{complex(ElT),CuVector{complex(ElT)}} -end - -CuArray(x::CuDense{ElT}) where {ElT} = CuVector{ElT}(data(x)) -function CuArray{ElT,N}(x::CuDenseTensor{ElT,N}) where {ElT,N} - return CuArray{ElT,N}(reshape(data(store(x)), dims(inds(x))...)) -end -CuArray(x::CuDenseTensor{ElT,N}) where {ElT,N} = CuArray{ElT,N}(x) - -*(D::Dense{T,AT}, x::S) where {T,AT<:CuArray,S<:Number} = Dense(x .* data(D)) - -Base.getindex(D::CuDense{<:Number}) = collect(data(D))[] -Base.getindex(D::CuDenseTensor{<:Number,0}) = store(D)[] -LinearAlgebra.norm(T::CuDenseTensor) = norm(data(store(T))) - -function Base.copyto!(R::CuDenseTensor{<:Number,N}, T::CuDenseTensor{<:Number,N}) where {N} - RA = array(R) - TA = array(T) - RA .= TA - return R -end - -# This is for type promotion for Scalar*Dense -function Base.promote_rule( - ::Type{<:Dense{ElT1,CuVector{ElT1}}}, ::Type{ElT2} -) where {ElT1,ElT2<:Number} - ElR = promote_type(ElT1, ElT2) - VecR = CuVector{ElR} - return Dense{ElR,VecR} -end - -function permutedims!!( - B::Tensor{ElT,N,StoreT,IndsB}, - A::Tensor{ElT,N,StoreT,IndsA}, - perm::NTuple{N,Int}, - f::Function=(r, t) -> permute!(r, t), -) where {N,ElT,IndsB,IndsA,StoreT<:CuDense{ElT}} - Ais = inds(A) - Bis = ITensors.NDTensors.permute(inds(A), perm) - B = f(B, A) - return B -end - -import ITensors.NDTensors: GemmBackend, auto_select_backend, _gemm! -function backend_cutensor() - return gemm_backend[] = :CUTENSOR -end -function backend_cublas() - return gemm_backend[] = :CUBLAS -end - -@inline function auto_select_backend( - ::Type{<:CuArray{<:BlasFloat}}, - ::Type{<:CuArray{<:BlasFloat}}, - ::Type{<:CuArray{<:BlasFloat}}, -) - return GemmBackend(:CUBLAS) -end - -@inline function auto_select_backend( - ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:AbstractVecOrMat} -) - return GemmBackend(:GenericCUDA) -end - -# CUBLAS matmul -function _gemm!( - ::GemmBackend{:CUBLAS}, - tA, - tB, - alpha, - A::AbstractVecOrMat, - B::AbstractVecOrMat, - beta, - C::AbstractVecOrMat, -) - return CUBLAS.gemm!(tA, tB, alpha, A, B, beta, C) -end - -# CUDA generic matmul -function _gemm!( - ::GemmBackend{:GenericCUDA}, - tA, - tB, - alpha, - A::AbstractVecOrMat, - B::AbstractVecOrMat, - beta, - C::CuDenseTensor, -) - C_dat = reshape(data(store(C)), size(C)) - A_ = tA == 'T' ? transpose(A) : A - B_ = tB == 'T' ? transpose(B) : B - C_dat = mul!(C_dat, A_, B_, alpha, beta) - copyto!(data(store(C)), C_dat) - return C -end - -function _contract_scalar!( - R::CuDenseTensor{ElR,NR}, - labelsR, - T₁::CuDenseTensor, - labelsT₁, - T₂::CuDenseTensor, - labelsT₂, - α=one(ElR), - β=zero(ElR), -) where {ElR,NR} - if nnz(T₁) == nnz(T₂) == 1 - new_R = Tensor(Dense(data(store(T₁)) .* data(store(T₂))), inds(R)) - copyto!(store(R), store(new_R)) - elseif nnz(T₁) == 1 - props = ContractionProperties(labelsT₁, labelsT₂, labelsR) - compute_contraction_properties!(props, T₁, T₂, R) - R = _contract!(R, T₁, T₂, props, α, β) - elseif nnz(T₂) == 1 - props = ContractionProperties(labelsT₁, labelsT₂, labelsR) - compute_contraction_properties!(props, T₁, T₂, R) - R = _contract!(R, T₁, T₂, props, α, β) - else - error("In _contract_scalar!, one tensor must be a scalar") - end - return R -end - -function _gemm_contract!( - CT::DenseTensor{El,NC}, - AT::DenseTensor{El,NA}, - BT::DenseTensor{El,NB}, - props::ContractionProperties, - α::Number=one(El), - β::Number=zero(El), -) where {El,NC,NA,NB} - # TODO: directly use Tensor instead of Array - C = array(CT) - A = array(AT) - B = array(BT) - - tA = 'N' - if props.permuteA - pA = NTuple{NA,Int}(props.PA) - Ap = permutedims(A, pA) - AM = reshape(Ap, props.dmid, props.dleft) - tA = 'T' - else - #A doesn't have to be permuted - if Atrans(props) - AM = reshape(A, props.dmid, props.dleft) - tA = 'T' - else - AM = reshape(A, props.dleft, props.dmid) - end - end - - tB = 'N' - if props.permuteB - pB = NTuple{NB,Int}(props.PB) - Bp = permutedims(B, pB) - BM = reshape(Bp, props.dmid, props.dright) - else - if Btrans(props) - BM = reshape(B, props.dright, props.dmid) - tB = 'T' - else - BM = reshape(B, props.dmid, props.dright) - end - end - - #TODO: this logic may be wrong - if props.permuteC - #Need to copy here since we will be permuting - #into C later - CM = reshape(copy(C), props.dleft, props.dright) - else - if Ctrans(props) - CM = reshape(C, props.dleft, props.dright) - (AM, BM) = (BM, AM) - if tA == tB - tA = tB = (tA == 'T' ? 'N' : 'T') - end - else - CM = reshape(C, props.dleft, props.dright) - end - end - - CM = CUBLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM) - - if props.permuteC - pC = NTuple{NC,Int}(props.PC) - Cr = reshape(CM, props.newCrange...) - @strided C .= permutedims(Cr, pC) - end - return C -end - -function _contract!( - CT::CuDenseTensor{El,NC}, - AT::CuDenseTensor{El,NA}, - BT::CuDenseTensor{El,NB}, - props::ContractionProperties, - α::Number=one(El), - β::Number=zero(El), -) where {El,NC,NA,NB} - if ndims(CT) > 12 || ndims(BT) > 12 || ndims(AT) > 12 - return _gemm_contract!(CT, AT, BT, props, α, β) - end - Ainds = inds(AT) - Adims = dims(Ainds) - Binds = inds(BT) - Bdims = dims(Binds) - Cinds = inds(CT) - Cdims = dims(Cinds) - Adata = reshape(data(store(AT)), Adims...) - Bdata = reshape(data(store(BT)), Bdims...) - Cdata = reshape(data(store(CT)), Cdims...) - contracted = commoninds(Ainds, Binds) - A_only = uniqueinds(Ainds, Binds) - B_only = uniqueinds(Binds, Ainds) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(contracted) - push!(ind_dict, i) - end - if length(A_only) > 0 - for (idx, i) in enumerate(A_only) - push!(ind_dict, i) - end - end - if length(B_only) > 0 - for (idx, i) in enumerate(B_only) - push!(ind_dict, i) - end - end - ctainds = zeros(Int, length(Ainds)) - ctbinds = zeros(Int, length(Binds)) - ctcinds = zeros(Int, length(Cinds)) - for (ii, ia) in enumerate(Ainds) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Binds) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - for (ii, ic) in enumerate(Cinds) - ctcinds[ii] = findfirst(x -> x == ic, ind_dict) - end - id_op = cuTENSOR.CUTENSOR_OP_IDENTITY - dict_key = "" - for cc in zip(ctcinds, Cdims) - dict_key *= string(cc[1]) * "," * string(cc[2]) * "," - end - for aa in zip(ctainds, Adims) - dict_key *= string(aa[1]) * "," * string(aa[2]) * "," - end - for bb in zip(ctbinds, Bdims) - dict_key *= string(bb[1]) * "," * string(bb[2]) * "," - end - if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1 - if haskey(ContractionPlans, dict_key) - dict_val = ContractionPlans[dict_key] - algo = dict_val - #plan = dict_val[2] - Cdata = cuTENSOR.contraction!( - α, - Adata, - Vector{Char}(ctainds), - id_op, - Bdata, - Vector{Char}(ctbinds), - id_op, - β, - Cdata, - Vector{Char}(ctcinds), - id_op, - id_op; - algo=algo, - ) - else - # loop through all algos - # pick the fastest one - # store that plan! - best_time = 1e6 - best_plan = nothing - best_algo = nothing - max_algos = Ref{Int32}(C_NULL) - cuTENSOR.cutensorContractionMaxAlgos(max_algos) - # fix once the other options are documented - #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1)) - algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1)) - for algo in reverse(algos) - try - Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!( - α, - Adata, - Vector{Char}(ctainds), - id_op, - Bdata, - Vector{Char}(ctbinds), - id_op, - β, - Cdata, - Vector{Char}(ctcinds), - id_op, - id_op; - algo=cuTENSOR.cutensorAlgo_t(algo), - ) - if this_time < best_time - best_time = this_time - #best_plan = this_plan - best_algo = cuTENSOR.cutensorAlgo_t(algo) - end - catch err - @warn "Algorithm $algo not supported" - end - end - ContractionPlans[dict_key] = best_algo - end - else - Cdata = cuTENSOR.contraction!( - α, - Adata, - Vector{Char}(ctainds), - id_op, - Bdata, - Vector{Char}(ctbinds), - id_op, - β, - Cdata, - Vector{Char}(ctcinds), - id_op, - id_op, - ) - end - return parent(Cdata) -end - -function Base.:+(B::CuDenseTensor, A::CuDenseTensor) - opC = cuTENSOR.CUTENSOR_OP_IDENTITY - opA = cuTENSOR.CUTENSOR_OP_IDENTITY - opAC = cuTENSOR.CUTENSOR_OP_ADD - Ais = inds(A) - Bis = inds(B) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(inds(A)) - push!(ind_dict, i) - end - Adata = data(store(A)) - Bdata = data(store(B)) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - ctcinds = copy(ctbinds) - C = CUDA.zeros(eltype(Bdata), dims(Bis)...) - cuTENSOR.elementwiseBinary!( - one(eltype(Adata)), - reshapeAdata, - ctainds, - opA, - one(eltype(Bdata)), - reshapeBdata, - ctbinds, - opC, - C, - ctcinds, - opAC, - ) - copyto!(data(store(B)), vec(C)) - return B -end - -function Base.:+(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet) - opA = cuTENSOR.CUTENSOR_OP_IDENTITY - opC = cuTENSOR.CUTENSOR_OP_IDENTITY - opAC = cuTENSOR.CUTENSOR_OP_ADD - ind_dict = Vector{Index}() - for (idx, i) in enumerate(Ais) - push!(ind_dict, i) - end - Adata = data(A) - Bdata = data(B) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - ctcinds = copy(ctbinds) - C = CUDA.zeros(eltype(Bdata), dims(Bis)...) - Cis = Bis - C = cuTENSOR.elementwiseBinary!( - 1, reshapeAdata, ctainds, opA, 1, reshapeBdata, ctbinds, opC, C, ctcinds, opAC - ) - copyto!(data(B), vec(C)) - return C -end - -function Base.:-(B::CuDenseTensor, A::CuDenseTensor) - opC = cuTENSOR.CUTENSOR_OP_IDENTITY - opA = cuTENSOR.CUTENSOR_OP_IDENTITY - opAC = cuTENSOR.CUTENSOR_OP_ADD - Ais = inds(A) - Bis = inds(B) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(inds(A)) - push!(ind_dict, i) - end - Adata = data(store(A)) - Bdata = data(store(B)) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - ctcinds = copy(ctbinds) - C = CUDA.zeros(eltype(Bdata), dims(Bis)) - cuTENSOR.elementwiseBinary!( - -one(eltype(Adata)), - reshapeAdata, - ctainds, - opA, - one(eltype(Bdata)), - reshapeBdata, - ctbinds, - opC, - C, - ctcinds, - opAC, - ) - copyto!(data(store(B)), vec(C)) - return B -end - -function Base.:-(A::CuDense, Ais::IndexSet, B::CuDense, Bis::IndexSet) - opA = cuTENSOR.CUTENSOR_OP_IDENTITY - opC = cuTENSOR.CUTENSOR_OP_IDENTITY - opAC = cuTENSOR.CUTENSOR_OP_ADD - ind_dict = Vector{Index}() - for (idx, i) in enumerate(Ais) - push!(ind_dict, i) - end - Adata = data(A) - Bdata = data(B) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - ctcinds = copy(ctbinds) - C = CUDA.zeros(eltype(Bdata), dims(Bis)...) - Cis = Bis - C = cuTENSOR.elementwiseBinary!( - one(eltype(Adata)), - reshapeAdata, - ctainds, - opA, - -one(eltype(Bdata)), - reshapeBdata, - ctbinds, - opC, - C, - ctcinds, - opAC, - ) - copyto!(data(B), vec(C)) - return C -end - -function Base.permute!(B::CuDenseTensor, A::CuDenseTensor) - Ais = inds(A) - Bis = inds(B) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(Ais) - push!(ind_dict, i) - end - Adata = data(store(A)) - Bdata = data(store(B)) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - if ndims(A) < 40 # use CUTENSOR - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - cuTENSOR.permutation!( - one(eltype(Adata)), - reshapeAdata, - Vector{Char}(ctainds), - reshapeBdata, - Vector{Char}(ctbinds), - ) - else # use GPUArrays - perm = Int[] - for aix in Ais - b_pos = findfirst(bix -> bix == aix, Bis) - push!(perm, b_pos) - end - @assert isperm(perm) - permutedims!(reshapeBdata, reshapeAdata, invperm(perm)) - end - return Tensor(Dense(vec(reshapeBdata)), inds(B)) -end - -function Base.permute!(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(Ais) - push!(ind_dict, i) - end - Adata = data(A) - Bdata = data(B) - reshapeBdata = reshape(Bdata, dims(Bis)...) - reshapeAdata = reshape(Adata, dims(Ais)...) - ctainds = zeros(Int, length(Ais)) - ctbinds = zeros(Int, length(Bis)) - for (ii, ia) in enumerate(Ais) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Bis) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - - cuTENSOR.permutation!( - one(eltype(Adata)), - reshapeAdata, - Vector{Char}(ctainds), - reshapeBdata, - Vector{Char}(ctbinds), - ) - return Tensor(Dense(reshapeBdata), Tuple(Bis)) -end - -Base.:/(A::CuDenseTensor, x::Number) = A * inv(x) diff --git a/src/tensor/cudiag.jl b/src/tensor/cudiag.jl deleted file mode 100644 index 0e27beb..0000000 --- a/src/tensor/cudiag.jl +++ /dev/null @@ -1,228 +0,0 @@ -export CuDiag - -const CuDiag{ElT,VecT} = Diag{ElT,VecT} where {VecT<:CuArray{ElT}} -const NonuniformCuDiagTensor{ElT,N,StoreT,IndsT} = - Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDiag} - -CuArray(D::NonuniformCuDiagTensor) = CuArray(dense(D)) - -function NDTensors.dense(T::NonuniformCuDiagTensor{ElT}) where {ElT} - R_data = CUDA.zeros(ElT, dim(inds(T))) - diag_inds = diagind(reshape(R_data, dims(inds(T))), 0) - R_data[diag_inds] .= data(store(T)) - return Tensor(Dense(R_data), inds(T)) -end - -function Base.promote_rule( - ::Type{Diag{ElT2,VecT2}}, ::Type{CuDense} -) where {ElT2,VecT2<:Number} - return promote_type(DenseT1, ElT2) -end - -function Base.promote_rule( - ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}} -) where {ElT1,ElT2,N1,N2,StoreT1<:CuDense,StoreT2<:UniformDiag} - ElT3 = promote_type(ElT1, ElT2) - return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3} -end -function Base.promote_rule( - ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}} -) where {ElT1,ElT2,N1,N2,StoreT1<:UniformDiag,StoreT2<:CuDense} - ElT3 = promote_type(ElT1, ElT2) - return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3} -end - -function Base.promote_rule( - ::Type{<:UniformDiag{ElT1}}, ::Type{<:NonuniformDiag{ElT2,VecT2}} -) where {ElT1,ElT2,VecT2<:CuArray} - ElT3 = promote_type(ElT1, ElT2) - return NonuniformDiag{ElT3,CuVector{ElT3}} -end -function Base.promote_rule( - ::Type{<:NonuniformDiag{ElT2,VecT2}}, ::Type{<:UniformDiag{ElT1}} -) where {ElT1,ElT2,VecT2<:CuArray} - ElT3 = promote_type(ElT1, ElT2) - return NonuniformDiag{ElT3,CuVector{ElT3}} -end -function Base.promote_rule( - ::Type{DenseT1}, ::Type{Diag{ElT2,VecT2}} -) where {DenseT1<:CuDense,ElT2,VecT2<:Number} - return promote_type(DenseT1, ElT2) -end - -function contraction_output_type( - TensorT1::Type{<:NonuniformCuDiagTensor}, TensorT2::Type{<:CuDenseTensor}, IndsR -) - return similartype(promote_type(TensorT1, TensorT2), IndsR) -end -function contraction_output_type( - TensorT1::Type{<:CuDenseTensor}, TensorT2::Type{<:NonuniformCuDiagTensor}, IndsR -) - return contraction_output_type(TensorT2, TensorT1, IndsR) -end - -function contraction_output_type( - TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}}, - TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}}, - IndsR::Type, -) - return similartype(promote_type(TensorT1, TensorT2), IndsR) -end - -function contraction_output_type( - TensorT1::Type{<:UniformDiagTensor}, - TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}}, - IndsR::Type, -) - return similartype(promote_type(TensorT1, TensorT2), IndsR) -end -function contraction_output_type( - TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}}, - TensorT2::Type{<:UniformDiagTensor}, - IndsR::Type, -) - return contraction_output_type(TensorT2, TensorT1, IndsR) -end - -function zero_contraction_output( - T1::UniformDiagTensor{ElT1,N1}, T2::CuDenseTensor{ElT2,N2}, indsR::IndsR -) where {ElT1,N1,ElT2,N2,IndsR} - ElT3 = promote_type(ElT1, ElT2) - dat = CUDA.zeros(ElT3, dim(indsR)) - return Tensor(Dense(dat), indsR) -end -function zero_contraction_output( - T2::CuDenseTensor{ElT2,N2}, T1::UniformDiagTensor{ElT1,N1}, indsR::IndsR -) where {ElT1,N1,ElT2,N2,IndsR} - return zero_contraction_output(T1, T2, indsR) -end - -function zero_contraction_output( - T1::TensorT1, T2::TensorT2, indsR -) where {TensorT1<:NonuniformDiagTensor,TensorT2<:NonuniformCuDiagTensor} - ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2)) - dat = CUDA.zeros(ElT3, length(data(store(T2)))) - return Tensor(Diag(dat), indsR) -end - -function zero_contraction_output( - T1::TensorT1, T2::TensorT2, indsR -) where {TensorT1<:UniformDiagTensor,TensorT2<:NonuniformCuDiagTensor} - ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2)) - dat = CUDA.zeros(ElT3, length(data(store(T2)))) - return Tensor(Diag(dat), indsR) -end -function zero_contraction_output( - T1::TensorT1, T2::TensorT2, indsR -) where {TensorT2<:UniformDiagTensor,TensorT1<:NonuniformCuDiagTensor} - return zero_contraction_output(T2, T1, indsR) -end - -function zero_contraction_output( - T1::TensorT1, T2::TensorT2, indsR -) where {TensorT1<:DiagTensor,TensorT2<:CuDenseTensor} - ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2)) - dat = CUDA.zeros(ElT3, length(data(store(T2)))) - return Tensor(Dense(dat), indsR) -end -function zero_contraction_output( - T1::TensorT1, T2::TensorT2, indsR -) where {TensorT2<:DiagTensor,TensorT1<:CuDenseTensor} - return zero_contraction_output(T2, T1, indsR) -end - -function contraction_output( - T1::UniformDiagTensor, T2::DiagTensor{Elt2,<:CuDiag}, indsR -) where {Elt2} - return zero_contraction_output(T1, T2, indsR) -end -function contraction_output( - T1::DiagTensor{Elt1,<:CuDiag}, T2::UniformDiagTensor, indsR -) where {Elt1} - return contraction_output(T2, T1, indsR) -end - -function contract!( - C::CuDenseTensor{<:Number,NC}, - Clabels, - A::UniformDiagTensor{<:Number,NA}, - Alabels, - B::CuDenseTensor{<:Number,NB}, - Blabels, -) where {NC,NA,NB} - Bstore = data(store(B)) - Astore = data(store(A)) - Cstore = data(store(C)) - return copyto!(Cstore, Astore .* Bstore) -end - -function contract!( - C::CuDenseTensor{<:Number,NC}, - Clabels, - A::CuDenseTensor{<:Number,NA}, - Alabels, - B::UniformDiagTensor{<:Number,NB}, - Blabels, -) where {NC,NA,NB} - return contract!(C, Clabels, B, Blabels, A, Alabels) -end - -function contract!( - C::NonuniformCuDiagTensor{EltC,NC,IndsC}, - Clabels, - A::UniformDiagTensor{EltA,NA,IndsA}, - Alabels, - B::NonuniformCuDiagTensor{EltB,NB,IndsB}, - Blabels, -) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC} - Bstore = data(store(B)) - Astore = data(store(A)) - Cstore = data(store(C)) - return copyto!(Cstore, Astore .* Bstore) -end - -function contract!( - C::NonuniformCuDiagTensor{EltC,NC,IndsC}, - Clabels, - B::NonuniformCuDiagTensor{EltB,NB,IndsB}, - Blabels, - A::UniformDiagTensor{EltA,NA,IndsA}, - Alabels, -) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC} - Bstore = data(store(B)) - Astore = data(store(A)) - Cstore = data(store(C)) - return copyto!(Cstore, Astore .* Bstore) -end - -function contract!( - C::NonuniformCuDiagTensor{EltC,NC,IndsC}, - Clabels, - B::NonuniformCuDiagTensor{EltB,NB,IndsB}, - Blabels, - A::NonuniformCuDiagTensor{EltA,NA,IndsA}, - Alabels, -) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC} - Bstore = data(store(B)) - Astore = data(store(A)) - Cstore = data(store(C)) - return copyto!(Cstore, Astore .* Bstore) -end - -# Dense * NonuniformCuDiag -function contract!( - C::CuDenseTensor, Clabels, A::NonuniformCuDiagTensor, Alabels, B::CuDenseTensor, Blabels -) - Astore = data(store(A)) - newAstore = CUDA.zeros(eltype(A), dims(inds(A))[1], dims(inds(A))[2]) - adi = diagind(newAstore, 0) - newAstore[adi] .= Astore[:] - newA = Tensor(Dense(vec(newAstore)), inds(A)) - return contract!(C, Clabels, newA, Alabels, B, Blabels) -end - -function contract!( - C::CuDenseTensor, Clabels, A::CuDenseTensor, Alabels, B::NonuniformCuDiagTensor, Blabels -) - return contract!(C, Clabels, B, Blabels, A, Alabels) -end diff --git a/src/tensor/culinearalgebra.jl b/src/tensor/culinearalgebra.jl deleted file mode 100644 index c64a3b0..0000000 --- a/src/tensor/culinearalgebra.jl +++ /dev/null @@ -1,159 +0,0 @@ - -# -# Linear Algebra of order 2 Tensors -# -# Even though CuDenseTensor{_,2} is strided -# and passable to BLAS/LAPACK, it cannot -# be made <: StridedArray - -function Base.:*( - T1::Tensor{ElT1,2,StoreT1,IndsT1}, T2::Tensor{ElT2,2,StoreT2,IndsT2} -) where {ElT1,StoreT1<:CuDense,IndsT1,ElT2,StoreT2<:CuDense,IndsT2} - RM = matrix(T1) * matrix(T2) - indsR = IndsT1(ind(T1, 1), ind(T2, 2)) - pT = promote_type(ElT1, ElT2) - return tensor(Dense(vec(RM)), indsR) -end -#= FIX ME -function LinearAlgebra.exp(T::CuDenseTensor{ElT,2}) where {ElT,IndsT} - expTM = exp(matrix(T)) - return tensor(Dense(vec(expTM)),inds(T)) -end - -function expHermitian(T::CuDenseTensor{ElT,2}) where {ElT,IndsT} - # exp(::Hermitian/Symmetric) returns Hermitian/Symmetric, - # so extract the parent matrix - expTM = parent(exp(Hermitian(matrix(T)))) - return tensor(Dense(vec(expTM)),inds(T)) -end -=# - -# svd of an order-2 tensor -function LinearAlgebra.svd(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT} - maxdim::Int = get(kwargs, :maxdim, minimum(dims(T))) - mindim::Int = get(kwargs, :mindim, 1) - cutoff::Float64 = get(kwargs, :cutoff, 0.0) - absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false) - doRelCutoff::Bool = get(kwargs, :doRelCutoff, true) - fastSVD::Bool = get(kwargs, :fastSVD, false) - # Safer to use `Array`, which ensures - # no views/aliases are made, since - # we are using in-place `CUSOLVER.svd!` below. - aT = Array(T) - @timeit "CUSOLVER svd" begin - MU, MS, MV = CUSOLVER.svd!(aT) - end - if !(MV isa CuMatrix) - # Materialize any array wrappers, - # for now, since `Adjoint` wrappers - # seem to cause issues throughout - # CUDA.jl, for example with slicing, - # reshaping and then copying, etc. - # TODO: Fix this in a more robust way. - MV = copy(MV) - end - # for consistency with cpu version, - # ITensors.jl/NDTensors/src/linearalgebra.jl/svd - # need conj!(MV) - conj!(MV) - P = MS .^ 2 - truncerr, docut, P = truncate!( - P; - mindim=mindim, - maxdim=maxdim, - cutoff=cutoff, - absoluteCutoff=absoluteCutoff, - doRelCutoff=doRelCutoff, - ) - spec = Spectrum(P, truncerr) - dS = length(P) - if dS < length(MS) - MU = MU[:, 1:dS] - MS = MS[1:dS] - MV = MV[:, 1:dS] - end - - # Make the new indices to go onto U and V - u = eltype(IndsT)(dS) - v = eltype(IndsT)(dS) - Uinds = IndsT((ind(T, 1), u)) - Sinds = IndsT((u, v)) - Vinds = IndsT((ind(T, 2), v)) - U = tensor(Dense(vec(MU)), Uinds) - Sdata = CUDA.zeros(ElT, dS * dS) - dsi = diagind(reshape(Sdata, dS, dS), 0) - Sdata[dsi] = MS - S = tensor(Dense(Sdata), Sinds) - V = tensor(Dense(vec(MV)), Vinds) - return U, S, V, spec -end - -function LinearAlgebra.eigen( - T::Hermitian{ElT,<:CuDenseTensor{ElT,2,IndsT}}; kwargs... -) where {ElT<:Union{Real,Complex},IndsT} - ispossemidef::Bool = get(kwargs, :ispossemidef, false) - maxdim::Int = get(kwargs, :maxdim, minimum(dims(T))) - mindim::Int = get(kwargs, :mindim, 1) - cutoff::Float64 = get(kwargs, :cutoff, 0.0) - absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false) - doRelCutoff::Bool = get(kwargs, :doRelCutoff, true) - @timeit "CUSOLVER eigen" begin - local DM, UM - if ElT <: Complex - DM, UM = CUSOLVER.heevd!('V', 'U', matrix(parent(T))) - else - DM, UM = CUSOLVER.syevd!('V', 'U', matrix(parent(T))) - end - end - DM_ = reverse(DM) - @timeit "truncate" begin - truncerr, docut, DM = truncate!( - DM_; - maxdim=maxdim, - cutoff=cutoff, - absoluteCutoff=absoluteCutoff, - doRelCutoff=doRelCutoff, - ) - end - spec = Spectrum(DM, truncerr) - dD = length(DM) - dV = reverse(UM; dims=2) - if dD < size(dV, 2) - dV = CuMatrix(dV[:, 1:dD]) - end - # Make the new indices to go onto U and V - l = eltype(IndsT)(dD) - r = eltype(IndsT)(dD) - Vinds = IndsT((dag(ind(T, 2)), dag(r))) - Dinds = IndsT((l, dag(r))) - U = tensor(Dense(vec(dV)), Vinds) - D = tensor(Diag(real.(DM)), Dinds) - return D, U, spec -end - -function LinearAlgebra.qr(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT} - QM, RM = qr(matrix(T)) - # Make the new indices to go onto Q and R - q, r = inds(T) - q = dim(q) < dim(r) ? sim(q) : sim(r) - Qinds = IndsT((ind(T, 1), q)) - Rinds = IndsT((q, ind(T, 2))) - QM = CuMatrix(QM) - Q = tensor(Dense(vec(QM)), Qinds) - R = tensor(Dense(vec(RM)), Rinds) - return Q, R -end - -function polar(T::CuDenseTensor{ElT,2,IndsT}) where {ElT,IndsT} - QM, RM = polar(matrix(T)) - dim = size(QM, 2) - # Make the new indices to go onto Q and R - q = eltype(IndsT)(dim) - # TODO: use push/pushfirst instead of a constructor - # call here - Qinds = IndsT((ind(T, 1), q)) - Rinds = IndsT((q, ind(T, 2))) - Q = tensor(Dense(vec(QM)), Qinds) - R = tensor(Dense(vec(RM)), Rinds) - return Q, R -end diff --git a/src/tensor/cutruncate.jl b/src/tensor/cutruncate.jl deleted file mode 100644 index 108bc1e..0000000 --- a/src/tensor/cutruncate.jl +++ /dev/null @@ -1,87 +0,0 @@ -import LinearAlgebra: BlasReal - -function truncate!(P::CuVector{T}; kwargs...)::Tuple{T,T,CuVector{T}} where {T<:BlasReal} - maxdim::Int = min(get(kwargs, :maxdim, length(P)), length(P)) - mindim::Int = min(get(kwargs, :mindim, 1), maxdim) - cutoff::Float64 = get(kwargs, :cutoff, 0.0) - absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false) - doRelCutoff::Bool = get(kwargs, :doRelCutoff, true) - origm = length(P) - docut = zero(T) - - maxP = maximum(P) - if maxP == zero(T) - P = CUDA.zeros(T, 1) - return zero(T), zero(T), P - end - if origm == 1 - docut = maxP / 2 - return zero(T), docut, P[1:1] - end - @timeit "setup rP" begin - #Zero out any negative weight - #neg_z_f = (!signbit(x) ? x : 0.0) - rP = map(x -> !signbit(x) ? Float64(x) : 0.0, P) - n = origm - end - @timeit "handle cutoff" begin - if absoluteCutoff - #Test if individual prob. weights fall below cutoff - #rather than using *sum* of discarded weights - sub_arr = rP .- Float64(cutoff) - err_rP = sub_arr ./ abs.(sub_arr) - flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61) - cut_ind = CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1 - if cut_ind > 0 - n = min(maxdim, cut_ind) - n = max(n, mindim) - else - n = maxdim - end - truncerr = T(sum(rP[(n + 1):end])) - else - truncerr = zero(T) - scale = one(T) - @timeit "find scale" begin - if doRelCutoff - scale = sum(P) - scale = scale > zero(T) ? scale : one(T) - end - end - #Truncating until *sum* of discarded probability - #weight reaches cutoff reached (or m==mindim) - csum_rp = Float64.(CUDA.reverse(CUDA.cumsum(CUDA.reverse(rP)))) - sub_arr = csum_rp .- Float64(cutoff * scale) - err_rP = sub_arr ./ abs.(sub_arr) - flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61) - cut_ind = (CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1) - if cut_ind > 0 - n = min(maxdim, cut_ind) - n = max(n, mindim) - else - n = maxdim - end - truncerr = sum(rP[(n + 1):end]) - if scale == zero(T) - truncerr = zero(T) - else - truncerr /= scale - end - end - end - if n < 1 - n = 1 - end - if n < origm - hP = collect(P) - docut = (hP[n] + hP[n + 1]) / 2 - if abs(hP[n] - hP[n + 1]) < 1E-3 * hP[n] - docut += T(1E-3) * hP[n] - end - end - @timeit "setup return" begin - rinds = 1:n - rrP = P[rinds] - end - return truncerr, docut, rrP -end diff --git a/src/tensor/dense.jl b/src/tensor/dense.jl deleted file mode 100644 index 04e1cb8..0000000 --- a/src/tensor/dense.jl +++ /dev/null @@ -1,285 +0,0 @@ -function contract!!( - R::DenseTensor{<:Number,NR}, - labelsR::NTuple{NR}, - T1::DenseTensor{<:Number,N1}, - labelsT1::NTuple{N1}, - T2::DenseTensor{<:Number,N2}, - labelsT2::NTuple{N2}, - α::Number=1, - β::Number=0, -) where {NR,N1,N2} - if N1 == 0 - (α ≠ 1 || β ≠ 0) && - error("contract!! not yet implemented for scalar ITensor with non-trivial α and β") - # TODO: replace with an add! function? - # What about doing `R .= T1[] .* PermutedDimsArray(T2,perm)`? - perm = getperm(labelsR, labelsT2) - R = permutedims!!(R, T2, perm, (r, t2) -> T1[] * t2) - elseif N2 == 0 - (α ≠ 1 || β ≠ 0) && - error("contract!! not yet implemented for scalar ITensor with non-trivial α and β") - perm = getperm(labelsR, labelsT1) - R = permutedims!!(R, T1, perm, (r, t1) -> T2[] * t1) - elseif N1 + N2 == NR - (α ≠ 1 || β ≠ 0) && error( - "contract!! not yet implemented for outer product tensor contraction with non-trivial α and β", - ) - # TODO: permute T1 and T2 appropriately first (can be more efficient - # then permuting the result of T1⊗T2) - # TODO: implement the in-place version directly - R = outer!!(R, T1, T2) - labelsRp = (labelsT1..., labelsT2...) - perm = getperm(labelsR, labelsRp) - if !is_trivial_permutation(perm) - R = permutedims!!(R, copy(R), perm) - end - else - #if dim(T1) > 2^13 && dim(T2) > 2^13 - # R = _big_contract!!(R,labelsR,T1,labelsT1,T2,labelsT2, α, β) - #else - if α ≠ 1 || β ≠ 0 - R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2, α, β) - else - R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2) - end - #end - end - return R -end - -function _big_contract!!( - R::DenseTensor{<:Number,NR}, - labelsR, - T1::DenseTensor{ElT1,N1}, - labelsT1, - T2::DenseTensor{ElT2,N2}, - labelsT2, - α::Number=1, - β::Number=0, -) where {ElT1,ElT2,N1,N2,NR} - props = ContractionProperties(labelsT1, labelsT2, labelsR) - compute_contraction_properties!(props, T1, T2, R) - _big_contract!(R, T1, T2, props, α, β) - #val, t, _ = @timed _blasmg_contract!(R,T1,T2,props,α,β) - return R -end - -function _big_contract!( - CT::DenseTensor{El,NC}, - AT::DenseTensor{El,NA}, - BT::DenseTensor{El,NB}, - props::ContractionProperties, - α::Number=one(El), - β::Number=zero(El), -) where {El,NC,NA,NB} - Ainds = inds(AT) - Adims = dims(Ainds) - Binds = inds(BT) - Bdims = dims(Binds) - Cinds = inds(CT) - Cdims = dims(Cinds) - Adata = reshape(data(store(AT)), Adims) - Bdata = reshape(data(store(BT)), Bdims) - Cdata = reshape(data(store(CT)), Cdims) - contracted = commoninds(Ainds, Binds) - A_only = uniqueinds(Ainds, Binds) - B_only = uniqueinds(Binds, Ainds) - ind_dict = Vector{Index}() - for (idx, i) in enumerate(contracted) - push!(ind_dict, i) - end - if length(A_only) > 0 - for (idx, i) in enumerate(A_only) - push!(ind_dict, i) - end - end - if length(B_only) > 0 - for (idx, i) in enumerate(B_only) - push!(ind_dict, i) - end - end - ctainds = zeros(Int, length(Ainds)) - ctbinds = zeros(Int, length(Binds)) - ctcinds = zeros(Int, length(Cinds)) - for (ii, ia) in enumerate(Ainds) - ctainds[ii] = findfirst(x -> x == ia, ind_dict) - end - for (ii, ib) in enumerate(Binds) - ctbinds[ii] = findfirst(x -> x == ib, ind_dict) - end - for (ii, ic) in enumerate(Cinds) - ctcinds[ii] = findfirst(x -> x == ic, ind_dict) - end - id_op = cuTENSOR.CUTENSOR_OP_IDENTITY - dict_key = "" - for cc in zip(ctcinds, Cdims) - dict_key *= string(cc[1]) * "," * string(cc[2]) * "," - end - for aa in zip(ctainds, Adims) - dict_key *= string(aa[1]) * "," * string(aa[2]) * "," - end - for bb in zip(ctbinds, Bdims) - dict_key *= string(bb[1]) * "," * string(bb[2]) * "," - end - #=synchronize() - if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1 - if haskey(ContractionPlans, dict_key) - dict_val = ContractionPlans[dict_key] - algo = dict_val - Cdata = cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=algo) - synchronize() - else - # loop through all algos - # pick the fastest one - # store that plan! - best_time = 1e6 - best_plan = nothing - best_algo = nothing - max_algos = Ref{Int32}(C_NULL) - cuTENSOR.cutensorContractionMaxAlgos(max_algos) - # fix once the other options are documented - #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1)) - algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1)) - for algo in reverse(algos) - try - Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=cuTENSOR.cutensorAlgo_t(algo)) - synchronize() - if this_time < best_time - best_time = this_time - best_algo = cuTENSOR.cutensorAlgo_t(algo) - end - catch err - @warn "Algorithm $algo not supported" - end - end - ContractionPlans[dict_key] = best_algo - end - else - =# - Cdata .= zero(eltype(Cdata)) - #@show size(Adata) - #@show size(Bdata) - #@show size(Cdata) - @assert !any(isnan.(Adata)) - @assert !any(isnan.(Bdata)) - @assert !any(isnan.(Cdata)) - #@show ctainds - #@show ctbinds - #@show ctcinds - #flush(stdout) - CUDA.Mem.pin(Adata) - CUDA.Mem.pin(Bdata) - CUDA.Mem.pin(Cdata) - synchronize() - #AC = CuArray(Adata) - #BC = CuArray(Bdata) - #CC = CuArray(Cdata) - @assert !any(isnan.(Adata)) - @assert !any(isnan.(Bdata)) - @assert !any(isnan.(Cdata)) - #@assert !any(isnan.(AC)) - #@assert !any(isnan.(BC)) - #@assert !any(isnan.(CC)) - #CC = cuTENSOR.contraction!(α, AC, ctainds, id_op, BC, ctbinds, id_op, β, CC, ctcinds, id_op, id_op) - #synchronize() - #@assert !any(isnan.(AC)) - #@assert !any(isnan.(BC)) - #@assert !any(isnan.(CC)) - Cdata = cuTENSOR.contraction!( - α, Adata, ctainds, id_op, Bdata, ctbinds, id_op, β, Cdata, ctcinds, id_op, id_op - ) - synchronize() - #end - #CCh = collect(CC) - #@assert !any(isnan.(CCh)) - #Cdata .= CCh - @assert !any(isnan.(Adata)) - @assert !any(isnan.(Bdata)) - @assert !any(isnan.(Cdata)) - return parent(Cdata) -end - -function _blasmg_contract!( - CT::DenseTensor{El,NC}, - AT::DenseTensor{El,NA}, - BT::DenseTensor{El,NB}, - props::ContractionProperties, - α::Number=one(El), - β::Number=zero(El), -) where {El,NC,NA,NB} - # TODO: directly use Tensor instead of Array - C = array(CT) - A = array(AT) - B = array(BT) - - tA = 'N' - if props.permuteA - pA = NTuple{NA,Int}(props.PA) - @strided Ap = permutedims(A, pA) - AM = reshape(Ap, props.dmid, props.dleft) - tA = 'T' - else - #A doesn't have to be permuted - if Atrans(props) - AM = reshape(A, props.dmid, props.dleft) - tA = 'T' - else - AM = reshape(A, props.dleft, props.dmid) - end - end - - tB = 'N' - if props.permuteB - pB = NTuple{NB,Int}(props.PB) - @strided Bp = permutedims(B, pB) - BM = reshape(Bp, props.dmid, props.dright) - else - if Btrans(props) - BM = reshape(B, props.dright, props.dmid) - tB = 'T' - else - BM = reshape(B, props.dmid, props.dright) - end - end - - # TODO: this logic may be wrong - if props.permuteC - # Need to copy here since we will be permuting - # into C later - CM = reshape(copy(C), props.dleft, props.dright) - else - if Ctrans(props) - CM = reshape(C, props.dleft, props.dright) - (AM, BM) = (BM, AM) - if tA == tB - tA = tB = (tA == 'T' ? 'N' : 'T') - end - else - CM = reshape(C, props.dleft, props.dright) - end - end - - if length(AM) > 4096 && length(BM) > 4096 && length(CM) > 4096 - CM = CUBLASMG.mg_gemm!( - tA, - tB, - El(α), - AM, - BM, - El(β), - CM; - devs=devs[], - dev_rows=dev_rows[], - dev_cols=dev_cols[], - ) - else - BLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM) - end - - if props.permuteC - pC = NTuple{NC,Int}(props.PC) - Cr = reshape(CM, props.newCrange...) - @strided C .= permutedims(Cr, pC) - end - return C -end diff --git a/src/traits.jl b/src/traits.jl deleted file mode 100644 index 6589298..0000000 --- a/src/traits.jl +++ /dev/null @@ -1,51 +0,0 @@ -# Trait type indicating the object is either on CPU -# or on a CUDA device (for example a type that doesn't -# have any data, like a Combiner or uniform Diagonal -# tensor). -struct CPUorCUDA end - -is_cu(::Type{<:Number}) = CPUorCUDA() -is_cu(::Type{NDTensors.NoData}) = CPUorCUDA() -is_cu(::Type{<:Array}) = false -is_cu(::Type{<:CuArray}) = true - -# Handle Array wrappers like `ReshapedArray`. -@traitfn function is_cu(arraytype::Type{T}) where {T; IsWrappedArray{T}} - return is_cu(parenttype(arraytype)) -end - -is_cu(X::Type{<:TensorStorage}) = is_cu(NDTensors.datatype(X)) -is_cu(X::Type{<:Tensor}) = is_cu(NDTensors.storagetype(X)) -is_cu(::Type{ITensor}) = error("Unknown") - -is_cu(x::CuArray) = is_cu(typeof(x)) -is_cu(x::Array) = is_cu(typeof(x)) - -is_cu(x::TensorStorage) = is_cu(typeof(x)) -is_cu(x::Tensor) = is_cu(typeof(x)) -is_cu(x::ITensor) = is_cu(typeof(tensor(x))) -is_cu(x::MPS) = all(is_cu, x) -is_cu(x::MPO) = all(is_cu, x) - -mixed_cu_cpu(::Bool, ::CPUorCUDA) = false -mixed_cu_cpu(::CPUorCUDA, ::Bool) = false -mixed_cu_cpu(::CPUorCUDA, ::CPUorCUDA) = false -mixed_cu_cpu(is_cu1::Bool, is_cu2::Bool) = (is_cu1 ⊻ is_cu2) -mixed_cu_cpu(T1::Type, T2::Type) = mixed_cu_cpu(is_cu(T1), is_cu(T2)) - -@traitdef MixedCuCPU{T1,T2} - -#! format: off -@traitimpl MixedCuCPU{T1,T2} <- mixed_cu_cpu(T1, T2) -#! format: on - -@traitfn function can_contract( - ::Type{T1}, ::Type{T2} -) where {T1<:TensorStorage,T2<:TensorStorage;!MixedCuCPU{T1,T2}} - return true -end -@traitfn function can_contract( - ::Type{T1}, ::Type{T2} -) where {T1<:TensorStorage,T2<:TensorStorage;MixedCuCPU{T1,T2}} - return false -end