[NDTensors] Fix nonuniform Diag-Dense contractions on GPU (ITensor#1511)

* [NDTensorscuTENSORExt] Better type promotion in cuTENSOR contraction * [NDTensorsGPUArraysCoreExt] Fix nonuniform Diag-Dense contractions on GPU * [NDTensors] Fix dot on GPU * [NDTensors] Fix Diag-Diag contraction on GPU * [NDTensors] Add GPU-friendly mapreduce * [NDTensors] Fix mapping by function that doesn't preserve zeros * [NDTensors] Bump to v0.3.35 * [ITensors] Define sum and prod for ITensor * [ITensors] Use NDTensors.map_diag in ITensors * [ITensors] Bump to v0.6.16
NuclearPowerNerd · Jun 24, 2024 · d3afdb7 · d3afdb7
1 parent 7d5ecf9
commit d3afdb7
Show file tree

Hide file tree

Showing 20 changed files with 336 additions and 83 deletions.
diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
@@ -1,7 +1,7 @@
 name = "NDTensors"
 uuid = "23ae76d9-e61a-49c4-8f12-3f1a16adf9cf"
 authors = ["Matthew Fishman <[email protected]>"]
-version = "0.3.34"
+version = "0.3.35"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"

diff --git a/NDTensors/ext/NDTensorsGPUArraysCoreExt/contract.jl b/NDTensors/ext/NDTensorsGPUArraysCoreExt/contract.jl
@@ -4,6 +4,42 @@ using NDTensors: NDTensors, DenseTensor, DiagTensor, contract!, dense, inds, Ten
 using NDTensors.Expose: Exposed, expose, unexpose
 using NDTensors.TypeParameterAccessors: parenttype, set_ndims
 
+function NDTensors.contract!(
+  output_tensor::Exposed{<:AbstractGPUArray,<:DenseTensor},
+  labelsoutput_tensor,
+  tensor1::Exposed{<:AbstractGPUArray,<:DiagTensor},
+  labelstensor1,
+  tensor2::Exposed{<:AbstractGPUArray,<:DenseTensor},
+  labelstensor2,
+  α::Number=one(Bool),
+  β::Number=zero(Bool);
+  convert_to_dense::Bool=true,
+)
+  # Convert tensor1 to dense.
+  # TODO: Define `Exposed` overload for `dense`.
+  tensor1 = expose(dense(unexpose(tensor1)))
+  contract!(
+    output_tensor, labelsoutput_tensor, tensor1, labelstensor1, tensor2, labelstensor2, α, β
+  )
+  return output_tensor
+end
+
+function NDTensors.contract!(
+  output_tensor::Exposed{<:AbstractGPUArray,<:DenseTensor},
+  labelsoutput_tensor,
+  tensor1::Exposed{<:AbstractGPUArray,<:DenseTensor},
+  labelstensor1,
+  tensor2::Exposed{<:AbstractGPUArray,<:DiagTensor},
+  labelstensor2,
+  α::Number=one(Bool),
+  β::Number=zero(Bool),
+)
+  contract!(
+    output_tensor, labelsoutput_tensor, tensor2, labelstensor2, tensor1, labelstensor1, α, β
+  )
+  return output_tensor
+end
+
 ## In this function we convert the DiagTensor to a dense tensor and
 ## Feed it back into contract
 function NDTensors.contract!(
@@ -16,20 +52,16 @@ function NDTensors.contract!(
   α::Number=one(Bool),
   β::Number=zero(Bool),
 )
-  tensor1 = unexpose(tensor1)
-  ## convert tensor1 to a dense
-  ## TODO this allocates on CPU first then moves over to GPU which could be slow
-  tensor1 = adapt(set_ndims(parenttype(typeof(tensor2)), 1), dense(tensor1))
-  return contract!(
-    output_tensor,
-    labelsoutput_tensor,
-    expose(tensor1),
-    labelstensor1,
-    tensor2,
-    labelstensor2,
-    α,
-    β,
+  # Convert tensor1 to dense.
+  # TODO: Define `Exposed` overload for `dense`.
+  # TODO: This allocates on CPU first then moves over to GPU which could be optimized.
+  tensor1 = expose(
+    adapt(set_ndims(parenttype(typeof(tensor2)), 1), dense(unexpose(tensor1)))
+  )
+  contract!(
+    output_tensor, labelsoutput_tensor, tensor1, labelstensor1, tensor2, labelstensor2, α, β
   )
+  return output_tensor
 end
 
 function NDTensors.contract!(
@@ -42,7 +74,8 @@ function NDTensors.contract!(
   α::Number=one(Bool),
   β::Number=zero(Bool),
 )
-  return contract!(
+  contract!(
     output_tensor, labelsoutput_tensor, tensor2, labelstensor2, tensor1, labelstensor1, α, β
   )
+  return output_tensor
 end
diff --git a/NDTensors/ext/NDTensorscuTENSORExt/contract.jl b/NDTensors/ext/NDTensorscuTENSORExt/contract.jl
@@ -1,7 +1,17 @@
+using Base: ReshapedArray
 using NDTensors: NDTensors, DenseTensor, array
-using NDTensors.Expose: Exposed, unexpose
+using NDTensors.Expose: Exposed, expose, unexpose
 using cuTENSOR: cuTENSOR, CuArray, CuTensor
 
+# Handle cases that can't be handled by `cuTENSOR.jl`
+# right now.
+function to_zero_offset_cuarray(a::CuArray)
+  return iszero(a.offset) ? a : copy(a)
+end
+function to_zero_offset_cuarray(a::ReshapedArray)
+  return copy(expose(a))
+end
+
 function NDTensors.contract!(
   exposedR::Exposed{<:CuArray,<:DenseTensor},
   labelsR,
@@ -15,8 +25,20 @@ function NDTensors.contract!(
   R, T1, T2 = unexpose.((exposedR, exposedT1, exposedT2))
   zoffR = iszero(array(R).offset)
   arrayR = zoffR ? array(R) : copy(array(R))
-  arrayT1 = iszero(array(T1).offset) ? array(T1) : copy(array(T1))
-  arrayT2 = iszero(array(T2).offset) ? array(T2) : copy(array(T2))
+  arrayT1 = to_zero_offset_cuarray(array(T1))
+  arrayT2 = to_zero_offset_cuarray(array(T2))
+  # Promote to a common type. This is needed because as of
+  # cuTENSOR.jl v5.4.2, cuTENSOR contraction only performs
+  # limited sets of type promotions of inputs, see:
+  # https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/lib/cutensor/src/types.jl#L11-L19
+  elt = promote_type(eltype.((arrayR, arrayT1, arrayT2))...)
+  if elt !== eltype(arrayR)
+    return error(
+      "In cuTENSOR contraction, input tensors have element types `$(eltype(arrayT1))` and `$(eltype(arrayT2))` while the output has element type `$(eltype(arrayR))`.",
+    )
+  end
+  arrayT1 = convert(CuArray{elt}, arrayT1)
+  arrayT2 = convert(CuArray{elt}, arrayT2)
   cuR = CuTensor(arrayR, collect(labelsR))
   cuT1 = CuTensor(arrayT1, collect(labelsT1))
   cuT2 = CuTensor(arrayT2, collect(labelsT2))

diff --git a/NDTensors/src/abstractarray/generic_array_constructors.jl b/NDTensors/src/abstractarray/generic_array_constructors.jl
@@ -1,6 +1,11 @@
 using .TypeParameterAccessors:
   unwrap_array_type, specify_default_type_parameters, type_parameter
 
+# Convert to Array, avoiding copying if possible
+array(a::AbstractArray) = a
+matrix(a::AbstractMatrix) = a
+vector(a::AbstractVector) = a
+
 ## Warning to use these functions it is necessary to define `TypeParameterAccessors.position(::Type{<:YourArrayType}, ::typeof(ndims)))`
 # Implementation, catches if `ndims(arraytype) != length(dims)`.
 ## TODO convert ndims to `type_parameter(::, typeof(ndims))`

diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -367,7 +367,45 @@ function diag(ETensor::Exposed{<:AbstractArray,<:BlockSparseTensor})
   return tensordiag
 end
 
-## TODO currently this fails on GPU with scalar indexing 
+function Base.mapreduce(
+  f, op, t1::BlockSparseTensor, t_tail::BlockSparseTensor...; kwargs...
+)
+  # TODO: Take advantage of block sparsity here.
+  return mapreduce(f, op, array(t1), array.(t_tail)...; kwargs...)
+end
+
+# This is a special case that optimizes for a single tensor
+# and takes advantage of block sparsity. Once the more general
+# case handles block sparsity, this can be removed.
+function Base.mapreduce(f, op, t::BlockSparseTensor; kwargs...)
+  elt = eltype(t)
+  if !iszero(f(zero(elt)))
+    return mapreduce(f, op, array(t); kwargs...)
+  end
+  if length(t) > nnz(t)
+    # Some elements are zero, account for that
+    # with the initial value.
+    init_kwargs = (; init=zero(elt))
+  else
+    init_kwargs = (;)
+  end
+  return mapreduce(f, op, storage(t); kwargs..., init_kwargs...)
+end
+
+function blocksparse_isequal(x, y)
+  return array(x) == array(y)
+end
+function Base.:(==)(x::BlockSparseTensor, y::BlockSparseTensor)
+  return blocksparse_isequal(x, y)
+end
+function Base.:(==)(x::BlockSparseTensor, y::Tensor)
+  return blocksparse_isequal(x, y)
+end
+function Base.:(==)(x::Tensor, y::BlockSparseTensor)
+  return blocksparse_isequal(x, y)
+end
+
+## TODO currently this fails on GPU with scalar indexing
 function map_diag!(
   f::Function,
   exposed_t_destination::Exposed{<:AbstractArray,<:BlockSparseTensor},

diff --git a/NDTensors/src/diag/diagtensor.jl b/NDTensors/src/diag/diagtensor.jl
@@ -195,6 +195,21 @@ function permutedims!!(
   return RR
 end
 
+function Base.mapreduce(f, op, t1::DiagTensor, t_tail::DiagTensor...; kwargs...)
+  elt = mapreduce(eltype, promote_type, (t1, t_tail...))
+  if !iszero(f(zero(elt)))
+    return mapreduce(f, op, array(t1), array.(t_tail)...; kwargs...)
+  end
+  if length(t1) > diaglength(t1)
+    # Some elements are zero, account for that
+    # with the initial value.
+    init_kwargs = (; init=zero(elt))
+  else
+    init_kwargs = (;)
+  end
+  return mapreduce(f, op, diagview(t1), diagview.(t_tail)...; kwargs..., init_kwargs...)
+end
+
 function Base.show(io::IO, mime::MIME"text/plain", T::DiagTensor)
   summary(io, T)
   print_tensor(io, T)

diff --git a/NDTensors/src/diag/tensoralgebra/contract.jl b/NDTensors/src/diag/tensoralgebra/contract.jl
@@ -89,20 +89,13 @@ function contract!(
   labelsT2,
 ) where {ElR,NR,N1,N2}
   if NR == 0  # If all indices of A and B are contracted
-    # all indices are summed over, just add the product of the diagonal
-    # elements of A and B
-    Rdiag = zero(ElR)
-    for i in 1:diaglength(T1)
-      Rdiag += getdiagindex(T1, i) * getdiagindex(T2, i)
-    end
-    setdiagindex!(R, Rdiag, 1)
+    # All indices are summed over, just add the product of the diagonal
+    # elements of A and B.
+    # `expose` allows dispatching on the data type
+    # in order to allow scalar indexing on GPU.
+    expose(R)[] = mapreduce(*, +, diagview(T1), diagview(T2))
   else
-    min_dim = min(diaglength(T1), diaglength(T2))
-    # not all indices are summed over, set the diagonals of the result
-    # to the product of the diagonals of A and B
-    for i in 1:min_dim
-      setdiagindex!(R, getdiagindex(T1, i) * getdiagindex(T2, i), i)
-    end
+    diagview(R) .= diagview(T1) .* diagview(T2)
   end
   return R
 end

diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl
@@ -8,14 +8,26 @@ using .RankFactorization: Spectrum
 # be made <: StridedArray
 import .Expose: qr_positive, ql, ql_positive
 
-function (
-  T1::Tensor{ElT1,2,StoreT1} * T2::Tensor{ElT2,2,StoreT2}
-) where {ElT1,StoreT1<:Dense,ElT2,StoreT2<:Dense}
+# TODO: Generalize this to any `Tensor` type using:
+# ```julia
+# contract(T1, (1, -1), T2, (-1, 2))
+# ```
+function Base.:*(T1::Tensor{<:Any,2,<:Dense}, T2::Tensor{<:Any,2,<:Dense})
   RM = matrix(T1) * matrix(T2)
   indsR = (ind(T1, 1), ind(T2, 2))
   return tensor(Dense(vec(RM)), indsR)
 end
 
+function LinearAlgebra.dot(x::Tensor, y::Tensor)
+  size(x) == size(y) || throw(
+    DimensionMismatch(
+      "dimensions must match in `dot(x::Tensor, y::Tensor)`: `x` has size `$(size(x))` while `y` has size `$(size(y))`.",
+    ),
+  )
+  labels = ntuple(dim -> -dim, ndims(x))
+  return contract(conj(x), labels, y, labels)[]
+end
+
 function LinearAlgebra.exp(T::DenseTensor{ElT,2}) where {ElT<:Union{Real,Complex}}
   expTM = exp(matrix(T))
   return tensor(Dense(vec(expTM)), inds(T))

diff --git a/NDTensors/src/tensor/tensor.jl b/NDTensors/src/tensor/tensor.jl
@@ -210,13 +210,21 @@ real(T::Tensor) = setstorage(T, real(storage(T)))
 
 imag(T::Tensor) = setstorage(T, imag(storage(T)))
 
-function map(f, x::Tensor{T}) where {T}
-  if !iszero(f(zero(T)))
-    error(
-      "map(f, ::Tensor) currently doesn't support functions that don't preserve zeros, while you passed a function such that f(0) = $(f(zero(T))). This isn't supported right now because it doesn't necessarily preserve the sparsity structure of the input tensor.",
-    )
+function Base.map(f, t1::Tensor, t_tail::Tensor...; kwargs...)
+  elt = mapreduce(eltype, promote_type, (t1, t_tail...))
+  if !iszero(f(zero(elt)))
+    # TODO: Do a better job of preserving the storage type, if possible.
+    return tensor(Dense(map(f, array(t1), array.(t_tail)...; kwargs...)), inds(t1))
   end
-  return setstorage(x, map(f, storage(x)))
+  return setstorage(t1, map(f, storage(t1), storage.(t_tail)...; kwargs...))
+end
+
+function Base.mapreduce(f, op, t1::Tensor, t_tail::Tensor...; kwargs...)
+  elt = mapreduce(eltype, promote_type, (t1, t_tail...))
+  if !iszero(f(zero(elt)))
+    return mapreduce(f, op, array(t1), array.(t_tail)...; kwargs...)
+  end
+  return mapreduce(f, op, storage(t1), storage.(t_tail)...; kwargs...)
 end
 
 #
@@ -281,6 +289,9 @@ array(T::Tensor) = array(dense(T))
 matrix(T::Tensor{<:Number,2}) = array(T)
 vector(T::Tensor{<:Number,1}) = array(T)
 
+array(T::Transpose{<:Any,<:Tensor}) = transpose(array(transpose(T)))
+matrix(T::Transpose{<:Any,<:Tensor}) = transpose(array(transpose(T)))
+
 #
 # Helper functions for BlockSparse-type storage
 #
@@ -352,6 +363,42 @@ end
 
 insertblock!!(T::Tensor, block) = insertblock!(T, block)
 
+function tensor_isequal(x, y)
+  # TODO: Use a reduction to avoid intermediates.
+  # This doesn't work right now because `mapreduce`
+  # on `Tensor`s is limited to functions that preserve
+  # zeros.
+  # return mapreduce(==, ==, x, y)
+
+  # TODO: Use `x - y` instead of `map(-, x, y)`.
+  # `x - y` calls `x .- y` and broadcasting isn't
+  # defined properly for sparse Tensor storage
+  # like `Diag` and `BlockSparse`.
+  return iszero(norm(map(-, x, y)))
+end
+
+function Base.:(==)(x::Tensor, y::Tensor)
+  return tensor_isequal(x, y)
+end
+
+function Base.:(==)(x::AbstractArray, y::Tensor)
+  return array(x) == array(y)
+end
+function Base.:(==)(x::Tensor, y::AbstractArray)
+  return array(x) == array(y)
+end
+
+function Base.isequal(x::Tensor, y::Tensor)
+  return tensor_isequal(x, y)
+end
+
+function Base.isequal(x::AbstractArray, y::Tensor)
+  return isequal(array(x), array(y))
+end
+function Base.isequal(x::Tensor, y::AbstractArray)
+  return isequal(array(x), array(y))
+end
+
 """
 getdiagindex
 
@@ -386,11 +433,21 @@ function setdiagindex!(T::Tensor{<:Number,N}, val, ind::Int) where {N}
   return T
 end
 
-function map_diag!(f::Function, exposed_t_destination::Exposed, exposed_t_source::Exposed)
-  diagview(unexpose(exposed_t_destination)) .= f.(diagview(unexpose(exposed_t_source)))
-  return unexpose(exposed_t_destination)
+function map_diag!(f::Function, t_dest::Tensor, t_src::Tensor)
+  map_diag!(f, expose(t_dest), expose(t_src))
+  return t_dest
+end
+function map_diag!(f::Function, exposed_t_dest::Exposed, exposed_t_src::Exposed)
+  diagview(unexpose(exposed_t_dest)) .= f.(diagview(unexpose(exposed_t_src)))
+  return unexpose(exposed_t_dest)
+end
+
+map_diag(f::Function, t::Tensor) = map_diag(f, expose(t))
+function map_diag(f::Function, exposed_t::Exposed)
+  t_dest = copy(exposed_t)
+  map_diag!(f, expose(t_dest), exposed_t)
+  return t_dest
 end
-map_diag(f::Function, t::Tensor) = map_diag!(f, expose(copy(t)), expose(t))
 
 #
 # Some generic contraction functionality

diff --git a/NDTensors/src/tensorstorage/tensorstorage.jl b/NDTensors/src/tensorstorage/tensorstorage.jl
@@ -68,13 +68,12 @@ end
 Random.randn!(S::TensorStorage) = randn!(Random.default_rng(), S)
 Random.randn!(rng::AbstractRNG, S::TensorStorage) = (randn!(rng, data(S)); S)
 
-function map(f, x::TensorStorage{T}) where {T}
-  if !iszero(f(zero(T)))
-    error(
-      "map(f, ::TensorStorage) currently doesn't support functions that don't preserve zeros, while you passed a function such that f(0) = $(f(zero(T))). This isn't supported right now because it doesn't necessarily preserve the sparsity structure of the input tensor.",
-    )
-  end
-  return setdata(x, map(f, data(x)))
+function Base.map(f, t1::TensorStorage, t_tail::TensorStorage...; kwargs...)
+  return setdata(t1, map(f, data(t1), data.(t_tail)...; kwargs...))
+end
+
+function Base.mapreduce(f, op, t1::TensorStorage, t_tail::TensorStorage...; kwargs...)
+  return mapreduce(f, op, data(t1), data.(t_tail)...; kwargs...)
 end
 
 Base.fill!(S::TensorStorage, v) = (fill!(data(S), v); S)