diff --git a/Project.toml b/Project.toml
index 762e51e..b889938 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,33 +1,13 @@
 name = "ITensorGPU"
 uuid = "d89171c1-af8f-46b3-badf-d2a472317c15"
 authors = ["Katharine Hyatt", "Matthew Fishman <mfishman@flatironinstitute.org>"]
-version = "0.1.7"
+version = "0.2.0"
 
 [deps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
-Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NDTensors = "23ae76d9-e61a-49c4-8f12-3f1a16adf9cf"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SimpleTraits = "699a6c99-e7fa-54fc-8d76-47d257e15c1d"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [compat]
-Adapt = "3.5, 4"
-CUDA = "4.0"
-Combinatorics = "1.0.2"
-Functors = "0.2, 0.3, 0.4"
-ITensors = "= 0.3.37"
-NDTensors = "0.1.50"
-SimpleTraits = "0.9.4"
-StaticArrays = "1.2.13"
-Strided = "1.1.2, 2"
-TimerOutputs = "0.5.13"
-cuTENSOR = "1.1.0"
-julia = "1.6 - 1.9"
+CUDA = "4, 5"
+ITensors = "0.3, 0.4, 0.5"
+julia = "1.6"
diff --git a/README.md b/README.md
index 40238be..b0578b0 100644
--- a/README.md
+++ b/README.md
@@ -1,50 +1,3 @@
-# ITensorGPU: Intelligent Tensors with GPU acceleration
-
-[![codecov](https://codecov.io/gh/ITensor/ITensorGPU.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ITensor/ITensorGPU.jl)
-
-[![gitlab-ci](https://gitlab.com/JuliaGPU/ITensorGPU-jl/badges/master/pipeline.svg)](https://gitlab.com/JuliaGPU/ITensorGPU-jl/commits/master)
-
-This package extends the functionality of [ITensors.jl](https://github.com/ITensor/ITensors.jl) to make use of CUDA-enabled GPUs to accelerate tensor contractions and factorizations. It sits on top of the wonderful [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package and uses NVIDIA's [cuTENSOR](https://developer.nvidia.com/cutensor) library for high-performance tensor operations.
-
-## Installing ITensorGPU.jl
-
-Dependencies:
-  - [Julia 1.3 or later](https://julialang.org/downloads/)
-  - [CUDA 10.1 or later](https://developer.nvidia.com/cuda-downloads) -- Currently only NVIDIA GPUs are supported. NVIDIA drivers are required so that Julia can make use of the NVIDIA GPU on your system.
-  - [cuTENSOR v1.0.0 or later](https://developer.nvidia.com/cutensor) -- A specialized library for perfoming permutation-free tensor contractions on the GPU. `libcutensor.so` needs to be in your `LD_LIBRARY_PATH` so that `CUDA.jl` will be able to find it.
-  - [ITensors.jl](https://itensor.github.io/ITensors.jl/stable/#Installation-1)
-
-To properly install CUDA with Julia, it may be helpful to first follow the [CUDA.jl installation instructions](https://juliagpu.github.io/CUDA.jl/stable/installation/overview/) and test that you have that installed properly and that it is able to use `cuTENSOR`. You can run the commands:
-```julia
-julia> using CUDA.CUTENSOR
-
-julia> CUTENSOR.has_cutensor()
-true
-
-julia> CUTENSOR.version()
-v"1.2.1"
-```
-to check that `CUDA.jl` can see the version of `cuTENSOR` you have installed.
-
-Once you have all of the dependencies installed, you can then go ahead and install `ITensorGPU.jl` with the following command:
-```
-julia> ]
-
-pkg> add ITensorGPU
-```
-
-To check if this has all worked, you can run the package tests using:
-```julia
-julia> ]
-
-pkg> test ITensorGPU
-```
-
-## Examples
-
-Take a look at the `examples/` directory for examples of running ITensor calculations on the GPU.
-
-For an application of `ITensorGPU.jl` to more sophisticated tensor network calculations, take a look at [PEPS.jl](https://github.com/ITensor/PEPS.jl).
-
-For some background on the development and design of this package, you can take a look at [this blog post](https://kshyatt.github.io/post/itensorsgpu/) by Katie Hyatt, original author of the `ITensorGPU.jl` package.
-
+| :warning: WARNING          |
+|:---------------------------|
+| The ITensorGPU.jl package is deprecated and only provided for backwards compatibility. For an alternative, see the ITensors.jl documentation about [running ITensor on GPUs](https://itensor.github.io/ITensors.jl/dev/RunningOnGPUs.html). |
diff --git a/src/ITensorGPU.jl b/src/ITensorGPU.jl
index 93be4c9..af8696e 100644
--- a/src/ITensorGPU.jl
+++ b/src/ITensorGPU.jl
@@ -1,114 +1,29 @@
 module ITensorGPU
-using NDTensors
-
-using Adapt
-using CUDA
-using CUDA.CUBLAS
-using CUDA.CUSOLVER
-using Functors
-using ITensors
-using LinearAlgebra
-using Random
-using SimpleTraits
-using StaticArrays
-using Strided
-using TimerOutputs
-using cuTENSOR
-
-using NDTensors: setdata, setstorage, cpu, IsWrappedArray, parenttype
-
-import Adapt: adapt_structure
-import Base: *, permutedims!
-import CUDA: CuArray, CuMatrix, CuVector, cu
-import CUDA.Mem: pin
-import ITensors:
-  randn!,
-  compute_contraction_labels,
-  eigen,
-  tensor,
-  scale!,
-  unioninds,
-  array,
-  matrix,
-  vector,
-  polar,
-  tensors,
-  truncate!,
-  leftlim,
-  rightlim,
-  permute,
-  BroadcastStyle,
-  Indices
-import NDTensors:
-  Atrans,
-  Btrans,
-  CombinerTensor,
-  ContractionProperties,
-  Combiner,
-  Ctrans,
-  Diag,
-  DiagTensor,
-  Dense,
-  DenseTensor,
-  NonuniformDiag,
-  NonuniformDiagTensor,
-  Tensor,
-  UniformDiag,
-  UniformDiagTensor,
-  _contract!!,
-  _contract!,
-  _contract_scalar!,
-  _contract_scalar_noperm!,
-  can_contract,
-  compute_contraction_properties!,
-  contract!!,
-  contract!,
-  contract,
-  contraction_output,
-  contraction_output_type,
-  data,
-  getperm,
-  ind,
-  is_trivial_permutation,
-  outer!,
-  outer!!,
-  permutedims!!,
-  set_eltype,
-  set_ndims,
-  similartype,
-  zero_contraction_output
-import cuTENSOR: cutensorContractionPlan_t, cutensorAlgo_t
-
-#const ContractionPlans = Dict{String, Tuple{cutensorAlgo_t, cutensorContractionPlan_t}}()
-const ContractionPlans = Dict{String,cutensorAlgo_t}()
-
-include("cuarray/set_types.jl")
-include("traits.jl")
-include("adapt.jl")
-include("tensor/cudense.jl")
-include("tensor/dense.jl")
-include("tensor/culinearalgebra.jl")
-include("tensor/cutruncate.jl")
-include("tensor/cucombiner.jl")
-include("tensor/cudiag.jl")
-include("cuitensor.jl")
-include("mps/cumps.jl")
-
-export cu,
-  cpu, cuITensor, randomCuITensor, cuMPS, randomCuMPS, productCuMPS, randomCuMPO, cuMPO
-
-## TODO: Is this needed?
-## const devs = Ref{Vector{CUDAdrv.CuDevice}}()
-## const dev_rows = Ref{Int}(0)
-## const dev_cols = Ref{Int}(0)
-## function __init__()
-##   voltas    = filter(dev->occursin("V100", CUDAdrv.name(dev)), collect(CUDAdrv.devices()))
-##   pascals    = filter(dev->occursin("P100", CUDAdrv.name(dev)), collect(CUDAdrv.devices()))
-##   devs[] = voltas[1:1]
-##   #devs[] = pascals[1:2]
-##   CUBLASMG.cublasMgDeviceSelect(CUBLASMG.mg_handle(), length(devs[]), devs[])
-##   dev_rows[] = 1
-##   dev_cols[] = 1
-## end
-
-end #module
+using CUDA: CUDA
+using ITensors: cpu, cu
+export cpu, cu
+
+using ITensors: ITensor, cpu, cu, randomITensor
+function cuITensor(args...; kwargs...)
+  return cu(ITensor(args...; kwargs...))
+end
+function randomCuITensor(args...; kwargs...)
+  return cu(randomITensor(args...; kwargs...))
+end
+export cuITensor, randomCuITensor
+
+using ITensors.ITensorMPS: MPO, MPS, randomMPS
+function CuMPS(args...; kwargs...)
+  return cu(MPS(args...; kwargs...))
+end
+function productCuMPS(args...; kwargs...)
+  return cu(MPS(args...; kwargs...))
+end
+function randomCuMPS(args...; kwargs...)
+  return cu(randomMPS(args...; kwargs...))
+end
+function CuMPO(args...; kwargs...)
+  return cu(MPO(args...; kwargs...))
+end
+export cuMPO, cuMPS, productCuMPS, randomCuMPO, randomCuMPS
+end
diff --git a/src/adapt.jl b/src/adapt.jl
deleted file mode 100644
index fe40c14..0000000
--- a/src/adapt.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Used to adapt `EmptyStorage` types
-#
-
-function NDTensors.cu(eltype::Type{<:Number}, x)
-  return fmap(x -> adapt(CuVector{eltype,default_buffertype()}, x), x)
-end
-NDTensors.cu(x) = fmap(x -> adapt(CuArray, x), x)
-
-function NDTensors.set_eltype_if_unspecified(
-  arraytype::Type{CuVector{T}}, eltype::Type
-) where {T}
-  return arraytype
-end
-function NDTensors.set_eltype_if_unspecified(arraytype::Type{CuVector}, eltype::Type)
-  return CuVector{eltype}
-end
-
-# Overload `CUDA.cu` for convenience
-const ITensorType = Union{
-  TensorStorage,Tensor,ITensor,Array{ITensor},Array{<:Array{ITensor}},MPS,MPO
-}
-CUDA.cu(x::ITensorType) = NDTensors.cu(x)
-CUDA.cu(eltype::Type{<:Number}, x::ITensorType) = NDTensors.cu(eltype, x)
-
-function NDTensors.adapt_storagetype(
-  to::Type{<:CUDA.CuArray}, x::Type{<:NDTensors.EmptyStorage}
-)
-  store = NDTensors.storagetype(x)
-  return NDTensors.emptytype(
-    NDTensors.set_datatype(store, CuVector{eltype(store),default_buffertype()})
-  )
-end
diff --git a/src/cuarray/set_types.jl b/src/cuarray/set_types.jl
deleted file mode 100644
index 21beda4..0000000
--- a/src/cuarray/set_types.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-buffertype(datatype::Type{<:CuArray{<:Any,<:Any,B}}) where {B} = B
-function buffertype(datatype::Type{<:CuArray})
-  println(
-    "CuArray definitions require a CUDA.Mem buffer try $(datatype{default_buffertype()})"
-  )
-  throw(TypeError)
-end
-
-default_buffertype() = CUDA.Mem.DeviceBuffer
-
-function set_eltype(arraytype::Type{<:CuArray}, eltype::Type)
-  return CuArray{eltype,ndims(arraytype),buffertype(arraytype)}
-end
-
-function set_ndims(arraytype::Type{<:CuArray}, ndims)
-  return CuArray{eltype(arraytype),ndims,buffertype(arraytype)}
-end
diff --git a/src/cuitensor.jl b/src/cuitensor.jl
deleted file mode 100644
index 667aa28..0000000
--- a/src/cuitensor.jl
+++ /dev/null
@@ -1,127 +0,0 @@
-import ITensors.NDTensors: NeverAlias, AliasStyle, AllowAlias
-import ITensors: ITensor
-import CUDA: CuArray
-
-function cuITensor(eltype::Type{<:Number}, inds::IndexSet)
-  return itensor(
-    NDTensors.default_storagetype(CuVector{eltype,default_buffertype()})(dim(inds)), inds
-  )
-end
-cuITensor(::Type{T}, inds::Index...) where {T<:Number} = cuITensor(T, IndexSet(inds...))
-
-cuITensor(is::IndexSet) = cuITensor(Float64, is)
-cuITensor(inds::Index...) = cuITensor(IndexSet(inds...))
-
-cuITensor() = ITensor()
-function cuITensor(x::S, inds::IndexSet{N}) where {S<:Number,N}
-  d = NDTensors.Dense{S,CuVector{S,default_buffertype()}}(
-    fill!(CuVector{S,default_buffertype()}(undef, dim(inds)), x)
-  )
-  return ITensor(d, inds)
-end
-cuITensor(x::S, inds::Index...) where {S<:Number} = cuITensor(x, IndexSet(inds...))
-
-function ITensor(
-  as::AliasStyle,
-  eltype::Type{<:Number},
-  A::CuArray{<:Number},
-  inds::Indices{Index{Int}};
-  kwargs...,
-)
-  length(A) ≠ dim(inds) && throw(
-    DimensionMismatch(
-      "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))",
-    ),
-  )
-  data = CuArray{eltype}(as, A)
-  return itensor(Dense(data), inds)
-end
-
-# Fix ambiguity error
-function ITensor(
-  as::NDTensors.AliasStyle, eltype::Type{<:Number}, A::CuArray{<:Number}, inds::Tuple{}
-)
-  length(A) ≠ dim(inds) && throw(
-    DimensionMismatch(
-      "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))",
-    ),
-  )
-  data = CuArray{eltype}(as, A)
-  return itensor(Dense(data), inds)
-end
-
-# Helper functions for different view behaviors
-CuArray{ElT,N}(::NeverAlias, A::AbstractArray) where {ElT,N} = CuArray{ElT,N}(A)
-function CuArray{ElT,N}(::AllowAlias, A::AbstractArray) where {ElT,N}
-  return convert(CuArray{ElT,N}, A)
-end
-function CuArray{ElT}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {ElT,N,ElTA}
-  return CuArray{ElT,N}(as, A)
-end
-
-# TODO: Change to:
-# (Array{ElT, N} where {ElT})([...]) = [...]
-# once support for `VERSION < v"1.6"` is dropped.
-# Previous to Julia v1.6 `where` syntax couldn't be used in a function name
-function CuArray{<:Any,N}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {N,ElTA}
-  return CuArray{ElTA,N}(as, A)
-end
-
-cuITensor(data::Array, inds...) = cu(ITensor(data, inds...))
-
-cuITensor(data::CuArray, inds...) = ITensor(data, inds...)
-
-cuITensor(A::ITensor) = cu(A)
-
-function randomCuITensor(::Type{S}, inds::Indices) where {S<:Real}
-  T = cuITensor(S, inds)
-  randn!(T)
-  return T
-end
-function randomCuITensor(::Type{S}, inds::Indices) where {S<:Complex}
-  Tr = cuITensor(real(S), inds)
-  Ti = cuITensor(real(S), inds)
-  randn!(Tr)
-  randn!(Ti)
-  return complex(Tr) + im * Ti
-end
-function randomCuITensor(::Type{S}, inds::Index...) where {S<:Number}
-  return randomCuITensor(S, IndexSet(inds...))
-end
-randomCuITensor(inds::IndexSet) = randomCuITensor(Float64, inds)
-randomCuITensor(inds::Index...) = randomCuITensor(Float64, IndexSet(inds...))
-
-CuArray(T::ITensor) = CuArray(tensor(T))
-
-function CuArray{ElT,N}(T::ITensor, is::Vararg{Index,N}) where {ElT,N}
-  ndims(T) != N && throw(
-    DimensionMismatch(
-      "cannot convert an $(ndims(T)) dimensional ITensor to an $N-dimensional CuArray."
-    ),
-  )
-  TT = tensor(permute(T, is...; allow_alias=true))
-  return CuArray{ElT,N}(TT)::CuArray{ElT,N}
-end
-
-function CuArray{ElT}(T::ITensor, is::Vararg{Index,N}) where {ElT,N}
-  return CuArray{ElT,N}(T, is...)
-end
-
-function CuArray(T::ITensor, is::Vararg{Index,N}) where {N}
-  return CuArray{eltype(T),N}(T, is...)::CuArray{<:Number,N}
-end
-
-CUDA.CuMatrix(A::ITensor) = CuArray(A)
-
-function CuVector(A::ITensor)
-  if ndims(A) != 1
-    throw(DimensionMismatch("Vector() expected a 1-index ITensor"))
-  end
-  return CuArray(A)
-end
-
-function CuMatrix(T::ITensor, i1::Index, i2::Index)
-  ndims(T) != 2 &&
-    throw(DimensionMismatch("ITensor must be order 2 to convert to a Matrix"))
-  return CuArray(T, i1, i2)
-end
diff --git a/src/mps/cumps.jl b/src/mps/cumps.jl
deleted file mode 100644
index f59099f..0000000
--- a/src/mps/cumps.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-# cu(ψ::Union{MPS,MPO}) = map(cu, ψ)
-# cpu(ψ::Union{MPS,MPO}) = map(cpu, ψ)
-
-cuMPS(ψ::MPS) = cu(ψ)
-cuMPS(args...; kwargs...) = cu(MPS(args...; kwargs...))
-randomCuMPS(args...; kwargs...) = cu(randomMPS(args...; kwargs...))
-
-# For backwards compatibility
-productCuMPS(args...; kwargs...) = cuMPS(args...; kwargs...)
-
-cuMPO(M::MPO) = cu(M)
-cuMPO(args...; kwargs...) = cu(MPO(args...; kwargs...))
-randomCuMPO(args...; kwargs...) = cu(randomMPO(args...; kwargs...))
diff --git a/src/tensor/cucombiner.jl b/src/tensor/cucombiner.jl
deleted file mode 100644
index d01109a..0000000
--- a/src/tensor/cucombiner.jl
+++ /dev/null
@@ -1 +0,0 @@
-Base.promote_rule(::Type{<:Combiner}, StorageT::Type{<:CuDense}) = StorageT
diff --git a/src/tensor/cudense.jl b/src/tensor/cudense.jl
deleted file mode 100644
index 2db9055..0000000
--- a/src/tensor/cudense.jl
+++ /dev/null
@@ -1,573 +0,0 @@
-using LinearAlgebra: BlasFloat
-
-const CuDense{ElT,VecT} = Dense{ElT,VecT} where {VecT<:CuVector}
-const CuDenseTensor{ElT,N,StoreT,IndsT} = Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDense}
-
-# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:CuArray,SB<:Array}
-#   return Dense{T,SA}(CuArray(x))
-# end
-# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:Array,SB<:CuArray}
-#   return Dense{T,SA}(collect(x.data))
-# end
-# Dense{T,S}(size::Integer) where {T,S<:CuArray{<:T}} = Dense{T,S}(CUDA.zeros(T, size))
-# function Dense{T,S}(x::T, size::Integer) where {T,S<:CuArray{<:T}}
-#   arr = CuArray{T}(undef, size)
-#   fill!(arr, x)
-#   return Dense{T,S}(arr)
-# end
-
-function Base.complex(::Type{Dense{ElT,VT}}) where {ElT,VT<:CuArray}
-  return Dense{complex(ElT),CuVector{complex(ElT)}}
-end
-
-CuArray(x::CuDense{ElT}) where {ElT} = CuVector{ElT}(data(x))
-function CuArray{ElT,N}(x::CuDenseTensor{ElT,N}) where {ElT,N}
-  return CuArray{ElT,N}(reshape(data(store(x)), dims(inds(x))...))
-end
-CuArray(x::CuDenseTensor{ElT,N}) where {ElT,N} = CuArray{ElT,N}(x)
-
-*(D::Dense{T,AT}, x::S) where {T,AT<:CuArray,S<:Number} = Dense(x .* data(D))
-
-Base.getindex(D::CuDense{<:Number}) = collect(data(D))[]
-Base.getindex(D::CuDenseTensor{<:Number,0}) = store(D)[]
-LinearAlgebra.norm(T::CuDenseTensor) = norm(data(store(T)))
-
-function Base.copyto!(R::CuDenseTensor{<:Number,N}, T::CuDenseTensor{<:Number,N}) where {N}
-  RA = array(R)
-  TA = array(T)
-  RA .= TA
-  return R
-end
-
-# This is for type promotion for Scalar*Dense
-function Base.promote_rule(
-  ::Type{<:Dense{ElT1,CuVector{ElT1}}}, ::Type{ElT2}
-) where {ElT1,ElT2<:Number}
-  ElR = promote_type(ElT1, ElT2)
-  VecR = CuVector{ElR}
-  return Dense{ElR,VecR}
-end
-
-function permutedims!!(
-  B::Tensor{ElT,N,StoreT,IndsB},
-  A::Tensor{ElT,N,StoreT,IndsA},
-  perm::NTuple{N,Int},
-  f::Function=(r, t) -> permute!(r, t),
-) where {N,ElT,IndsB,IndsA,StoreT<:CuDense{ElT}}
-  Ais = inds(A)
-  Bis = ITensors.NDTensors.permute(inds(A), perm)
-  B = f(B, A)
-  return B
-end
-
-import ITensors.NDTensors: GemmBackend, auto_select_backend, _gemm!
-function backend_cutensor()
-  return gemm_backend[] = :CUTENSOR
-end
-function backend_cublas()
-  return gemm_backend[] = :CUBLAS
-end
-
-@inline function auto_select_backend(
-  ::Type{<:CuArray{<:BlasFloat}},
-  ::Type{<:CuArray{<:BlasFloat}},
-  ::Type{<:CuArray{<:BlasFloat}},
-)
-  return GemmBackend(:CUBLAS)
-end
-
-@inline function auto_select_backend(
-  ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:AbstractVecOrMat}
-)
-  return GemmBackend(:GenericCUDA)
-end
-
-# CUBLAS matmul
-function _gemm!(
-  ::GemmBackend{:CUBLAS},
-  tA,
-  tB,
-  alpha,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta,
-  C::AbstractVecOrMat,
-)
-  return CUBLAS.gemm!(tA, tB, alpha, A, B, beta, C)
-end
-
-# CUDA generic matmul
-function _gemm!(
-  ::GemmBackend{:GenericCUDA},
-  tA,
-  tB,
-  alpha,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta,
-  C::CuDenseTensor,
-)
-  C_dat = reshape(data(store(C)), size(C))
-  A_ = tA == 'T' ? transpose(A) : A
-  B_ = tB == 'T' ? transpose(B) : B
-  C_dat = mul!(C_dat, A_, B_, alpha, beta)
-  copyto!(data(store(C)), C_dat)
-  return C
-end
-
-function _contract_scalar!(
-  R::CuDenseTensor{ElR,NR},
-  labelsR,
-  T₁::CuDenseTensor,
-  labelsT₁,
-  T₂::CuDenseTensor,
-  labelsT₂,
-  α=one(ElR),
-  β=zero(ElR),
-) where {ElR,NR}
-  if nnz(T₁) == nnz(T₂) == 1
-    new_R = Tensor(Dense(data(store(T₁)) .* data(store(T₂))), inds(R))
-    copyto!(store(R), store(new_R))
-  elseif nnz(T₁) == 1
-    props = ContractionProperties(labelsT₁, labelsT₂, labelsR)
-    compute_contraction_properties!(props, T₁, T₂, R)
-    R = _contract!(R, T₁, T₂, props, α, β)
-  elseif nnz(T₂) == 1
-    props = ContractionProperties(labelsT₁, labelsT₂, labelsR)
-    compute_contraction_properties!(props, T₁, T₂, R)
-    R = _contract!(R, T₁, T₂, props, α, β)
-  else
-    error("In _contract_scalar!, one tensor must be a scalar")
-  end
-  return R
-end
-
-function _gemm_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  # TODO: directly use Tensor instead of Array
-  C = array(CT)
-  A = array(AT)
-  B = array(BT)
-
-  tA = 'N'
-  if props.permuteA
-    pA = NTuple{NA,Int}(props.PA)
-    Ap = permutedims(A, pA)
-    AM = reshape(Ap, props.dmid, props.dleft)
-    tA = 'T'
-  else
-    #A doesn't have to be permuted
-    if Atrans(props)
-      AM = reshape(A, props.dmid, props.dleft)
-      tA = 'T'
-    else
-      AM = reshape(A, props.dleft, props.dmid)
-    end
-  end
-
-  tB = 'N'
-  if props.permuteB
-    pB = NTuple{NB,Int}(props.PB)
-    Bp = permutedims(B, pB)
-    BM = reshape(Bp, props.dmid, props.dright)
-  else
-    if Btrans(props)
-      BM = reshape(B, props.dright, props.dmid)
-      tB = 'T'
-    else
-      BM = reshape(B, props.dmid, props.dright)
-    end
-  end
-
-  #TODO: this logic may be wrong
-  if props.permuteC
-    #Need to copy here since we will be permuting
-    #into C later
-    CM = reshape(copy(C), props.dleft, props.dright)
-  else
-    if Ctrans(props)
-      CM = reshape(C, props.dleft, props.dright)
-      (AM, BM) = (BM, AM)
-      if tA == tB
-        tA = tB = (tA == 'T' ? 'N' : 'T')
-      end
-    else
-      CM = reshape(C, props.dleft, props.dright)
-    end
-  end
-
-  CM = CUBLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM)
-
-  if props.permuteC
-    pC = NTuple{NC,Int}(props.PC)
-    Cr = reshape(CM, props.newCrange...)
-    @strided C .= permutedims(Cr, pC)
-  end
-  return C
-end
-
-function _contract!(
-  CT::CuDenseTensor{El,NC},
-  AT::CuDenseTensor{El,NA},
-  BT::CuDenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  if ndims(CT) > 12 || ndims(BT) > 12 || ndims(AT) > 12
-    return _gemm_contract!(CT, AT, BT, props, α, β)
-  end
-  Ainds = inds(AT)
-  Adims = dims(Ainds)
-  Binds = inds(BT)
-  Bdims = dims(Binds)
-  Cinds = inds(CT)
-  Cdims = dims(Cinds)
-  Adata = reshape(data(store(AT)), Adims...)
-  Bdata = reshape(data(store(BT)), Bdims...)
-  Cdata = reshape(data(store(CT)), Cdims...)
-  contracted = commoninds(Ainds, Binds)
-  A_only = uniqueinds(Ainds, Binds)
-  B_only = uniqueinds(Binds, Ainds)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(contracted)
-    push!(ind_dict, i)
-  end
-  if length(A_only) > 0
-    for (idx, i) in enumerate(A_only)
-      push!(ind_dict, i)
-    end
-  end
-  if length(B_only) > 0
-    for (idx, i) in enumerate(B_only)
-      push!(ind_dict, i)
-    end
-  end
-  ctainds = zeros(Int, length(Ainds))
-  ctbinds = zeros(Int, length(Binds))
-  ctcinds = zeros(Int, length(Cinds))
-  for (ii, ia) in enumerate(Ainds)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Binds)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  for (ii, ic) in enumerate(Cinds)
-    ctcinds[ii] = findfirst(x -> x == ic, ind_dict)
-  end
-  id_op = cuTENSOR.CUTENSOR_OP_IDENTITY
-  dict_key = ""
-  for cc in zip(ctcinds, Cdims)
-    dict_key *= string(cc[1]) * "," * string(cc[2]) * ","
-  end
-  for aa in zip(ctainds, Adims)
-    dict_key *= string(aa[1]) * "," * string(aa[2]) * ","
-  end
-  for bb in zip(ctbinds, Bdims)
-    dict_key *= string(bb[1]) * "," * string(bb[2]) * ","
-  end
-  if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1
-    if haskey(ContractionPlans, dict_key)
-      dict_val = ContractionPlans[dict_key]
-      algo = dict_val
-      #plan  = dict_val[2]
-      Cdata = cuTENSOR.contraction!(
-        α,
-        Adata,
-        Vector{Char}(ctainds),
-        id_op,
-        Bdata,
-        Vector{Char}(ctbinds),
-        id_op,
-        β,
-        Cdata,
-        Vector{Char}(ctcinds),
-        id_op,
-        id_op;
-        algo=algo,
-      )
-    else
-      # loop through all algos
-      # pick the fastest one
-      # store that plan!
-      best_time = 1e6
-      best_plan = nothing
-      best_algo = nothing
-      max_algos = Ref{Int32}(C_NULL)
-      cuTENSOR.cutensorContractionMaxAlgos(max_algos)
-      # fix once the other options are documented
-      #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1))
-      algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1))
-      for algo in reverse(algos)
-        try
-          Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!(
-            α,
-            Adata,
-            Vector{Char}(ctainds),
-            id_op,
-            Bdata,
-            Vector{Char}(ctbinds),
-            id_op,
-            β,
-            Cdata,
-            Vector{Char}(ctcinds),
-            id_op,
-            id_op;
-            algo=cuTENSOR.cutensorAlgo_t(algo),
-          )
-          if this_time < best_time
-            best_time = this_time
-            #best_plan = this_plan
-            best_algo = cuTENSOR.cutensorAlgo_t(algo)
-          end
-        catch err
-          @warn "Algorithm $algo not supported"
-        end
-      end
-      ContractionPlans[dict_key] = best_algo
-    end
-  else
-    Cdata = cuTENSOR.contraction!(
-      α,
-      Adata,
-      Vector{Char}(ctainds),
-      id_op,
-      Bdata,
-      Vector{Char}(ctbinds),
-      id_op,
-      β,
-      Cdata,
-      Vector{Char}(ctcinds),
-      id_op,
-      id_op,
-    )
-  end
-  return parent(Cdata)
-end
-
-function Base.:+(B::CuDenseTensor, A::CuDenseTensor)
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(inds(A))
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  cuTENSOR.elementwiseBinary!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(store(B)), vec(C))
-  return B
-end
-
-function Base.:+(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet)
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  Cis = Bis
-  C = cuTENSOR.elementwiseBinary!(
-    1, reshapeAdata, ctainds, opA, 1, reshapeBdata, ctbinds, opC, C, ctcinds, opAC
-  )
-  copyto!(data(B), vec(C))
-  return C
-end
-
-function Base.:-(B::CuDenseTensor, A::CuDenseTensor)
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(inds(A))
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis))
-  cuTENSOR.elementwiseBinary!(
-    -one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(store(B)), vec(C))
-  return B
-end
-
-function Base.:-(A::CuDense, Ais::IndexSet, B::CuDense, Bis::IndexSet)
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  Cis = Bis
-  C = cuTENSOR.elementwiseBinary!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    -one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(B), vec(C))
-  return C
-end
-
-function Base.permute!(B::CuDenseTensor, A::CuDenseTensor)
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  if ndims(A) < 40 # use CUTENSOR
-    ctainds = zeros(Int, length(Ais))
-    ctbinds = zeros(Int, length(Bis))
-    for (ii, ia) in enumerate(Ais)
-      ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-    end
-    for (ii, ib) in enumerate(Bis)
-      ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-    end
-    cuTENSOR.permutation!(
-      one(eltype(Adata)),
-      reshapeAdata,
-      Vector{Char}(ctainds),
-      reshapeBdata,
-      Vector{Char}(ctbinds),
-    )
-  else # use GPUArrays
-    perm = Int[]
-    for aix in Ais
-      b_pos = findfirst(bix -> bix == aix, Bis)
-      push!(perm, b_pos)
-    end
-    @assert isperm(perm)
-    permutedims!(reshapeBdata, reshapeAdata, invperm(perm))
-  end
-  return Tensor(Dense(vec(reshapeBdata)), inds(B))
-end
-
-function Base.permute!(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-
-  cuTENSOR.permutation!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    Vector{Char}(ctainds),
-    reshapeBdata,
-    Vector{Char}(ctbinds),
-  )
-  return Tensor(Dense(reshapeBdata), Tuple(Bis))
-end
-
-Base.:/(A::CuDenseTensor, x::Number) = A * inv(x)
diff --git a/src/tensor/cudiag.jl b/src/tensor/cudiag.jl
deleted file mode 100644
index 0e27beb..0000000
--- a/src/tensor/cudiag.jl
+++ /dev/null
@@ -1,228 +0,0 @@
-export CuDiag
-
-const CuDiag{ElT,VecT} = Diag{ElT,VecT} where {VecT<:CuArray{ElT}}
-const NonuniformCuDiagTensor{ElT,N,StoreT,IndsT} =
-  Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDiag}
-
-CuArray(D::NonuniformCuDiagTensor) = CuArray(dense(D))
-
-function NDTensors.dense(T::NonuniformCuDiagTensor{ElT}) where {ElT}
-  R_data = CUDA.zeros(ElT, dim(inds(T)))
-  diag_inds = diagind(reshape(R_data, dims(inds(T))), 0)
-  R_data[diag_inds] .= data(store(T))
-  return Tensor(Dense(R_data), inds(T))
-end
-
-function Base.promote_rule(
-  ::Type{Diag{ElT2,VecT2}}, ::Type{CuDense}
-) where {ElT2,VecT2<:Number}
-  return promote_type(DenseT1, ElT2)
-end
-
-function Base.promote_rule(
-  ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}}
-) where {ElT1,ElT2,N1,N2,StoreT1<:CuDense,StoreT2<:UniformDiag}
-  ElT3 = promote_type(ElT1, ElT2)
-  return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3}
-end
-function Base.promote_rule(
-  ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}}
-) where {ElT1,ElT2,N1,N2,StoreT1<:UniformDiag,StoreT2<:CuDense}
-  ElT3 = promote_type(ElT1, ElT2)
-  return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3}
-end
-
-function Base.promote_rule(
-  ::Type{<:UniformDiag{ElT1}}, ::Type{<:NonuniformDiag{ElT2,VecT2}}
-) where {ElT1,ElT2,VecT2<:CuArray}
-  ElT3 = promote_type(ElT1, ElT2)
-  return NonuniformDiag{ElT3,CuVector{ElT3}}
-end
-function Base.promote_rule(
-  ::Type{<:NonuniformDiag{ElT2,VecT2}}, ::Type{<:UniformDiag{ElT1}}
-) where {ElT1,ElT2,VecT2<:CuArray}
-  ElT3 = promote_type(ElT1, ElT2)
-  return NonuniformDiag{ElT3,CuVector{ElT3}}
-end
-function Base.promote_rule(
-  ::Type{DenseT1}, ::Type{Diag{ElT2,VecT2}}
-) where {DenseT1<:CuDense,ElT2,VecT2<:Number}
-  return promote_type(DenseT1, ElT2)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:NonuniformCuDiagTensor}, TensorT2::Type{<:CuDenseTensor}, IndsR
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-function contraction_output_type(
-  TensorT1::Type{<:CuDenseTensor}, TensorT2::Type{<:NonuniformCuDiagTensor}, IndsR
-)
-  return contraction_output_type(TensorT2, TensorT1, IndsR)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  IndsR::Type,
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:UniformDiagTensor},
-  TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  IndsR::Type,
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-function contraction_output_type(
-  TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  TensorT2::Type{<:UniformDiagTensor},
-  IndsR::Type,
-)
-  return contraction_output_type(TensorT2, TensorT1, IndsR)
-end
-
-function zero_contraction_output(
-  T1::UniformDiagTensor{ElT1,N1}, T2::CuDenseTensor{ElT2,N2}, indsR::IndsR
-) where {ElT1,N1,ElT2,N2,IndsR}
-  ElT3 = promote_type(ElT1, ElT2)
-  dat = CUDA.zeros(ElT3, dim(indsR))
-  return Tensor(Dense(dat), indsR)
-end
-function zero_contraction_output(
-  T2::CuDenseTensor{ElT2,N2}, T1::UniformDiagTensor{ElT1,N1}, indsR::IndsR
-) where {ElT1,N1,ElT2,N2,IndsR}
-  return zero_contraction_output(T1, T2, indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:NonuniformDiagTensor,TensorT2<:NonuniformCuDiagTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Diag(dat), indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:UniformDiagTensor,TensorT2<:NonuniformCuDiagTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Diag(dat), indsR)
-end
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT2<:UniformDiagTensor,TensorT1<:NonuniformCuDiagTensor}
-  return zero_contraction_output(T2, T1, indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:DiagTensor,TensorT2<:CuDenseTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Dense(dat), indsR)
-end
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT2<:DiagTensor,TensorT1<:CuDenseTensor}
-  return zero_contraction_output(T2, T1, indsR)
-end
-
-function contraction_output(
-  T1::UniformDiagTensor, T2::DiagTensor{Elt2,<:CuDiag}, indsR
-) where {Elt2}
-  return zero_contraction_output(T1, T2, indsR)
-end
-function contraction_output(
-  T1::DiagTensor{Elt1,<:CuDiag}, T2::UniformDiagTensor, indsR
-) where {Elt1}
-  return contraction_output(T2, T1, indsR)
-end
-
-function contract!(
-  C::CuDenseTensor{<:Number,NC},
-  Clabels,
-  A::UniformDiagTensor{<:Number,NA},
-  Alabels,
-  B::CuDenseTensor{<:Number,NB},
-  Blabels,
-) where {NC,NA,NB}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::CuDenseTensor{<:Number,NC},
-  Clabels,
-  A::CuDenseTensor{<:Number,NA},
-  Alabels,
-  B::UniformDiagTensor{<:Number,NB},
-  Blabels,
-) where {NC,NA,NB}
-  return contract!(C, Clabels, B, Blabels, A, Alabels)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  A::UniformDiagTensor{EltA,NA,IndsA},
-  Alabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-  A::UniformDiagTensor{EltA,NA,IndsA},
-  Alabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-  A::NonuniformCuDiagTensor{EltA,NA,IndsA},
-  Alabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-# Dense * NonuniformCuDiag
-function contract!(
-  C::CuDenseTensor, Clabels, A::NonuniformCuDiagTensor, Alabels, B::CuDenseTensor, Blabels
-)
-  Astore = data(store(A))
-  newAstore = CUDA.zeros(eltype(A), dims(inds(A))[1], dims(inds(A))[2])
-  adi = diagind(newAstore, 0)
-  newAstore[adi] .= Astore[:]
-  newA = Tensor(Dense(vec(newAstore)), inds(A))
-  return contract!(C, Clabels, newA, Alabels, B, Blabels)
-end
-
-function contract!(
-  C::CuDenseTensor, Clabels, A::CuDenseTensor, Alabels, B::NonuniformCuDiagTensor, Blabels
-)
-  return contract!(C, Clabels, B, Blabels, A, Alabels)
-end
diff --git a/src/tensor/culinearalgebra.jl b/src/tensor/culinearalgebra.jl
deleted file mode 100644
index c64a3b0..0000000
--- a/src/tensor/culinearalgebra.jl
+++ /dev/null
@@ -1,159 +0,0 @@
-
-#
-# Linear Algebra of order 2 Tensors
-#
-# Even though CuDenseTensor{_,2} is strided
-# and passable to BLAS/LAPACK, it cannot
-# be made <: StridedArray
-
-function Base.:*(
-  T1::Tensor{ElT1,2,StoreT1,IndsT1}, T2::Tensor{ElT2,2,StoreT2,IndsT2}
-) where {ElT1,StoreT1<:CuDense,IndsT1,ElT2,StoreT2<:CuDense,IndsT2}
-  RM = matrix(T1) * matrix(T2)
-  indsR = IndsT1(ind(T1, 1), ind(T2, 2))
-  pT = promote_type(ElT1, ElT2)
-  return tensor(Dense(vec(RM)), indsR)
-end
-#= FIX ME
-function LinearAlgebra.exp(T::CuDenseTensor{ElT,2}) where {ElT,IndsT}
-  expTM = exp(matrix(T))
-  return tensor(Dense(vec(expTM)),inds(T))
-end
-
-function expHermitian(T::CuDenseTensor{ElT,2}) where {ElT,IndsT}
-  # exp(::Hermitian/Symmetric) returns Hermitian/Symmetric,
-  # so extract the parent matrix
-  expTM = parent(exp(Hermitian(matrix(T))))
-  return tensor(Dense(vec(expTM)),inds(T))
-end
-=#
-
-# svd of an order-2 tensor
-function LinearAlgebra.svd(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT}
-  maxdim::Int = get(kwargs, :maxdim, minimum(dims(T)))
-  mindim::Int = get(kwargs, :mindim, 1)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  fastSVD::Bool = get(kwargs, :fastSVD, false)
-  # Safer to use `Array`, which ensures
-  # no views/aliases are made, since
-  # we are using in-place `CUSOLVER.svd!` below.
-  aT = Array(T)
-  @timeit "CUSOLVER svd" begin
-    MU, MS, MV = CUSOLVER.svd!(aT)
-  end
-  if !(MV isa CuMatrix)
-    # Materialize any array wrappers,
-    # for now, since `Adjoint` wrappers
-    # seem to cause issues throughout
-    # CUDA.jl, for example with slicing,
-    # reshaping and then copying, etc.
-    # TODO: Fix this in a more robust way.
-    MV = copy(MV)
-  end
-  # for consistency with cpu version,
-  # ITensors.jl/NDTensors/src/linearalgebra.jl/svd
-  # need conj!(MV)
-  conj!(MV)
-  P = MS .^ 2
-  truncerr, docut, P = truncate!(
-    P;
-    mindim=mindim,
-    maxdim=maxdim,
-    cutoff=cutoff,
-    absoluteCutoff=absoluteCutoff,
-    doRelCutoff=doRelCutoff,
-  )
-  spec = Spectrum(P, truncerr)
-  dS = length(P)
-  if dS < length(MS)
-    MU = MU[:, 1:dS]
-    MS = MS[1:dS]
-    MV = MV[:, 1:dS]
-  end
-
-  # Make the new indices to go onto U and V
-  u = eltype(IndsT)(dS)
-  v = eltype(IndsT)(dS)
-  Uinds = IndsT((ind(T, 1), u))
-  Sinds = IndsT((u, v))
-  Vinds = IndsT((ind(T, 2), v))
-  U = tensor(Dense(vec(MU)), Uinds)
-  Sdata = CUDA.zeros(ElT, dS * dS)
-  dsi = diagind(reshape(Sdata, dS, dS), 0)
-  Sdata[dsi] = MS
-  S = tensor(Dense(Sdata), Sinds)
-  V = tensor(Dense(vec(MV)), Vinds)
-  return U, S, V, spec
-end
-
-function LinearAlgebra.eigen(
-  T::Hermitian{ElT,<:CuDenseTensor{ElT,2,IndsT}}; kwargs...
-) where {ElT<:Union{Real,Complex},IndsT}
-  ispossemidef::Bool = get(kwargs, :ispossemidef, false)
-  maxdim::Int = get(kwargs, :maxdim, minimum(dims(T)))
-  mindim::Int = get(kwargs, :mindim, 1)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  @timeit "CUSOLVER eigen" begin
-    local DM, UM
-    if ElT <: Complex
-      DM, UM = CUSOLVER.heevd!('V', 'U', matrix(parent(T)))
-    else
-      DM, UM = CUSOLVER.syevd!('V', 'U', matrix(parent(T)))
-    end
-  end
-  DM_ = reverse(DM)
-  @timeit "truncate" begin
-    truncerr, docut, DM = truncate!(
-      DM_;
-      maxdim=maxdim,
-      cutoff=cutoff,
-      absoluteCutoff=absoluteCutoff,
-      doRelCutoff=doRelCutoff,
-    )
-  end
-  spec = Spectrum(DM, truncerr)
-  dD = length(DM)
-  dV = reverse(UM; dims=2)
-  if dD < size(dV, 2)
-    dV = CuMatrix(dV[:, 1:dD])
-  end
-  # Make the new indices to go onto U and V
-  l = eltype(IndsT)(dD)
-  r = eltype(IndsT)(dD)
-  Vinds = IndsT((dag(ind(T, 2)), dag(r)))
-  Dinds = IndsT((l, dag(r)))
-  U = tensor(Dense(vec(dV)), Vinds)
-  D = tensor(Diag(real.(DM)), Dinds)
-  return D, U, spec
-end
-
-function LinearAlgebra.qr(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT}
-  QM, RM = qr(matrix(T))
-  # Make the new indices to go onto Q and R
-  q, r = inds(T)
-  q = dim(q) < dim(r) ? sim(q) : sim(r)
-  Qinds = IndsT((ind(T, 1), q))
-  Rinds = IndsT((q, ind(T, 2)))
-  QM = CuMatrix(QM)
-  Q = tensor(Dense(vec(QM)), Qinds)
-  R = tensor(Dense(vec(RM)), Rinds)
-  return Q, R
-end
-
-function polar(T::CuDenseTensor{ElT,2,IndsT}) where {ElT,IndsT}
-  QM, RM = polar(matrix(T))
-  dim = size(QM, 2)
-  # Make the new indices to go onto Q and R
-  q = eltype(IndsT)(dim)
-  # TODO: use push/pushfirst instead of a constructor
-  # call here
-  Qinds = IndsT((ind(T, 1), q))
-  Rinds = IndsT((q, ind(T, 2)))
-  Q = tensor(Dense(vec(QM)), Qinds)
-  R = tensor(Dense(vec(RM)), Rinds)
-  return Q, R
-end
diff --git a/src/tensor/cutruncate.jl b/src/tensor/cutruncate.jl
deleted file mode 100644
index 108bc1e..0000000
--- a/src/tensor/cutruncate.jl
+++ /dev/null
@@ -1,87 +0,0 @@
-import LinearAlgebra: BlasReal
-
-function truncate!(P::CuVector{T}; kwargs...)::Tuple{T,T,CuVector{T}} where {T<:BlasReal}
-  maxdim::Int = min(get(kwargs, :maxdim, length(P)), length(P))
-  mindim::Int = min(get(kwargs, :mindim, 1), maxdim)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  origm = length(P)
-  docut = zero(T)
-
-  maxP = maximum(P)
-  if maxP == zero(T)
-    P = CUDA.zeros(T, 1)
-    return zero(T), zero(T), P
-  end
-  if origm == 1
-    docut = maxP / 2
-    return zero(T), docut, P[1:1]
-  end
-  @timeit "setup rP" begin
-    #Zero out any negative weight
-    #neg_z_f = (!signbit(x) ? x : 0.0)
-    rP = map(x -> !signbit(x) ? Float64(x) : 0.0, P)
-    n = origm
-  end
-  @timeit "handle cutoff" begin
-    if absoluteCutoff
-      #Test if individual prob. weights fall below cutoff
-      #rather than using *sum* of discarded weights
-      sub_arr = rP .- Float64(cutoff)
-      err_rP = sub_arr ./ abs.(sub_arr)
-      flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61)
-      cut_ind = CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1
-      if cut_ind > 0
-        n = min(maxdim, cut_ind)
-        n = max(n, mindim)
-      else
-        n = maxdim
-      end
-      truncerr = T(sum(rP[(n + 1):end]))
-    else
-      truncerr = zero(T)
-      scale = one(T)
-      @timeit "find scale" begin
-        if doRelCutoff
-          scale = sum(P)
-          scale = scale > zero(T) ? scale : one(T)
-        end
-      end
-      #Truncating until *sum* of discarded probability
-      #weight reaches cutoff reached (or m==mindim)
-      csum_rp = Float64.(CUDA.reverse(CUDA.cumsum(CUDA.reverse(rP))))
-      sub_arr = csum_rp .- Float64(cutoff * scale)
-      err_rP = sub_arr ./ abs.(sub_arr)
-      flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61)
-      cut_ind = (CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1)
-      if cut_ind > 0
-        n = min(maxdim, cut_ind)
-        n = max(n, mindim)
-      else
-        n = maxdim
-      end
-      truncerr = sum(rP[(n + 1):end])
-      if scale == zero(T)
-        truncerr = zero(T)
-      else
-        truncerr /= scale
-      end
-    end
-  end
-  if n < 1
-    n = 1
-  end
-  if n < origm
-    hP = collect(P)
-    docut = (hP[n] + hP[n + 1]) / 2
-    if abs(hP[n] - hP[n + 1]) < 1E-3 * hP[n]
-      docut += T(1E-3) * hP[n]
-    end
-  end
-  @timeit "setup return" begin
-    rinds = 1:n
-    rrP = P[rinds]
-  end
-  return truncerr, docut, rrP
-end
diff --git a/src/tensor/dense.jl b/src/tensor/dense.jl
deleted file mode 100644
index 04e1cb8..0000000
--- a/src/tensor/dense.jl
+++ /dev/null
@@ -1,285 +0,0 @@
-function contract!!(
-  R::DenseTensor{<:Number,NR},
-  labelsR::NTuple{NR},
-  T1::DenseTensor{<:Number,N1},
-  labelsT1::NTuple{N1},
-  T2::DenseTensor{<:Number,N2},
-  labelsT2::NTuple{N2},
-  α::Number=1,
-  β::Number=0,
-) where {NR,N1,N2}
-  if N1 == 0
-    (α ≠ 1 || β ≠ 0) &&
-      error("contract!! not yet implemented for scalar ITensor with non-trivial α and β")
-    # TODO: replace with an add! function?
-    # What about doing `R .= T1[] .* PermutedDimsArray(T2,perm)`?
-    perm = getperm(labelsR, labelsT2)
-    R = permutedims!!(R, T2, perm, (r, t2) -> T1[] * t2)
-  elseif N2 == 0
-    (α ≠ 1 || β ≠ 0) &&
-      error("contract!! not yet implemented for scalar ITensor with non-trivial α and β")
-    perm = getperm(labelsR, labelsT1)
-    R = permutedims!!(R, T1, perm, (r, t1) -> T2[] * t1)
-  elseif N1 + N2 == NR
-    (α ≠ 1 || β ≠ 0) && error(
-      "contract!! not yet implemented for outer product tensor contraction with non-trivial α and β",
-    )
-    # TODO: permute T1 and T2 appropriately first (can be more efficient
-    # then permuting the result of T1⊗T2)
-    # TODO: implement the in-place version directly
-    R = outer!!(R, T1, T2)
-    labelsRp = (labelsT1..., labelsT2...)
-    perm = getperm(labelsR, labelsRp)
-    if !is_trivial_permutation(perm)
-      R = permutedims!!(R, copy(R), perm)
-    end
-  else
-    #if dim(T1) > 2^13 && dim(T2) > 2^13
-    #    R = _big_contract!!(R,labelsR,T1,labelsT1,T2,labelsT2, α, β)
-    #else
-    if α ≠ 1 || β ≠ 0
-      R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2, α, β)
-    else
-      R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2)
-    end
-    #end
-  end
-  return R
-end
-
-function _big_contract!!(
-  R::DenseTensor{<:Number,NR},
-  labelsR,
-  T1::DenseTensor{ElT1,N1},
-  labelsT1,
-  T2::DenseTensor{ElT2,N2},
-  labelsT2,
-  α::Number=1,
-  β::Number=0,
-) where {ElT1,ElT2,N1,N2,NR}
-  props = ContractionProperties(labelsT1, labelsT2, labelsR)
-  compute_contraction_properties!(props, T1, T2, R)
-  _big_contract!(R, T1, T2, props, α, β)
-  #val, t, _ = @timed _blasmg_contract!(R,T1,T2,props,α,β)
-  return R
-end
-
-function _big_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  Ainds = inds(AT)
-  Adims = dims(Ainds)
-  Binds = inds(BT)
-  Bdims = dims(Binds)
-  Cinds = inds(CT)
-  Cdims = dims(Cinds)
-  Adata = reshape(data(store(AT)), Adims)
-  Bdata = reshape(data(store(BT)), Bdims)
-  Cdata = reshape(data(store(CT)), Cdims)
-  contracted = commoninds(Ainds, Binds)
-  A_only = uniqueinds(Ainds, Binds)
-  B_only = uniqueinds(Binds, Ainds)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(contracted)
-    push!(ind_dict, i)
-  end
-  if length(A_only) > 0
-    for (idx, i) in enumerate(A_only)
-      push!(ind_dict, i)
-    end
-  end
-  if length(B_only) > 0
-    for (idx, i) in enumerate(B_only)
-      push!(ind_dict, i)
-    end
-  end
-  ctainds = zeros(Int, length(Ainds))
-  ctbinds = zeros(Int, length(Binds))
-  ctcinds = zeros(Int, length(Cinds))
-  for (ii, ia) in enumerate(Ainds)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Binds)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  for (ii, ic) in enumerate(Cinds)
-    ctcinds[ii] = findfirst(x -> x == ic, ind_dict)
-  end
-  id_op = cuTENSOR.CUTENSOR_OP_IDENTITY
-  dict_key = ""
-  for cc in zip(ctcinds, Cdims)
-    dict_key *= string(cc[1]) * "," * string(cc[2]) * ","
-  end
-  for aa in zip(ctainds, Adims)
-    dict_key *= string(aa[1]) * "," * string(aa[2]) * ","
-  end
-  for bb in zip(ctbinds, Bdims)
-    dict_key *= string(bb[1]) * "," * string(bb[2]) * ","
-  end
-  #=synchronize()
-  if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1
-      if haskey(ContractionPlans, dict_key)
-          dict_val = ContractionPlans[dict_key]
-          algo  = dict_val
-          Cdata = cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=algo)
-          synchronize()
-      else
-          # loop through all algos
-          # pick the fastest one
-          # store that plan!
-          best_time = 1e6
-          best_plan = nothing
-          best_algo = nothing
-          max_algos = Ref{Int32}(C_NULL)
-          cuTENSOR.cutensorContractionMaxAlgos(max_algos)
-          # fix once the other options are documented
-          #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1))
-          algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1))
-          for algo in reverse(algos)
-              try
-                  Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=cuTENSOR.cutensorAlgo_t(algo))
-                  synchronize()
-                  if this_time < best_time
-                      best_time = this_time
-                      best_algo = cuTENSOR.cutensorAlgo_t(algo)
-                  end
-              catch err
-                  @warn "Algorithm $algo not supported"
-              end
-          end
-          ContractionPlans[dict_key] = best_algo
-      end
-  else
-  =#
-  Cdata .= zero(eltype(Cdata))
-  #@show size(Adata)
-  #@show size(Bdata)
-  #@show size(Cdata)
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  #@show ctainds
-  #@show ctbinds
-  #@show ctcinds
-  #flush(stdout)
-  CUDA.Mem.pin(Adata)
-  CUDA.Mem.pin(Bdata)
-  CUDA.Mem.pin(Cdata)
-  synchronize()
-  #AC = CuArray(Adata)
-  #BC = CuArray(Bdata)
-  #CC = CuArray(Cdata)
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  #@assert !any(isnan.(AC))
-  #@assert !any(isnan.(BC))
-  #@assert !any(isnan.(CC))
-  #CC = cuTENSOR.contraction!(α, AC, ctainds, id_op, BC, ctbinds, id_op, β, CC, ctcinds, id_op, id_op)
-  #synchronize()
-  #@assert !any(isnan.(AC))
-  #@assert !any(isnan.(BC))
-  #@assert !any(isnan.(CC))
-  Cdata = cuTENSOR.contraction!(
-    α, Adata, ctainds, id_op, Bdata, ctbinds, id_op, β, Cdata, ctcinds, id_op, id_op
-  )
-  synchronize()
-  #end
-  #CCh = collect(CC)
-  #@assert !any(isnan.(CCh))
-  #Cdata .= CCh
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  return parent(Cdata)
-end
-
-function _blasmg_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  # TODO: directly use Tensor instead of Array
-  C = array(CT)
-  A = array(AT)
-  B = array(BT)
-
-  tA = 'N'
-  if props.permuteA
-    pA = NTuple{NA,Int}(props.PA)
-    @strided Ap = permutedims(A, pA)
-    AM = reshape(Ap, props.dmid, props.dleft)
-    tA = 'T'
-  else
-    #A doesn't have to be permuted
-    if Atrans(props)
-      AM = reshape(A, props.dmid, props.dleft)
-      tA = 'T'
-    else
-      AM = reshape(A, props.dleft, props.dmid)
-    end
-  end
-
-  tB = 'N'
-  if props.permuteB
-    pB = NTuple{NB,Int}(props.PB)
-    @strided Bp = permutedims(B, pB)
-    BM = reshape(Bp, props.dmid, props.dright)
-  else
-    if Btrans(props)
-      BM = reshape(B, props.dright, props.dmid)
-      tB = 'T'
-    else
-      BM = reshape(B, props.dmid, props.dright)
-    end
-  end
-
-  # TODO: this logic may be wrong
-  if props.permuteC
-    # Need to copy here since we will be permuting
-    # into C later
-    CM = reshape(copy(C), props.dleft, props.dright)
-  else
-    if Ctrans(props)
-      CM = reshape(C, props.dleft, props.dright)
-      (AM, BM) = (BM, AM)
-      if tA == tB
-        tA = tB = (tA == 'T' ? 'N' : 'T')
-      end
-    else
-      CM = reshape(C, props.dleft, props.dright)
-    end
-  end
-
-  if length(AM) > 4096 && length(BM) > 4096 && length(CM) > 4096
-    CM = CUBLASMG.mg_gemm!(
-      tA,
-      tB,
-      El(α),
-      AM,
-      BM,
-      El(β),
-      CM;
-      devs=devs[],
-      dev_rows=dev_rows[],
-      dev_cols=dev_cols[],
-    )
-  else
-    BLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM)
-  end
-
-  if props.permuteC
-    pC = NTuple{NC,Int}(props.PC)
-    Cr = reshape(CM, props.newCrange...)
-    @strided C .= permutedims(Cr, pC)
-  end
-  return C
-end
diff --git a/src/traits.jl b/src/traits.jl
deleted file mode 100644
index 6589298..0000000
--- a/src/traits.jl
+++ /dev/null
@@ -1,51 +0,0 @@
-# Trait type indicating the object is either on CPU
-# or on a CUDA device (for example a type that doesn't
-# have any data, like a Combiner or uniform Diagonal
-# tensor).
-struct CPUorCUDA end
-
-is_cu(::Type{<:Number}) = CPUorCUDA()
-is_cu(::Type{NDTensors.NoData}) = CPUorCUDA()
-is_cu(::Type{<:Array}) = false
-is_cu(::Type{<:CuArray}) = true
-
-# Handle Array wrappers like `ReshapedArray`.
-@traitfn function is_cu(arraytype::Type{T}) where {T; IsWrappedArray{T}}
-  return is_cu(parenttype(arraytype))
-end
-
-is_cu(X::Type{<:TensorStorage}) = is_cu(NDTensors.datatype(X))
-is_cu(X::Type{<:Tensor}) = is_cu(NDTensors.storagetype(X))
-is_cu(::Type{ITensor}) = error("Unknown")
-
-is_cu(x::CuArray) = is_cu(typeof(x))
-is_cu(x::Array) = is_cu(typeof(x))
-
-is_cu(x::TensorStorage) = is_cu(typeof(x))
-is_cu(x::Tensor) = is_cu(typeof(x))
-is_cu(x::ITensor) = is_cu(typeof(tensor(x)))
-is_cu(x::MPS) = all(is_cu, x)
-is_cu(x::MPO) = all(is_cu, x)
-
-mixed_cu_cpu(::Bool, ::CPUorCUDA) = false
-mixed_cu_cpu(::CPUorCUDA, ::Bool) = false
-mixed_cu_cpu(::CPUorCUDA, ::CPUorCUDA) = false
-mixed_cu_cpu(is_cu1::Bool, is_cu2::Bool) = (is_cu1 ⊻ is_cu2)
-mixed_cu_cpu(T1::Type, T2::Type) = mixed_cu_cpu(is_cu(T1), is_cu(T2))
-
-@traitdef MixedCuCPU{T1,T2}
-
-#! format: off
-@traitimpl MixedCuCPU{T1,T2} <- mixed_cu_cpu(T1, T2)
-#! format: on
-
-@traitfn function can_contract(
-  ::Type{T1}, ::Type{T2}
-) where {T1<:TensorStorage,T2<:TensorStorage;!MixedCuCPU{T1,T2}}
-  return true
-end
-@traitfn function can_contract(
-  ::Type{T1}, ::Type{T2}
-) where {T1<:TensorStorage,T2<:TensorStorage;MixedCuCPU{T1,T2}}
-  return false
-end