diff --git a/Project.toml b/Project.toml
index 762e51e..68e6eab 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,33 +1,15 @@
 name = "ITensorGPU"
 uuid = "d89171c1-af8f-46b3-badf-d2a472317c15"
 authors = ["Katharine Hyatt", "Matthew Fishman <mfishman@flatironinstitute.org>"]
-version = "0.1.7"
+version = "0.2.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
-Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NDTensors = "23ae76d9-e61a-49c4-8f12-3f1a16adf9cf"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SimpleTraits = "699a6c99-e7fa-54fc-8d76-47d257e15c1d"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [compat]
-Adapt = "3.5, 4"
-CUDA = "4.0"
-Combinatorics = "1.0.2"
-Functors = "0.2, 0.3, 0.4"
-ITensors = "= 0.3.37"
-NDTensors = "0.1.50"
-SimpleTraits = "0.9.4"
-StaticArrays = "1.2.13"
-Strided = "1.1.2, 2"
-TimerOutputs = "0.5.13"
-cuTENSOR = "1.1.0"
-julia = "1.6 - 1.9"
+Adapt = "3, 4"
+CUDA = "4, 5"
+ITensors = "0.3, 0.4, 0.5"
+julia = "1.6"
diff --git a/README.md b/README.md
index 40238be..964cdc9 100644
--- a/README.md
+++ b/README.md
@@ -1,50 +1,5 @@
-# ITensorGPU: Intelligent Tensors with GPU acceleration
-
-[![codecov](https://codecov.io/gh/ITensor/ITensorGPU.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ITensor/ITensorGPU.jl)
-
-[![gitlab-ci](https://gitlab.com/JuliaGPU/ITensorGPU-jl/badges/master/pipeline.svg)](https://gitlab.com/JuliaGPU/ITensorGPU-jl/commits/master)
-
-This package extends the functionality of [ITensors.jl](https://github.com/ITensor/ITensors.jl) to make use of CUDA-enabled GPUs to accelerate tensor contractions and factorizations. It sits on top of the wonderful [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package and uses NVIDIA's [cuTENSOR](https://developer.nvidia.com/cutensor) library for high-performance tensor operations.
-
-## Installing ITensorGPU.jl
-
-Dependencies:
-  - [Julia 1.3 or later](https://julialang.org/downloads/)
-  - [CUDA 10.1 or later](https://developer.nvidia.com/cuda-downloads) -- Currently only NVIDIA GPUs are supported. NVIDIA drivers are required so that Julia can make use of the NVIDIA GPU on your system.
-  - [cuTENSOR v1.0.0 or later](https://developer.nvidia.com/cutensor) -- A specialized library for perfoming permutation-free tensor contractions on the GPU. `libcutensor.so` needs to be in your `LD_LIBRARY_PATH` so that `CUDA.jl` will be able to find it.
-  - [ITensors.jl](https://itensor.github.io/ITensors.jl/stable/#Installation-1)
-
-To properly install CUDA with Julia, it may be helpful to first follow the [CUDA.jl installation instructions](https://juliagpu.github.io/CUDA.jl/stable/installation/overview/) and test that you have that installed properly and that it is able to use `cuTENSOR`. You can run the commands:
-```julia
-julia> using CUDA.CUTENSOR
-
-julia> CUTENSOR.has_cutensor()
-true
-
-julia> CUTENSOR.version()
-v"1.2.1"
-```
-to check that `CUDA.jl` can see the version of `cuTENSOR` you have installed.
-
-Once you have all of the dependencies installed, you can then go ahead and install `ITensorGPU.jl` with the following command:
-```
-julia> ]
-
-pkg> add ITensorGPU
-```
-
-To check if this has all worked, you can run the package tests using:
-```julia
-julia> ]
-
-pkg> test ITensorGPU
-```
-
-## Examples
-
-Take a look at the `examples/` directory for examples of running ITensor calculations on the GPU.
-
-For an application of `ITensorGPU.jl` to more sophisticated tensor network calculations, take a look at [PEPS.jl](https://github.com/ITensor/PEPS.jl).
-
-For some background on the development and design of this package, you can take a look at [this blog post](https://kshyatt.github.io/post/itensorsgpu/) by Katie Hyatt, original author of the `ITensorGPU.jl` package.
+| :warning: WARNING          |
+|:---------------------------|
+| The ITensorGPU.jl package is deprecated and only provided for backwards compatibility. For an alternative, see the ITensors.jl documentation about [running ITensor on GPUs](https://itensor.github.io/ITensors.jl/dev/RunningOnGPUs.html). |
 
+TEST
diff --git a/src/ITensorGPU.jl b/src/ITensorGPU.jl
index 93be4c9..6c5a6fe 100644
--- a/src/ITensorGPU.jl
+++ b/src/ITensorGPU.jl
@@ -1,114 +1,38 @@
 module ITensorGPU
-using NDTensors
-
-using Adapt
-using CUDA
-using CUDA.CUBLAS
-using CUDA.CUSOLVER
-using Functors
-using ITensors
-using LinearAlgebra
-using Random
-using SimpleTraits
-using StaticArrays
-using Strided
-using TimerOutputs
-using cuTENSOR
-
-using NDTensors: setdata, setstorage, cpu, IsWrappedArray, parenttype
-
-import Adapt: adapt_structure
-import Base: *, permutedims!
-import CUDA: CuArray, CuMatrix, CuVector, cu
-import CUDA.Mem: pin
-import ITensors:
-  randn!,
-  compute_contraction_labels,
-  eigen,
-  tensor,
-  scale!,
-  unioninds,
-  array,
-  matrix,
-  vector,
-  polar,
-  tensors,
-  truncate!,
-  leftlim,
-  rightlim,
-  permute,
-  BroadcastStyle,
-  Indices
-import NDTensors:
-  Atrans,
-  Btrans,
-  CombinerTensor,
-  ContractionProperties,
-  Combiner,
-  Ctrans,
-  Diag,
-  DiagTensor,
-  Dense,
-  DenseTensor,
-  NonuniformDiag,
-  NonuniformDiagTensor,
-  Tensor,
-  UniformDiag,
-  UniformDiagTensor,
-  _contract!!,
-  _contract!,
-  _contract_scalar!,
-  _contract_scalar_noperm!,
-  can_contract,
-  compute_contraction_properties!,
-  contract!!,
-  contract!,
-  contract,
-  contraction_output,
-  contraction_output_type,
-  data,
-  getperm,
-  ind,
-  is_trivial_permutation,
-  outer!,
-  outer!!,
-  permutedims!!,
-  set_eltype,
-  set_ndims,
-  similartype,
-  zero_contraction_output
-import cuTENSOR: cutensorContractionPlan_t, cutensorAlgo_t
-
-#const ContractionPlans = Dict{String, Tuple{cutensorAlgo_t, cutensorContractionPlan_t}}()
-const ContractionPlans = Dict{String,cutensorAlgo_t}()
-
-include("cuarray/set_types.jl")
-include("traits.jl")
-include("adapt.jl")
-include("tensor/cudense.jl")
-include("tensor/dense.jl")
-include("tensor/culinearalgebra.jl")
-include("tensor/cutruncate.jl")
-include("tensor/cucombiner.jl")
-include("tensor/cudiag.jl")
-include("cuitensor.jl")
-include("mps/cumps.jl")
-
-export cu,
-  cpu, cuITensor, randomCuITensor, cuMPS, randomCuMPS, productCuMPS, randomCuMPO, cuMPO
-
-## TODO: Is this needed?
-## const devs = Ref{Vector{CUDAdrv.CuDevice}}()
-## const dev_rows = Ref{Int}(0)
-## const dev_cols = Ref{Int}(0)
-## function __init__()
-##   voltas    = filter(dev->occursin("V100", CUDAdrv.name(dev)), collect(CUDAdrv.devices()))
-##   pascals    = filter(dev->occursin("P100", CUDAdrv.name(dev)), collect(CUDAdrv.devices()))
-##   devs[] = voltas[1:1]
-##   #devs[] = pascals[1:2]
-##   CUBLASMG.cublasMgDeviceSelect(CUBLASMG.mg_handle(), length(devs[]), devs[])
-##   dev_rows[] = 1
-##   dev_cols[] = 1
-## end
-
-end #module
+using Adapt: adapt
+using CUDA: CUDA, CuArray, cu
+export cu
+using ITensors: cpu
+export cpu
+
+using ITensors: ITensor, cpu, cu, randomITensor
+function cuITensor(args...; kwargs...)
+  return adapt(CuArray, ITensor(args...; kwargs...))
+end
+function randomCuITensor(args...; kwargs...)
+  return adapt(CuArray, randomITensor(args...; kwargs...))
+end
+export cuITensor, randomCuITensor
+
+# TODO: Change over to `using ITensorMPS`
+# once it is registered.
+using ITensors.ITensorMPS: MPO, MPS, randomMPO, randomMPS
+function cuMPS(args...; kwargs...)
+  return adapt(CuArray, MPS(args...; kwargs...))
+end
+cuMPS(tn::MPS) = adapt(CuArray, tn)
+function productCuMPS(args...; kwargs...)
+  return adapt(CuArray, MPS(args...; kwargs...))
+end
+function randomCuMPS(args...; kwargs...)
+  return adapt(CuArray, randomMPS(args...; kwargs...))
+end
+function cuMPO(args...; kwargs...)
+  return adapt(CuArray, MPO(args...; kwargs...))
+end
+cuMPO(tn::MPO) = adapt(CuArray, tn)
+function randomCuMPO(args...; kwargs...)
+  return adapt(CuArray, randomMPO(args...; kwargs...))
+end
+export cuMPO, cuMPS, productCuMPS, randomCuMPO, randomCuMPS
+end
diff --git a/src/adapt.jl b/src/adapt.jl
deleted file mode 100644
index fe40c14..0000000
--- a/src/adapt.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Used to adapt `EmptyStorage` types
-#
-
-function NDTensors.cu(eltype::Type{<:Number}, x)
-  return fmap(x -> adapt(CuVector{eltype,default_buffertype()}, x), x)
-end
-NDTensors.cu(x) = fmap(x -> adapt(CuArray, x), x)
-
-function NDTensors.set_eltype_if_unspecified(
-  arraytype::Type{CuVector{T}}, eltype::Type
-) where {T}
-  return arraytype
-end
-function NDTensors.set_eltype_if_unspecified(arraytype::Type{CuVector}, eltype::Type)
-  return CuVector{eltype}
-end
-
-# Overload `CUDA.cu` for convenience
-const ITensorType = Union{
-  TensorStorage,Tensor,ITensor,Array{ITensor},Array{<:Array{ITensor}},MPS,MPO
-}
-CUDA.cu(x::ITensorType) = NDTensors.cu(x)
-CUDA.cu(eltype::Type{<:Number}, x::ITensorType) = NDTensors.cu(eltype, x)
-
-function NDTensors.adapt_storagetype(
-  to::Type{<:CUDA.CuArray}, x::Type{<:NDTensors.EmptyStorage}
-)
-  store = NDTensors.storagetype(x)
-  return NDTensors.emptytype(
-    NDTensors.set_datatype(store, CuVector{eltype(store),default_buffertype()})
-  )
-end
diff --git a/src/cuarray/set_types.jl b/src/cuarray/set_types.jl
deleted file mode 100644
index 21beda4..0000000
--- a/src/cuarray/set_types.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-buffertype(datatype::Type{<:CuArray{<:Any,<:Any,B}}) where {B} = B
-function buffertype(datatype::Type{<:CuArray})
-  println(
-    "CuArray definitions require a CUDA.Mem buffer try $(datatype{default_buffertype()})"
-  )
-  throw(TypeError)
-end
-
-default_buffertype() = CUDA.Mem.DeviceBuffer
-
-function set_eltype(arraytype::Type{<:CuArray}, eltype::Type)
-  return CuArray{eltype,ndims(arraytype),buffertype(arraytype)}
-end
-
-function set_ndims(arraytype::Type{<:CuArray}, ndims)
-  return CuArray{eltype(arraytype),ndims,buffertype(arraytype)}
-end
diff --git a/src/cuitensor.jl b/src/cuitensor.jl
deleted file mode 100644
index 667aa28..0000000
--- a/src/cuitensor.jl
+++ /dev/null
@@ -1,127 +0,0 @@
-import ITensors.NDTensors: NeverAlias, AliasStyle, AllowAlias
-import ITensors: ITensor
-import CUDA: CuArray
-
-function cuITensor(eltype::Type{<:Number}, inds::IndexSet)
-  return itensor(
-    NDTensors.default_storagetype(CuVector{eltype,default_buffertype()})(dim(inds)), inds
-  )
-end
-cuITensor(::Type{T}, inds::Index...) where {T<:Number} = cuITensor(T, IndexSet(inds...))
-
-cuITensor(is::IndexSet) = cuITensor(Float64, is)
-cuITensor(inds::Index...) = cuITensor(IndexSet(inds...))
-
-cuITensor() = ITensor()
-function cuITensor(x::S, inds::IndexSet{N}) where {S<:Number,N}
-  d = NDTensors.Dense{S,CuVector{S,default_buffertype()}}(
-    fill!(CuVector{S,default_buffertype()}(undef, dim(inds)), x)
-  )
-  return ITensor(d, inds)
-end
-cuITensor(x::S, inds::Index...) where {S<:Number} = cuITensor(x, IndexSet(inds...))
-
-function ITensor(
-  as::AliasStyle,
-  eltype::Type{<:Number},
-  A::CuArray{<:Number},
-  inds::Indices{Index{Int}};
-  kwargs...,
-)
-  length(A) ≠ dim(inds) && throw(
-    DimensionMismatch(
-      "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))",
-    ),
-  )
-  data = CuArray{eltype}(as, A)
-  return itensor(Dense(data), inds)
-end
-
-# Fix ambiguity error
-function ITensor(
-  as::NDTensors.AliasStyle, eltype::Type{<:Number}, A::CuArray{<:Number}, inds::Tuple{}
-)
-  length(A) ≠ dim(inds) && throw(
-    DimensionMismatch(
-      "In ITensor(::CuArray, inds), length of AbstractArray ($(length(A))) must match total dimension of IndexSet ($(dim(inds)))",
-    ),
-  )
-  data = CuArray{eltype}(as, A)
-  return itensor(Dense(data), inds)
-end
-
-# Helper functions for different view behaviors
-CuArray{ElT,N}(::NeverAlias, A::AbstractArray) where {ElT,N} = CuArray{ElT,N}(A)
-function CuArray{ElT,N}(::AllowAlias, A::AbstractArray) where {ElT,N}
-  return convert(CuArray{ElT,N}, A)
-end
-function CuArray{ElT}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {ElT,N,ElTA}
-  return CuArray{ElT,N}(as, A)
-end
-
-# TODO: Change to:
-# (Array{ElT, N} where {ElT})([...]) = [...]
-# once support for `VERSION < v"1.6"` is dropped.
-# Previous to Julia v1.6 `where` syntax couldn't be used in a function name
-function CuArray{<:Any,N}(as::AliasStyle, A::AbstractArray{ElTA,N}) where {N,ElTA}
-  return CuArray{ElTA,N}(as, A)
-end
-
-cuITensor(data::Array, inds...) = cu(ITensor(data, inds...))
-
-cuITensor(data::CuArray, inds...) = ITensor(data, inds...)
-
-cuITensor(A::ITensor) = cu(A)
-
-function randomCuITensor(::Type{S}, inds::Indices) where {S<:Real}
-  T = cuITensor(S, inds)
-  randn!(T)
-  return T
-end
-function randomCuITensor(::Type{S}, inds::Indices) where {S<:Complex}
-  Tr = cuITensor(real(S), inds)
-  Ti = cuITensor(real(S), inds)
-  randn!(Tr)
-  randn!(Ti)
-  return complex(Tr) + im * Ti
-end
-function randomCuITensor(::Type{S}, inds::Index...) where {S<:Number}
-  return randomCuITensor(S, IndexSet(inds...))
-end
-randomCuITensor(inds::IndexSet) = randomCuITensor(Float64, inds)
-randomCuITensor(inds::Index...) = randomCuITensor(Float64, IndexSet(inds...))
-
-CuArray(T::ITensor) = CuArray(tensor(T))
-
-function CuArray{ElT,N}(T::ITensor, is::Vararg{Index,N}) where {ElT,N}
-  ndims(T) != N && throw(
-    DimensionMismatch(
-      "cannot convert an $(ndims(T)) dimensional ITensor to an $N-dimensional CuArray."
-    ),
-  )
-  TT = tensor(permute(T, is...; allow_alias=true))
-  return CuArray{ElT,N}(TT)::CuArray{ElT,N}
-end
-
-function CuArray{ElT}(T::ITensor, is::Vararg{Index,N}) where {ElT,N}
-  return CuArray{ElT,N}(T, is...)
-end
-
-function CuArray(T::ITensor, is::Vararg{Index,N}) where {N}
-  return CuArray{eltype(T),N}(T, is...)::CuArray{<:Number,N}
-end
-
-CUDA.CuMatrix(A::ITensor) = CuArray(A)
-
-function CuVector(A::ITensor)
-  if ndims(A) != 1
-    throw(DimensionMismatch("Vector() expected a 1-index ITensor"))
-  end
-  return CuArray(A)
-end
-
-function CuMatrix(T::ITensor, i1::Index, i2::Index)
-  ndims(T) != 2 &&
-    throw(DimensionMismatch("ITensor must be order 2 to convert to a Matrix"))
-  return CuArray(T, i1, i2)
-end
diff --git a/src/mps/cumps.jl b/src/mps/cumps.jl
deleted file mode 100644
index f59099f..0000000
--- a/src/mps/cumps.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-# cu(ψ::Union{MPS,MPO}) = map(cu, ψ)
-# cpu(ψ::Union{MPS,MPO}) = map(cpu, ψ)
-
-cuMPS(ψ::MPS) = cu(ψ)
-cuMPS(args...; kwargs...) = cu(MPS(args...; kwargs...))
-randomCuMPS(args...; kwargs...) = cu(randomMPS(args...; kwargs...))
-
-# For backwards compatibility
-productCuMPS(args...; kwargs...) = cuMPS(args...; kwargs...)
-
-cuMPO(M::MPO) = cu(M)
-cuMPO(args...; kwargs...) = cu(MPO(args...; kwargs...))
-randomCuMPO(args...; kwargs...) = cu(randomMPO(args...; kwargs...))
diff --git a/src/tensor/cucombiner.jl b/src/tensor/cucombiner.jl
deleted file mode 100644
index d01109a..0000000
--- a/src/tensor/cucombiner.jl
+++ /dev/null
@@ -1 +0,0 @@
-Base.promote_rule(::Type{<:Combiner}, StorageT::Type{<:CuDense}) = StorageT
diff --git a/src/tensor/cudense.jl b/src/tensor/cudense.jl
deleted file mode 100644
index 2db9055..0000000
--- a/src/tensor/cudense.jl
+++ /dev/null
@@ -1,573 +0,0 @@
-using LinearAlgebra: BlasFloat
-
-const CuDense{ElT,VecT} = Dense{ElT,VecT} where {VecT<:CuVector}
-const CuDenseTensor{ElT,N,StoreT,IndsT} = Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDense}
-
-# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:CuArray,SB<:Array}
-#   return Dense{T,SA}(CuArray(x))
-# end
-# function Dense{T,SA}(x::Dense{T,SB}) where {T<:Number,SA<:Array,SB<:CuArray}
-#   return Dense{T,SA}(collect(x.data))
-# end
-# Dense{T,S}(size::Integer) where {T,S<:CuArray{<:T}} = Dense{T,S}(CUDA.zeros(T, size))
-# function Dense{T,S}(x::T, size::Integer) where {T,S<:CuArray{<:T}}
-#   arr = CuArray{T}(undef, size)
-#   fill!(arr, x)
-#   return Dense{T,S}(arr)
-# end
-
-function Base.complex(::Type{Dense{ElT,VT}}) where {ElT,VT<:CuArray}
-  return Dense{complex(ElT),CuVector{complex(ElT)}}
-end
-
-CuArray(x::CuDense{ElT}) where {ElT} = CuVector{ElT}(data(x))
-function CuArray{ElT,N}(x::CuDenseTensor{ElT,N}) where {ElT,N}
-  return CuArray{ElT,N}(reshape(data(store(x)), dims(inds(x))...))
-end
-CuArray(x::CuDenseTensor{ElT,N}) where {ElT,N} = CuArray{ElT,N}(x)
-
-*(D::Dense{T,AT}, x::S) where {T,AT<:CuArray,S<:Number} = Dense(x .* data(D))
-
-Base.getindex(D::CuDense{<:Number}) = collect(data(D))[]
-Base.getindex(D::CuDenseTensor{<:Number,0}) = store(D)[]
-LinearAlgebra.norm(T::CuDenseTensor) = norm(data(store(T)))
-
-function Base.copyto!(R::CuDenseTensor{<:Number,N}, T::CuDenseTensor{<:Number,N}) where {N}
-  RA = array(R)
-  TA = array(T)
-  RA .= TA
-  return R
-end
-
-# This is for type promotion for Scalar*Dense
-function Base.promote_rule(
-  ::Type{<:Dense{ElT1,CuVector{ElT1}}}, ::Type{ElT2}
-) where {ElT1,ElT2<:Number}
-  ElR = promote_type(ElT1, ElT2)
-  VecR = CuVector{ElR}
-  return Dense{ElR,VecR}
-end
-
-function permutedims!!(
-  B::Tensor{ElT,N,StoreT,IndsB},
-  A::Tensor{ElT,N,StoreT,IndsA},
-  perm::NTuple{N,Int},
-  f::Function=(r, t) -> permute!(r, t),
-) where {N,ElT,IndsB,IndsA,StoreT<:CuDense{ElT}}
-  Ais = inds(A)
-  Bis = ITensors.NDTensors.permute(inds(A), perm)
-  B = f(B, A)
-  return B
-end
-
-import ITensors.NDTensors: GemmBackend, auto_select_backend, _gemm!
-function backend_cutensor()
-  return gemm_backend[] = :CUTENSOR
-end
-function backend_cublas()
-  return gemm_backend[] = :CUBLAS
-end
-
-@inline function auto_select_backend(
-  ::Type{<:CuArray{<:BlasFloat}},
-  ::Type{<:CuArray{<:BlasFloat}},
-  ::Type{<:CuArray{<:BlasFloat}},
-)
-  return GemmBackend(:CUBLAS)
-end
-
-@inline function auto_select_backend(
-  ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:CuArray{<:BlasFloat}}, ::Type{<:AbstractVecOrMat}
-)
-  return GemmBackend(:GenericCUDA)
-end
-
-# CUBLAS matmul
-function _gemm!(
-  ::GemmBackend{:CUBLAS},
-  tA,
-  tB,
-  alpha,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta,
-  C::AbstractVecOrMat,
-)
-  return CUBLAS.gemm!(tA, tB, alpha, A, B, beta, C)
-end
-
-# CUDA generic matmul
-function _gemm!(
-  ::GemmBackend{:GenericCUDA},
-  tA,
-  tB,
-  alpha,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta,
-  C::CuDenseTensor,
-)
-  C_dat = reshape(data(store(C)), size(C))
-  A_ = tA == 'T' ? transpose(A) : A
-  B_ = tB == 'T' ? transpose(B) : B
-  C_dat = mul!(C_dat, A_, B_, alpha, beta)
-  copyto!(data(store(C)), C_dat)
-  return C
-end
-
-function _contract_scalar!(
-  R::CuDenseTensor{ElR,NR},
-  labelsR,
-  T₁::CuDenseTensor,
-  labelsT₁,
-  T₂::CuDenseTensor,
-  labelsT₂,
-  α=one(ElR),
-  β=zero(ElR),
-) where {ElR,NR}
-  if nnz(T₁) == nnz(T₂) == 1
-    new_R = Tensor(Dense(data(store(T₁)) .* data(store(T₂))), inds(R))
-    copyto!(store(R), store(new_R))
-  elseif nnz(T₁) == 1
-    props = ContractionProperties(labelsT₁, labelsT₂, labelsR)
-    compute_contraction_properties!(props, T₁, T₂, R)
-    R = _contract!(R, T₁, T₂, props, α, β)
-  elseif nnz(T₂) == 1
-    props = ContractionProperties(labelsT₁, labelsT₂, labelsR)
-    compute_contraction_properties!(props, T₁, T₂, R)
-    R = _contract!(R, T₁, T₂, props, α, β)
-  else
-    error("In _contract_scalar!, one tensor must be a scalar")
-  end
-  return R
-end
-
-function _gemm_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  # TODO: directly use Tensor instead of Array
-  C = array(CT)
-  A = array(AT)
-  B = array(BT)
-
-  tA = 'N'
-  if props.permuteA
-    pA = NTuple{NA,Int}(props.PA)
-    Ap = permutedims(A, pA)
-    AM = reshape(Ap, props.dmid, props.dleft)
-    tA = 'T'
-  else
-    #A doesn't have to be permuted
-    if Atrans(props)
-      AM = reshape(A, props.dmid, props.dleft)
-      tA = 'T'
-    else
-      AM = reshape(A, props.dleft, props.dmid)
-    end
-  end
-
-  tB = 'N'
-  if props.permuteB
-    pB = NTuple{NB,Int}(props.PB)
-    Bp = permutedims(B, pB)
-    BM = reshape(Bp, props.dmid, props.dright)
-  else
-    if Btrans(props)
-      BM = reshape(B, props.dright, props.dmid)
-      tB = 'T'
-    else
-      BM = reshape(B, props.dmid, props.dright)
-    end
-  end
-
-  #TODO: this logic may be wrong
-  if props.permuteC
-    #Need to copy here since we will be permuting
-    #into C later
-    CM = reshape(copy(C), props.dleft, props.dright)
-  else
-    if Ctrans(props)
-      CM = reshape(C, props.dleft, props.dright)
-      (AM, BM) = (BM, AM)
-      if tA == tB
-        tA = tB = (tA == 'T' ? 'N' : 'T')
-      end
-    else
-      CM = reshape(C, props.dleft, props.dright)
-    end
-  end
-
-  CM = CUBLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM)
-
-  if props.permuteC
-    pC = NTuple{NC,Int}(props.PC)
-    Cr = reshape(CM, props.newCrange...)
-    @strided C .= permutedims(Cr, pC)
-  end
-  return C
-end
-
-function _contract!(
-  CT::CuDenseTensor{El,NC},
-  AT::CuDenseTensor{El,NA},
-  BT::CuDenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  if ndims(CT) > 12 || ndims(BT) > 12 || ndims(AT) > 12
-    return _gemm_contract!(CT, AT, BT, props, α, β)
-  end
-  Ainds = inds(AT)
-  Adims = dims(Ainds)
-  Binds = inds(BT)
-  Bdims = dims(Binds)
-  Cinds = inds(CT)
-  Cdims = dims(Cinds)
-  Adata = reshape(data(store(AT)), Adims...)
-  Bdata = reshape(data(store(BT)), Bdims...)
-  Cdata = reshape(data(store(CT)), Cdims...)
-  contracted = commoninds(Ainds, Binds)
-  A_only = uniqueinds(Ainds, Binds)
-  B_only = uniqueinds(Binds, Ainds)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(contracted)
-    push!(ind_dict, i)
-  end
-  if length(A_only) > 0
-    for (idx, i) in enumerate(A_only)
-      push!(ind_dict, i)
-    end
-  end
-  if length(B_only) > 0
-    for (idx, i) in enumerate(B_only)
-      push!(ind_dict, i)
-    end
-  end
-  ctainds = zeros(Int, length(Ainds))
-  ctbinds = zeros(Int, length(Binds))
-  ctcinds = zeros(Int, length(Cinds))
-  for (ii, ia) in enumerate(Ainds)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Binds)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  for (ii, ic) in enumerate(Cinds)
-    ctcinds[ii] = findfirst(x -> x == ic, ind_dict)
-  end
-  id_op = cuTENSOR.CUTENSOR_OP_IDENTITY
-  dict_key = ""
-  for cc in zip(ctcinds, Cdims)
-    dict_key *= string(cc[1]) * "," * string(cc[2]) * ","
-  end
-  for aa in zip(ctainds, Adims)
-    dict_key *= string(aa[1]) * "," * string(aa[2]) * ","
-  end
-  for bb in zip(ctbinds, Bdims)
-    dict_key *= string(bb[1]) * "," * string(bb[2]) * ","
-  end
-  if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1
-    if haskey(ContractionPlans, dict_key)
-      dict_val = ContractionPlans[dict_key]
-      algo = dict_val
-      #plan  = dict_val[2]
-      Cdata = cuTENSOR.contraction!(
-        α,
-        Adata,
-        Vector{Char}(ctainds),
-        id_op,
-        Bdata,
-        Vector{Char}(ctbinds),
-        id_op,
-        β,
-        Cdata,
-        Vector{Char}(ctcinds),
-        id_op,
-        id_op;
-        algo=algo,
-      )
-    else
-      # loop through all algos
-      # pick the fastest one
-      # store that plan!
-      best_time = 1e6
-      best_plan = nothing
-      best_algo = nothing
-      max_algos = Ref{Int32}(C_NULL)
-      cuTENSOR.cutensorContractionMaxAlgos(max_algos)
-      # fix once the other options are documented
-      #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1))
-      algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1))
-      for algo in reverse(algos)
-        try
-          Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!(
-            α,
-            Adata,
-            Vector{Char}(ctainds),
-            id_op,
-            Bdata,
-            Vector{Char}(ctbinds),
-            id_op,
-            β,
-            Cdata,
-            Vector{Char}(ctcinds),
-            id_op,
-            id_op;
-            algo=cuTENSOR.cutensorAlgo_t(algo),
-          )
-          if this_time < best_time
-            best_time = this_time
-            #best_plan = this_plan
-            best_algo = cuTENSOR.cutensorAlgo_t(algo)
-          end
-        catch err
-          @warn "Algorithm $algo not supported"
-        end
-      end
-      ContractionPlans[dict_key] = best_algo
-    end
-  else
-    Cdata = cuTENSOR.contraction!(
-      α,
-      Adata,
-      Vector{Char}(ctainds),
-      id_op,
-      Bdata,
-      Vector{Char}(ctbinds),
-      id_op,
-      β,
-      Cdata,
-      Vector{Char}(ctcinds),
-      id_op,
-      id_op,
-    )
-  end
-  return parent(Cdata)
-end
-
-function Base.:+(B::CuDenseTensor, A::CuDenseTensor)
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(inds(A))
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  cuTENSOR.elementwiseBinary!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(store(B)), vec(C))
-  return B
-end
-
-function Base.:+(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet)
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  Cis = Bis
-  C = cuTENSOR.elementwiseBinary!(
-    1, reshapeAdata, ctainds, opA, 1, reshapeBdata, ctbinds, opC, C, ctcinds, opAC
-  )
-  copyto!(data(B), vec(C))
-  return C
-end
-
-function Base.:-(B::CuDenseTensor, A::CuDenseTensor)
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(inds(A))
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis))
-  cuTENSOR.elementwiseBinary!(
-    -one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(store(B)), vec(C))
-  return B
-end
-
-function Base.:-(A::CuDense, Ais::IndexSet, B::CuDense, Bis::IndexSet)
-  opA = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opC = cuTENSOR.CUTENSOR_OP_IDENTITY
-  opAC = cuTENSOR.CUTENSOR_OP_ADD
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  ctcinds = copy(ctbinds)
-  C = CUDA.zeros(eltype(Bdata), dims(Bis)...)
-  Cis = Bis
-  C = cuTENSOR.elementwiseBinary!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    ctainds,
-    opA,
-    -one(eltype(Bdata)),
-    reshapeBdata,
-    ctbinds,
-    opC,
-    C,
-    ctcinds,
-    opAC,
-  )
-  copyto!(data(B), vec(C))
-  return C
-end
-
-function Base.permute!(B::CuDenseTensor, A::CuDenseTensor)
-  Ais = inds(A)
-  Bis = inds(B)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(store(A))
-  Bdata = data(store(B))
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  if ndims(A) < 40 # use CUTENSOR
-    ctainds = zeros(Int, length(Ais))
-    ctbinds = zeros(Int, length(Bis))
-    for (ii, ia) in enumerate(Ais)
-      ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-    end
-    for (ii, ib) in enumerate(Bis)
-      ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-    end
-    cuTENSOR.permutation!(
-      one(eltype(Adata)),
-      reshapeAdata,
-      Vector{Char}(ctainds),
-      reshapeBdata,
-      Vector{Char}(ctbinds),
-    )
-  else # use GPUArrays
-    perm = Int[]
-    for aix in Ais
-      b_pos = findfirst(bix -> bix == aix, Bis)
-      push!(perm, b_pos)
-    end
-    @assert isperm(perm)
-    permutedims!(reshapeBdata, reshapeAdata, invperm(perm))
-  end
-  return Tensor(Dense(vec(reshapeBdata)), inds(B))
-end
-
-function Base.permute!(B::CuDense, Bis::IndexSet, A::CuDense, Ais::IndexSet)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(Ais)
-    push!(ind_dict, i)
-  end
-  Adata = data(A)
-  Bdata = data(B)
-  reshapeBdata = reshape(Bdata, dims(Bis)...)
-  reshapeAdata = reshape(Adata, dims(Ais)...)
-  ctainds = zeros(Int, length(Ais))
-  ctbinds = zeros(Int, length(Bis))
-  for (ii, ia) in enumerate(Ais)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Bis)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-
-  cuTENSOR.permutation!(
-    one(eltype(Adata)),
-    reshapeAdata,
-    Vector{Char}(ctainds),
-    reshapeBdata,
-    Vector{Char}(ctbinds),
-  )
-  return Tensor(Dense(reshapeBdata), Tuple(Bis))
-end
-
-Base.:/(A::CuDenseTensor, x::Number) = A * inv(x)
diff --git a/src/tensor/cudiag.jl b/src/tensor/cudiag.jl
deleted file mode 100644
index 0e27beb..0000000
--- a/src/tensor/cudiag.jl
+++ /dev/null
@@ -1,228 +0,0 @@
-export CuDiag
-
-const CuDiag{ElT,VecT} = Diag{ElT,VecT} where {VecT<:CuArray{ElT}}
-const NonuniformCuDiagTensor{ElT,N,StoreT,IndsT} =
-  Tensor{ElT,N,StoreT,IndsT} where {StoreT<:CuDiag}
-
-CuArray(D::NonuniformCuDiagTensor) = CuArray(dense(D))
-
-function NDTensors.dense(T::NonuniformCuDiagTensor{ElT}) where {ElT}
-  R_data = CUDA.zeros(ElT, dim(inds(T)))
-  diag_inds = diagind(reshape(R_data, dims(inds(T))), 0)
-  R_data[diag_inds] .= data(store(T))
-  return Tensor(Dense(R_data), inds(T))
-end
-
-function Base.promote_rule(
-  ::Type{Diag{ElT2,VecT2}}, ::Type{CuDense}
-) where {ElT2,VecT2<:Number}
-  return promote_type(DenseT1, ElT2)
-end
-
-function Base.promote_rule(
-  ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}}
-) where {ElT1,ElT2,N1,N2,StoreT1<:CuDense,StoreT2<:UniformDiag}
-  ElT3 = promote_type(ElT1, ElT2)
-  return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3}
-end
-function Base.promote_rule(
-  ::Type{<:Tensor{ElT1,N1,StoreT1}}, ::Type{<:Tensor{ElT2,N2,StoreT2}}
-) where {ElT1,ElT2,N1,N2,StoreT1<:UniformDiag,StoreT2<:CuDense}
-  ElT3 = promote_type(ElT1, ElT2)
-  return Tensor{promote_type(ElT1, ElT2),N3,CuDense{ElT3,CuVector{ElT3}}} where {N3}
-end
-
-function Base.promote_rule(
-  ::Type{<:UniformDiag{ElT1}}, ::Type{<:NonuniformDiag{ElT2,VecT2}}
-) where {ElT1,ElT2,VecT2<:CuArray}
-  ElT3 = promote_type(ElT1, ElT2)
-  return NonuniformDiag{ElT3,CuVector{ElT3}}
-end
-function Base.promote_rule(
-  ::Type{<:NonuniformDiag{ElT2,VecT2}}, ::Type{<:UniformDiag{ElT1}}
-) where {ElT1,ElT2,VecT2<:CuArray}
-  ElT3 = promote_type(ElT1, ElT2)
-  return NonuniformDiag{ElT3,CuVector{ElT3}}
-end
-function Base.promote_rule(
-  ::Type{DenseT1}, ::Type{Diag{ElT2,VecT2}}
-) where {DenseT1<:CuDense,ElT2,VecT2<:Number}
-  return promote_type(DenseT1, ElT2)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:NonuniformCuDiagTensor}, TensorT2::Type{<:CuDenseTensor}, IndsR
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-function contraction_output_type(
-  TensorT1::Type{<:CuDenseTensor}, TensorT2::Type{<:NonuniformCuDiagTensor}, IndsR
-)
-  return contraction_output_type(TensorT2, TensorT1, IndsR)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  IndsR::Type,
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-
-function contraction_output_type(
-  TensorT1::Type{<:UniformDiagTensor},
-  TensorT2::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  IndsR::Type,
-)
-  return similartype(promote_type(TensorT1, TensorT2), IndsR)
-end
-function contraction_output_type(
-  TensorT1::Type{<:DiagTensor{<:Number,<:CuDiag}},
-  TensorT2::Type{<:UniformDiagTensor},
-  IndsR::Type,
-)
-  return contraction_output_type(TensorT2, TensorT1, IndsR)
-end
-
-function zero_contraction_output(
-  T1::UniformDiagTensor{ElT1,N1}, T2::CuDenseTensor{ElT2,N2}, indsR::IndsR
-) where {ElT1,N1,ElT2,N2,IndsR}
-  ElT3 = promote_type(ElT1, ElT2)
-  dat = CUDA.zeros(ElT3, dim(indsR))
-  return Tensor(Dense(dat), indsR)
-end
-function zero_contraction_output(
-  T2::CuDenseTensor{ElT2,N2}, T1::UniformDiagTensor{ElT1,N1}, indsR::IndsR
-) where {ElT1,N1,ElT2,N2,IndsR}
-  return zero_contraction_output(T1, T2, indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:NonuniformDiagTensor,TensorT2<:NonuniformCuDiagTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Diag(dat), indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:UniformDiagTensor,TensorT2<:NonuniformCuDiagTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Diag(dat), indsR)
-end
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT2<:UniformDiagTensor,TensorT1<:NonuniformCuDiagTensor}
-  return zero_contraction_output(T2, T1, indsR)
-end
-
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT1<:DiagTensor,TensorT2<:CuDenseTensor}
-  ElT3 = promote_type(eltype(TensorT1), eltype(TensorT2))
-  dat = CUDA.zeros(ElT3, length(data(store(T2))))
-  return Tensor(Dense(dat), indsR)
-end
-function zero_contraction_output(
-  T1::TensorT1, T2::TensorT2, indsR
-) where {TensorT2<:DiagTensor,TensorT1<:CuDenseTensor}
-  return zero_contraction_output(T2, T1, indsR)
-end
-
-function contraction_output(
-  T1::UniformDiagTensor, T2::DiagTensor{Elt2,<:CuDiag}, indsR
-) where {Elt2}
-  return zero_contraction_output(T1, T2, indsR)
-end
-function contraction_output(
-  T1::DiagTensor{Elt1,<:CuDiag}, T2::UniformDiagTensor, indsR
-) where {Elt1}
-  return contraction_output(T2, T1, indsR)
-end
-
-function contract!(
-  C::CuDenseTensor{<:Number,NC},
-  Clabels,
-  A::UniformDiagTensor{<:Number,NA},
-  Alabels,
-  B::CuDenseTensor{<:Number,NB},
-  Blabels,
-) where {NC,NA,NB}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::CuDenseTensor{<:Number,NC},
-  Clabels,
-  A::CuDenseTensor{<:Number,NA},
-  Alabels,
-  B::UniformDiagTensor{<:Number,NB},
-  Blabels,
-) where {NC,NA,NB}
-  return contract!(C, Clabels, B, Blabels, A, Alabels)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  A::UniformDiagTensor{EltA,NA,IndsA},
-  Alabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-  A::UniformDiagTensor{EltA,NA,IndsA},
-  Alabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-function contract!(
-  C::NonuniformCuDiagTensor{EltC,NC,IndsC},
-  Clabels,
-  B::NonuniformCuDiagTensor{EltB,NB,IndsB},
-  Blabels,
-  A::NonuniformCuDiagTensor{EltA,NA,IndsA},
-  Alabels,
-) where {EltC<:Number,EltB<:Number,EltA<:Number,NC,NB,NA,IndsA,IndsB,IndsC}
-  Bstore = data(store(B))
-  Astore = data(store(A))
-  Cstore = data(store(C))
-  return copyto!(Cstore, Astore .* Bstore)
-end
-
-# Dense * NonuniformCuDiag
-function contract!(
-  C::CuDenseTensor, Clabels, A::NonuniformCuDiagTensor, Alabels, B::CuDenseTensor, Blabels
-)
-  Astore = data(store(A))
-  newAstore = CUDA.zeros(eltype(A), dims(inds(A))[1], dims(inds(A))[2])
-  adi = diagind(newAstore, 0)
-  newAstore[adi] .= Astore[:]
-  newA = Tensor(Dense(vec(newAstore)), inds(A))
-  return contract!(C, Clabels, newA, Alabels, B, Blabels)
-end
-
-function contract!(
-  C::CuDenseTensor, Clabels, A::CuDenseTensor, Alabels, B::NonuniformCuDiagTensor, Blabels
-)
-  return contract!(C, Clabels, B, Blabels, A, Alabels)
-end
diff --git a/src/tensor/culinearalgebra.jl b/src/tensor/culinearalgebra.jl
deleted file mode 100644
index c64a3b0..0000000
--- a/src/tensor/culinearalgebra.jl
+++ /dev/null
@@ -1,159 +0,0 @@
-
-#
-# Linear Algebra of order 2 Tensors
-#
-# Even though CuDenseTensor{_,2} is strided
-# and passable to BLAS/LAPACK, it cannot
-# be made <: StridedArray
-
-function Base.:*(
-  T1::Tensor{ElT1,2,StoreT1,IndsT1}, T2::Tensor{ElT2,2,StoreT2,IndsT2}
-) where {ElT1,StoreT1<:CuDense,IndsT1,ElT2,StoreT2<:CuDense,IndsT2}
-  RM = matrix(T1) * matrix(T2)
-  indsR = IndsT1(ind(T1, 1), ind(T2, 2))
-  pT = promote_type(ElT1, ElT2)
-  return tensor(Dense(vec(RM)), indsR)
-end
-#= FIX ME
-function LinearAlgebra.exp(T::CuDenseTensor{ElT,2}) where {ElT,IndsT}
-  expTM = exp(matrix(T))
-  return tensor(Dense(vec(expTM)),inds(T))
-end
-
-function expHermitian(T::CuDenseTensor{ElT,2}) where {ElT,IndsT}
-  # exp(::Hermitian/Symmetric) returns Hermitian/Symmetric,
-  # so extract the parent matrix
-  expTM = parent(exp(Hermitian(matrix(T))))
-  return tensor(Dense(vec(expTM)),inds(T))
-end
-=#
-
-# svd of an order-2 tensor
-function LinearAlgebra.svd(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT}
-  maxdim::Int = get(kwargs, :maxdim, minimum(dims(T)))
-  mindim::Int = get(kwargs, :mindim, 1)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  fastSVD::Bool = get(kwargs, :fastSVD, false)
-  # Safer to use `Array`, which ensures
-  # no views/aliases are made, since
-  # we are using in-place `CUSOLVER.svd!` below.
-  aT = Array(T)
-  @timeit "CUSOLVER svd" begin
-    MU, MS, MV = CUSOLVER.svd!(aT)
-  end
-  if !(MV isa CuMatrix)
-    # Materialize any array wrappers,
-    # for now, since `Adjoint` wrappers
-    # seem to cause issues throughout
-    # CUDA.jl, for example with slicing,
-    # reshaping and then copying, etc.
-    # TODO: Fix this in a more robust way.
-    MV = copy(MV)
-  end
-  # for consistency with cpu version,
-  # ITensors.jl/NDTensors/src/linearalgebra.jl/svd
-  # need conj!(MV)
-  conj!(MV)
-  P = MS .^ 2
-  truncerr, docut, P = truncate!(
-    P;
-    mindim=mindim,
-    maxdim=maxdim,
-    cutoff=cutoff,
-    absoluteCutoff=absoluteCutoff,
-    doRelCutoff=doRelCutoff,
-  )
-  spec = Spectrum(P, truncerr)
-  dS = length(P)
-  if dS < length(MS)
-    MU = MU[:, 1:dS]
-    MS = MS[1:dS]
-    MV = MV[:, 1:dS]
-  end
-
-  # Make the new indices to go onto U and V
-  u = eltype(IndsT)(dS)
-  v = eltype(IndsT)(dS)
-  Uinds = IndsT((ind(T, 1), u))
-  Sinds = IndsT((u, v))
-  Vinds = IndsT((ind(T, 2), v))
-  U = tensor(Dense(vec(MU)), Uinds)
-  Sdata = CUDA.zeros(ElT, dS * dS)
-  dsi = diagind(reshape(Sdata, dS, dS), 0)
-  Sdata[dsi] = MS
-  S = tensor(Dense(Sdata), Sinds)
-  V = tensor(Dense(vec(MV)), Vinds)
-  return U, S, V, spec
-end
-
-function LinearAlgebra.eigen(
-  T::Hermitian{ElT,<:CuDenseTensor{ElT,2,IndsT}}; kwargs...
-) where {ElT<:Union{Real,Complex},IndsT}
-  ispossemidef::Bool = get(kwargs, :ispossemidef, false)
-  maxdim::Int = get(kwargs, :maxdim, minimum(dims(T)))
-  mindim::Int = get(kwargs, :mindim, 1)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  @timeit "CUSOLVER eigen" begin
-    local DM, UM
-    if ElT <: Complex
-      DM, UM = CUSOLVER.heevd!('V', 'U', matrix(parent(T)))
-    else
-      DM, UM = CUSOLVER.syevd!('V', 'U', matrix(parent(T)))
-    end
-  end
-  DM_ = reverse(DM)
-  @timeit "truncate" begin
-    truncerr, docut, DM = truncate!(
-      DM_;
-      maxdim=maxdim,
-      cutoff=cutoff,
-      absoluteCutoff=absoluteCutoff,
-      doRelCutoff=doRelCutoff,
-    )
-  end
-  spec = Spectrum(DM, truncerr)
-  dD = length(DM)
-  dV = reverse(UM; dims=2)
-  if dD < size(dV, 2)
-    dV = CuMatrix(dV[:, 1:dD])
-  end
-  # Make the new indices to go onto U and V
-  l = eltype(IndsT)(dD)
-  r = eltype(IndsT)(dD)
-  Vinds = IndsT((dag(ind(T, 2)), dag(r)))
-  Dinds = IndsT((l, dag(r)))
-  U = tensor(Dense(vec(dV)), Vinds)
-  D = tensor(Diag(real.(DM)), Dinds)
-  return D, U, spec
-end
-
-function LinearAlgebra.qr(T::CuDenseTensor{ElT,2,IndsT}; kwargs...) where {ElT,IndsT}
-  QM, RM = qr(matrix(T))
-  # Make the new indices to go onto Q and R
-  q, r = inds(T)
-  q = dim(q) < dim(r) ? sim(q) : sim(r)
-  Qinds = IndsT((ind(T, 1), q))
-  Rinds = IndsT((q, ind(T, 2)))
-  QM = CuMatrix(QM)
-  Q = tensor(Dense(vec(QM)), Qinds)
-  R = tensor(Dense(vec(RM)), Rinds)
-  return Q, R
-end
-
-function polar(T::CuDenseTensor{ElT,2,IndsT}) where {ElT,IndsT}
-  QM, RM = polar(matrix(T))
-  dim = size(QM, 2)
-  # Make the new indices to go onto Q and R
-  q = eltype(IndsT)(dim)
-  # TODO: use push/pushfirst instead of a constructor
-  # call here
-  Qinds = IndsT((ind(T, 1), q))
-  Rinds = IndsT((q, ind(T, 2)))
-  Q = tensor(Dense(vec(QM)), Qinds)
-  R = tensor(Dense(vec(RM)), Rinds)
-  return Q, R
-end
diff --git a/src/tensor/cutruncate.jl b/src/tensor/cutruncate.jl
deleted file mode 100644
index 108bc1e..0000000
--- a/src/tensor/cutruncate.jl
+++ /dev/null
@@ -1,87 +0,0 @@
-import LinearAlgebra: BlasReal
-
-function truncate!(P::CuVector{T}; kwargs...)::Tuple{T,T,CuVector{T}} where {T<:BlasReal}
-  maxdim::Int = min(get(kwargs, :maxdim, length(P)), length(P))
-  mindim::Int = min(get(kwargs, :mindim, 1), maxdim)
-  cutoff::Float64 = get(kwargs, :cutoff, 0.0)
-  absoluteCutoff::Bool = get(kwargs, :absoluteCutoff, false)
-  doRelCutoff::Bool = get(kwargs, :doRelCutoff, true)
-  origm = length(P)
-  docut = zero(T)
-
-  maxP = maximum(P)
-  if maxP == zero(T)
-    P = CUDA.zeros(T, 1)
-    return zero(T), zero(T), P
-  end
-  if origm == 1
-    docut = maxP / 2
-    return zero(T), docut, P[1:1]
-  end
-  @timeit "setup rP" begin
-    #Zero out any negative weight
-    #neg_z_f = (!signbit(x) ? x : 0.0)
-    rP = map(x -> !signbit(x) ? Float64(x) : 0.0, P)
-    n = origm
-  end
-  @timeit "handle cutoff" begin
-    if absoluteCutoff
-      #Test if individual prob. weights fall below cutoff
-      #rather than using *sum* of discarded weights
-      sub_arr = rP .- Float64(cutoff)
-      err_rP = sub_arr ./ abs.(sub_arr)
-      flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61)
-      cut_ind = CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1
-      if cut_ind > 0
-        n = min(maxdim, cut_ind)
-        n = max(n, mindim)
-      else
-        n = maxdim
-      end
-      truncerr = T(sum(rP[(n + 1):end]))
-    else
-      truncerr = zero(T)
-      scale = one(T)
-      @timeit "find scale" begin
-        if doRelCutoff
-          scale = sum(P)
-          scale = scale > zero(T) ? scale : one(T)
-        end
-      end
-      #Truncating until *sum* of discarded probability
-      #weight reaches cutoff reached (or m==mindim)
-      csum_rp = Float64.(CUDA.reverse(CUDA.cumsum(CUDA.reverse(rP))))
-      sub_arr = csum_rp .- Float64(cutoff * scale)
-      err_rP = sub_arr ./ abs.(sub_arr)
-      flags = reinterpret(Float64, (signbit.(err_rP) .<< 1 .& 2) .<< 61)
-      cut_ind = (CUDA.CUBLAS.iamax(length(err_rP), err_rP .* flags) - 1)
-      if cut_ind > 0
-        n = min(maxdim, cut_ind)
-        n = max(n, mindim)
-      else
-        n = maxdim
-      end
-      truncerr = sum(rP[(n + 1):end])
-      if scale == zero(T)
-        truncerr = zero(T)
-      else
-        truncerr /= scale
-      end
-    end
-  end
-  if n < 1
-    n = 1
-  end
-  if n < origm
-    hP = collect(P)
-    docut = (hP[n] + hP[n + 1]) / 2
-    if abs(hP[n] - hP[n + 1]) < 1E-3 * hP[n]
-      docut += T(1E-3) * hP[n]
-    end
-  end
-  @timeit "setup return" begin
-    rinds = 1:n
-    rrP = P[rinds]
-  end
-  return truncerr, docut, rrP
-end
diff --git a/src/tensor/dense.jl b/src/tensor/dense.jl
deleted file mode 100644
index 04e1cb8..0000000
--- a/src/tensor/dense.jl
+++ /dev/null
@@ -1,285 +0,0 @@
-function contract!!(
-  R::DenseTensor{<:Number,NR},
-  labelsR::NTuple{NR},
-  T1::DenseTensor{<:Number,N1},
-  labelsT1::NTuple{N1},
-  T2::DenseTensor{<:Number,N2},
-  labelsT2::NTuple{N2},
-  α::Number=1,
-  β::Number=0,
-) where {NR,N1,N2}
-  if N1 == 0
-    (α ≠ 1 || β ≠ 0) &&
-      error("contract!! not yet implemented for scalar ITensor with non-trivial α and β")
-    # TODO: replace with an add! function?
-    # What about doing `R .= T1[] .* PermutedDimsArray(T2,perm)`?
-    perm = getperm(labelsR, labelsT2)
-    R = permutedims!!(R, T2, perm, (r, t2) -> T1[] * t2)
-  elseif N2 == 0
-    (α ≠ 1 || β ≠ 0) &&
-      error("contract!! not yet implemented for scalar ITensor with non-trivial α and β")
-    perm = getperm(labelsR, labelsT1)
-    R = permutedims!!(R, T1, perm, (r, t1) -> T2[] * t1)
-  elseif N1 + N2 == NR
-    (α ≠ 1 || β ≠ 0) && error(
-      "contract!! not yet implemented for outer product tensor contraction with non-trivial α and β",
-    )
-    # TODO: permute T1 and T2 appropriately first (can be more efficient
-    # then permuting the result of T1⊗T2)
-    # TODO: implement the in-place version directly
-    R = outer!!(R, T1, T2)
-    labelsRp = (labelsT1..., labelsT2...)
-    perm = getperm(labelsR, labelsRp)
-    if !is_trivial_permutation(perm)
-      R = permutedims!!(R, copy(R), perm)
-    end
-  else
-    #if dim(T1) > 2^13 && dim(T2) > 2^13
-    #    R = _big_contract!!(R,labelsR,T1,labelsT1,T2,labelsT2, α, β)
-    #else
-    if α ≠ 1 || β ≠ 0
-      R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2, α, β)
-    else
-      R = _contract!!(R, labelsR, T1, labelsT1, T2, labelsT2)
-    end
-    #end
-  end
-  return R
-end
-
-function _big_contract!!(
-  R::DenseTensor{<:Number,NR},
-  labelsR,
-  T1::DenseTensor{ElT1,N1},
-  labelsT1,
-  T2::DenseTensor{ElT2,N2},
-  labelsT2,
-  α::Number=1,
-  β::Number=0,
-) where {ElT1,ElT2,N1,N2,NR}
-  props = ContractionProperties(labelsT1, labelsT2, labelsR)
-  compute_contraction_properties!(props, T1, T2, R)
-  _big_contract!(R, T1, T2, props, α, β)
-  #val, t, _ = @timed _blasmg_contract!(R,T1,T2,props,α,β)
-  return R
-end
-
-function _big_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  Ainds = inds(AT)
-  Adims = dims(Ainds)
-  Binds = inds(BT)
-  Bdims = dims(Binds)
-  Cinds = inds(CT)
-  Cdims = dims(Cinds)
-  Adata = reshape(data(store(AT)), Adims)
-  Bdata = reshape(data(store(BT)), Bdims)
-  Cdata = reshape(data(store(CT)), Cdims)
-  contracted = commoninds(Ainds, Binds)
-  A_only = uniqueinds(Ainds, Binds)
-  B_only = uniqueinds(Binds, Ainds)
-  ind_dict = Vector{Index}()
-  for (idx, i) in enumerate(contracted)
-    push!(ind_dict, i)
-  end
-  if length(A_only) > 0
-    for (idx, i) in enumerate(A_only)
-      push!(ind_dict, i)
-    end
-  end
-  if length(B_only) > 0
-    for (idx, i) in enumerate(B_only)
-      push!(ind_dict, i)
-    end
-  end
-  ctainds = zeros(Int, length(Ainds))
-  ctbinds = zeros(Int, length(Binds))
-  ctcinds = zeros(Int, length(Cinds))
-  for (ii, ia) in enumerate(Ainds)
-    ctainds[ii] = findfirst(x -> x == ia, ind_dict)
-  end
-  for (ii, ib) in enumerate(Binds)
-    ctbinds[ii] = findfirst(x -> x == ib, ind_dict)
-  end
-  for (ii, ic) in enumerate(Cinds)
-    ctcinds[ii] = findfirst(x -> x == ic, ind_dict)
-  end
-  id_op = cuTENSOR.CUTENSOR_OP_IDENTITY
-  dict_key = ""
-  for cc in zip(ctcinds, Cdims)
-    dict_key *= string(cc[1]) * "," * string(cc[2]) * ","
-  end
-  for aa in zip(ctainds, Adims)
-    dict_key *= string(aa[1]) * "," * string(aa[2]) * ","
-  end
-  for bb in zip(ctbinds, Bdims)
-    dict_key *= string(bb[1]) * "," * string(bb[2]) * ","
-  end
-  #=synchronize()
-  if haskey(ENV, "CUTENSOR_AUTOTUNE") && tryparse(Int, ENV["CUTENSOR_AUTOTUNE"]) == 1
-      if haskey(ContractionPlans, dict_key)
-          dict_val = ContractionPlans[dict_key]
-          algo  = dict_val
-          Cdata = cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=algo)
-          synchronize()
-      else
-          # loop through all algos
-          # pick the fastest one
-          # store that plan!
-          best_time = 1e6
-          best_plan = nothing
-          best_algo = nothing
-          max_algos = Ref{Int32}(C_NULL)
-          cuTENSOR.cutensorContractionMaxAlgos(max_algos)
-          # fix once the other options are documented
-          #algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(max_algos[] - 1))
-          algos = collect(Cint(cuTENSOR.CUTENSOR_ALGO_GETT):Cint(-1))
-          for algo in reverse(algos)
-              try
-                  Cdata, this_time, bytes, gctime, memallocs = @timed cuTENSOR.contraction!(α, Adata, Vector{Char}(ctainds), id_op, Bdata, Vector{Char}(ctbinds), id_op, β, Cdata, Vector{Char}(ctcinds), id_op, id_op; algo=cuTENSOR.cutensorAlgo_t(algo))
-                  synchronize()
-                  if this_time < best_time
-                      best_time = this_time
-                      best_algo = cuTENSOR.cutensorAlgo_t(algo)
-                  end
-              catch err
-                  @warn "Algorithm $algo not supported"
-              end
-          end
-          ContractionPlans[dict_key] = best_algo
-      end
-  else
-  =#
-  Cdata .= zero(eltype(Cdata))
-  #@show size(Adata)
-  #@show size(Bdata)
-  #@show size(Cdata)
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  #@show ctainds
-  #@show ctbinds
-  #@show ctcinds
-  #flush(stdout)
-  CUDA.Mem.pin(Adata)
-  CUDA.Mem.pin(Bdata)
-  CUDA.Mem.pin(Cdata)
-  synchronize()
-  #AC = CuArray(Adata)
-  #BC = CuArray(Bdata)
-  #CC = CuArray(Cdata)
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  #@assert !any(isnan.(AC))
-  #@assert !any(isnan.(BC))
-  #@assert !any(isnan.(CC))
-  #CC = cuTENSOR.contraction!(α, AC, ctainds, id_op, BC, ctbinds, id_op, β, CC, ctcinds, id_op, id_op)
-  #synchronize()
-  #@assert !any(isnan.(AC))
-  #@assert !any(isnan.(BC))
-  #@assert !any(isnan.(CC))
-  Cdata = cuTENSOR.contraction!(
-    α, Adata, ctainds, id_op, Bdata, ctbinds, id_op, β, Cdata, ctcinds, id_op, id_op
-  )
-  synchronize()
-  #end
-  #CCh = collect(CC)
-  #@assert !any(isnan.(CCh))
-  #Cdata .= CCh
-  @assert !any(isnan.(Adata))
-  @assert !any(isnan.(Bdata))
-  @assert !any(isnan.(Cdata))
-  return parent(Cdata)
-end
-
-function _blasmg_contract!(
-  CT::DenseTensor{El,NC},
-  AT::DenseTensor{El,NA},
-  BT::DenseTensor{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  # TODO: directly use Tensor instead of Array
-  C = array(CT)
-  A = array(AT)
-  B = array(BT)
-
-  tA = 'N'
-  if props.permuteA
-    pA = NTuple{NA,Int}(props.PA)
-    @strided Ap = permutedims(A, pA)
-    AM = reshape(Ap, props.dmid, props.dleft)
-    tA = 'T'
-  else
-    #A doesn't have to be permuted
-    if Atrans(props)
-      AM = reshape(A, props.dmid, props.dleft)
-      tA = 'T'
-    else
-      AM = reshape(A, props.dleft, props.dmid)
-    end
-  end
-
-  tB = 'N'
-  if props.permuteB
-    pB = NTuple{NB,Int}(props.PB)
-    @strided Bp = permutedims(B, pB)
-    BM = reshape(Bp, props.dmid, props.dright)
-  else
-    if Btrans(props)
-      BM = reshape(B, props.dright, props.dmid)
-      tB = 'T'
-    else
-      BM = reshape(B, props.dmid, props.dright)
-    end
-  end
-
-  # TODO: this logic may be wrong
-  if props.permuteC
-    # Need to copy here since we will be permuting
-    # into C later
-    CM = reshape(copy(C), props.dleft, props.dright)
-  else
-    if Ctrans(props)
-      CM = reshape(C, props.dleft, props.dright)
-      (AM, BM) = (BM, AM)
-      if tA == tB
-        tA = tB = (tA == 'T' ? 'N' : 'T')
-      end
-    else
-      CM = reshape(C, props.dleft, props.dright)
-    end
-  end
-
-  if length(AM) > 4096 && length(BM) > 4096 && length(CM) > 4096
-    CM = CUBLASMG.mg_gemm!(
-      tA,
-      tB,
-      El(α),
-      AM,
-      BM,
-      El(β),
-      CM;
-      devs=devs[],
-      dev_rows=dev_rows[],
-      dev_cols=dev_cols[],
-    )
-  else
-    BLAS.gemm!(tA, tB, El(α), AM, BM, El(β), CM)
-  end
-
-  if props.permuteC
-    pC = NTuple{NC,Int}(props.PC)
-    Cr = reshape(CM, props.newCrange...)
-    @strided C .= permutedims(Cr, pC)
-  end
-  return C
-end
diff --git a/src/traits.jl b/src/traits.jl
deleted file mode 100644
index 6589298..0000000
--- a/src/traits.jl
+++ /dev/null
@@ -1,51 +0,0 @@
-# Trait type indicating the object is either on CPU
-# or on a CUDA device (for example a type that doesn't
-# have any data, like a Combiner or uniform Diagonal
-# tensor).
-struct CPUorCUDA end
-
-is_cu(::Type{<:Number}) = CPUorCUDA()
-is_cu(::Type{NDTensors.NoData}) = CPUorCUDA()
-is_cu(::Type{<:Array}) = false
-is_cu(::Type{<:CuArray}) = true
-
-# Handle Array wrappers like `ReshapedArray`.
-@traitfn function is_cu(arraytype::Type{T}) where {T; IsWrappedArray{T}}
-  return is_cu(parenttype(arraytype))
-end
-
-is_cu(X::Type{<:TensorStorage}) = is_cu(NDTensors.datatype(X))
-is_cu(X::Type{<:Tensor}) = is_cu(NDTensors.storagetype(X))
-is_cu(::Type{ITensor}) = error("Unknown")
-
-is_cu(x::CuArray) = is_cu(typeof(x))
-is_cu(x::Array) = is_cu(typeof(x))
-
-is_cu(x::TensorStorage) = is_cu(typeof(x))
-is_cu(x::Tensor) = is_cu(typeof(x))
-is_cu(x::ITensor) = is_cu(typeof(tensor(x)))
-is_cu(x::MPS) = all(is_cu, x)
-is_cu(x::MPO) = all(is_cu, x)
-
-mixed_cu_cpu(::Bool, ::CPUorCUDA) = false
-mixed_cu_cpu(::CPUorCUDA, ::Bool) = false
-mixed_cu_cpu(::CPUorCUDA, ::CPUorCUDA) = false
-mixed_cu_cpu(is_cu1::Bool, is_cu2::Bool) = (is_cu1 ⊻ is_cu2)
-mixed_cu_cpu(T1::Type, T2::Type) = mixed_cu_cpu(is_cu(T1), is_cu(T2))
-
-@traitdef MixedCuCPU{T1,T2}
-
-#! format: off
-@traitimpl MixedCuCPU{T1,T2} <- mixed_cu_cpu(T1, T2)
-#! format: on
-
-@traitfn function can_contract(
-  ::Type{T1}, ::Type{T2}
-) where {T1<:TensorStorage,T2<:TensorStorage;!MixedCuCPU{T1,T2}}
-  return true
-end
-@traitfn function can_contract(
-  ::Type{T1}, ::Type{T2}
-) where {T1<:TensorStorage,T2<:TensorStorage;MixedCuCPU{T1,T2}}
-  return false
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 751fc3a..6ad072e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,31 +4,45 @@ end
 
 using ITensorGPU, Test, CUDA
 
-if !CUDA.has_cuda()
-  println("System does not have CUDA, skipping test.")
-else
-  println("Running ITensorGPU tests with a runtime CUDA version: $(CUDA.runtime_version())")
-
-  CUDA.allowscalar(false)
-  @testset "ITensorGPU.jl" begin
-    #@testset "$filename" for filename in ("test_cucontract.jl",)
-    #  println("Running $filename with autotune")
-    #  cmd = `$(Base.julia_cmd()) -e 'using Pkg; Pkg.activate(".."); Pkg.instantiate(); include("test_cucontract.jl")'`
-    #  run(pipeline(setenv(cmd, "CUTENSOR_AUTOTUNE" => 1); stdout=stdout, stderr=stderr))
-    #end
-    @testset "$filename" for filename in (
-      "test_dmrg.jl",
-      "test_cuitensor.jl",
-      "test_cudiag.jl",
-      "test_cudense.jl",
-      "test_cucontract.jl",
-      "test_cumpo.jl",
-      "test_cumps.jl",
-      "test_cutruncate.jl",
-      #"test_pastaq.jl",
+@testset "ITensorGPU.jl" begin
+  @testset "Test exports" begin
+    @test issetequal(
+      names(ITensorGPU),
+      [
+        :ITensorGPU,
+        :cpu,
+        :cu,
+        :cuITensor,
+        :cuMPO,
+        :cuMPS,
+        :productCuMPS,
+        :randomCuITensor,
+        :randomCuMPO,
+        :randomCuMPS,
+      ],
+    )
+  end
+  if !CUDA.has_cuda()
+    println("System does not have CUDA, skipping test.")
+  else
+    println(
+      "Running ITensorGPU tests with a runtime CUDA version: $(CUDA.runtime_version())"
     )
-      println("Running $filename")
-      include(filename)
+
+    CUDA.allowscalar(false)
+    @testset "ITensorGPU.jl" begin
+      @testset "$filename" for filename in (
+        "test_dmrg.jl",
+        "test_cuitensor.jl",
+        "test_cudiag.jl",
+        "test_cudense.jl",
+        "test_cucontract.jl",
+        "test_cumpo.jl",
+        "test_cumps.jl",
+      )
+        println("Running $filename")
+        include(filename)
+      end
     end
   end
 end
diff --git a/test/test_cucontract.jl b/test/test_cucontract.jl
index ad12e0e..8315ad2 100644
--- a/test/test_cucontract.jl
+++ b/test/test_cucontract.jl
@@ -198,8 +198,8 @@ using ITensors,
       a_only_inds = [Index(2) for ii in 1:7]
       b_only_inds = [Index(2) for ii in 1:7]
       shared_inds = [Index(2) for ii in 1:7]
-      A = randomITensor(IndexSet(vcat(a_only_inds, shared_inds)))
-      B = randomITensor(IndexSet(vcat(b_only_inds, shared_inds)))
+      A = randomITensor(vcat(a_only_inds, shared_inds))
+      B = randomITensor(vcat(b_only_inds, shared_inds))
       cA = cuITensor(A)
       cB = cuITensor(B)
       inds_a = vcat(a_only_inds, shared_inds)
diff --git a/test/test_cudense.jl b/test/test_cudense.jl
index ef01eaf..34d4952 100644
--- a/test/test_cudense.jl
+++ b/test/test_cudense.jl
@@ -1,9 +1,9 @@
-using ITensors,
-  ITensorGPU,
-  LinearAlgebra, # For tr()
-  Combinatorics, # For permutations()
-  CUDA,
-  Test
+using CUDA
+using Combinatorics: permutations
+using ITensors
+using ITensorGPU
+using LinearAlgebra: tr
+using Test: @test, @testset
 
 # gpu tests!
 @testset "cuITensor, Dense{$SType} storage" for SType in (Float64, ComplexF64)
@@ -14,19 +14,6 @@ using ITensors,
   l = Index(ml, "l")
   a = Index(ma, "a")
   indices = [i, j, k, l, a]
-  @testset "Test add CuDense" begin
-    A = [SType(1.0) for ii in 1:dim(i), jj in 1:dim(j)]
-    dA = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      SType(1.0), dim(i) * dim(j)
-    )
-    B = [SType(2.0) for ii in 1:dim(i), jj in 1:dim(j)]
-    dB = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      SType(2.0), dim(i) * dim(j)
-    )
-    dC = +(dA, IndexSet(i, j), dB, IndexSet(j, i))
-    hC = collect(dC)
-    @test collect(A + B) ≈ hC
-  end
   @testset "Test2 add CuDense" begin
     for i1 in indices, i2 in indices
       i1 == i2 && continue
@@ -44,20 +31,6 @@ using ITensors,
       @test B ≈ cpu(cuB) #check does operation `+=`` modify cuB
     end
   end
-
-  @testset "Test subtract CuDense" begin
-    A = [SType(1.0) for ii in 1:dim(i), jj in 1:dim(j)]
-    dA = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      SType(1.0), dim(i) * dim(j)
-    )
-    B = [SType(2.0) for ii in 1:dim(i), jj in 1:dim(j)]
-    dB = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      SType(2.0), dim(i) * dim(j)
-    )
-    dC = -(dA, IndexSet(i, j), dB, IndexSet(i, j))
-    hC = collect(dC)
-    @test A - B ≈ hC
-  end
   @testset "Test2 subtract CuDense" begin
     for i1 in indices, i2 in indices
       i1 == i2 && continue
@@ -76,43 +49,4 @@ using ITensors,
       #end
     end
   end
-  @testset "Test permute CuDense" begin
-    A = [SType(ii * jj) for ii in 1:dim(i), jj in 1:dim(j)]
-    dA = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      NDTensors.Dense(vec(A))
-    )
-    B = [SType(0.0) for ii in 1:dim(j), jj in 1:dim(j)]
-    dB = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      SType(0.0), dim(i) * dim(j)
-    )
-    dC = permute!(dB, IndexSet(j, i), dA, IndexSet(i, j))
-    hC = cpu(dC)
-    @test transpose(A) == hC
-  end
-  @testset "Test move CuDense on/off GPU" begin
-    A = [SType(1.0) for ii in 1:dim(i), jj in 1:dim(j)]
-    dA = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(
-      NDTensors.Dense(vec(A))
-    )
-    dB = convert(NDTensors.Dense{SType,Vector{SType}}, dA)
-    @test NDTensors.data(dB) == vec(A)
-  end
-  @testset "Test basic CuDense features" begin
-    @test NDTensors.Dense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(10) isa
-      ITensorGPU.CuDense{SType}
-    @test complex(NDTensors.Dense{SType,CuVector{SType}}) ==
-      NDTensors.Dense{complex(SType),CuVector{complex(SType)}}
-  end
-  if SType == Float64
-    @testset "Test CuDense complex" begin
-      A = CUDA.rand(SType, dim(i) * dim(j))
-      dA = ITensorGPU.CuDense{SType,CuVector{SType,ITensorGPU.default_buffertype()}}(A)
-      dC = complex(dA)
-      @test typeof(dC) !== typeof(dA)
-      cdC = CuArray(dC)
-      hC = collect(cdC)
-      ccA = complex.(A)
-      @test hC == collect(ccA)
-    end
-  end
 end # End Dense storage test
diff --git a/test/test_cudiag.jl b/test/test_cudiag.jl
index c618d44..70dda0c 100644
--- a/test/test_cudiag.jl
+++ b/test/test_cudiag.jl
@@ -1,10 +1,10 @@
-using ITensors,
-  ITensors.NDTensors,
-  ITensorGPU,
-  LinearAlgebra, # For tr()
-  Combinatorics, # For permutations()
-  CUDA,
-  Test
+using ITensors
+using ITensors.NDTensors: NDTensors
+using ITensorGPU
+using LinearAlgebra: tr
+using Combinatorics: permutations
+using CUDA
+using Test: @test, @test_broken, @testset
 
 @testset "cuITensor $T Contractions" for T in (Float64, ComplexF64)
   mi, mj, mk, ml, ma = 2, 3, 4, 5, 6, 7
@@ -22,31 +22,34 @@ using ITensors,
     Ajl = cuITensor(randomITensor(T, j, l))
     Akl = cuITensor(randomITensor(T, k, l))
     Dv = rand(T, mi)
-    D = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Dv)), IndexSet(i, i')))
+    D = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Dv)), (i, i')))
     Ev = rand(T, mi)
-    E = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Ev)), IndexSet(i, i'')))
+    E = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Ev)), (i, i'')))
     @testset "Test contract cuITensors (Matrix*Diag -> Matrix)" begin
       C = Aij * D
-      @test collect(CuArray(C)) ≈ collect(CuMatrix(Aij, j, i)) * diagm(0 => Dv)
+      @test_broken collect(CuArray(C)) ≈ collect(CuMatrix(Aij, j, i)) * diagm(0 => Dv)
     end
     @testset "Test contract cuDiagITensors (Diag*Diag -> Diag)" begin
-      C = E * D
-      cC = CuArray(C)
-      @test collect(cC) ≈ diagm(0 => Ev) * diagm(0 => Dv)
+      @test_broken E * D
+      ## C = E * D
+      ## cC = CuArray(C)
+      ## @test collect(cC) ≈ diagm(0 => Ev) * diagm(0 => Dv)
     end
     @testset "Test contract cuDiagITensors (UniformDiag*Diag -> Diag)" begin
-      scal = itensor(ITensors.tensor(NDTensors.Diag(2.0), IndexSet(i, i'')))
-      C = scal * D
-      @test collect(CuArray(C)) ≈ 2.0 .* diagm(0 => Dv)
-      C = D * scal
-      @test collect(CuArray(C)) ≈ 2.0 .* diagm(0 => Dv)
+      scal = itensor(ITensors.tensor(NDTensors.Diag(2.0), (i, i'')))
+      @test_broken scal * D
+      ## C = scal * D
+      ## @test collect(CuArray(C)) ≈ 2.0 .* diagm(0 => Dv)
+      ## C = D * scal
+      ## @test collect(CuArray(C)) ≈ 2.0 .* diagm(0 => Dv)
     end
     @testset "Test contract cuITensors (Matrix*UniformDiag -> Matrix)" begin
-      scal = itensor(ITensors.tensor(NDTensors.Diag(T(2.0)), IndexSet(i, i')))
-      C = scal * Aij
-      @test cpu(C) ≈ 2.0 * cpu(replaceind(Aij, i, i')) atol = 1e-4
-      C = Aij * scal
-      @test_broken cpu(C) ≈ 2.0 * cpu(replaceind(permute(Aij, j, i), i, i')) atol = 1e-4
+      scal = itensor(ITensors.tensor(NDTensors.Diag(T(2.0)), (i, i')))
+      @test_broken scal * Aij
+      ## C = scal * Aij
+      ## @test cpu(C) ≈ 2.0 * cpu(replaceind(Aij, i, i')) atol = 1e-4
+      ## C = Aij * scal
+      ## @test_broken cpu(C) ≈ 2.0 * cpu(replaceind(permute(Aij, j, i), i, i')) atol = 1e-4
     end
   end # End contraction testset
 end
@@ -65,37 +68,39 @@ end
     Aji = cuITensor(randomITensor(T1, j, i))
     Bij = cuITensor(randomITensor(T1, i, j))
     Dv = rand(T2, mi)
-    D = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Dv)), IndexSet(i, i')))
+    D = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Dv)), (i, i')))
     Ev = rand(T2, mi)
-    E = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Ev)), IndexSet(i, i'')))
+    E = itensor(ITensors.tensor(NDTensors.Diag(CuVector(Ev)), (i, i'')))
     @testset "Test contract cuITensors (Matrix*Diag -> Matrix)" begin
       C = Aij * D
-      cC = CuArray(C)
-      @test collect(cC) ≈ collect(CuMatrix(Aij, j, i)) * diagm(0 => Dv)
+      @test_broken CuArray(C)
+      ## cC = CuArray(C)
+      ## @test collect(cC) ≈ collect(CuMatrix(Aij, j, i)) * diagm(0 => Dv)
     end
     @testset "Test contract cuDiagITensors (Diag*Diag -> Diag)" begin
-      C = E * D
-      cC = CuArray(C)
-      @test collect(cC) ≈ diagm(0 => Ev) * diagm(0 => Dv)
-    end
-    @testset "Test contract cuDiagITensors (UniformDiag*Diag -> Diag)" begin
-      scal = itensor(ITensors.tensor(NDTensors.Diag(T2(2.0)), IndexSet(i, i'')))
-      C = scal * D
-      cC = CuArray(C)
-      @test collect(cC) ≈ 2.0 .* diagm(0 => Dv)
-      C = D * scal
-      cC = CuArray(C)
-      @test collect(cC) ≈ 2.0 .* diagm(0 => Dv)
-    end
-    @testset "Test contract cuITensors (Matrix*UniformDiag -> Matrix)" begin
-      scal = itensor(ITensors.tensor(NDTensors.Diag(T2(2.0)), IndexSet(i, i')))
-      C = scal * Aij
-      cC = CuArray(C)
-      @test collect(cC) ≈ array(2.0 * cpu(replaceind(Aij, i, i'))) atol = 1e-4
-      C = Aij * scal
-      cC = CuArray(C)
-      @test_broken collect(cC) ≈ array(2.0 * cpu(replaceind(permute(Aij, j, i), i, i'))) atol =
-        1e-4
+      @test_broken E * D
+      ## C = E * D
+      ## cC = CuArray(C)
+      ## @test collect(cC) ≈ diagm(0 => Ev) * diagm(0 => Dv)
     end
+    ## @testset "Test contract cuDiagITensors (UniformDiag*Diag -> Diag)" begin
+    ##   scal = itensor(ITensors.tensor(NDTensors.Diag(T2(2.0)), (i, i'')))
+    ##   C = scal * D
+    ##   cC = CuArray(C)
+    ##   @test collect(cC) ≈ 2.0 .* diagm(0 => Dv)
+    ##   C = D * scal
+    ##   cC = CuArray(C)
+    ##   @test collect(cC) ≈ 2.0 .* diagm(0 => Dv)
+    ## end
+    ## @testset "Test contract cuITensors (Matrix*UniformDiag -> Matrix)" begin
+    ##   scal = itensor(ITensors.tensor(NDTensors.Diag(T2(2.0)), (i, i')))
+    ##   C = scal * Aij
+    ##   cC = CuArray(C)
+    ##   @test collect(cC) ≈ array(2.0 * cpu(replaceind(Aij, i, i'))) atol = 1e-4
+    ##   C = Aij * scal
+    ##   cC = CuArray(C)
+    ##   @test_broken collect(cC) ≈ array(2.0 * cpu(replaceind(permute(Aij, j, i), i, i'))) atol =
+    ##     1e-4
+    ## end
   end # End contraction testset
 end
diff --git a/test/test_cuitensor.jl b/test/test_cuitensor.jl
index 886ad6d..030561e 100644
--- a/test/test_cuitensor.jl
+++ b/test/test_cuitensor.jl
@@ -16,15 +16,13 @@ using ITensors,
   a = Index(ma, "a")
   @testset "Constructor" begin
     A = cuITensor(one(SType), i, j, k)
-    @test collect(CuArray(A, i, j, k)) == ones(SType, dim(i), dim(j), dim(k))
-    A = randomCuITensor(IndexSet(i, j, k))
-    @test inds(A) == IndexSet(i, j, k)
-    @test ITensorGPU.storage(A) isa ITensorGPU.CuDense
+    @test_broken collect(CuArray(A, i, j, k)) == ones(SType, dim(i), dim(j), dim(k))
+    A = randomCuITensor((i, j, k))
+    @test inds(A) == (i, j, k)
     Aarr = rand(SType, dim(i) * dim(j) * dim(k))
     @test cpu(ITensor(Aarr, i, j, k)) == cpu(cuITensor(Aarr, i, j, k))
     @test cuITensor(SType, i, j, k) isa ITensor
-    @test storage(cuITensor(SType, i, j, k)) isa ITensorGPU.CuDense{SType}
-    @test vec(collect(CuArray(ITensor(Aarr, i, j, k), i, j, k))) == Aarr
+    @test_broken vec(collect(CuArray(ITensor(Aarr, i, j, k), i, j, k))) == Aarr
   end
   @testset "Test permute(cuITensor,Index...)" begin
     CA = randomCuITensor(SType, i, k, j)
@@ -40,7 +38,7 @@ using ITensors,
   end
   @testset "Test permute(cuITensor,Index...) for large tensors" begin
     inds = [Index(2) for ii in 1:14]
-    A = randomITensor(SType, IndexSet(inds))
+    A = randomITensor(SType, (inds))
     CA = cuITensor(A)
     for shuffle_count in 1:20
       perm_inds = shuffle(inds)
@@ -60,17 +58,17 @@ using ITensors,
   @testset "Test CuVector(cuITensor)" begin
     v = CuVector(ones(SType, dim(a)))
     A = cuITensor(v, a)
-    @test v == CuVector(A)
+    @test_broken v == CuVector(A)
   end
   @testset "Test CuMatrix(cuITensor)" begin
     v = CuMatrix(ones(SType, dim(a), dim(l)))
     A = cuITensor(vec(v), a, l)
-    @test v == CuMatrix(A, a, l)
+    @test_broken v == CuMatrix(A, a, l)
     A = cuITensor(vec(v), a, l)
-    @test v == CuMatrix(A)
+    @test_broken v == CuMatrix(A)
     A = cuITensor(vec(v), a, l)
-    @test v == CuArray(A, a, l)
-    @test v == CuArray(A)
+    @test_broken v == CuArray(A, a, l)
+    @test_broken v == CuArray(A)
   end
   @testset "Test norm(cuITensor)" begin
     A = randomCuITensor(SType, i, j, k)
@@ -80,7 +78,7 @@ using ITensors,
   @testset "Test complex(cuITensor)" begin
     A = randomCuITensor(SType, i, j, k)
     cA = complex(A)
-    @test complex.(CuArray(A)) == CuArray(cA)
+    @test_broken complex.(CuArray(A)) == CuArray(cA)
   end
   #@testset "Test exp(cuITensor)" begin
   #  A  = randomCuITensor(SType,i,i')
@@ -92,7 +90,7 @@ using ITensors,
     A = cpu(dA)
     B = cpu(dB)
     C = cpu(dA + dB)
-    @test CuArray(permute(C, i, j, k)) ==
+    @test_broken CuArray(permute(C, i, j, k)) ==
       CuArray(permute(A, i, j, k)) + CuArray(permute(B, i, j, k))
     for ii in 1:dim(i), jj in 1:dim(j), kk in 1:dim(k)
       @test C[i => ii, j => jj, k => kk] ==
@@ -124,7 +122,7 @@ using ITensors,
         ii = Index(4)
         jj = Index(4)
         S = Diagonal(s)
-        T = cuITensor(vec(CuArray(U*S*V')),IndexSet(ii,jj))
+        T = cuITensor(vec(CuArray(U*S*V')),(ii,jj))
         (U,S,V) = svd(T,ii;maxm=2)
         @test norm(U*S*V-T)≈sqrt(s[3]^2+s[4]^2)
     end=#
diff --git a/test/test_cumps.jl b/test/test_cumps.jl
index 1ebe28b..3d62d08 100644
--- a/test/test_cumps.jl
+++ b/test/test_cumps.jl
@@ -96,25 +96,6 @@ using ITensors, ITensorGPU, Test
     @test_throws DimensionMismatch inner(phi, badpsi)
   end
 
-  @testset "inner same MPS" begin
-    psi = randomMPS(sites)
-    psidag = dag(deepcopy(psi))
-    ITensors.prime_linkinds!(psidag)
-    psipsi = psidag[1] * psi[1]
-    for j in 2:N
-      psipsi = psipsi * psidag[j] * psi[j]
-    end
-    @test psipsi[] ≈ inner(psi, psi)
-    psi = randomCuMPS(sites)
-    psidag = dag(deepcopy(psi))
-    ITensors.prime_linkinds!(psidag)
-    psipsi = psidag[1] * psi[1]
-    for j in 2:N
-      psipsi = psipsi * psidag[j] * psi[j]
-    end
-    @test psipsi[] ≈ inner(psi, psi)
-  end
-
   @testset "add MPS" begin
     psi = randomMPS(sites)
     phi = deepcopy(psi)
@@ -133,16 +114,16 @@ using ITensors, ITensorGPU, Test
 
   psi = randomCuMPS(sites)
   orthogonalize!(psi, N - 1)
-  @test ITensors.leftlim(psi) == N - 2
-  @test ITensors.rightlim(psi) == N
+  @test_broken ITensors.leftlim(psi) == N - 2
+  @test_broken ITensors.rightlim(psi) == N
   orthogonalize!(psi, 2)
-  @test ITensors.leftlim(psi) == 1
-  @test ITensors.rightlim(psi) == 3
+  @test_broken ITensors.leftlim(psi) == 1
+  @test_broken ITensors.rightlim(psi) == 3
   psi = randomCuMPS(sites)
   psi.rlim = N + 1 # do this to test qr from rightmost tensor
   orthogonalize!(psi, div(N, 2))
-  @test ITensors.leftlim(psi) == div(N, 2) - 1
-  @test ITensors.rightlim(psi) == div(N, 2) + 1
+  @test_broken ITensors.leftlim(psi) == div(N, 2) - 1
+  @test_broken ITensors.rightlim(psi) == div(N, 2) + 1
 
   #@test_throws ErrorException linkind(MPS(N, fill(cuITensor(), N), 0, N + 1), 1)
 
@@ -154,24 +135,24 @@ using ITensors, ITensorGPU, Test
     replacebond!(psi, 1, phi)
     @test tags(linkind(psi, 1)) == bondindtags
 
-    # check that replaceBond! updates llim_ and rlim_ properly
-    orthogonalize!(psi, 5)
-    phi = psi[5] * psi[6]
-    replacebond!(psi, 5, phi; ortho="left")
-    @test ITensors.leftlim(psi) == 5
-    @test ITensors.rightlim(psi) == 7
-
-    phi = psi[5] * psi[6]
-    replacebond!(psi, 5, phi; ortho="right")
-    @test ITensors.leftlim(psi) == 4
-    @test ITensors.rightlim(psi) == 6
-
-    psi.llim = 3
-    psi.rlim = 7
-    phi = psi[5] * psi[6]
-    replacebond!(psi, 5, phi; ortho="left")
-    @test ITensors.leftlim(psi) == 3
-    @test ITensors.rightlim(psi) == 7
+    ## # check that replaceBond! updates llim_ and rlim_ properly
+    ## orthogonalize!(psi, 5)
+    ## phi = psi[5] * psi[6]
+    ## replacebond!(psi, 5, phi; ortho="left")
+    ## @test ITensors.leftlim(psi) == 5
+    ## @test ITensors.rightlim(psi) == 7
+
+    ## phi = psi[5] * psi[6]
+    ## replacebond!(psi, 5, phi; ortho="right")
+    ## @test ITensors.leftlim(psi) == 4
+    ## @test ITensors.rightlim(psi) == 6
+
+    ## psi.llim = 3
+    ## psi.rlim = 7
+    ## phi = psi[5] * psi[6]
+    ## replacebond!(psi, 5, phi; ortho="left")
+    ## @test ITensors.leftlim(psi) == 3
+    ## @test ITensors.rightlim(psi) == 7
   end
 end
 
@@ -196,8 +177,8 @@ end
     M = basicRandomCuMPS(N)
     orthogonalize!(M, c)
 
-    @test ITensors.leftlim(M) == c - 1
-    @test ITensors.rightlim(M) == c + 1
+    @test_broken ITensors.leftlim(M) == c - 1
+    @test_broken ITensors.rightlim(M) == c + 1
 
     # Test for left-orthogonality
     L = M[1] * prime(M[1], "Link")
@@ -226,7 +207,7 @@ end
     M = basicRandomCuMPS(N; dim=10)
     M0 = copy(M)
     truncate!(M; maxdim=5)
-    @test ITensors.rightlim(M) == 2
+    @test_broken ITensors.rightlim(M) == 2
     # Test for right-orthogonality
     R = M[N] * prime(M[N], "Link")
     r = linkind(M, N - 1)
@@ -239,37 +220,3 @@ end
     @test inner(M0, M) > 0.1
   end
 end
-
-#=@testset "Other MPS methods" begin
-
-  @testset "sample! method" begin
-    N = 10
-    sites = [Index(3,"Site,n=$n") for n=1:N]
-    psi = makeRandomCuMPS(sites,chi=3)
-    nrm2 = inner(psi,psi)
-    psi[1] *= (1.0/sqrt(nrm2))
-
-    s = sample!(psi)
-
-    @test length(s) == N
-    for n=1:N
-      @test 1 <= s[n] <= 3
-    end
-
-    # Throws becase not orthogonalized to site 1:
-    orthogonalize!(psi,3)
-    @test_throws ErrorException sample(psi)
-
-    # Throws becase not normalized
-    orthogonalize!(psi,1)
-    psi[1] *= (5.0/norm(psi[1]))
-    @test_throws ErrorException sample(psi)
-
-    # Works when ortho & normalized:
-    orthogonalize!(psi,1)
-    psi[1] *= (1.0/norm(psi[1]))
-    s = sample(psi)
-    @test length(s) == N
-  end
-
-end=#
diff --git a/test/test_cutruncate.jl b/test/test_cutruncate.jl
deleted file mode 100644
index 1afcfb9..0000000
--- a/test/test_cutruncate.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-using ITensors,
-  ITensorGPU,
-  LinearAlgebra, # For tr()
-  CUDA,
-  Test
-
-# gpu tests!
-@testset "cutrunctate" begin
-  @testset for T in (Float32, Float64)
-    @test ITensorGPU.truncate!(CUDA.zeros(T, 10)) == (zero(T), zero(T), CUDA.zeros(T, 1))
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[1.0, 0.5, 0.4, 0.1, 0.05]); absoluteCutoff=true, cutoff=T(0.2)
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.25)
-    @test Array(trunc[3]) == T[1.0, 0.5, 0.4]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[1.0, 0.5, 0.4, 0.1, 0.05]); absoluteCutoff=true, maxdim=3
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.25)
-    @test Array(trunc[3]) == T[1.0, 0.5, 0.4]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[1.0, 0.5, 0.4, 0.1, 0.05]); absoluteCutoff=true, maxdim=3, cutoff=T(0.07)
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.25)
-    @test Array(trunc[3]) == T[1.0, 0.5, 0.4]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[0.4, 0.26, 0.19, 0.1, 0.05]); relativeCutoff=true, cutoff=T(0.2)
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.145)
-    @test Array(trunc[3]) == T[0.4, 0.26, 0.19]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[0.4, 0.26, 0.19, 0.1, 0.05]); relativeCutoff=true, maxdim=3
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.145)
-    @test Array(trunc[3]) == T[0.4, 0.26, 0.19]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(T[0.4, 0.26, 0.19, 0.1, 0.05]); relativeCutoff=true, maxdim=3, cutoff=T(0.07)
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.145)
-    @test Array(trunc[3]) == T[0.4, 0.26, 0.19]
-
-    trunc = ITensorGPU.truncate!(
-      CuArray(convert(Vector{T}, [0.4, 0.26, 0.19, 0.1, 0.05] / 2));
-      relativeCutoff=true,
-      cutoff=T(0.2),
-    )
-    @test trunc[1] ≈ T(0.15)
-    @test trunc[2] ≈ T(0.145 / 2)
-    @test Array(trunc[3]) == convert(Vector{T}, [0.4, 0.26, 0.19] / 2)
-  end
-end # End truncate test
diff --git a/test/test_pastaq.jl b/test/test_pastaq.jl
deleted file mode 100644
index 5c74085..0000000
--- a/test/test_pastaq.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-using Test
-using ITensors
-using ITensorGPU
-using PastaQ
-using Zygote
-using OptimKit
-
-function ising_model(n; J=1.0, h)
-  H = OpSum()
-  for j in 1:n
-    if j < n
-      H -= J, "Z", j, "Z", j + 1
-    end
-    H -= h, "X", j
-  end
-  return H
-end
-
-Ry(θ) = [("Ry", j, (θ=θ[j],)) for j in 1:length(θ)]
-CNOT(n) = [("CNOT", j, j + 1) for j in 1:(n - 1)]
-function U(θ)
-  nlayers = length(θ)
-  Uθ = Tuple[]
-  for l in 1:(nlayers - 1)
-    Uθ = [Uθ; [Ry(θ[l]); CNOT(length(θ[l]))]]
-  end
-  Uθ = [Uθ; Ry(θ[nlayers])]
-  return Uθ
-end
-
-function f(θ, ψ; kwargs...)
-  i = siteinds(ψ)
-  ψθ = runcircuit(i, U(θ); kwargs...)
-  return 1 - abs(inner(ψ, ψθ))
-end
-
-function f_∇f(f, θ, ψ; kwargs...)
-  fθ, (∇fθ,) = withgradient(θ -> f(θ, ψ; kwargs...), θ)
-  return fθ, ∇fθ
-end
-
-@testset "PastaQ runcircuit on $device with element type $eltype" for device in (cpu, cu),
-  eltype in (Float32, Float64)
-
-  n = 4
-  h = 0.5
-  cutoff = 1e-5
-  gradtol = 1e-4
-  maxdim = 10
-  maxiter = 30
-  nlayers = 4
-  i = siteinds("Qubit", n)
-  ψ = device(eltype, MPS(i, j -> isodd(j) ? "0" : "1"))
-  H = device(eltype, MPO(ising_model(n; h), i))
-  _, ψ = dmrg(H, ψ; nsweeps=10, cutoff, maxdim, outputlevel=0)
-  θ = [zeros(eltype, n) for l in 1:nlayers]
-  (θ,) = optimize(
-    θ -> f_∇f(f, θ, ψ; cutoff, maxdim, device, eltype),
-    θ,
-    LBFGS(; verbosity=0, maxiter, gradtol),
-  )
-  ψθ = runcircuit(i, U(θ); cutoff, maxdim, device, eltype)
-  energy_reference = inner(ψ', H, ψ)
-  energy_opt = inner(ψθ', H, ψθ)
-  is_device(x, device) = device == cu ? ITensorGPU.is_cu(x) : !ITensorGPU.is_cu(x)
-  @test is_device(H, device)
-  @test is_device(ψ, device)
-  @test is_device(ψθ, device)
-  @test ITensors.scalartype(H) <: eltype
-  @test ITensors.scalartype(ψ) <: eltype
-  @test ITensors.scalartype(ψθ) <: eltype
-  @test inner(ψ', H, ψ) ≈ inner(ψθ', H, ψθ) atol = 1e-3
-end