diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 86ffdd7..25d50ae 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -19,7 +19,7 @@ jobs:
       matrix:
         version:
           - '1.6'
-          - '1.7'
+          - '1'
         os:
           - ubuntu-latest
           - macOS-latest
@@ -35,6 +35,8 @@ jobs:
       - uses: julia-actions/cache@v1
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
+        env:
+          JULIA_NUM_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v2
         with:
diff --git a/Project.toml b/Project.toml
index 1750ad0..f32c8f8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,20 @@
 name = "ITensorParallel"
 uuid = "0fccfcd2-f061-4985-9740-339d3f86bfce"
 authors = ["Matthew Fishman <mfishman@flatironinstitute.org> and contributors"]
-version = "0.0.1"
+version = "0.1.0"
 
 [deps]
+Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
 ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [compat]
-ITensors = "0.2, 0.3"
-MPI = "0.19, 0.20"
+Accessors = "0.1.26"
+Folds = "0.2.8"
+ITensors = "0.3.27"
+MPI = "0.20"
 julia = "1.6"
 
 [extras]
diff --git a/README.md b/README.md
index 247e10f..530d103 100644
--- a/README.md
+++ b/README.md
@@ -8,102 +8,12 @@
 
 # Overview
 
-This package is for experimenting with adding more shared and distributed memory parallelism, for example implementing the techniques laid out in the paper https://arxiv.org/abs/2103.09976.
+This package adds more shared and distributed memory parallelism to [ITensors.jl](https://github.com/ITensor/ITensors.jl), for example implementing the techniques for nested parallelization in DMRG laid out in the paper [arXiv:2103.09976](https://arxiv.org/abs/2103.09976). So far, it focuses on parallelizing over optimizing or evolving sums of tensor networks. We plan to add real-space parallel DMRG, TDVP, and TEBD based on [arXiv:1301.3494](https://arxiv.org/abs/1301.3494).
 
-We will explore multithreaded and distributed parallelism over real-space parallel DMRG, TDVP, and TEBD based on https://arxiv.org/abs/1301.3494, as well as multithreaded and distributed parallelism over sums of Hamiltonian terms in DMRG and TDVP.
+For multithreading, we are using Julia's standard Threads.jl library, as well as convenient abstractions on top of that for parallelizing over maps and reductions provided by [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl). See [here](https://juliafolds.github.io/data-parallelism/tutorials/quick-introduction/) for a nice overview of parallelization in Julia.
 
-For multithreading, we are using Julia's standard Threads.jl library, and possibly convenient abstractions on top of that provided by [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl). See [here](https://juliafolds.github.io/data-parallelism/tutorials/quick-introduction/) for a nice overview of the different options.
+For distributed computing, we make use of Julia's standard [Distributed.jl](https://docs.julialang.org/en/v1/manual/distributed-computing/) library along with it's interface through [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl), as well as [MPI.jl](https://juliaparallel.github.io/MPI.jl/latest/). Take a look at Julia'd documentation on [distributed computing](https://docs.julialang.org/en/v1/manual/distributed-computing/) for more information and background.
 
-For distributed computing, we will explore Julia's standard [Distributed.jl](https://docs.julialang.org/en/v1/manual/distributed-computing/) library, as well as [MPI.jl](https://juliaparallel.github.io/MPI.jl/latest/).
-
-To run Distributed.jl-based computations on clusters, we will explore using Julia's cluster manager tools like [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl), [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl), and [MPIClusterManagers.jl](https://github.com/JuliaParallel/MPIClusterManagers.jl).
-
-# Running on clusters
-
-
-## Option 1: `Distributed.jl`
-Here are detailed instructions for running a minimal "hello world" example parallelized with Julia's Distributed.jl standard library, distributed over nodes of a cluster.
-
-1. Start by [downloading the latest version of Julia](https://julialang.org/downloads/) or loading a pre-installed version of Julia, for example with `module load julia`. You can follow more detailed instruction for installing your own version of Julia on a cluster [here](https://itensor.github.io/ITensors.jl/stable/getting_started/Installing.html).
-2. Start Julia by executing the command `julia` at the command line. This should bring up the interactive Julia REPL. From there, you should install either [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl) or [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl) if your cluster uses Slurm as the cluster management and job scheduling system (ClusterManagers.jl supports Slurm, but SlurmClusterManager.jl has a more specialized implementation). This would look something like this:
-```julia
-$ julia
-               _
-   _       _ _(_)_     |  Documentation: https://docs.julialang.org
-  (_)     | (_) (_)    |
-   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
-  | | | | | | |/ _` |  |
-  | | |_| | | | (_| |  |  Version 1.7.2 (2022-02-06)
- _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
-|__/                   |
-
-julia> using Pkg
-
-julia> Pkg.add("SlurmClusterManager")
-    Updating registry at `~/.julia/registries/General`
-    Updating git-repo `https://github.com/JuliaRegistries/General.git`
-   Resolving package versions...
-   Installed SlurmClusterManager ─ v0.1.2
-    Updating `~/.julia/environments/v1.7/Project.toml`
-  [c82cd089] + SlurmClusterManager v0.1.2
-    Updating `~/.julia/environments/v1.7/Manifest.toml`
-  [c82cd089] + SlurmClusterManager v0.1.2
-Precompiling project...
-  1 dependency successfully precompiled in 2 seconds (456 already precompiled, 2 skipped during auto due to previous errors)
-```
-Now the cluster manager (either `ClusterManagers.jl` or `SlurmClusterManager.jl`, whichever you installed) will be available system-wide for you to use and aid you in running your Distributed.jl-based parallelized Julia code across nodes of your cluster.
-
-3. Create a file somewhere within your home directory on the cluster called `hello_world.jl` with the contents:
-```julia
-#!/usr/bin/env julia
-
-using Distributed, SlurmClusterManager
-addprocs(SlurmManager())
-@everywhere println("hello from $(myid()):$(gethostname())")
-```
-4. Submit your script to the work queue of your cluster, for example with `sbatch` if the cluster you are on uses Slurm:
-```
-$ sbatch -N 2 --ntasks-per-node=64 hello_world.jl
-```
-This will execute the code on two nodes using 64 workers per node.
-
-We will add similar instructions on running the same "hello world" example using MPI.jl, and additionally running linear algebra and ITensor operations in parallel with both Distributed.jl and MPI.jl.
-
-
-## Option 2: `MPI.jl`
-
-1. Start by [downloading the latest version of Julia](https://julialang.org/downloads/) or loading a pre-installed version of Julia, for example with `module load julia`. 
-2. Load an MPI installation, e.g. `module load openmpi`.
-3. Install `MPI.jl` (e.g. from the command line: `julia --project -e 'using Pkg; Pkg.add("MPI")'`).
-4. Make sure `MPI.jl` is pointing to the correct `MPI` installation by running the following:
-   ```
-   julia --project -e 'ENV["JULIA_MPI_BINARY"]="system"; using Pkg; Pkg.build("MPI"; verbose=true)'
-   ```
-   Make sure the version of MPI in the output of this command matches the one you wanted to load.
-5. Run a test job:
-   ```julia
-   # 01-hello.jl
-   using MPI
-   MPI.Init()
-
-   comm = MPI.COMM_WORLD
-   println("Hello world, I am $(MPI.Comm_rank(comm)) of $(MPI.Comm_size(comm))")
-   MPI.Barrier(comm)
-   ```
-   using `mpirun -np 4 julia 01-hello.jl`
-6. The correct output should look something like:
-    ```julia
-    Hello world, I am 2 of 4
-    Hello world, I am 0 of 4
-    Hello world, I am 3 of 4
-    Hello world, I am 1 of 4
-    ```
-
-If you see the following warning, go back to step 4 and make sure, `MPI.jl` is pointing to the correct MPI installation:
-
-```
-┌ Warning:     You appear to have run julia under a different `mpiexec` than the one used by MPI.jl.
-│     See the documentation for details.
-└ @ MPI ~/.julia/packages/MPI/08SPr/src/environment.jl:38
-```
+To run Distributed.jl-based computations on clusters, we recommend using Julia's cluster manager tools like [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl), [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl), and [MPIClusterManagers.jl](https://github.com/JuliaParallel/MPIClusterManagers.jl).
 
+See the [examples folder](https://github.com/ITensor/ITensorParallel.jl/tree/main/examples) for examples of running DMRG parallelized over sums of Hamiltonians, using Threads.jl, Distributed.jl, and MPI.jl.
diff --git a/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl b/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl
index 109bd99..9a32178 100644
--- a/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl
+++ b/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl
@@ -17,15 +17,23 @@ ITensors.Strided.disable_threads()
 """
 Run with:
 ```julia
-# No blocksparse multithreading
-main(; Nx=8, Ny=4, maxdim=1000, Sum=ThreadedSum);
-main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum);
-main(; Nx=8, Ny=4, maxdim=1000, Sum=SequentialSum);
-
-# Blocksparse multithreading
-main(; Nx=8, Ny=4, maxdim=1000, Sum=ThreadedSum, threaded_blocksparse=true);
-main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum, threaded_blocksparse=true);
-main(; Nx=8, Ny=4, maxdim=1000, Sum=SequentialSum, threaded_blocksparse=true);
+# Sequential sum over MPOs.
+# Uses the default `Sum=SequentialSum`.
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000);
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, threaded_blocksparse=true);
+
+# Threaded sum over MPOs.
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=ThreadedSum);
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=ThreadedSum, threaded_blocksparse=true);
+
+# Distributed sum over MPOs, where terms of the MPO
+# sum and their environments are stored, updated,
+# and applied remotely on a worker process.
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=DistributedSum);
+main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=DistributedSum, threaded_blocksparse=true);
+
+# Using write-to-disk.
+main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum, disk=true, threaded_blocksparse=true);
 ```
 """
 function main(;
@@ -33,28 +41,24 @@ function main(;
   Ny::Int,
   U::Float64=4.0,
   t::Float64=1.0,
+  nsweeps=10,
   maxdim::Int=3000,
   conserve_ky=true,
   seed=1234,
   npartitions=2Ny,
-  Sum,
+  Sum=SequentialSum,
   threaded_blocksparse=false,
+  disk=false,
+  random_init=false,
   in_partition=ITensorParallel.default_in_partition,
 )
-  Random.seed!(seed)
   @show Threads.nthreads()
 
-  # TODO: Use `ITensors.enable_threaded_blocksparse(threaded_blocksparse)`
-  if threaded_blocksparse
-    ITensors.enable_threaded_blocksparse()
-  else
-    ITensors.disable_threaded_blocksparse()
-  end
+  ITensors.enable_threaded_blocksparse(threaded_blocksparse)
   @show ITensors.using_threaded_blocksparse()
 
   N = Nx * Ny
 
-  nsweeps = 10
   max_maxdim = maxdim
   maxdim = min.([100, 200, 400, 800, 2000, 3000, max_maxdim], max_maxdim)
   cutoff = 1e-6
@@ -68,42 +72,44 @@ function main(;
 
   ℋ = hubbard(; Nx=Nx, Ny=Ny, t=t, U=U, ky=true)
   ℋs = partition(ℋ, npartitions; in_partition)
-  H = [MPO(ℋ, sites) for ℋ in ℋs]
+  Hs = [MPO(ℋ, sites) for ℋ in ℋs]
 
-  @show maxlinkdim.(H)
+  @show maxlinkdim.(Hs)
 
   # Number of structural nonzero elements in a bulk
   # Hamiltonian MPO tensor
-  @show nnz(H[1][end ÷ 2])
-  @show nnzblocks(H[1][end ÷ 2])
-
-  # Create start state
-  state = Vector{String}(undef, N)
-  for i in 1:N
-    x = (i - 1) ÷ Ny
-    y = (i - 1) % Ny
-    if x % 2 == 0
-      if y % 2 == 0
-        state[i] = "Up"
-      else
-        state[i] = "Dn"
-      end
-    else
-      if y % 2 == 0
-        state[i] = "Dn"
-      else
-        state[i] = "Up"
-      end
-    end
+  @show nnz(Hs[1][end ÷ 2])
+  @show nnzblocks(Hs[1][end ÷ 2])
+
+  # Create starting state with checkerboard
+  # pattern
+  state = map(CartesianIndices((Ny, Nx))) do I
+    return iseven(I[1]) ⊻ iseven(I[2]) ? "↓" : "↑"
+  end
+  display(state)
+
+  if random_init
+    # Only available in ITensors 0.3.27
+    # Helps make results reproducible when comparing
+    # sequential vs. threaded.
+    itensor_rng = Xoshiro()
+    Random.seed!(itensor_rng, seed)
+    psi0 = randomMPS(itensor_rng, sites, state; linkdims=10)
+  else
+    psi0 = MPS(sites, state)
   end
 
-  psi0 = randomMPS(sites, state; linkdims=10)
+  mpo_sum = Sum(Hs)
+  if disk
+    # Write-to-disk
+    mpo_sum = ITensors.disk(mpo_sum)
+  end
 
-  energy, psi = @time dmrg(Sum(H), psi0; nsweeps, maxdim, cutoff, noise)
+  energy, psi = @time dmrg(mpo_sum, psi0; nsweeps, maxdim, cutoff, noise)
   @show Nx, Ny
   @show t, U
   @show flux(psi)
   @show maxlinkdim(psi)
   @show energy
-  return energy, H, psi
+  return energy, psi
 end
diff --git a/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl b/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl
index 56409f0..f86efc1 100644
--- a/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl
+++ b/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl
@@ -12,10 +12,11 @@ ITensors.Strided.disable_threads()
 """
 Run at the command line with 4 processes:
 ```julia
-mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 20
+mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000
 
-# Currently is broken!
-mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 20 --threaded_blocksparse true
+mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 --threaded_blocksparse true
+
+mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 --disk true --threaded_blocksparse true
 ```
 """
 function main(;
@@ -23,31 +24,22 @@ function main(;
   Ny::Int,
   U::Float64=4.0,
   t::Float64=1.0,
+  nsweeps=10,
   maxdim::Int=3000,
   conserve_ky=true,
   seed=1234,
   threaded_blocksparse=false,
+  disk=false,
+  random_init=false,
   in_partition=ITensorParallel.default_in_partition,
 )
-  # It is very important to set both of these RNG seeds!
-  # This ensures the same state and the same ITensor indices
-  # are made in each process
-  Random.seed!(seed)
-  Random.seed!(index_id_rng(), seed)
-
   @show Threads.nthreads()
 
-  # TODO: Use `ITensors.enable_threaded_blocksparse(threaded_blocksparse)`
-  if threaded_blocksparse
-    ITensors.enable_threaded_blocksparse()
-  else
-    ITensors.disable_threaded_blocksparse()
-  end
+  ITensors.enable_threaded_blocksparse(threaded_blocksparse)
   @show ITensors.using_threaded_blocksparse()
 
   N = Nx * Ny
 
-  nsweeps = 10
   max_maxdim = maxdim
   maxdim = min.([100, 200, 400, 800, 2000, 3000, max_maxdim], max_maxdim)
   cutoff = 1e-6
@@ -60,34 +52,41 @@ function main(;
   # Create a lazy representation of the Hamiltonian
   ℋ = hubbard(; Nx, Ny, t, U, ky=true)
 
-  # Create start state
-  state = Vector{String}(undef, N)
-  for i in 1:N
-    x = (i - 1) ÷ Ny
-    y = (i - 1) % Ny
-    if x % 2 == 0
-      if y % 2 == 0
-        state[i] = "Up"
-      else
-        state[i] = "Dn"
-      end
-    else
-      if y % 2 == 0
-        state[i] = "Dn"
-      else
-        state[i] = "Up"
-      end
-    end
+  # Create starting state with checkerboard
+  # pattern
+  state = map(CartesianIndices((Ny, Nx))) do I
+    return iseven(I[1]) ⊻ iseven(I[2]) ? "↓" : "↑"
   end
-  sites = siteinds("ElecK", N; conserve_qns=true, conserve_ky, modulus_ky=Ny)
-  psi0 = randomMPS(sites, state; linkdims=10)
+  display(state)
 
   MPI.Init()
+
+  sites = siteinds("ElecK", N; conserve_qns=true, conserve_ky, modulus_ky=Ny)
+  sites = MPI.bcast(sites, 0, MPI.COMM_WORLD)
+
+  if random_init
+    # Only available in ITensors 0.3.27
+    # Helps make results reproducible when comparing
+    # sequential vs. threaded.
+    itensor_rng = Xoshiro()
+    Random.seed!(itensor_rng, seed)
+    psi0 = randomMPS(itensor_rng, sites, state; linkdims=10)
+  else
+    psi0 = MPS(sites, state)
+  end
+  psi0 = MPI.bcast(psi0, 0, MPI.COMM_WORLD)
+
   nprocs = MPI.Comm_size(MPI.COMM_WORLD)
   ℋs = partition(ℋ, nprocs; in_partition)
   which_proc = MPI.Comm_rank(MPI.COMM_WORLD) + 1
-  PH = MPISum(ProjMPO(MPO(ℋs[which_proc], sites)))
-  energy, psi = @time dmrg(PH, psi0; nsweeps, maxdim, cutoff, noise)
+  mpo_sum_term = MPISumTerm(MPO(ℋs[which_proc], sites), MPI.COMM_WORLD)
+
+  if disk
+    # Write-to-disk
+    mpo_sum_term = ITensors.disk(mpo_sum_term)
+  end
+
+  energy, psi = @time dmrg(mpo_sum_term, psi0; nsweeps, maxdim, cutoff, noise)
 
   @show Nx, Ny
   @show t, U
diff --git a/examples/02_mpi_run.jl b/examples/02_mpi_run.jl
index 024f8af..55eb31e 100644
--- a/examples/02_mpi_run.jl
+++ b/examples/02_mpi_run.jl
@@ -2,7 +2,7 @@
 include("02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl")
 
 # Run with:
-# mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 1000
+# mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000
 
 using ArgParse
 function parse_commandline()
@@ -16,10 +16,18 @@ function parse_commandline()
     help = "Cylinder width"
     arg_type = Int
     required = true
+    "--nsweeps"
+    help = "Number of sweeps"
+    arg_type = Int
+    required = true
     "--maxdim"
     help = "Maximum bond dimension"
     arg_type = Int
     required = true
+    "--disk"
+    help = "Write-to-disk"
+    arg_type = Bool
+    default = false
     "--threaded_blocksparse"
     help = "Use threaded block sparse operations"
     arg_type = Bool
@@ -31,6 +39,8 @@ args = parse_commandline()
 main(;
   Nx=args["Nx"],
   Ny=args["Ny"],
+  nsweeps=args["nsweeps"],
   maxdim=args["maxdim"],
+  disk=args["disk"],
   threaded_blocksparse=args["threaded_blocksparse"],
 );
diff --git a/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl b/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl
deleted file mode 100644
index aecea28..0000000
--- a/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl
+++ /dev/null
@@ -1,69 +0,0 @@
-using Distributed
-
-rmprocs(setdiff(procs(), 1))
-addprocs(4)
-@show nprocs()
-
-using Random
-@everywhere using ITensors
-@everywhere using ITensorParallel
-
-npartitions = 4
-
-Sum = DistributedSum
-# Sum = ThreadedSum
-# Sum = SequentialSum
-
-@show Sum
-@show npartitions
-@show Threads.nthreads()
-
-ITensors.BLAS.set_num_threads(1)
-ITensors.Strided.disable_threads()
-
-# Turn block sparse multithreading on or off
-ITensors.disable_threaded_blocksparse()
-
-Nx, Ny = 8, 4
-N = Nx * Ny
-
-sites = siteinds("S=1/2", N; conserve_qns=true)
-
-lattice = square_lattice(Nx, Ny; yperiodic=false)
-
-function heisenberg_hamiltonian(lattice)
-  ℋ = OpSum()
-  for b in lattice
-    ℋ += 0.5, "S+", b.s1, "S-", b.s2
-    ℋ += 0.5, "S-", b.s1, "S+", b.s2
-    ℋ += "Sz", b.s1, "Sz", b.s2
-  end
-  return ℋ
-end
-ℋ = heisenberg_hamiltonian(lattice)
-
-# Define how the Hamiltonian will be partitioned
-# into a sum of sub-Hamiltonians.
-# function in_partition(sites::Tuple{Int,Int}, p, nparts)
-#   i, j = sites
-#   return p == mod1(i, nparts)
-# end
-
-ℋs = partition(ℋ, npartitions)
-H = [MPO(ℋ, sites) for ℋ in ℋs]
-@show maxlinkdim.(H)
-PH = Sum(H)
-
-state = [isodd(n) ? "Up" : "Dn" for n in 1:N]
-Random.seed!(1234)
-psi0 = randomMPS(sites, state; linkdims=10)
-
-nsweeps = 10
-maxdim = [20, 60, 100, 100, 200, 400, 800]
-cutoff = 1e-8
-@show nsweeps
-@show maxdim
-@show cutoff
-
-energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff)
-@show energy
diff --git a/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl b/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl
deleted file mode 100644
index be99237..0000000
--- a/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl
+++ /dev/null
@@ -1,61 +0,0 @@
-using ITensors
-using ITensorParallel
-using Random
-
-ITensors.BLAS.set_num_threads(1)
-ITensors.Strided.disable_threads()
-
-# Turn block sparse multithreading on or off
-ITensors.disable_threaded_blocksparse()
-
-npartitions = 2
-
-# Threaded or sequential sum of MPOs
-Sum = ThreadedSum
-# Sum = SequentialSum
-
-@show Threads.nthreads()
-@show npartitions
-
-Nx, Ny = 10, 5
-N = Nx * Ny
-
-sites = siteinds("S=1/2", N; conserve_qns=true)
-
-lattice = square_lattice(Nx, Ny; yperiodic=false)
-
-function heisenberg_hamiltonian(lattice)
-  ℋ = OpSum()
-  for b in lattice
-    ℋ += 0.5, "S+", b.s1, "S-", b.s2
-    ℋ += 0.5, "S-", b.s1, "S+", b.s2
-    ℋ += "Sz", b.s1, "Sz", b.s2
-  end
-  return ℋ
-end
-ℋ = heisenberg_hamiltonian(lattice)
-
-# Define how the Hamiltonian will be partitioned
-# into a sum of sub-Hamiltonians.
-function in_partition(sites::Tuple{Int,Int}, p, nparts)
-  i, j = sites
-  return p == mod1(i, nparts)
-end
-
-ℋs = partition(ℋ, npartitions; in_partition)
-H = [MPO(ℋ, sites) for ℋ in ℋs]
-PH = Sum(H)
-
-state = [isodd(n) ? "Up" : "Dn" for n in 1:N]
-Random.seed!(1234)
-psi0 = randomMPS(sites, state; linkdims=10)
-
-nsweeps = 10
-maxdim = [20, 60, 100, 100, 200, 400, 800]
-cutoff = 1e-6
-@show nsweeps
-@show maxdim
-@show cutoff
-
-energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff)
-@show energy
diff --git a/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl b/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl
deleted file mode 100644
index 4a01733..0000000
--- a/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl
+++ /dev/null
@@ -1,65 +0,0 @@
-using MPI
-using ITensors
-using ITensorParallel
-using Random
-
-MPI.Init()
-
-ITensors.BLAS.set_num_threads(1)
-ITensors.Strided.disable_threads()
-ITensors.disable_threaded_blocksparse()
-
-seed = 1234
-Random.seed!(seed)
-
-mpi_projmpo = true
-
-Nx, Ny = 4, 1
-N = Nx * Ny
-
-sites = siteinds("S=1/2", N; conserve_qns=true)
-
-lattice = square_lattice(Nx, Ny; yperiodic=false)
-
-function heisenberg_hamiltonian(lattice)
-  ℋ = OpSum()
-  for b in lattice
-    ℋ += 0.5, "S+", b.s1, "S-", b.s2
-    ℋ += 0.5, "S-", b.s1, "S+", b.s2
-    ℋ += "Sz", b.s1, "Sz", b.s2
-  end
-  return ℋ
-end
-ℋ = heisenberg_hamiltonian(lattice)
-
-# Define how the Hamiltonian will be partitioned
-# into a sum of sub-Hamiltonians.
-function in_partition(sites::Tuple{Int,Int}, p, nparts)
-  i, j = sites
-  return p == mod1(i, nparts)
-end
-
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-ℋs = partition(ℋ, nprocs; in_partition=in_partition)
-
-PH = if mpi_projmpo
-  n = MPI.Comm_rank(MPI.COMM_WORLD) + 1
-  MPISum(ProjMPO(MPO(ℋs[n], sites)))
-else
-  ProjMPOSum(H)
-end
-
-state = [isodd(n) ? "Up" : "Dn" for n in 1:N]
-psi0 = randomMPS(sites, state; linkdims=10)
-
-nsweeps = 10
-maxdim = [20, 60, 100, 100, 200, 400, 800]
-cutoff = 1e-10
-@show nsweeps
-@show maxdim
-@show cutoff
-
-energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff)
-@show energy
-
-MPI.Finalize()
diff --git a/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl b/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl
deleted file mode 100644
index 5856176..0000000
--- a/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl
+++ /dev/null
@@ -1,63 +0,0 @@
-using MPI
-using ITensors
-using ITensorParallel
-using Random
-
-MPI.Init()
-
-ITensors.BLAS.set_num_threads(1)
-ITensors.Strided.disable_threads()
-ITensors.disable_threaded_blocksparse()
-
-seed = 1234
-Random.seed!(seed)
-
-Nx, Ny = 8, 1
-N = Nx * Ny
-
-sites = siteinds("S=1/2", N; conserve_qns=true)
-
-lattice = square_lattice(Nx, Ny; yperiodic=false)
-
-function heisenberg_hamiltonian(lattice)
-  opsum = OpSum()
-  for b in lattice
-    opsum += 0.5, "S+", b.s1, "S-", b.s2
-    opsum += 0.5, "S-", b.s1, "S+", b.s2
-    opsum += "Sz", b.s1, "Sz", b.s2
-  end
-  return opsum
-end
-opsum = heisenberg_hamiltonian(lattice)
-
-# Define how the Hamiltonian will be partitioned
-# into a sum of sub-Hamiltonians.
-function in_partition(sites::Tuple{Int,Int}, p, nparts)
-  i, j = sites
-  return p == mod1(i, nparts)
-end
-
-# Number of partitions
-nparts = 4
-
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-nparts_per_proc = nparts ÷ nprocs
-opsums = collect(
-  Iterators.partition(partition(opsum, nparts; in_partition=in_partition), nparts_per_proc)
-)
-
-n = MPI.Comm_rank(MPI.COMM_WORLD) + 1
-PH = MPISum(ThreadedSum([MPO(opsum, sites) for opsum in opsums[n]]))
-
-state = [isodd(n) ? "Up" : "Dn" for n in 1:N]
-psi0 = randomMPS(sites, state, 10)
-
-sweeps = Sweeps(10)
-setmaxdim!(sweeps, 20, 60, 100, 100, 200, 400, 800)
-setcutoff!(sweeps, 1E-10)
-@show sweeps
-
-energy, psi = dmrg(PH, psi0, sweeps)
-@show energy
-
-MPI.Finalize()
diff --git a/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl b/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl
deleted file mode 100644
index 325a6f0..0000000
--- a/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl
+++ /dev/null
@@ -1,49 +0,0 @@
-using Distributed
-
-# Apply a sum of matrices to a vector in
-# parallel using multi-processing.
-# The vector exists on every process, but
-# each matrix in the sum only exists on
-# a single process.
-
-# Set the number of processors Julia will use
-# @everywhere denotes that it will be defined
-# on every worker.
-@everywhere nprocs = 2
-addprocs(nprocs)
-
-# Define the vector dimension.
-# @everywhere denotes that it will be defined
-# on every worker.
-@everywhere n = 4
-
-# Define the vector we will apply the set
-# of matrices to.
-# @everywhere denotes that it will be defined
-# on every worker.
-@everywhere v = randn(n)
-
-# Define the matrices in the matrix sum
-# that will be applied in parallel to the
-# vector `v`.
-# The first matrix is defined on worker 1,
-# the second one on worker 2.
-A1 = @spawnat workers()[1] randn(n, n)
-A2 = @spawnat workers()[2] randn(n, n)
-
-# Define the individual matrix-vector multiplication
-# operations on the appropriate worker.
-# Alternatively, you can use:
-#
-# A1v = @spawnat :any fetch(A1) * v
-#
-# for Julia to choose the worker automatically.
-A1v = @spawnat workers()[1] fetch(A1) * v
-A2v = @spawnat workers()[2] fetch(A2) * v
-
-# Actually perform the matrix-vector multiplications,
-# fetching the results to the main worker for
-# local usage.
-Av = fetch(A1v) + fetch(A2v)
-
-@show norm((fetch(A1) + fetch(A2)) * v - Av)
diff --git a/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl b/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl
deleted file mode 100644
index 2c08ef5..0000000
--- a/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using Distributed
-using LinearAlgebra
-
-BLAS.set_num_threads(1)
-
-# Apply a sum of distributed matrices and reduce
-# after applying to a vector.
-
-# Number of matrices in our sum.
-nmats = 2
-
-# Specify the number of processors Julia will use.
-# For now we just say it is the same as the number of
-# matrices in our sum.
-addprocs(nmats)
-
-# Size of out matrices.
-@everywhere n = 4
-
-# Set of matrices in our sum, each created on a
-# separate worker.
-As = [@spawnat(workers()[nmat], randn(n, n)) for nmat in 1:nmats]
-
-# Vector we will apply to our matrices, distributed to
-# every worker.
-@everywhere v = randn(n)
-
-# Distributed matrix-vector multiplication for each distributed
-# matrix in our sum, then reduced over with `+`.
-Av = @distributed (+) for n in 1:nmats
-  # Need to call `fetch` to instantiate the matrix
-  # on the worker where it was defined above.
-  fetch(As[n]) * v
-end
-
-# Test by fetching and summing locally
-A = sum(fetch.(As))
-@show norm(Av - A * v)
diff --git a/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl b/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl
deleted file mode 100644
index ee44bbb..0000000
--- a/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using Distributed
-
-# Number of matrices in the sum
-nmats = 2
-addprocs(nmats)
-
-@everywhere begin
-  using LinearAlgebra
-  using KrylovKit
-
-  BLAS.set_num_threads(1)
-
-  include(joinpath("src", "DistributedSums.jl"))
-  include(joinpath("src", "MatrixMaps.jl"))
-
-  # Matrix dimension
-  n = 100
-
-  # Starting vector that we will apply
-  # the matrix map to
-  v0 = randn(n)
-end
-
-# Make a distributed sum of MatrixMaps, which is
-# just a Matrix with special multiplication
-# notation that can work with KrylovKit
-A = DistributedSum(_ -> MatrixMap(randn(n, n)), nmats)
-
-Av0 = A(v0)
-
-@show norm(Av0 - sum(fetch.(A))(v0))
-
-# Solve eigenvector equation
-λ⃗, v⃗ = eigsolve(A, v0, 1, :LM)
-λ = λ⃗[1]
-v = v⃗[1]
-
-@show norm(A(v) - λ * v)
diff --git a/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl b/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl
deleted file mode 100644
index 531e272..0000000
--- a/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-using Distributed
-
-# Number of ITensorNetworkMaps in the sum
-nmats = 2
-
-addprocs(nmats)
-
-# Set up for the calculation. Define everything
-# in the `@everywhere` block that you need
-# on all of the processes to perform the
-# calculation.
-@everywhere begin
-  using ITensors
-  using ITensors.ITensorNetworkMaps
-  using LinearAlgebra
-  using KrylovKit
-
-  BLAS.set_num_threads(1)
-  ITensors.Strided.disable_threads()
-  ITensors.disable_threaded_blocksparse()
-
-  include(joinpath("src", "DistributedSums.jl"))
-
-  # Function that defines the tensor network
-  # that will map the input vector (i.e.
-  # a transfer matrix or effective Hamiltonian).
-  function itensor_map(i, j)
-    A1 = randomITensor(i', dag(i), dag(l))
-    A2 = randomITensor(j', dag(j), l)
-    return ITensorNetworkMap([A1, A2])
-  end
-
-  d = 2
-  χ = 3
-  i = Index(d, "i")
-  j = Index(d, "j")
-  l = Index(χ, "l")
-
-  v0 = randomITensor(i, j)
-end
-
-# Defines an object representing a sum of
-# linear maps (generalized matrices) built
-# from contractions of ITensors that are
-# distributed across `nmats` workers
-# (each linear map lives on a worker).
-A = DistributedSum(_ -> itensor_map(i, j), nmats)
-
-# Apply the linear map, which internally
-# is performed in parallel where the linear
-# map is created.
-Av0 = A(v0)
-
-@show norm(Av0 - sum(fetch.(A))(v0))
-
-# Solve an eigenvector equation in
-# parallel.
-λ⃗, v⃗ = eigsolve(A, v0, 1, :LM)
-λ = λ⃗[1]
-v = v⃗[1]
-
-@show norm(A(v) - λ * v)
diff --git a/examples/distributed_itensormap_tutorial/README.md b/examples/distributed_itensormap_tutorial/README.md
deleted file mode 100644
index af43340..0000000
--- a/examples/distributed_itensormap_tutorial/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-A short set of examples acting as a tutorial for performing
-a distributed sum of matrix-vector multiplications for
-use with a Krylov solver.
-
-Good references are:
-
-- https://docs.julialang.org/en/v1/manual/distributed-computing/
-- https://docs.julialang.org/en/v1/stdlib/Distributed/
diff --git a/examples/distributed_itensormap_tutorial/src/DistributedSums.jl b/examples/distributed_itensormap_tutorial/src/DistributedSums.jl
deleted file mode 100644
index 26e5b7c..0000000
--- a/examples/distributed_itensormap_tutorial/src/DistributedSums.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-using Distributed
-
-struct DistributedSum{T}
-  data::Vector{T}
-end
-
-# Construct from a distributed sum from a
-# function that returns the elements of the sum.
-function DistributedSum(f::Function, n::Integer)
-  # Assign the matrices to the first `n` workers.
-  return DistributedSum([@spawnat(workers()[ni], f(ni)) for ni in 1:n])
-end
-
-# Functions needed for @distributed loop and reduction
-Base.length(A::DistributedSum) = length(A.data)
-Base.firstindex(A::DistributedSum) = firstindex(A.data)
-Base.lastindex(A::DistributedSum) = lastindex(A.data)
-Base.getindex(A::DistributedSum, args...) = getindex(A.data, args...)
-Base.iterate(A::DistributedSum, args...) = iterate(A.data, args...)
-
-# Apply the sum
-function (A::DistributedSum)(v)
-  return @distributed (+) for An in A
-    fetch(An)(v)
-  end
-end
diff --git a/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl b/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl
deleted file mode 100644
index 5e808c1..0000000
--- a/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-# Wrapper of Matrix that is callable as `A(v)`
-struct MatrixMap{T}
-  data::Matrix{T}
-end
-(A::MatrixMap)(v) = A.data * v
-Base.:+(A::MatrixMap, B::MatrixMap) = MatrixMap(A.data + B.data)
diff --git a/examples/hubbard_conserve_momentum/electronk.jl b/examples/hubbard_conserve_momentum/electronk.jl
deleted file mode 100644
index 953caf5..0000000
--- a/examples/hubbard_conserve_momentum/electronk.jl
+++ /dev/null
@@ -1,67 +0,0 @@
-import ITensors: space, state, op, has_fermion_string
-
-function space(
-  ::SiteType"ElectronK",
-  n::Int;
-  conserve_qns=false,
-  conserve_sz=conserve_qns,
-  conserve_nf=conserve_qns,
-  conserve_nfparity=conserve_qns,
-  conserve_ky=false,
-  qnname_sz="Sz",
-  qnname_nf="Nf",
-  qnname_nfparity="NfParity",
-  qnname_ky="Ky",
-  modulus_ky=nothing,
-  # Deprecated
-  conserve_parity=nothing,
-)
-  if !isnothing(conserve_parity)
-    conserve_nfparity = conserve_parity
-  end
-  if conserve_ky && conserve_sz && conserve_nf
-    mod = (n - 1) % modulus_ky
-    mod2 = (2 * mod) % modulus_ky
-    return [
-      QN((qnname_nf, 0, -1), (qnname_sz, 0), (qnname_ky, 0, modulus_ky)) => 1
-      QN((qnname_nf, 1, -1), (qnname_sz, 1), (qnname_ky, mod, modulus_ky)) => 1
-      QN((qnname_nf, 1, -1), (qnname_sz, -1), (qnname_ky, mod, modulus_ky)) => 1
-      QN((qnname_nf, 2, -1), (qnname_sz, 0), (qnname_ky, mod2, modulus_ky)) => 1
-    ]
-  elseif conserve_ky
-    error("Cannot conserve ky without conserving sz and nf")
-  elseif conserve_sz && conserve_nf
-    return [
-      QN((qnname_nf, 0, -1), (qnname_sz, 0)) => 1
-      QN((qnname_nf, 1, -1), (qnname_sz, +1)) => 1
-      QN((qnname_nf, 1, -1), (qnname_sz, -1)) => 1
-      QN((qnname_nf, 2, -1), (qnname_sz, 0)) => 1
-    ]
-  elseif conserve_nf
-    return [
-      QN(qnname_nf, 0, -1) => 1
-      QN(qnname_nf, 1, -1) => 2
-      QN(qnname_nf, 2, -1) => 1
-    ]
-  elseif conserve_sz
-    return [
-      QN((qnname_sz, 0), (qnname_nfparity, 0, -2)) => 1
-      QN((qnname_sz, +1), (qnname_nfparity, 1, -2)) => 1
-      QN((qnname_sz, -1), (qnname_nfparity, 1, -2)) => 1
-      QN((qnname_sz, 0), (qnname_nfparity, 0, -2)) => 1
-    ]
-  elseif conserve_nfparity
-    return [
-      QN(qnname_nfparity, 0, -2) => 1
-      QN(qnname_nfparity, 1, -2) => 2
-      QN(qnname_nfparity, 0, -2) => 1
-    ]
-  end
-  return 4
-end
-
-state(statename::StateName, ::SiteType"ElectronK") = state(statename, SiteType("Electron"))
-op(opname::OpName, ::SiteType"ElectronK") = op(opname, SiteType("Electron"))
-function has_fermion_string(opname::OpName, ::SiteType"ElectronK")
-  return has_fermion_string(opname, SiteType("Electron"))
-end
diff --git a/examples/hubbard_conserve_momentum/hubbard_ky.jl b/examples/hubbard_conserve_momentum/hubbard_ky.jl
deleted file mode 100644
index 387f000..0000000
--- a/examples/hubbard_conserve_momentum/hubbard_ky.jl
+++ /dev/null
@@ -1,79 +0,0 @@
-using ITensors
-
-function hubbard_ky_hopping(; Nx::Int, Ny::Int, t=1.0)
-  opsums = OpSum[]
-
-  opsum = OpSum()
-  for x in 0:(Nx - 1)
-    for ky in 0:(Ny - 1)
-      i = x * Ny + ky + 1
-      ϵ = -2 * t * cos((2 * π / Ny) * ky)
-      if abs(ϵ) > 1e-12
-        opsum += ϵ, "n↑", i
-        opsum += ϵ, "n↓", i
-      end
-    end
-  end
-  push!(opsums, opsum)
-
-  opsum = OpSum()
-  for x in 0:(Nx - 2)
-    for ky in 0:(Ny - 1)
-      i = x * Ny + ky + 1
-      j = (x + 1) * Ny + ky + 1
-      opsum += -t, "c†↑", i, "c↑", j
-      opsum += -t, "c†↑", j, "c↑", i
-      opsum += -t, "c†↓", i, "c↓", j
-      opsum += -t, "c†↓", j, "c↓", i
-    end
-  end
-  push!(opsums, opsum)
-
-  return opsums
-end
-
-# function hubbard_ky_interactions(; Nx::Int, Ny::Int, U)
-#   opsums = OpSum[]
-#   for py in 1:Ny
-#     for qy in 1:Ny
-#       opsum = OpSum()
-#       for x in 1:Nx
-#         for ky in 1:Ny
-#           i = (x - 1) * Ny + mod1(ky + qy + Ny - 1, Ny)
-#           j = (x - 1) * Ny + mod1(py - qy + Ny + 1, Ny)
-#           k = (x - 1) * Ny + py
-#           l = (x - 1) * Ny + ky
-#           opsum += U / Ny, "c†↓", i, "c†↑", j, "c↑", k, "c↓", l
-#         end
-#       end
-#       push!(opsums, opsum)
-#     end
-#   end
-#   return opsums
-# end
-
-function hubbard_ky_interactions(; Nx::Int, Ny::Int, U)
-  opsums = OpSum[]
-  for py in 0:(Ny - 1)
-    for qy in 0:(Ny - 1)
-      opsum = OpSum()
-      for x in 0:(Nx - 1)
-        for ky in 0:(Ny - 1)
-          s1 = x * Ny + (ky + qy + Ny) % Ny + 1
-          s2 = x * Ny + (py - qy + Ny) % Ny + 1
-          s3 = x * Ny + py + 1
-          s4 = x * Ny + ky + 1
-          opsum += (U / Ny), "Cdagdn", s1, "Cdagup", s2, "Cup", s3, "Cdn", s4
-        end
-      end
-      push!(opsums, opsum)
-    end
-  end
-  return opsums
-end
-
-function hubbard_ky(; Nx::Int, Ny::Int, t=1.0, U)
-  opsum_hopping = hubbard_ky_hopping(; Nx, Ny, t)
-  opsum_interactions = hubbard_ky_interactions(; Nx, Ny, U)
-  return [opsum_hopping; opsum_interactions]
-end
diff --git a/examples/hubbard_conserve_momentum/mpi_dmrg.jl b/examples/hubbard_conserve_momentum/mpi_dmrg.jl
deleted file mode 100644
index 132bbb6..0000000
--- a/examples/hubbard_conserve_momentum/mpi_dmrg.jl
+++ /dev/null
@@ -1,81 +0,0 @@
-using ITensors
-using ITensorParallel
-using MPI
-using Random
-
-MPI.Init()
-
-ITensors.BLAS.set_num_threads(1)
-ITensors.Strided.disable_threads()
-ITensors.disable_threaded_blocksparse()
-
-include("electronk.jl")
-include("hubbard_ky.jl")
-include("split_vec.jl")
-
-U = 10.0
-Nx = 2
-Ny = 2
-N = Nx * Ny
-
-which_proc = MPI.Comm_rank(MPI.COMM_WORLD) + 1
-
-@show which_proc, U, Nx, Ny
-sites = siteinds("ElectronK", N; conserve_qns=true, conserve_ky=true, modulus_ky=Ny)
-
-opsums = hubbard_ky(; Nx, Ny, U)
-
-@show which_proc, length.(opsums)
-
-# TODO: randomly shuffle the MPOs
-# TODO: only construct the MPOs that are needed by the current process
-Hs = [splitblocks(linkinds, MPO(opsum, sites)) for opsum in opsums]
-
-@show which_proc, length(Hs)
-
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-
-@show which_proc, nprocs
-
-Hss = split_vec(Hs, nprocs)
-
-@show which_proc, length.(Hss)
-@show which_proc, [maxlinkdim.(Hs) for Hs in Hss]
-
-Hn = Hss[which_proc]
-
-@show which_proc, length(Hss[which_proc])
-
-ITensors.checkflux.(Hss[which_proc])
-@show flux.(Hss[which_proc])
-
-PH = MPISum(ProjMPOSum(Hss[which_proc]))
-
-init_state = Vector{String}(undef, N)
-for i in 1:N
-  x = (i - 1) ÷ Ny
-  y = (i - 1) % Ny
-  if x % 2 == 0
-    if y % 2 == 0
-      init_state[i] = "Up"
-    else
-      init_state[i] = "Dn"
-    end
-  else
-    if y % 2 == 0
-      init_state[i] = "Dn"
-    else
-      init_state[i] = "Up"
-    end
-  end
-end
-
-Random.seed!(1234)
-
-psi0 = randomMPS(sites, init_state; linkdims=20)
-ITensors.checkflux(psi0)
-@show flux(psi0)
-dmrg_kwargs = (nsweeps=10, maxdim=[10, 20, 50, 100], cutoff=1e-5, noise=1e-4)
-energy, psi = @time dmrg(PH, psi0; dmrg_kwargs...)
-
-MPI.Finalize()
diff --git a/examples/hubbard_conserve_momentum/split_vec.jl b/examples/hubbard_conserve_momentum/split_vec.jl
deleted file mode 100644
index 8e68241..0000000
--- a/examples/hubbard_conserve_momentum/split_vec.jl
+++ /dev/null
@@ -1,18 +0,0 @@
-# References:
-# https://github.com/JuliaML/MLUtils.jl: `MLUtils.chunk`
-# https://discourse.julialang.org/t/split-vector-into-n-potentially-unequal-length-subvectors/73548
-# https://github.com/m3g/ChunkSplitters.jl
-# https://github.com/JuliaFolds/SplittablesBase.jl
-function split_vec(x::Vector, npartitions::Int)
-  partition_lengths = zeros(Int, npartitions)
-  which_part = 1
-  for j in 1:length(x)
-    partition_lengths[which_part] += 1
-    which_part = mod1(which_part + 1, npartitions)
-  end
-  partition_stops = [0; accumulate(+, partition_lengths)]
-  partition_ranges = [
-    (partition_stops[j] + 1):partition_stops[j + 1] for j in 1:(length(partition_stops) - 1)
-  ]
-  return [x[partition_range] for partition_range in partition_ranges]
-end
diff --git a/examples/mpi_itensormap_tutorial/01_simple_matvec.jl b/examples/mpi_itensormap_tutorial/01_simple_matvec.jl
deleted file mode 100644
index 6291b8c..0000000
--- a/examples/mpi_itensormap_tutorial/01_simple_matvec.jl
+++ /dev/null
@@ -1,27 +0,0 @@
-using MPI
-
-MPI.Init()
-
-comm = MPI.COMM_WORLD
-N = 4
-root = 0
-
-if MPI.Comm_rank(comm) == root
-  v = randn(N)
-else
-  v = Vector{Float64}(undef, N)
-end
-
-MPI.Bcast!(v, root, comm)
-
-M = randn(N, N)
-
-Mv = M * v
-
-@show Mv
-
-Mv_total = MPI.Reduce(Mv, +, root, comm)
-
-@show MPI.Comm_rank(comm), Mv_total
-
-MPI.Finalize()
diff --git a/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl b/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl
deleted file mode 100644
index 5076e34..0000000
--- a/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-using MPI
-
-MPI.Init()
-
-include(joinpath("src", "MPISum.jl"))
-include(joinpath("src", "MatrixMaps.jl"))
-
-n = 4
-v = ones(n)
-A = MPISum(MatrixMap(randn(n, n)))
-Av = A(v)
-
-@show MPI.Comm_rank(A.comm), Av
-
-MPI.Finalize()
diff --git a/examples/mpi_itensormap_tutorial/src/MPISum.jl b/examples/mpi_itensormap_tutorial/src/MPISum.jl
deleted file mode 100644
index d950678..0000000
--- a/examples/mpi_itensormap_tutorial/src/MPISum.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-struct MPISum{T}
-  data::T
-  comm::MPI.Comm
-end
-
-MPISum(data::T, comm=MPI.COMM_WORLD) where {T} = MPISum{T}(data, comm)
-
-# Apply the sum
-function (A::MPISum)(v)
-  return MPI.Allreduce(A.data(v), +, A.comm)
-end
diff --git a/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl b/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl
deleted file mode 100644
index 5e808c1..0000000
--- a/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-# Wrapper of Matrix that is callable as `A(v)`
-struct MatrixMap{T}
-  data::Matrix{T}
-end
-(A::MatrixMap)(v) = A.data * v
-Base.:+(A::MatrixMap, B::MatrixMap) = MatrixMap(A.data + B.data)
diff --git a/src/ITensorParallel.jl b/src/ITensorParallel.jl
index b2a7a94..d372f67 100644
--- a/src/ITensorParallel.jl
+++ b/src/ITensorParallel.jl
@@ -1,19 +1,36 @@
 module ITensorParallel
 
+using Accessors
 using Distributed
 using Folds
 using MPI
 using ITensors
 using ITensors.NDTensors
 
+using ITensors: AbstractSum
+
 import Base: eltype, length, size
-import ITensors: product, position!, noiseterm, lproj, rproj, nsite, replaceind!, linkind
+import ITensors:
+  disk,
+  linkind,
+  lproj,
+  noiseterm,
+  nsite,
+  orthogonalize!,
+  position!,
+  product,
+  replaceind!,
+  replacebond!,
+  rproj,
+  set_terms,
+  terms
 
-include("mpi_extensions.jl")
 include("partition.jl")
-include("parallelsum.jl")
-include("mpisum.jl")
+include("foldssum.jl")
+include("distributedsum.jl")
+include("mpi_extensions.jl")
+include("mpisumterm.jl")
 
-export DistributedSum, SequentialSum, ThreadedSum, MPISum, partition
+export DistributedSum, SequentialSum, ThreadedSum, MPISumTerm, distribute, partition
 
 end
diff --git a/src/broken/distributedsum.jl b/src/broken/distributedsum.jl
deleted file mode 100644
index 7fc71fc..0000000
--- a/src/broken/distributedsum.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-# WARNING: This is currently broken!
-struct DistributedSum{T}
-  terms::Vector{T}
-end
-terms(sum::DistributedSum) = sum.terms
-
-function DistributedSum(f::Function, n::Integer)
-  # Assign the terms of the sum to the first `n` workers.
-  return DistributedSum([@spawnat(workers()[ni], f(ni)) for ni in 1:n])
-end
-
-nsite(sum::DistributedSum) = nsite(fetch(terms(sum)[1]))
-
-Base.length(sum::DistributedSum) = length(fetch(terms(sum)[1]))
-
-Base.getindex(sum::DistributedSum, n::Int) = getindex(terms(sum), n)
-Base.setindex!(sum::DistributedSum, x, n::Int) = setindex!(terms(sum), x, n)
-
-function product(sum::DistributedSum, v::ITensor)::ITensor
-  # TODO: Use `Folds.sum`:
-  # Folds.sum(term -> fetch(term)(v), terms(sum), DistributedEx())
-  return @distributed (+) for M in terms(sum)
-    fetch(M)(v)
-  end
-end
-
-function Base.eltype(sum::DistributedSum)
-  elT = eltype(terms(sum)[1])
-  for n in 2:length(terms(sum))
-    elT = promote_type(elT, eltype(terms(sum)[n]))
-  end
-  return elT
-end
-
-(sum::DistributedSum)(v::ITensor) = product(sum, v)
-
-Base.size(sum::DistributedSum) = size(terms(sum)[1])
-
-function position!(sum::DistributedSum, psi::MPS, pos::Int)
-  # TODO: Try `Folds.map`, i.e.:
-  # new_terms = Folds.map(terms(sum), DistributedEx()) do term
-  #   return @spawnat(worker(term), position!(fetch(term), psi, pos))
-  # end
-  # return set_terms(sum, new_terms)
-  @distributed for n in 1:length(terms(sum))
-    M_fetched = fetch(sum[n])
-    M_fetched = position!(M_fetched, psi, pos)
-    sum[n] = @spawnat(workers()[n], M_fetched)
-  end
-  return sum
-end
-
-## XXX: Implement this.
-## function noiseterm(sum::DistributedSum,
-##                    phi::ITensor,
-##                    dir::String)
-##   nts = fill(ITensor(), Threads.nthreads())
-##   Threads.@threads for M in terms(sum)
-##     nts[Threads.threadid()] += noiseterm(M, phi, dir)
-##   end
-##   return sum(nts)
-## end
diff --git a/src/distributedsum.jl b/src/distributedsum.jl
new file mode 100644
index 0000000..bd8e3c3
--- /dev/null
+++ b/src/distributedsum.jl
@@ -0,0 +1,32 @@
+# Functionality for distributed terms of a sum
+# onto seperate processes
+function distribute(terms::Vector)
+  return map(term -> @spawnat(:any, term), terms)
+end
+
+distribute(terms::Vector{Future}) = terms
+
+function distribute(terms::Vector{MPO})
+  return distribute(ProjMPO.(terms))
+end
+
+function DistributedSum(terms::Vector; executor_kwargs...)
+  return FoldsSum(distribute(terms), SequentialEx(; executor_kwargs...))
+end
+
+function position!(term::Future, v::MPS, pos::Int)
+  return @spawnat term.where position!(fetch(term), v, pos)
+end
+
+function product(term::Future, v::ITensor)
+  return @fetchfrom term.where fetch(term)(v)
+end
+(term::Future)(v::ITensor) = product(term, v)
+
+function noiseterm(term::Future, v::ITensor, dir::String)
+  return @fetchfrom term.where noiseterm(fetch(term), v, dir)
+end
+
+function disk(term::Future; disk_kwargs...)
+  return @spawnat term.where disk(fetch(term); disk_kwargs...)
+end
diff --git a/src/foldssum.jl b/src/foldssum.jl
new file mode 100644
index 0000000..d47d7a3
--- /dev/null
+++ b/src/foldssum.jl
@@ -0,0 +1,78 @@
+struct FoldsSum{T,Ex} <: AbstractSum
+  terms::Vector{T}
+  executor::Ex
+end
+
+## Accessors
+terms(sum::FoldsSum) = sum.terms
+executor(sum::FoldsSum) = sum.executor
+set_terms(sum::FoldsSum, terms) = (@set sum.terms = terms)
+set_executor(sum::FoldsSum, executor) = (@set sum.executor = executor)
+
+## Constructors
+function FoldsSum{T,Ex}(terms::Vector; executor_kwargs...) where {T,Ex}
+  return FoldsSum(terms, Ex(; executor_kwargs...))
+end
+
+function FoldsSum{<:Any,Ex}(terms::Vector; executor_kwargs...) where {Ex}
+  return FoldsSum{eltype(terms),Ex}(terms; executor_kwargs...)
+end
+
+# Default to `ThreadedEx`
+function FoldsSum{T}(terms::Vector; executor_kwargs...) where {T}
+  return FoldsSum{T,ThreadedEx}(terms; executor_kwargs...)
+end
+
+# Default to `ThreadedEx`
+function FoldsSum(terms::Vector; executor_kwargs...)
+  return FoldsSum{eltype(terms)}(terms; executor_kwargs...)
+end
+
+# Conversion from `MPO`
+function FoldsSum{T,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {T,Ex}
+  return FoldsSum{T,Ex}(T.(mpos); executor_kwargs...)
+end
+function FoldsSum{T,Ex}(mpos::MPO...; executor_kwargs...) where {T,Ex}
+  return FoldsSum{T,Ex}([mpos...]; executor_kwargs...)
+end
+
+# Conversion from `MPO`
+function FoldsSum{T}(mpos::Vector{MPO}; executor_kwargs...) where {T}
+  return FoldsSum{T}(T.(mpos); executor_kwargs...)
+end
+function FoldsSum{T}(mpos::MPO...; executor_kwargs...) where {T}
+  return FoldsSum{T}([mpos...]; executor_kwargs...)
+end
+
+# Conversion from `MPO`, default to `ProjMPO`
+function FoldsSum{<:Any,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {Ex}
+  return FoldsSum{ProjMPO,Ex}(mpos; executor_kwargs...)
+end
+function FoldsSum{<:Any,Ex}(mpos::MPO...; executor_kwargs...) where {Ex}
+  return FoldsSum{<:Any,T}([mpos...]; executor_kwargs...)
+end
+
+# Conversion from `MPO`, default to `ProjMPO`
+function FoldsSum(mpos::Vector{MPO}; executor_kwargs...)
+  return FoldsSum{ProjMPO}(mpos; executor_kwargs...)
+end
+function FoldsSum(mpos::MPO...; executor_kwargs...)
+  return FoldsSum([mpos...]; executor_kwargs...)
+end
+
+## Necessary operations
+function product(sum::FoldsSum, v::ITensor)
+  return Folds.sum(term -> term(v), terms(sum), executor(sum))
+end
+
+function position!(sum::FoldsSum, v::MPS, pos::Int)
+  new_terms = Folds.map(term -> position!(term, v, pos), terms(sum), executor(sum))
+  return set_terms(sum, new_terms)
+end
+
+function noiseterm(sum::FoldsSum, v::ITensor, dir::String)
+  return Folds.sum(term -> noiseterm(term, v, dir), terms(sum), executor(sum))
+end
+
+const ThreadedSum{T} = FoldsSum{T,ThreadedEx}
+const SequentialSum{T} = FoldsSum{T,SequentialEx}
diff --git a/src/mpisum.jl b/src/mpisum.jl
deleted file mode 100644
index ec98653..0000000
--- a/src/mpisum.jl
+++ /dev/null
@@ -1,110 +0,0 @@
-struct MPISum{T}
-  term::T
-  comm::MPI.Comm
-end
-term(sum::MPISum) = sum.term
-comm(sum::MPISum) = sum.comm
-
-MPISum(term::T, comm=MPI.COMM_WORLD) where {T} = MPISum{T}(term, comm)
-
-MPISum(mpo::MPO, comm=MPI.COMM_WORLD) = MPISum(ProjMPO(mpo), comm)
-
-nsite(sum::MPISum) = nsite(term(sum))
-
-length(sum::MPISum) = length(term(sum))
-
-function product(sum::MPISum, v::ITensor)
-  return allreduce(term(sum)(v), +, comm(sum))
-end
-
-function eltype(sum::MPISum)
-  return eltype(term(sum))
-end
-
-(sum::MPISum)(v::ITensor) = product(sum, v)
-
-size(sum::MPISum) = size(term(sum))
-
-function position!(sum::MPISum, psi::MPS, pos::Int)
-  makeL!(sum, psi, pos - 1)
-  makeR!(sum, psi, pos + nsite(sum))
-  return sum
-end
-
-function makeL!(sum::MPISum, psi::MPS, k::Int)
-  _makeL!(sum, psi, k)
-  return sum
-end
-
-function makeR!(sum::MPISum, psi::MPS, k::Int)
-  _makeR!(sum, psi, k)
-  return sum
-end
-
-function _makeL!(sum::MPISum, psi::MPS, k::Int)
-  # Save the last `L` that is made to help with caching
-  # for DiskProjMPO
-  sum_term = term(sum)
-  ll = sum_term.lpos
-  if ll ≥ k
-    # Special case when nothing has to be done.
-    # Still need to change the position if lproj is
-    # being moved backward.
-    sum_term.lpos = k
-    return nothing
-  end
-  # Make sure ll is at least 0 for the generic logic below
-  ll = max(ll, 0)
-  L = lproj(sum_term)
-  while ll < k
-    # sync the linkindex across processes
-    mylink = linkind(psi, ll + 1)
-    otherlink = MPI.bcast(mylink, 0, comm(sum))
-    replaceind!(psi[ll + 1], mylink, otherlink)
-    replaceind!(psi[ll + 2], mylink, otherlink)
-
-    L = L * psi[ll + 1] * sum_term.H[ll + 1] * dag(prime(psi[ll + 1]))
-    sum_term.LR[ll + 1] = L
-    ll += 1
-  end
-  # Needed when moving lproj backward.
-  sum_term.lpos = k
-  return L
-end
-
-function _makeR!(sum::MPISum, psi::MPS, k::Int)
-  # Save the last `R` that is made to help with caching
-  # for DiskProjMPO
-  sum_term = term(sum)
-  rl = sum_term.rpos
-  if rl ≤ k
-    # Special case when nothing has to be done.
-    # Still need to change the position if rproj is
-    # being moved backward.
-    sum_term.rpos = k
-    return nothing
-  end
-  N = length(sum_term.H)
-  # Make sure rl is no bigger than `N + 1` for the generic logic below
-  rl = min(rl, N + 1)
-  R = rproj(sum_term)
-  while rl > k
-    #sync linkindex across processes
-    mylink = linkind(psi, rl - 2)
-    otherlink = MPI.bcast(mylink, 0, comm(sum))
-    replaceind!(psi[rl - 2], mylink, otherlink)
-    replaceind!(psi[rl - 1], mylink, otherlink)
-
-    R = R * psi[rl - 1] * sum_term.H[rl - 1] * dag(prime(psi[rl - 1]))
-    sum_term.LR[rl - 1] = R
-    rl -= 1
-  end
-  sum_term.rpos = k
-  return R
-end
-
-function noiseterm(sum::MPISum, phi::ITensor, dir::String)
-  ##ToDo: I think the logic here is wrong.
-  ##The noiseterm should be calculated on the reduced P*v and then broadcasted?
-  return allreduce(noiseterm(term(sum), phi, dir), +, comm(sum))
-end
diff --git a/src/mpisumterm.jl b/src/mpisumterm.jl
new file mode 100644
index 0000000..59bb884
--- /dev/null
+++ b/src/mpisumterm.jl
@@ -0,0 +1,70 @@
+struct MPISumTerm{T}
+  term::T
+  comm::MPI.Comm
+end
+term(sumterm::MPISumTerm) = sumterm.term
+comm(sumterm::MPISumTerm) = sumterm.comm
+
+set_term(sumterm::MPISumTerm, term) = (@set sumterm.term = term)
+set_comm(sumterm::MPISumTerm, comm) = (@set sumterm.comm = comm)
+
+MPISumTerm(mpo::MPO, comm::MPI.Comm) = MPISumTerm(ProjMPO(mpo), comm)
+
+nsite(sumterm::MPISumTerm) = nsite(term(sumterm))
+
+length(sumterm::MPISumTerm) = length(term(sumterm))
+
+disk(sumterm::MPISumTerm) = set_term(sumterm, disk(term(sumterm)))
+
+function product(sumterm::MPISumTerm, v::ITensor)
+  return allreduce(term(sumterm)(v), +, comm(sumterm))
+end
+
+function eltype(sumterm::MPISumTerm)
+  return eltype(term(sumterm))
+end
+
+(sumterm::MPISumTerm)(v::ITensor) = product(sumterm, v)
+
+size(sumterm::MPISumTerm) = size(term(sumterm))
+
+# Replace the bond and then broadcast one of the results
+# to each process. This ensures that the link indices
+# are consistent across processes and that the tensors
+# match numerically. Floating point precision errors
+# have been seen to build up enough to make their
+# block sparse structure inconsistent, especially
+# when multithreading block sparse operations are involved.
+function replacebond!(sumterm::MPISumTerm, M::MPS, b::Int, v::ITensor; kwargs...)
+  spec = nothing
+  M_ortho_lims = nothing
+  if MPI.Comm_rank(comm(sumterm)) == 0
+    spec = replacebond!(M, b, v; kwargs...)
+    M_ortho_lims = ortho_lims(M)
+  end
+  M_b1 = MPI.bcast(M[b], 0, comm(sumterm))
+  M_b2 = MPI.bcast(M[b + 1], 0, comm(sumterm))
+  M_ortho_lims = MPI.bcast(M_ortho_lims, 0, comm(sumterm))
+  spec = MPI.bcast(spec, 0, comm(sumterm))
+  M[b] = M_b1
+  M[b + 1] = M_b2
+  set_ortho_lims!(M, M_ortho_lims)
+  return spec
+end
+
+function orthogonalize!(sumterm::MPISumTerm, v::MPS, pos::Int; kwargs...)
+  if MPI.Comm_rank(comm(sumterm)) == 0
+    v = orthogonalize!(v, pos; kwargs...)
+  end
+  return MPI.bcast(v, 0, comm(sumterm))
+end
+
+function position!(sumterm::MPISumTerm, v::MPS, pos::Int)
+  return set_term(sumterm, position!(term(sumterm), v, pos))
+end
+
+function noiseterm(sumterm::MPISumTerm, v::ITensor, dir::String)
+  # TODO: Check if this logic is correct.
+  # Should the noiseterm instead be calculated on the reduced `sumterm(v)` and then broadcasted?
+  return allreduce(noiseterm(term(sumterm), v, dir), +, comm(sumterm))
+end
diff --git a/src/parallelsum.jl b/src/parallelsum.jl
deleted file mode 100644
index b0fd74a..0000000
--- a/src/parallelsum.jl
+++ /dev/null
@@ -1,118 +0,0 @@
-# TODO: Make `AbstractSum` subtype
-struct ParallelSum{T,Ex}
-  terms::Vector{T}
-  executor::Ex
-end
-
-terms(sum::ParallelSum) = sum.terms
-executor(sum::ParallelSum) = sum.executor
-set_terms(sum::ParallelSum, terms) = ParallelSum(terms, executor(sum))
-set_executor(sum::ParallelSum, executor) = ParallelSum(terms(sum), executor)
-
-function ParallelSum{T,Ex}(terms::Vector; executor_kwargs...) where {T,Ex}
-  return ParallelSum(terms, Ex(; executor_kwargs...))
-end
-
-function ParallelSum{<:Any,Ex}(terms::Vector; executor_kwargs...) where {Ex}
-  return ParallelSum{eltype(terms),Ex}(terms; executor_kwargs...)
-end
-
-# Default to `ThreadedEx`
-function ParallelSum{T}(terms::Vector; executor_kwargs...) where {T}
-  return ParallelSum{T,ThreadedEx}(terms; executor_kwargs...)
-end
-
-# Default to `ThreadedEx`
-function ParallelSum(terms::Vector; executor_kwargs...)
-  return ParallelSum{eltype(terms)}(terms; executor_kwargs...)
-end
-
-# Conversion from `MPO`
-function ParallelSum{T,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {T,Ex}
-  return ParallelSum{T,Ex}(T.(mpos); executor_kwargs...)
-end
-function ParallelSum{T,Ex}(mpos::MPO...; executor_kwargs...) where {T,Ex}
-  return ParallelSum{T,Ex}([mpos...]; executor_kwargs...)
-end
-
-# Conversion from `MPO`
-function ParallelSum{T}(mpos::Vector{MPO}; executor_kwargs...) where {T}
-  return ParallelSum{T}(T.(mpos); executor_kwargs...)
-end
-function ParallelSum{T}(mpos::MPO...; executor_kwargs...) where {T}
-  return ParallelSum{T}([mpos...]; executor_kwargs...)
-end
-
-# Conversion from `MPO`, default to `ProjMPO`
-function ParallelSum{<:Any,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {Ex}
-  return ParallelSum{ProjMPO,Ex}(mpos; executor_kwargs...)
-end
-function ParallelSum{<:Any,Ex}(mpos::MPO...; executor_kwargs...) where {Ex}
-  return ParallelSum{<:Any,T}([mpos...]; executor_kwargs...)
-end
-
-# Conversion from `MPO`, default to `ProjMPO`
-function ParallelSum(mpos::Vector{MPO}; executor_kwargs...)
-  return ParallelSum{ProjMPO}(mpos; executor_kwargs...)
-end
-function ParallelSum(mpos::MPO...; executor_kwargs...)
-  return ParallelSum([mpos...]; executor_kwargs...)
-end
-
-# TODO: Replace with `AbstractSum` once we merge:
-# https://github.com/ITensor/ITensors.jl/pull/1046
-nsite(sum::ParallelSum) = nsite(terms(sum)[1])
-length(sum::ParallelSum) = length(terms(sum)[1])
-function eltype(sum::ParallelSum)
-  elT = eltype(terms(sum)[1])
-  for n in 2:length(terms(sum))
-    elT = promote_type(elT, eltype(terms(sum)[n]))
-  end
-  return elT
-end
-(sum::ParallelSum)(v::ITensor) = product(sum, v)
-size(sum::ParallelSum) = size(terms(sum)[1])
-
-function product(sum::ParallelSum, v::ITensor)
-  return Folds.sum(term -> term(v), terms(sum), executor(sum))
-end
-
-function position!(sum::ParallelSum, psi::MPS, pos::Int)
-  new_terms = Folds.map(term -> position!(term, psi, pos), terms(sum), executor(sum))
-  return set_terms(sum, new_terms)
-end
-
-# TODO: Remove once we merge:
-# https://github.com/ITensor/ITensors.jl/pull/1047
-function position!(sum::ParallelSum{<:Any,<:DistributedEx}, psi::MPS, pos::Int)
-  threaded_sum = position!(set_executor(sum, ThreadedEx()), psi, pos)
-  return set_executor(threaded_sum, executor(sum))
-end
-
-function noiseterm(sum::ParallelSum, phi::ITensor, dir::String)
-  return Folds.sum(term -> noiseterm(term, phi, dir), terms(sum), executor(sum))
-end
-
-function disk(sum::ParallelSum; disk_kwargs...)
-  return set_terms(sum, [disk(term; disk_kwargs...) for term in terms(sum)])
-end
-
-const ThreadedSum{T} = ParallelSum{T,ThreadedEx}
-const DistributedSum{T} = ParallelSum{T,DistributedEx}
-const SequentialSum{T} = ParallelSum{T,SequentialEx}
-
-# Functionality for sums where terms are distributed remotely
-# function product(sum::DistributedSum{<:Future}, v::ITensor)
-#   return Folds.sum(term -> fetch(term)(v), terms(sum), executor(sum))
-# end
-#
-# function position!(sum::DistributedSum{<:Future}, psi::MPS, pos::Int)
-#   new_terms = Folds.map(terms(sum), executor(sum)) do term
-#     return @spawnat(term.where, position!(fetch(term), psi, pos))
-#   end
-#   return set_terms(sum, new_terms)
-# end
-#
-# function noiseterm(sum::ParallelSum{<:Future}, phi::ITensor, dir::String)
-#   return Folds.sum(term -> noiseterm(fetch(term), phi, dir), terms(sum), executor(sum))
-# end
diff --git a/src/references/parallel_projmpo.jl b/src/references/parallel_projmpo.jl
deleted file mode 100644
index e6fa893..0000000
--- a/src/references/parallel_projmpo.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-import ITensors: product, position!, noiseterm
-
-function product(P::ProjMPOSum, v::ITensor)::ITensor
-  Pvs = fill(ITensor(), Threads.nthreads())
-  Threads.@threads for M in P.pm
-    Pvs[Threads.threadid()] += product(M, v)
-  end
-  return sum(Pvs)
-end
-
-function position!(P::ProjMPOSum, psi::MPS, pos::Int)
-  Threads.@threads for M in P.pm
-    position!(M, psi, pos)
-  end
-  return P
-end
-
-function noiseterm(P::ProjMPOSum, phi::ITensor, dir::String)
-  nts = fill(ITensor(), Threads.nthreads())
-  Threads.@threads for M in P.pm
-    nts[Threads.threadid()] += noiseterm(M, phi, dir)
-  end
-  return sum(nts)
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 782d251..53791ef 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,34 +3,8 @@ using MPI
 using Test
 
 @testset "ITensorParallel.jl" begin
-  examples_dir = joinpath(pkgdir(ITensorParallel), "examples")
-
-  example_files = filter(
-    f -> startswith(f, "01_") && endswith(f, ".jl"), readdir(examples_dir)
-  )
-  @testset "Threaded/Distributed example $example_file" for example_file in example_files
-    include(joinpath(examples_dir, example_file))
-    Nx = 8
-    Ny = 4
-    maxdim = 20
-    Sums = (ThreadedSum, DistributedSum, SequentialSum)
-    @testset "Sum type $Sum" for Sum in Sums
-      println("Running parallel test with $(Sum)")
-      main(; Nx, Ny, maxdim, Sum)
-    end
-  end
-
-  example_files = ["02_mpi_run.jl"]
-  @testset "MPI example $example_file" for example_file in example_files
-    println("Running MPI parallel test")
-    nprocs = 2
-    Nx = 8
-    Ny = 4
-    maxdim = 20
-    mpiexec() do exe  # MPI wrapper
-      run(
-        `$exe -n $(nprocs) $(Base.julia_cmd()) $(joinpath(examples_dir, example_file)) --Nx $(Nx) --Ny $(Ny) --maxdim $(maxdim)`,
-      )
-    end
+  test_files = filter(file -> startswith("test_")(file) && endswith(".jl")(file), readdir())
+  @testset "Test file $(test_file)" for test_file in test_files
+    include(test_file)
   end
 end
diff --git a/test/test_mpi_example.jl b/test/test_mpi_example.jl
new file mode 100644
index 0000000..6f74747
--- /dev/null
+++ b/test/test_mpi_example.jl
@@ -0,0 +1,27 @@
+using MPI
+using ITensorParallel
+using Test
+
+@testset "Test $(@__FILE__)" begin
+  examples_dir = joinpath(pkgdir(ITensorParallel), "examples")
+  example_files = ["02_mpi_run.jl"]
+  @testset "MPI example $example_file, threaded block sparse $threaded_blocksparse, write-to-disk $disk" for example_file in
+                                                                                                             example_files,
+    threaded_blocksparse in (false, true),
+    disk in (false, true)
+
+    println(
+      "\nRunning MPI parallel test with threaded block sparse $threaded_blocksparse, write-to-disk $disk",
+    )
+    nprocs = 2
+    Nx = 8
+    Ny = 4
+    nsweeps = 2
+    maxdim = 20
+    mpiexec() do exe  # MPI wrapper
+      run(
+        `$exe -n $(nprocs) $(Base.julia_cmd()) --threads $(Threads.nthreads()) $(joinpath(examples_dir, example_file)) --Nx $(Nx) --Ny $(Ny) --nsweeps $(nsweeps) --maxdim $(maxdim) --disk $(disk) --threaded_blocksparse $(threaded_blocksparse)`,
+      )
+    end
+  end
+end
diff --git a/test/test_sequential_threaded_distributed_example.jl b/test/test_sequential_threaded_distributed_example.jl
new file mode 100644
index 0000000..e6d556f
--- /dev/null
+++ b/test/test_sequential_threaded_distributed_example.jl
@@ -0,0 +1,28 @@
+using ITensorParallel
+using Test
+
+@testset "Test $(@__FILE__)" begin
+  examples_dir = joinpath(pkgdir(ITensorParallel), "examples")
+
+  example_files = filter(
+    f -> startswith(f, "01_") && endswith(f, ".jl"), readdir(examples_dir)
+  )
+  @testset "Threaded/Distributed example $example_file" for example_file in example_files
+    include(joinpath(examples_dir, example_file))
+    Nx = 8
+    Ny = 4
+    nsweeps = 2
+    maxdim = 20
+    Sums = (SequentialSum, ThreadedSum, DistributedSum)
+    @testset "Sum type $Sum, threaded block sparse $threaded_blocksparse, write-to-disk $disk" for Sum in
+                                                                                                   Sums,
+      threaded_blocksparse in (false, true),
+      disk in (false, true)
+
+      println(
+        "\nRunning parallel test with $(Sum), threaded block sparse $threaded_blocksparse, write-to-disk $disk",
+      )
+      main(; Nx, Ny, nsweeps, maxdim, Sum, disk, threaded_blocksparse)
+    end
+  end
+end