diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 86ffdd7..25d50ae 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,7 +19,7 @@ jobs: matrix: version: - '1.6' - - '1.7' + - '1' os: - ubuntu-latest - macOS-latest @@ -35,6 +35,8 @@ jobs: - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + env: + JULIA_NUM_THREADS: 2 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v2 with: diff --git a/Project.toml b/Project.toml index 1750ad0..f32c8f8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,20 @@ name = "ITensorParallel" uuid = "0fccfcd2-f061-4985-9740-339d3f86bfce" authors = ["Matthew Fishman and contributors"] -version = "0.0.1" +version = "0.1.0" [deps] +Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Folds = "41a02a25-b8f0-4f67-bc48-60067656b558" ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" [compat] -ITensors = "0.2, 0.3" -MPI = "0.19, 0.20" +Accessors = "0.1.26" +Folds = "0.2.8" +ITensors = "0.3.27" +MPI = "0.20" julia = "1.6" [extras] diff --git a/README.md b/README.md index 247e10f..530d103 100644 --- a/README.md +++ b/README.md @@ -8,102 +8,12 @@ # Overview -This package is for experimenting with adding more shared and distributed memory parallelism, for example implementing the techniques laid out in the paper https://arxiv.org/abs/2103.09976. +This package adds more shared and distributed memory parallelism to [ITensors.jl](https://github.com/ITensor/ITensors.jl), for example implementing the techniques for nested parallelization in DMRG laid out in the paper [arXiv:2103.09976](https://arxiv.org/abs/2103.09976). So far, it focuses on parallelizing over optimizing or evolving sums of tensor networks. We plan to add real-space parallel DMRG, TDVP, and TEBD based on [arXiv:1301.3494](https://arxiv.org/abs/1301.3494). -We will explore multithreaded and distributed parallelism over real-space parallel DMRG, TDVP, and TEBD based on https://arxiv.org/abs/1301.3494, as well as multithreaded and distributed parallelism over sums of Hamiltonian terms in DMRG and TDVP. +For multithreading, we are using Julia's standard Threads.jl library, as well as convenient abstractions on top of that for parallelizing over maps and reductions provided by [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl). See [here](https://juliafolds.github.io/data-parallelism/tutorials/quick-introduction/) for a nice overview of parallelization in Julia. -For multithreading, we are using Julia's standard Threads.jl library, and possibly convenient abstractions on top of that provided by [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl). See [here](https://juliafolds.github.io/data-parallelism/tutorials/quick-introduction/) for a nice overview of the different options. +For distributed computing, we make use of Julia's standard [Distributed.jl](https://docs.julialang.org/en/v1/manual/distributed-computing/) library along with it's interface through [Folds.jl](https://github.com/juliafolds/folds.jl) and [FLoops.jl](https://github.com/JuliaFolds/FLoops.jl), as well as [MPI.jl](https://juliaparallel.github.io/MPI.jl/latest/). Take a look at Julia'd documentation on [distributed computing](https://docs.julialang.org/en/v1/manual/distributed-computing/) for more information and background. -For distributed computing, we will explore Julia's standard [Distributed.jl](https://docs.julialang.org/en/v1/manual/distributed-computing/) library, as well as [MPI.jl](https://juliaparallel.github.io/MPI.jl/latest/). - -To run Distributed.jl-based computations on clusters, we will explore using Julia's cluster manager tools like [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl), [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl), and [MPIClusterManagers.jl](https://github.com/JuliaParallel/MPIClusterManagers.jl). - -# Running on clusters - - -## Option 1: `Distributed.jl` -Here are detailed instructions for running a minimal "hello world" example parallelized with Julia's Distributed.jl standard library, distributed over nodes of a cluster. - -1. Start by [downloading the latest version of Julia](https://julialang.org/downloads/) or loading a pre-installed version of Julia, for example with `module load julia`. You can follow more detailed instruction for installing your own version of Julia on a cluster [here](https://itensor.github.io/ITensors.jl/stable/getting_started/Installing.html). -2. Start Julia by executing the command `julia` at the command line. This should bring up the interactive Julia REPL. From there, you should install either [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl) or [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl) if your cluster uses Slurm as the cluster management and job scheduling system (ClusterManagers.jl supports Slurm, but SlurmClusterManager.jl has a more specialized implementation). This would look something like this: -```julia -$ julia - _ - _ _ _(_)_ | Documentation: https://docs.julialang.org - (_) | (_) (_) | - _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. - | | | | | | |/ _` | | - | | |_| | | | (_| | | Version 1.7.2 (2022-02-06) - _/ |\__'_|_|_|\__'_| | Official https://julialang.org/ release -|__/ | - -julia> using Pkg - -julia> Pkg.add("SlurmClusterManager") - Updating registry at `~/.julia/registries/General` - Updating git-repo `https://github.com/JuliaRegistries/General.git` - Resolving package versions... - Installed SlurmClusterManager ─ v0.1.2 - Updating `~/.julia/environments/v1.7/Project.toml` - [c82cd089] + SlurmClusterManager v0.1.2 - Updating `~/.julia/environments/v1.7/Manifest.toml` - [c82cd089] + SlurmClusterManager v0.1.2 -Precompiling project... - 1 dependency successfully precompiled in 2 seconds (456 already precompiled, 2 skipped during auto due to previous errors) -``` -Now the cluster manager (either `ClusterManagers.jl` or `SlurmClusterManager.jl`, whichever you installed) will be available system-wide for you to use and aid you in running your Distributed.jl-based parallelized Julia code across nodes of your cluster. - -3. Create a file somewhere within your home directory on the cluster called `hello_world.jl` with the contents: -```julia -#!/usr/bin/env julia - -using Distributed, SlurmClusterManager -addprocs(SlurmManager()) -@everywhere println("hello from $(myid()):$(gethostname())") -``` -4. Submit your script to the work queue of your cluster, for example with `sbatch` if the cluster you are on uses Slurm: -``` -$ sbatch -N 2 --ntasks-per-node=64 hello_world.jl -``` -This will execute the code on two nodes using 64 workers per node. - -We will add similar instructions on running the same "hello world" example using MPI.jl, and additionally running linear algebra and ITensor operations in parallel with both Distributed.jl and MPI.jl. - - -## Option 2: `MPI.jl` - -1. Start by [downloading the latest version of Julia](https://julialang.org/downloads/) or loading a pre-installed version of Julia, for example with `module load julia`. -2. Load an MPI installation, e.g. `module load openmpi`. -3. Install `MPI.jl` (e.g. from the command line: `julia --project -e 'using Pkg; Pkg.add("MPI")'`). -4. Make sure `MPI.jl` is pointing to the correct `MPI` installation by running the following: - ``` - julia --project -e 'ENV["JULIA_MPI_BINARY"]="system"; using Pkg; Pkg.build("MPI"; verbose=true)' - ``` - Make sure the version of MPI in the output of this command matches the one you wanted to load. -5. Run a test job: - ```julia - # 01-hello.jl - using MPI - MPI.Init() - - comm = MPI.COMM_WORLD - println("Hello world, I am $(MPI.Comm_rank(comm)) of $(MPI.Comm_size(comm))") - MPI.Barrier(comm) - ``` - using `mpirun -np 4 julia 01-hello.jl` -6. The correct output should look something like: - ```julia - Hello world, I am 2 of 4 - Hello world, I am 0 of 4 - Hello world, I am 3 of 4 - Hello world, I am 1 of 4 - ``` - -If you see the following warning, go back to step 4 and make sure, `MPI.jl` is pointing to the correct MPI installation: - -``` -┌ Warning: You appear to have run julia under a different `mpiexec` than the one used by MPI.jl. -│ See the documentation for details. -└ @ MPI ~/.julia/packages/MPI/08SPr/src/environment.jl:38 -``` +To run Distributed.jl-based computations on clusters, we recommend using Julia's cluster manager tools like [ClusterManagers.jl](https://github.com/JuliaParallel/ClusterManagers.jl), [SlurmClusterManager.jl](https://github.com/kleinhenz/SlurmClusterManager.jl), and [MPIClusterManagers.jl](https://github.com/JuliaParallel/MPIClusterManagers.jl). +See the [examples folder](https://github.com/ITensor/ITensorParallel.jl/tree/main/examples) for examples of running DMRG parallelized over sums of Hamiltonians, using Threads.jl, Distributed.jl, and MPI.jl. diff --git a/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl b/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl index 109bd99..9a32178 100644 --- a/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl +++ b/examples/01_parallel_mpo_sum_2d_hubbard_conserve_momentum.jl @@ -17,15 +17,23 @@ ITensors.Strided.disable_threads() """ Run with: ```julia -# No blocksparse multithreading -main(; Nx=8, Ny=4, maxdim=1000, Sum=ThreadedSum); -main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum); -main(; Nx=8, Ny=4, maxdim=1000, Sum=SequentialSum); - -# Blocksparse multithreading -main(; Nx=8, Ny=4, maxdim=1000, Sum=ThreadedSum, threaded_blocksparse=true); -main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum, threaded_blocksparse=true); -main(; Nx=8, Ny=4, maxdim=1000, Sum=SequentialSum, threaded_blocksparse=true); +# Sequential sum over MPOs. +# Uses the default `Sum=SequentialSum`. +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000); +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, threaded_blocksparse=true); + +# Threaded sum over MPOs. +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=ThreadedSum); +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=ThreadedSum, threaded_blocksparse=true); + +# Distributed sum over MPOs, where terms of the MPO +# sum and their environments are stored, updated, +# and applied remotely on a worker process. +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=DistributedSum); +main(; Nx=8, Ny=4, nsweeps=10, maxdim=1000, Sum=DistributedSum, threaded_blocksparse=true); + +# Using write-to-disk. +main(; Nx=8, Ny=4, maxdim=1000, Sum=DistributedSum, disk=true, threaded_blocksparse=true); ``` """ function main(; @@ -33,28 +41,24 @@ function main(; Ny::Int, U::Float64=4.0, t::Float64=1.0, + nsweeps=10, maxdim::Int=3000, conserve_ky=true, seed=1234, npartitions=2Ny, - Sum, + Sum=SequentialSum, threaded_blocksparse=false, + disk=false, + random_init=false, in_partition=ITensorParallel.default_in_partition, ) - Random.seed!(seed) @show Threads.nthreads() - # TODO: Use `ITensors.enable_threaded_blocksparse(threaded_blocksparse)` - if threaded_blocksparse - ITensors.enable_threaded_blocksparse() - else - ITensors.disable_threaded_blocksparse() - end + ITensors.enable_threaded_blocksparse(threaded_blocksparse) @show ITensors.using_threaded_blocksparse() N = Nx * Ny - nsweeps = 10 max_maxdim = maxdim maxdim = min.([100, 200, 400, 800, 2000, 3000, max_maxdim], max_maxdim) cutoff = 1e-6 @@ -68,42 +72,44 @@ function main(; ℋ = hubbard(; Nx=Nx, Ny=Ny, t=t, U=U, ky=true) ℋs = partition(ℋ, npartitions; in_partition) - H = [MPO(ℋ, sites) for ℋ in ℋs] + Hs = [MPO(ℋ, sites) for ℋ in ℋs] - @show maxlinkdim.(H) + @show maxlinkdim.(Hs) # Number of structural nonzero elements in a bulk # Hamiltonian MPO tensor - @show nnz(H[1][end ÷ 2]) - @show nnzblocks(H[1][end ÷ 2]) - - # Create start state - state = Vector{String}(undef, N) - for i in 1:N - x = (i - 1) ÷ Ny - y = (i - 1) % Ny - if x % 2 == 0 - if y % 2 == 0 - state[i] = "Up" - else - state[i] = "Dn" - end - else - if y % 2 == 0 - state[i] = "Dn" - else - state[i] = "Up" - end - end + @show nnz(Hs[1][end ÷ 2]) + @show nnzblocks(Hs[1][end ÷ 2]) + + # Create starting state with checkerboard + # pattern + state = map(CartesianIndices((Ny, Nx))) do I + return iseven(I[1]) ⊻ iseven(I[2]) ? "↓" : "↑" + end + display(state) + + if random_init + # Only available in ITensors 0.3.27 + # Helps make results reproducible when comparing + # sequential vs. threaded. + itensor_rng = Xoshiro() + Random.seed!(itensor_rng, seed) + psi0 = randomMPS(itensor_rng, sites, state; linkdims=10) + else + psi0 = MPS(sites, state) end - psi0 = randomMPS(sites, state; linkdims=10) + mpo_sum = Sum(Hs) + if disk + # Write-to-disk + mpo_sum = ITensors.disk(mpo_sum) + end - energy, psi = @time dmrg(Sum(H), psi0; nsweeps, maxdim, cutoff, noise) + energy, psi = @time dmrg(mpo_sum, psi0; nsweeps, maxdim, cutoff, noise) @show Nx, Ny @show t, U @show flux(psi) @show maxlinkdim(psi) @show energy - return energy, H, psi + return energy, psi end diff --git a/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl b/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl index 56409f0..f86efc1 100644 --- a/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl +++ b/examples/02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl @@ -12,10 +12,11 @@ ITensors.Strided.disable_threads() """ Run at the command line with 4 processes: ```julia -mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 20 +mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 -# Currently is broken! -mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 20 --threaded_blocksparse true +mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 --threaded_blocksparse true + +mpiexecjl -n 2 julia -t2 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 --disk true --threaded_blocksparse true ``` """ function main(; @@ -23,31 +24,22 @@ function main(; Ny::Int, U::Float64=4.0, t::Float64=1.0, + nsweeps=10, maxdim::Int=3000, conserve_ky=true, seed=1234, threaded_blocksparse=false, + disk=false, + random_init=false, in_partition=ITensorParallel.default_in_partition, ) - # It is very important to set both of these RNG seeds! - # This ensures the same state and the same ITensor indices - # are made in each process - Random.seed!(seed) - Random.seed!(index_id_rng(), seed) - @show Threads.nthreads() - # TODO: Use `ITensors.enable_threaded_blocksparse(threaded_blocksparse)` - if threaded_blocksparse - ITensors.enable_threaded_blocksparse() - else - ITensors.disable_threaded_blocksparse() - end + ITensors.enable_threaded_blocksparse(threaded_blocksparse) @show ITensors.using_threaded_blocksparse() N = Nx * Ny - nsweeps = 10 max_maxdim = maxdim maxdim = min.([100, 200, 400, 800, 2000, 3000, max_maxdim], max_maxdim) cutoff = 1e-6 @@ -60,34 +52,41 @@ function main(; # Create a lazy representation of the Hamiltonian ℋ = hubbard(; Nx, Ny, t, U, ky=true) - # Create start state - state = Vector{String}(undef, N) - for i in 1:N - x = (i - 1) ÷ Ny - y = (i - 1) % Ny - if x % 2 == 0 - if y % 2 == 0 - state[i] = "Up" - else - state[i] = "Dn" - end - else - if y % 2 == 0 - state[i] = "Dn" - else - state[i] = "Up" - end - end + # Create starting state with checkerboard + # pattern + state = map(CartesianIndices((Ny, Nx))) do I + return iseven(I[1]) ⊻ iseven(I[2]) ? "↓" : "↑" end - sites = siteinds("ElecK", N; conserve_qns=true, conserve_ky, modulus_ky=Ny) - psi0 = randomMPS(sites, state; linkdims=10) + display(state) MPI.Init() + + sites = siteinds("ElecK", N; conserve_qns=true, conserve_ky, modulus_ky=Ny) + sites = MPI.bcast(sites, 0, MPI.COMM_WORLD) + + if random_init + # Only available in ITensors 0.3.27 + # Helps make results reproducible when comparing + # sequential vs. threaded. + itensor_rng = Xoshiro() + Random.seed!(itensor_rng, seed) + psi0 = randomMPS(itensor_rng, sites, state; linkdims=10) + else + psi0 = MPS(sites, state) + end + psi0 = MPI.bcast(psi0, 0, MPI.COMM_WORLD) + nprocs = MPI.Comm_size(MPI.COMM_WORLD) ℋs = partition(ℋ, nprocs; in_partition) which_proc = MPI.Comm_rank(MPI.COMM_WORLD) + 1 - PH = MPISum(ProjMPO(MPO(ℋs[which_proc], sites))) - energy, psi = @time dmrg(PH, psi0; nsweeps, maxdim, cutoff, noise) + mpo_sum_term = MPISumTerm(MPO(ℋs[which_proc], sites), MPI.COMM_WORLD) + + if disk + # Write-to-disk + mpo_sum_term = ITensors.disk(mpo_sum_term) + end + + energy, psi = @time dmrg(mpo_sum_term, psi0; nsweeps, maxdim, cutoff, noise) @show Nx, Ny @show t, U diff --git a/examples/02_mpi_run.jl b/examples/02_mpi_run.jl index 024f8af..55eb31e 100644 --- a/examples/02_mpi_run.jl +++ b/examples/02_mpi_run.jl @@ -2,7 +2,7 @@ include("02_mpi_mpo_sum_2d_hubbard_conserve_momentum.jl") # Run with: -# mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --maxdim 1000 +# mpiexecjl -n 2 julia 02_mpi_run.jl --Nx 8 --Ny 4 --nsweeps 10 --maxdim 1000 using ArgParse function parse_commandline() @@ -16,10 +16,18 @@ function parse_commandline() help = "Cylinder width" arg_type = Int required = true + "--nsweeps" + help = "Number of sweeps" + arg_type = Int + required = true "--maxdim" help = "Maximum bond dimension" arg_type = Int required = true + "--disk" + help = "Write-to-disk" + arg_type = Bool + default = false "--threaded_blocksparse" help = "Use threaded block sparse operations" arg_type = Bool @@ -31,6 +39,8 @@ args = parse_commandline() main(; Nx=args["Nx"], Ny=args["Ny"], + nsweeps=args["nsweeps"], maxdim=args["maxdim"], + disk=args["disk"], threaded_blocksparse=args["threaded_blocksparse"], ); diff --git a/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl b/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl deleted file mode 100644 index aecea28..0000000 --- a/examples/broken/03_distributed_mpo_sum_2d_heisenberg.jl +++ /dev/null @@ -1,69 +0,0 @@ -using Distributed - -rmprocs(setdiff(procs(), 1)) -addprocs(4) -@show nprocs() - -using Random -@everywhere using ITensors -@everywhere using ITensorParallel - -npartitions = 4 - -Sum = DistributedSum -# Sum = ThreadedSum -# Sum = SequentialSum - -@show Sum -@show npartitions -@show Threads.nthreads() - -ITensors.BLAS.set_num_threads(1) -ITensors.Strided.disable_threads() - -# Turn block sparse multithreading on or off -ITensors.disable_threaded_blocksparse() - -Nx, Ny = 8, 4 -N = Nx * Ny - -sites = siteinds("S=1/2", N; conserve_qns=true) - -lattice = square_lattice(Nx, Ny; yperiodic=false) - -function heisenberg_hamiltonian(lattice) - ℋ = OpSum() - for b in lattice - ℋ += 0.5, "S+", b.s1, "S-", b.s2 - ℋ += 0.5, "S-", b.s1, "S+", b.s2 - ℋ += "Sz", b.s1, "Sz", b.s2 - end - return ℋ -end -ℋ = heisenberg_hamiltonian(lattice) - -# Define how the Hamiltonian will be partitioned -# into a sum of sub-Hamiltonians. -# function in_partition(sites::Tuple{Int,Int}, p, nparts) -# i, j = sites -# return p == mod1(i, nparts) -# end - -ℋs = partition(ℋ, npartitions) -H = [MPO(ℋ, sites) for ℋ in ℋs] -@show maxlinkdim.(H) -PH = Sum(H) - -state = [isodd(n) ? "Up" : "Dn" for n in 1:N] -Random.seed!(1234) -psi0 = randomMPS(sites, state; linkdims=10) - -nsweeps = 10 -maxdim = [20, 60, 100, 100, 200, 400, 800] -cutoff = 1e-8 -@show nsweeps -@show maxdim -@show cutoff - -energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff) -@show energy diff --git a/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl b/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl deleted file mode 100644 index be99237..0000000 --- a/examples/deprecated/01_threaded_mpo_sum_2d_heisenberg.jl +++ /dev/null @@ -1,61 +0,0 @@ -using ITensors -using ITensorParallel -using Random - -ITensors.BLAS.set_num_threads(1) -ITensors.Strided.disable_threads() - -# Turn block sparse multithreading on or off -ITensors.disable_threaded_blocksparse() - -npartitions = 2 - -# Threaded or sequential sum of MPOs -Sum = ThreadedSum -# Sum = SequentialSum - -@show Threads.nthreads() -@show npartitions - -Nx, Ny = 10, 5 -N = Nx * Ny - -sites = siteinds("S=1/2", N; conserve_qns=true) - -lattice = square_lattice(Nx, Ny; yperiodic=false) - -function heisenberg_hamiltonian(lattice) - ℋ = OpSum() - for b in lattice - ℋ += 0.5, "S+", b.s1, "S-", b.s2 - ℋ += 0.5, "S-", b.s1, "S+", b.s2 - ℋ += "Sz", b.s1, "Sz", b.s2 - end - return ℋ -end -ℋ = heisenberg_hamiltonian(lattice) - -# Define how the Hamiltonian will be partitioned -# into a sum of sub-Hamiltonians. -function in_partition(sites::Tuple{Int,Int}, p, nparts) - i, j = sites - return p == mod1(i, nparts) -end - -ℋs = partition(ℋ, npartitions; in_partition) -H = [MPO(ℋ, sites) for ℋ in ℋs] -PH = Sum(H) - -state = [isodd(n) ? "Up" : "Dn" for n in 1:N] -Random.seed!(1234) -psi0 = randomMPS(sites, state; linkdims=10) - -nsweeps = 10 -maxdim = [20, 60, 100, 100, 200, 400, 800] -cutoff = 1e-6 -@show nsweeps -@show maxdim -@show cutoff - -energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff) -@show energy diff --git a/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl b/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl deleted file mode 100644 index 4a01733..0000000 --- a/examples/deprecated/02_mpi_mpo_sum_2d_heisenberg.jl +++ /dev/null @@ -1,65 +0,0 @@ -using MPI -using ITensors -using ITensorParallel -using Random - -MPI.Init() - -ITensors.BLAS.set_num_threads(1) -ITensors.Strided.disable_threads() -ITensors.disable_threaded_blocksparse() - -seed = 1234 -Random.seed!(seed) - -mpi_projmpo = true - -Nx, Ny = 4, 1 -N = Nx * Ny - -sites = siteinds("S=1/2", N; conserve_qns=true) - -lattice = square_lattice(Nx, Ny; yperiodic=false) - -function heisenberg_hamiltonian(lattice) - ℋ = OpSum() - for b in lattice - ℋ += 0.5, "S+", b.s1, "S-", b.s2 - ℋ += 0.5, "S-", b.s1, "S+", b.s2 - ℋ += "Sz", b.s1, "Sz", b.s2 - end - return ℋ -end -ℋ = heisenberg_hamiltonian(lattice) - -# Define how the Hamiltonian will be partitioned -# into a sum of sub-Hamiltonians. -function in_partition(sites::Tuple{Int,Int}, p, nparts) - i, j = sites - return p == mod1(i, nparts) -end - -nprocs = MPI.Comm_size(MPI.COMM_WORLD) -ℋs = partition(ℋ, nprocs; in_partition=in_partition) - -PH = if mpi_projmpo - n = MPI.Comm_rank(MPI.COMM_WORLD) + 1 - MPISum(ProjMPO(MPO(ℋs[n], sites))) -else - ProjMPOSum(H) -end - -state = [isodd(n) ? "Up" : "Dn" for n in 1:N] -psi0 = randomMPS(sites, state; linkdims=10) - -nsweeps = 10 -maxdim = [20, 60, 100, 100, 200, 400, 800] -cutoff = 1e-10 -@show nsweeps -@show maxdim -@show cutoff - -energy, psi = dmrg(PH, psi0; nsweeps, maxdim, cutoff) -@show energy - -MPI.Finalize() diff --git a/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl b/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl deleted file mode 100644 index 5856176..0000000 --- a/examples/deprecated/04_mpi_threaded_mpo_sum_2d_heisenberg.jl +++ /dev/null @@ -1,63 +0,0 @@ -using MPI -using ITensors -using ITensorParallel -using Random - -MPI.Init() - -ITensors.BLAS.set_num_threads(1) -ITensors.Strided.disable_threads() -ITensors.disable_threaded_blocksparse() - -seed = 1234 -Random.seed!(seed) - -Nx, Ny = 8, 1 -N = Nx * Ny - -sites = siteinds("S=1/2", N; conserve_qns=true) - -lattice = square_lattice(Nx, Ny; yperiodic=false) - -function heisenberg_hamiltonian(lattice) - opsum = OpSum() - for b in lattice - opsum += 0.5, "S+", b.s1, "S-", b.s2 - opsum += 0.5, "S-", b.s1, "S+", b.s2 - opsum += "Sz", b.s1, "Sz", b.s2 - end - return opsum -end -opsum = heisenberg_hamiltonian(lattice) - -# Define how the Hamiltonian will be partitioned -# into a sum of sub-Hamiltonians. -function in_partition(sites::Tuple{Int,Int}, p, nparts) - i, j = sites - return p == mod1(i, nparts) -end - -# Number of partitions -nparts = 4 - -nprocs = MPI.Comm_size(MPI.COMM_WORLD) -nparts_per_proc = nparts ÷ nprocs -opsums = collect( - Iterators.partition(partition(opsum, nparts; in_partition=in_partition), nparts_per_proc) -) - -n = MPI.Comm_rank(MPI.COMM_WORLD) + 1 -PH = MPISum(ThreadedSum([MPO(opsum, sites) for opsum in opsums[n]])) - -state = [isodd(n) ? "Up" : "Dn" for n in 1:N] -psi0 = randomMPS(sites, state, 10) - -sweeps = Sweeps(10) -setmaxdim!(sweeps, 20, 60, 100, 100, 200, 400, 800) -setcutoff!(sweeps, 1E-10) -@show sweeps - -energy, psi = dmrg(PH, psi0, sweeps) -@show energy - -MPI.Finalize() diff --git a/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl b/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl deleted file mode 100644 index 325a6f0..0000000 --- a/examples/distributed_itensormap_tutorial/01_simple_spawnat_matvec.jl +++ /dev/null @@ -1,49 +0,0 @@ -using Distributed - -# Apply a sum of matrices to a vector in -# parallel using multi-processing. -# The vector exists on every process, but -# each matrix in the sum only exists on -# a single process. - -# Set the number of processors Julia will use -# @everywhere denotes that it will be defined -# on every worker. -@everywhere nprocs = 2 -addprocs(nprocs) - -# Define the vector dimension. -# @everywhere denotes that it will be defined -# on every worker. -@everywhere n = 4 - -# Define the vector we will apply the set -# of matrices to. -# @everywhere denotes that it will be defined -# on every worker. -@everywhere v = randn(n) - -# Define the matrices in the matrix sum -# that will be applied in parallel to the -# vector `v`. -# The first matrix is defined on worker 1, -# the second one on worker 2. -A1 = @spawnat workers()[1] randn(n, n) -A2 = @spawnat workers()[2] randn(n, n) - -# Define the individual matrix-vector multiplication -# operations on the appropriate worker. -# Alternatively, you can use: -# -# A1v = @spawnat :any fetch(A1) * v -# -# for Julia to choose the worker automatically. -A1v = @spawnat workers()[1] fetch(A1) * v -A2v = @spawnat workers()[2] fetch(A2) * v - -# Actually perform the matrix-vector multiplications, -# fetching the results to the main worker for -# local usage. -Av = fetch(A1v) + fetch(A2v) - -@show norm((fetch(A1) + fetch(A2)) * v - Av) diff --git a/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl b/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl deleted file mode 100644 index 2c08ef5..0000000 --- a/examples/distributed_itensormap_tutorial/02_matvec_distributed_loop.jl +++ /dev/null @@ -1,38 +0,0 @@ -using Distributed -using LinearAlgebra - -BLAS.set_num_threads(1) - -# Apply a sum of distributed matrices and reduce -# after applying to a vector. - -# Number of matrices in our sum. -nmats = 2 - -# Specify the number of processors Julia will use. -# For now we just say it is the same as the number of -# matrices in our sum. -addprocs(nmats) - -# Size of out matrices. -@everywhere n = 4 - -# Set of matrices in our sum, each created on a -# separate worker. -As = [@spawnat(workers()[nmat], randn(n, n)) for nmat in 1:nmats] - -# Vector we will apply to our matrices, distributed to -# every worker. -@everywhere v = randn(n) - -# Distributed matrix-vector multiplication for each distributed -# matrix in our sum, then reduced over with `+`. -Av = @distributed (+) for n in 1:nmats - # Need to call `fetch` to instantiate the matrix - # on the worker where it was defined above. - fetch(As[n]) * v -end - -# Test by fetching and summing locally -A = sum(fetch.(As)) -@show norm(Av - A * v) diff --git a/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl b/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl deleted file mode 100644 index ee44bbb..0000000 --- a/examples/distributed_itensormap_tutorial/03_distributedsum_matrixmap.jl +++ /dev/null @@ -1,38 +0,0 @@ -using Distributed - -# Number of matrices in the sum -nmats = 2 -addprocs(nmats) - -@everywhere begin - using LinearAlgebra - using KrylovKit - - BLAS.set_num_threads(1) - - include(joinpath("src", "DistributedSums.jl")) - include(joinpath("src", "MatrixMaps.jl")) - - # Matrix dimension - n = 100 - - # Starting vector that we will apply - # the matrix map to - v0 = randn(n) -end - -# Make a distributed sum of MatrixMaps, which is -# just a Matrix with special multiplication -# notation that can work with KrylovKit -A = DistributedSum(_ -> MatrixMap(randn(n, n)), nmats) - -Av0 = A(v0) - -@show norm(Av0 - sum(fetch.(A))(v0)) - -# Solve eigenvector equation -λ⃗, v⃗ = eigsolve(A, v0, 1, :LM) -λ = λ⃗[1] -v = v⃗[1] - -@show norm(A(v) - λ * v) diff --git a/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl b/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl deleted file mode 100644 index 531e272..0000000 --- a/examples/distributed_itensormap_tutorial/04_distributedsum_itensornetworkmap.jl +++ /dev/null @@ -1,62 +0,0 @@ -using Distributed - -# Number of ITensorNetworkMaps in the sum -nmats = 2 - -addprocs(nmats) - -# Set up for the calculation. Define everything -# in the `@everywhere` block that you need -# on all of the processes to perform the -# calculation. -@everywhere begin - using ITensors - using ITensors.ITensorNetworkMaps - using LinearAlgebra - using KrylovKit - - BLAS.set_num_threads(1) - ITensors.Strided.disable_threads() - ITensors.disable_threaded_blocksparse() - - include(joinpath("src", "DistributedSums.jl")) - - # Function that defines the tensor network - # that will map the input vector (i.e. - # a transfer matrix or effective Hamiltonian). - function itensor_map(i, j) - A1 = randomITensor(i', dag(i), dag(l)) - A2 = randomITensor(j', dag(j), l) - return ITensorNetworkMap([A1, A2]) - end - - d = 2 - χ = 3 - i = Index(d, "i") - j = Index(d, "j") - l = Index(χ, "l") - - v0 = randomITensor(i, j) -end - -# Defines an object representing a sum of -# linear maps (generalized matrices) built -# from contractions of ITensors that are -# distributed across `nmats` workers -# (each linear map lives on a worker). -A = DistributedSum(_ -> itensor_map(i, j), nmats) - -# Apply the linear map, which internally -# is performed in parallel where the linear -# map is created. -Av0 = A(v0) - -@show norm(Av0 - sum(fetch.(A))(v0)) - -# Solve an eigenvector equation in -# parallel. -λ⃗, v⃗ = eigsolve(A, v0, 1, :LM) -λ = λ⃗[1] -v = v⃗[1] - -@show norm(A(v) - λ * v) diff --git a/examples/distributed_itensormap_tutorial/README.md b/examples/distributed_itensormap_tutorial/README.md deleted file mode 100644 index af43340..0000000 --- a/examples/distributed_itensormap_tutorial/README.md +++ /dev/null @@ -1,8 +0,0 @@ -A short set of examples acting as a tutorial for performing -a distributed sum of matrix-vector multiplications for -use with a Krylov solver. - -Good references are: - -- https://docs.julialang.org/en/v1/manual/distributed-computing/ -- https://docs.julialang.org/en/v1/stdlib/Distributed/ diff --git a/examples/distributed_itensormap_tutorial/src/DistributedSums.jl b/examples/distributed_itensormap_tutorial/src/DistributedSums.jl deleted file mode 100644 index 26e5b7c..0000000 --- a/examples/distributed_itensormap_tutorial/src/DistributedSums.jl +++ /dev/null @@ -1,26 +0,0 @@ -using Distributed - -struct DistributedSum{T} - data::Vector{T} -end - -# Construct from a distributed sum from a -# function that returns the elements of the sum. -function DistributedSum(f::Function, n::Integer) - # Assign the matrices to the first `n` workers. - return DistributedSum([@spawnat(workers()[ni], f(ni)) for ni in 1:n]) -end - -# Functions needed for @distributed loop and reduction -Base.length(A::DistributedSum) = length(A.data) -Base.firstindex(A::DistributedSum) = firstindex(A.data) -Base.lastindex(A::DistributedSum) = lastindex(A.data) -Base.getindex(A::DistributedSum, args...) = getindex(A.data, args...) -Base.iterate(A::DistributedSum, args...) = iterate(A.data, args...) - -# Apply the sum -function (A::DistributedSum)(v) - return @distributed (+) for An in A - fetch(An)(v) - end -end diff --git a/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl b/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl deleted file mode 100644 index 5e808c1..0000000 --- a/examples/distributed_itensormap_tutorial/src/MatrixMaps.jl +++ /dev/null @@ -1,6 +0,0 @@ -# Wrapper of Matrix that is callable as `A(v)` -struct MatrixMap{T} - data::Matrix{T} -end -(A::MatrixMap)(v) = A.data * v -Base.:+(A::MatrixMap, B::MatrixMap) = MatrixMap(A.data + B.data) diff --git a/examples/hubbard_conserve_momentum/electronk.jl b/examples/hubbard_conserve_momentum/electronk.jl deleted file mode 100644 index 953caf5..0000000 --- a/examples/hubbard_conserve_momentum/electronk.jl +++ /dev/null @@ -1,67 +0,0 @@ -import ITensors: space, state, op, has_fermion_string - -function space( - ::SiteType"ElectronK", - n::Int; - conserve_qns=false, - conserve_sz=conserve_qns, - conserve_nf=conserve_qns, - conserve_nfparity=conserve_qns, - conserve_ky=false, - qnname_sz="Sz", - qnname_nf="Nf", - qnname_nfparity="NfParity", - qnname_ky="Ky", - modulus_ky=nothing, - # Deprecated - conserve_parity=nothing, -) - if !isnothing(conserve_parity) - conserve_nfparity = conserve_parity - end - if conserve_ky && conserve_sz && conserve_nf - mod = (n - 1) % modulus_ky - mod2 = (2 * mod) % modulus_ky - return [ - QN((qnname_nf, 0, -1), (qnname_sz, 0), (qnname_ky, 0, modulus_ky)) => 1 - QN((qnname_nf, 1, -1), (qnname_sz, 1), (qnname_ky, mod, modulus_ky)) => 1 - QN((qnname_nf, 1, -1), (qnname_sz, -1), (qnname_ky, mod, modulus_ky)) => 1 - QN((qnname_nf, 2, -1), (qnname_sz, 0), (qnname_ky, mod2, modulus_ky)) => 1 - ] - elseif conserve_ky - error("Cannot conserve ky without conserving sz and nf") - elseif conserve_sz && conserve_nf - return [ - QN((qnname_nf, 0, -1), (qnname_sz, 0)) => 1 - QN((qnname_nf, 1, -1), (qnname_sz, +1)) => 1 - QN((qnname_nf, 1, -1), (qnname_sz, -1)) => 1 - QN((qnname_nf, 2, -1), (qnname_sz, 0)) => 1 - ] - elseif conserve_nf - return [ - QN(qnname_nf, 0, -1) => 1 - QN(qnname_nf, 1, -1) => 2 - QN(qnname_nf, 2, -1) => 1 - ] - elseif conserve_sz - return [ - QN((qnname_sz, 0), (qnname_nfparity, 0, -2)) => 1 - QN((qnname_sz, +1), (qnname_nfparity, 1, -2)) => 1 - QN((qnname_sz, -1), (qnname_nfparity, 1, -2)) => 1 - QN((qnname_sz, 0), (qnname_nfparity, 0, -2)) => 1 - ] - elseif conserve_nfparity - return [ - QN(qnname_nfparity, 0, -2) => 1 - QN(qnname_nfparity, 1, -2) => 2 - QN(qnname_nfparity, 0, -2) => 1 - ] - end - return 4 -end - -state(statename::StateName, ::SiteType"ElectronK") = state(statename, SiteType("Electron")) -op(opname::OpName, ::SiteType"ElectronK") = op(opname, SiteType("Electron")) -function has_fermion_string(opname::OpName, ::SiteType"ElectronK") - return has_fermion_string(opname, SiteType("Electron")) -end diff --git a/examples/hubbard_conserve_momentum/hubbard_ky.jl b/examples/hubbard_conserve_momentum/hubbard_ky.jl deleted file mode 100644 index 387f000..0000000 --- a/examples/hubbard_conserve_momentum/hubbard_ky.jl +++ /dev/null @@ -1,79 +0,0 @@ -using ITensors - -function hubbard_ky_hopping(; Nx::Int, Ny::Int, t=1.0) - opsums = OpSum[] - - opsum = OpSum() - for x in 0:(Nx - 1) - for ky in 0:(Ny - 1) - i = x * Ny + ky + 1 - ϵ = -2 * t * cos((2 * π / Ny) * ky) - if abs(ϵ) > 1e-12 - opsum += ϵ, "n↑", i - opsum += ϵ, "n↓", i - end - end - end - push!(opsums, opsum) - - opsum = OpSum() - for x in 0:(Nx - 2) - for ky in 0:(Ny - 1) - i = x * Ny + ky + 1 - j = (x + 1) * Ny + ky + 1 - opsum += -t, "c†↑", i, "c↑", j - opsum += -t, "c†↑", j, "c↑", i - opsum += -t, "c†↓", i, "c↓", j - opsum += -t, "c†↓", j, "c↓", i - end - end - push!(opsums, opsum) - - return opsums -end - -# function hubbard_ky_interactions(; Nx::Int, Ny::Int, U) -# opsums = OpSum[] -# for py in 1:Ny -# for qy in 1:Ny -# opsum = OpSum() -# for x in 1:Nx -# for ky in 1:Ny -# i = (x - 1) * Ny + mod1(ky + qy + Ny - 1, Ny) -# j = (x - 1) * Ny + mod1(py - qy + Ny + 1, Ny) -# k = (x - 1) * Ny + py -# l = (x - 1) * Ny + ky -# opsum += U / Ny, "c†↓", i, "c†↑", j, "c↑", k, "c↓", l -# end -# end -# push!(opsums, opsum) -# end -# end -# return opsums -# end - -function hubbard_ky_interactions(; Nx::Int, Ny::Int, U) - opsums = OpSum[] - for py in 0:(Ny - 1) - for qy in 0:(Ny - 1) - opsum = OpSum() - for x in 0:(Nx - 1) - for ky in 0:(Ny - 1) - s1 = x * Ny + (ky + qy + Ny) % Ny + 1 - s2 = x * Ny + (py - qy + Ny) % Ny + 1 - s3 = x * Ny + py + 1 - s4 = x * Ny + ky + 1 - opsum += (U / Ny), "Cdagdn", s1, "Cdagup", s2, "Cup", s3, "Cdn", s4 - end - end - push!(opsums, opsum) - end - end - return opsums -end - -function hubbard_ky(; Nx::Int, Ny::Int, t=1.0, U) - opsum_hopping = hubbard_ky_hopping(; Nx, Ny, t) - opsum_interactions = hubbard_ky_interactions(; Nx, Ny, U) - return [opsum_hopping; opsum_interactions] -end diff --git a/examples/hubbard_conserve_momentum/mpi_dmrg.jl b/examples/hubbard_conserve_momentum/mpi_dmrg.jl deleted file mode 100644 index 132bbb6..0000000 --- a/examples/hubbard_conserve_momentum/mpi_dmrg.jl +++ /dev/null @@ -1,81 +0,0 @@ -using ITensors -using ITensorParallel -using MPI -using Random - -MPI.Init() - -ITensors.BLAS.set_num_threads(1) -ITensors.Strided.disable_threads() -ITensors.disable_threaded_blocksparse() - -include("electronk.jl") -include("hubbard_ky.jl") -include("split_vec.jl") - -U = 10.0 -Nx = 2 -Ny = 2 -N = Nx * Ny - -which_proc = MPI.Comm_rank(MPI.COMM_WORLD) + 1 - -@show which_proc, U, Nx, Ny -sites = siteinds("ElectronK", N; conserve_qns=true, conserve_ky=true, modulus_ky=Ny) - -opsums = hubbard_ky(; Nx, Ny, U) - -@show which_proc, length.(opsums) - -# TODO: randomly shuffle the MPOs -# TODO: only construct the MPOs that are needed by the current process -Hs = [splitblocks(linkinds, MPO(opsum, sites)) for opsum in opsums] - -@show which_proc, length(Hs) - -nprocs = MPI.Comm_size(MPI.COMM_WORLD) - -@show which_proc, nprocs - -Hss = split_vec(Hs, nprocs) - -@show which_proc, length.(Hss) -@show which_proc, [maxlinkdim.(Hs) for Hs in Hss] - -Hn = Hss[which_proc] - -@show which_proc, length(Hss[which_proc]) - -ITensors.checkflux.(Hss[which_proc]) -@show flux.(Hss[which_proc]) - -PH = MPISum(ProjMPOSum(Hss[which_proc])) - -init_state = Vector{String}(undef, N) -for i in 1:N - x = (i - 1) ÷ Ny - y = (i - 1) % Ny - if x % 2 == 0 - if y % 2 == 0 - init_state[i] = "Up" - else - init_state[i] = "Dn" - end - else - if y % 2 == 0 - init_state[i] = "Dn" - else - init_state[i] = "Up" - end - end -end - -Random.seed!(1234) - -psi0 = randomMPS(sites, init_state; linkdims=20) -ITensors.checkflux(psi0) -@show flux(psi0) -dmrg_kwargs = (nsweeps=10, maxdim=[10, 20, 50, 100], cutoff=1e-5, noise=1e-4) -energy, psi = @time dmrg(PH, psi0; dmrg_kwargs...) - -MPI.Finalize() diff --git a/examples/hubbard_conserve_momentum/split_vec.jl b/examples/hubbard_conserve_momentum/split_vec.jl deleted file mode 100644 index 8e68241..0000000 --- a/examples/hubbard_conserve_momentum/split_vec.jl +++ /dev/null @@ -1,18 +0,0 @@ -# References: -# https://github.com/JuliaML/MLUtils.jl: `MLUtils.chunk` -# https://discourse.julialang.org/t/split-vector-into-n-potentially-unequal-length-subvectors/73548 -# https://github.com/m3g/ChunkSplitters.jl -# https://github.com/JuliaFolds/SplittablesBase.jl -function split_vec(x::Vector, npartitions::Int) - partition_lengths = zeros(Int, npartitions) - which_part = 1 - for j in 1:length(x) - partition_lengths[which_part] += 1 - which_part = mod1(which_part + 1, npartitions) - end - partition_stops = [0; accumulate(+, partition_lengths)] - partition_ranges = [ - (partition_stops[j] + 1):partition_stops[j + 1] for j in 1:(length(partition_stops) - 1) - ] - return [x[partition_range] for partition_range in partition_ranges] -end diff --git a/examples/mpi_itensormap_tutorial/01_simple_matvec.jl b/examples/mpi_itensormap_tutorial/01_simple_matvec.jl deleted file mode 100644 index 6291b8c..0000000 --- a/examples/mpi_itensormap_tutorial/01_simple_matvec.jl +++ /dev/null @@ -1,27 +0,0 @@ -using MPI - -MPI.Init() - -comm = MPI.COMM_WORLD -N = 4 -root = 0 - -if MPI.Comm_rank(comm) == root - v = randn(N) -else - v = Vector{Float64}(undef, N) -end - -MPI.Bcast!(v, root, comm) - -M = randn(N, N) - -Mv = M * v - -@show Mv - -Mv_total = MPI.Reduce(Mv, +, root, comm) - -@show MPI.Comm_rank(comm), Mv_total - -MPI.Finalize() diff --git a/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl b/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl deleted file mode 100644 index 5076e34..0000000 --- a/examples/mpi_itensormap_tutorial/03_mpisum_matrixmap.jl +++ /dev/null @@ -1,15 +0,0 @@ -using MPI - -MPI.Init() - -include(joinpath("src", "MPISum.jl")) -include(joinpath("src", "MatrixMaps.jl")) - -n = 4 -v = ones(n) -A = MPISum(MatrixMap(randn(n, n))) -Av = A(v) - -@show MPI.Comm_rank(A.comm), Av - -MPI.Finalize() diff --git a/examples/mpi_itensormap_tutorial/src/MPISum.jl b/examples/mpi_itensormap_tutorial/src/MPISum.jl deleted file mode 100644 index d950678..0000000 --- a/examples/mpi_itensormap_tutorial/src/MPISum.jl +++ /dev/null @@ -1,11 +0,0 @@ -struct MPISum{T} - data::T - comm::MPI.Comm -end - -MPISum(data::T, comm=MPI.COMM_WORLD) where {T} = MPISum{T}(data, comm) - -# Apply the sum -function (A::MPISum)(v) - return MPI.Allreduce(A.data(v), +, A.comm) -end diff --git a/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl b/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl deleted file mode 100644 index 5e808c1..0000000 --- a/examples/mpi_itensormap_tutorial/src/MatrixMaps.jl +++ /dev/null @@ -1,6 +0,0 @@ -# Wrapper of Matrix that is callable as `A(v)` -struct MatrixMap{T} - data::Matrix{T} -end -(A::MatrixMap)(v) = A.data * v -Base.:+(A::MatrixMap, B::MatrixMap) = MatrixMap(A.data + B.data) diff --git a/src/ITensorParallel.jl b/src/ITensorParallel.jl index b2a7a94..d372f67 100644 --- a/src/ITensorParallel.jl +++ b/src/ITensorParallel.jl @@ -1,19 +1,36 @@ module ITensorParallel +using Accessors using Distributed using Folds using MPI using ITensors using ITensors.NDTensors +using ITensors: AbstractSum + import Base: eltype, length, size -import ITensors: product, position!, noiseterm, lproj, rproj, nsite, replaceind!, linkind +import ITensors: + disk, + linkind, + lproj, + noiseterm, + nsite, + orthogonalize!, + position!, + product, + replaceind!, + replacebond!, + rproj, + set_terms, + terms -include("mpi_extensions.jl") include("partition.jl") -include("parallelsum.jl") -include("mpisum.jl") +include("foldssum.jl") +include("distributedsum.jl") +include("mpi_extensions.jl") +include("mpisumterm.jl") -export DistributedSum, SequentialSum, ThreadedSum, MPISum, partition +export DistributedSum, SequentialSum, ThreadedSum, MPISumTerm, distribute, partition end diff --git a/src/broken/distributedsum.jl b/src/broken/distributedsum.jl deleted file mode 100644 index 7fc71fc..0000000 --- a/src/broken/distributedsum.jl +++ /dev/null @@ -1,62 +0,0 @@ -# WARNING: This is currently broken! -struct DistributedSum{T} - terms::Vector{T} -end -terms(sum::DistributedSum) = sum.terms - -function DistributedSum(f::Function, n::Integer) - # Assign the terms of the sum to the first `n` workers. - return DistributedSum([@spawnat(workers()[ni], f(ni)) for ni in 1:n]) -end - -nsite(sum::DistributedSum) = nsite(fetch(terms(sum)[1])) - -Base.length(sum::DistributedSum) = length(fetch(terms(sum)[1])) - -Base.getindex(sum::DistributedSum, n::Int) = getindex(terms(sum), n) -Base.setindex!(sum::DistributedSum, x, n::Int) = setindex!(terms(sum), x, n) - -function product(sum::DistributedSum, v::ITensor)::ITensor - # TODO: Use `Folds.sum`: - # Folds.sum(term -> fetch(term)(v), terms(sum), DistributedEx()) - return @distributed (+) for M in terms(sum) - fetch(M)(v) - end -end - -function Base.eltype(sum::DistributedSum) - elT = eltype(terms(sum)[1]) - for n in 2:length(terms(sum)) - elT = promote_type(elT, eltype(terms(sum)[n])) - end - return elT -end - -(sum::DistributedSum)(v::ITensor) = product(sum, v) - -Base.size(sum::DistributedSum) = size(terms(sum)[1]) - -function position!(sum::DistributedSum, psi::MPS, pos::Int) - # TODO: Try `Folds.map`, i.e.: - # new_terms = Folds.map(terms(sum), DistributedEx()) do term - # return @spawnat(worker(term), position!(fetch(term), psi, pos)) - # end - # return set_terms(sum, new_terms) - @distributed for n in 1:length(terms(sum)) - M_fetched = fetch(sum[n]) - M_fetched = position!(M_fetched, psi, pos) - sum[n] = @spawnat(workers()[n], M_fetched) - end - return sum -end - -## XXX: Implement this. -## function noiseterm(sum::DistributedSum, -## phi::ITensor, -## dir::String) -## nts = fill(ITensor(), Threads.nthreads()) -## Threads.@threads for M in terms(sum) -## nts[Threads.threadid()] += noiseterm(M, phi, dir) -## end -## return sum(nts) -## end diff --git a/src/distributedsum.jl b/src/distributedsum.jl new file mode 100644 index 0000000..bd8e3c3 --- /dev/null +++ b/src/distributedsum.jl @@ -0,0 +1,32 @@ +# Functionality for distributed terms of a sum +# onto seperate processes +function distribute(terms::Vector) + return map(term -> @spawnat(:any, term), terms) +end + +distribute(terms::Vector{Future}) = terms + +function distribute(terms::Vector{MPO}) + return distribute(ProjMPO.(terms)) +end + +function DistributedSum(terms::Vector; executor_kwargs...) + return FoldsSum(distribute(terms), SequentialEx(; executor_kwargs...)) +end + +function position!(term::Future, v::MPS, pos::Int) + return @spawnat term.where position!(fetch(term), v, pos) +end + +function product(term::Future, v::ITensor) + return @fetchfrom term.where fetch(term)(v) +end +(term::Future)(v::ITensor) = product(term, v) + +function noiseterm(term::Future, v::ITensor, dir::String) + return @fetchfrom term.where noiseterm(fetch(term), v, dir) +end + +function disk(term::Future; disk_kwargs...) + return @spawnat term.where disk(fetch(term); disk_kwargs...) +end diff --git a/src/foldssum.jl b/src/foldssum.jl new file mode 100644 index 0000000..d47d7a3 --- /dev/null +++ b/src/foldssum.jl @@ -0,0 +1,78 @@ +struct FoldsSum{T,Ex} <: AbstractSum + terms::Vector{T} + executor::Ex +end + +## Accessors +terms(sum::FoldsSum) = sum.terms +executor(sum::FoldsSum) = sum.executor +set_terms(sum::FoldsSum, terms) = (@set sum.terms = terms) +set_executor(sum::FoldsSum, executor) = (@set sum.executor = executor) + +## Constructors +function FoldsSum{T,Ex}(terms::Vector; executor_kwargs...) where {T,Ex} + return FoldsSum(terms, Ex(; executor_kwargs...)) +end + +function FoldsSum{<:Any,Ex}(terms::Vector; executor_kwargs...) where {Ex} + return FoldsSum{eltype(terms),Ex}(terms; executor_kwargs...) +end + +# Default to `ThreadedEx` +function FoldsSum{T}(terms::Vector; executor_kwargs...) where {T} + return FoldsSum{T,ThreadedEx}(terms; executor_kwargs...) +end + +# Default to `ThreadedEx` +function FoldsSum(terms::Vector; executor_kwargs...) + return FoldsSum{eltype(terms)}(terms; executor_kwargs...) +end + +# Conversion from `MPO` +function FoldsSum{T,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {T,Ex} + return FoldsSum{T,Ex}(T.(mpos); executor_kwargs...) +end +function FoldsSum{T,Ex}(mpos::MPO...; executor_kwargs...) where {T,Ex} + return FoldsSum{T,Ex}([mpos...]; executor_kwargs...) +end + +# Conversion from `MPO` +function FoldsSum{T}(mpos::Vector{MPO}; executor_kwargs...) where {T} + return FoldsSum{T}(T.(mpos); executor_kwargs...) +end +function FoldsSum{T}(mpos::MPO...; executor_kwargs...) where {T} + return FoldsSum{T}([mpos...]; executor_kwargs...) +end + +# Conversion from `MPO`, default to `ProjMPO` +function FoldsSum{<:Any,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {Ex} + return FoldsSum{ProjMPO,Ex}(mpos; executor_kwargs...) +end +function FoldsSum{<:Any,Ex}(mpos::MPO...; executor_kwargs...) where {Ex} + return FoldsSum{<:Any,T}([mpos...]; executor_kwargs...) +end + +# Conversion from `MPO`, default to `ProjMPO` +function FoldsSum(mpos::Vector{MPO}; executor_kwargs...) + return FoldsSum{ProjMPO}(mpos; executor_kwargs...) +end +function FoldsSum(mpos::MPO...; executor_kwargs...) + return FoldsSum([mpos...]; executor_kwargs...) +end + +## Necessary operations +function product(sum::FoldsSum, v::ITensor) + return Folds.sum(term -> term(v), terms(sum), executor(sum)) +end + +function position!(sum::FoldsSum, v::MPS, pos::Int) + new_terms = Folds.map(term -> position!(term, v, pos), terms(sum), executor(sum)) + return set_terms(sum, new_terms) +end + +function noiseterm(sum::FoldsSum, v::ITensor, dir::String) + return Folds.sum(term -> noiseterm(term, v, dir), terms(sum), executor(sum)) +end + +const ThreadedSum{T} = FoldsSum{T,ThreadedEx} +const SequentialSum{T} = FoldsSum{T,SequentialEx} diff --git a/src/mpisum.jl b/src/mpisum.jl deleted file mode 100644 index ec98653..0000000 --- a/src/mpisum.jl +++ /dev/null @@ -1,110 +0,0 @@ -struct MPISum{T} - term::T - comm::MPI.Comm -end -term(sum::MPISum) = sum.term -comm(sum::MPISum) = sum.comm - -MPISum(term::T, comm=MPI.COMM_WORLD) where {T} = MPISum{T}(term, comm) - -MPISum(mpo::MPO, comm=MPI.COMM_WORLD) = MPISum(ProjMPO(mpo), comm) - -nsite(sum::MPISum) = nsite(term(sum)) - -length(sum::MPISum) = length(term(sum)) - -function product(sum::MPISum, v::ITensor) - return allreduce(term(sum)(v), +, comm(sum)) -end - -function eltype(sum::MPISum) - return eltype(term(sum)) -end - -(sum::MPISum)(v::ITensor) = product(sum, v) - -size(sum::MPISum) = size(term(sum)) - -function position!(sum::MPISum, psi::MPS, pos::Int) - makeL!(sum, psi, pos - 1) - makeR!(sum, psi, pos + nsite(sum)) - return sum -end - -function makeL!(sum::MPISum, psi::MPS, k::Int) - _makeL!(sum, psi, k) - return sum -end - -function makeR!(sum::MPISum, psi::MPS, k::Int) - _makeR!(sum, psi, k) - return sum -end - -function _makeL!(sum::MPISum, psi::MPS, k::Int) - # Save the last `L` that is made to help with caching - # for DiskProjMPO - sum_term = term(sum) - ll = sum_term.lpos - if ll ≥ k - # Special case when nothing has to be done. - # Still need to change the position if lproj is - # being moved backward. - sum_term.lpos = k - return nothing - end - # Make sure ll is at least 0 for the generic logic below - ll = max(ll, 0) - L = lproj(sum_term) - while ll < k - # sync the linkindex across processes - mylink = linkind(psi, ll + 1) - otherlink = MPI.bcast(mylink, 0, comm(sum)) - replaceind!(psi[ll + 1], mylink, otherlink) - replaceind!(psi[ll + 2], mylink, otherlink) - - L = L * psi[ll + 1] * sum_term.H[ll + 1] * dag(prime(psi[ll + 1])) - sum_term.LR[ll + 1] = L - ll += 1 - end - # Needed when moving lproj backward. - sum_term.lpos = k - return L -end - -function _makeR!(sum::MPISum, psi::MPS, k::Int) - # Save the last `R` that is made to help with caching - # for DiskProjMPO - sum_term = term(sum) - rl = sum_term.rpos - if rl ≤ k - # Special case when nothing has to be done. - # Still need to change the position if rproj is - # being moved backward. - sum_term.rpos = k - return nothing - end - N = length(sum_term.H) - # Make sure rl is no bigger than `N + 1` for the generic logic below - rl = min(rl, N + 1) - R = rproj(sum_term) - while rl > k - #sync linkindex across processes - mylink = linkind(psi, rl - 2) - otherlink = MPI.bcast(mylink, 0, comm(sum)) - replaceind!(psi[rl - 2], mylink, otherlink) - replaceind!(psi[rl - 1], mylink, otherlink) - - R = R * psi[rl - 1] * sum_term.H[rl - 1] * dag(prime(psi[rl - 1])) - sum_term.LR[rl - 1] = R - rl -= 1 - end - sum_term.rpos = k - return R -end - -function noiseterm(sum::MPISum, phi::ITensor, dir::String) - ##ToDo: I think the logic here is wrong. - ##The noiseterm should be calculated on the reduced P*v and then broadcasted? - return allreduce(noiseterm(term(sum), phi, dir), +, comm(sum)) -end diff --git a/src/mpisumterm.jl b/src/mpisumterm.jl new file mode 100644 index 0000000..59bb884 --- /dev/null +++ b/src/mpisumterm.jl @@ -0,0 +1,70 @@ +struct MPISumTerm{T} + term::T + comm::MPI.Comm +end +term(sumterm::MPISumTerm) = sumterm.term +comm(sumterm::MPISumTerm) = sumterm.comm + +set_term(sumterm::MPISumTerm, term) = (@set sumterm.term = term) +set_comm(sumterm::MPISumTerm, comm) = (@set sumterm.comm = comm) + +MPISumTerm(mpo::MPO, comm::MPI.Comm) = MPISumTerm(ProjMPO(mpo), comm) + +nsite(sumterm::MPISumTerm) = nsite(term(sumterm)) + +length(sumterm::MPISumTerm) = length(term(sumterm)) + +disk(sumterm::MPISumTerm) = set_term(sumterm, disk(term(sumterm))) + +function product(sumterm::MPISumTerm, v::ITensor) + return allreduce(term(sumterm)(v), +, comm(sumterm)) +end + +function eltype(sumterm::MPISumTerm) + return eltype(term(sumterm)) +end + +(sumterm::MPISumTerm)(v::ITensor) = product(sumterm, v) + +size(sumterm::MPISumTerm) = size(term(sumterm)) + +# Replace the bond and then broadcast one of the results +# to each process. This ensures that the link indices +# are consistent across processes and that the tensors +# match numerically. Floating point precision errors +# have been seen to build up enough to make their +# block sparse structure inconsistent, especially +# when multithreading block sparse operations are involved. +function replacebond!(sumterm::MPISumTerm, M::MPS, b::Int, v::ITensor; kwargs...) + spec = nothing + M_ortho_lims = nothing + if MPI.Comm_rank(comm(sumterm)) == 0 + spec = replacebond!(M, b, v; kwargs...) + M_ortho_lims = ortho_lims(M) + end + M_b1 = MPI.bcast(M[b], 0, comm(sumterm)) + M_b2 = MPI.bcast(M[b + 1], 0, comm(sumterm)) + M_ortho_lims = MPI.bcast(M_ortho_lims, 0, comm(sumterm)) + spec = MPI.bcast(spec, 0, comm(sumterm)) + M[b] = M_b1 + M[b + 1] = M_b2 + set_ortho_lims!(M, M_ortho_lims) + return spec +end + +function orthogonalize!(sumterm::MPISumTerm, v::MPS, pos::Int; kwargs...) + if MPI.Comm_rank(comm(sumterm)) == 0 + v = orthogonalize!(v, pos; kwargs...) + end + return MPI.bcast(v, 0, comm(sumterm)) +end + +function position!(sumterm::MPISumTerm, v::MPS, pos::Int) + return set_term(sumterm, position!(term(sumterm), v, pos)) +end + +function noiseterm(sumterm::MPISumTerm, v::ITensor, dir::String) + # TODO: Check if this logic is correct. + # Should the noiseterm instead be calculated on the reduced `sumterm(v)` and then broadcasted? + return allreduce(noiseterm(term(sumterm), v, dir), +, comm(sumterm)) +end diff --git a/src/parallelsum.jl b/src/parallelsum.jl deleted file mode 100644 index b0fd74a..0000000 --- a/src/parallelsum.jl +++ /dev/null @@ -1,118 +0,0 @@ -# TODO: Make `AbstractSum` subtype -struct ParallelSum{T,Ex} - terms::Vector{T} - executor::Ex -end - -terms(sum::ParallelSum) = sum.terms -executor(sum::ParallelSum) = sum.executor -set_terms(sum::ParallelSum, terms) = ParallelSum(terms, executor(sum)) -set_executor(sum::ParallelSum, executor) = ParallelSum(terms(sum), executor) - -function ParallelSum{T,Ex}(terms::Vector; executor_kwargs...) where {T,Ex} - return ParallelSum(terms, Ex(; executor_kwargs...)) -end - -function ParallelSum{<:Any,Ex}(terms::Vector; executor_kwargs...) where {Ex} - return ParallelSum{eltype(terms),Ex}(terms; executor_kwargs...) -end - -# Default to `ThreadedEx` -function ParallelSum{T}(terms::Vector; executor_kwargs...) where {T} - return ParallelSum{T,ThreadedEx}(terms; executor_kwargs...) -end - -# Default to `ThreadedEx` -function ParallelSum(terms::Vector; executor_kwargs...) - return ParallelSum{eltype(terms)}(terms; executor_kwargs...) -end - -# Conversion from `MPO` -function ParallelSum{T,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {T,Ex} - return ParallelSum{T,Ex}(T.(mpos); executor_kwargs...) -end -function ParallelSum{T,Ex}(mpos::MPO...; executor_kwargs...) where {T,Ex} - return ParallelSum{T,Ex}([mpos...]; executor_kwargs...) -end - -# Conversion from `MPO` -function ParallelSum{T}(mpos::Vector{MPO}; executor_kwargs...) where {T} - return ParallelSum{T}(T.(mpos); executor_kwargs...) -end -function ParallelSum{T}(mpos::MPO...; executor_kwargs...) where {T} - return ParallelSum{T}([mpos...]; executor_kwargs...) -end - -# Conversion from `MPO`, default to `ProjMPO` -function ParallelSum{<:Any,Ex}(mpos::Vector{MPO}; executor_kwargs...) where {Ex} - return ParallelSum{ProjMPO,Ex}(mpos; executor_kwargs...) -end -function ParallelSum{<:Any,Ex}(mpos::MPO...; executor_kwargs...) where {Ex} - return ParallelSum{<:Any,T}([mpos...]; executor_kwargs...) -end - -# Conversion from `MPO`, default to `ProjMPO` -function ParallelSum(mpos::Vector{MPO}; executor_kwargs...) - return ParallelSum{ProjMPO}(mpos; executor_kwargs...) -end -function ParallelSum(mpos::MPO...; executor_kwargs...) - return ParallelSum([mpos...]; executor_kwargs...) -end - -# TODO: Replace with `AbstractSum` once we merge: -# https://github.com/ITensor/ITensors.jl/pull/1046 -nsite(sum::ParallelSum) = nsite(terms(sum)[1]) -length(sum::ParallelSum) = length(terms(sum)[1]) -function eltype(sum::ParallelSum) - elT = eltype(terms(sum)[1]) - for n in 2:length(terms(sum)) - elT = promote_type(elT, eltype(terms(sum)[n])) - end - return elT -end -(sum::ParallelSum)(v::ITensor) = product(sum, v) -size(sum::ParallelSum) = size(terms(sum)[1]) - -function product(sum::ParallelSum, v::ITensor) - return Folds.sum(term -> term(v), terms(sum), executor(sum)) -end - -function position!(sum::ParallelSum, psi::MPS, pos::Int) - new_terms = Folds.map(term -> position!(term, psi, pos), terms(sum), executor(sum)) - return set_terms(sum, new_terms) -end - -# TODO: Remove once we merge: -# https://github.com/ITensor/ITensors.jl/pull/1047 -function position!(sum::ParallelSum{<:Any,<:DistributedEx}, psi::MPS, pos::Int) - threaded_sum = position!(set_executor(sum, ThreadedEx()), psi, pos) - return set_executor(threaded_sum, executor(sum)) -end - -function noiseterm(sum::ParallelSum, phi::ITensor, dir::String) - return Folds.sum(term -> noiseterm(term, phi, dir), terms(sum), executor(sum)) -end - -function disk(sum::ParallelSum; disk_kwargs...) - return set_terms(sum, [disk(term; disk_kwargs...) for term in terms(sum)]) -end - -const ThreadedSum{T} = ParallelSum{T,ThreadedEx} -const DistributedSum{T} = ParallelSum{T,DistributedEx} -const SequentialSum{T} = ParallelSum{T,SequentialEx} - -# Functionality for sums where terms are distributed remotely -# function product(sum::DistributedSum{<:Future}, v::ITensor) -# return Folds.sum(term -> fetch(term)(v), terms(sum), executor(sum)) -# end -# -# function position!(sum::DistributedSum{<:Future}, psi::MPS, pos::Int) -# new_terms = Folds.map(terms(sum), executor(sum)) do term -# return @spawnat(term.where, position!(fetch(term), psi, pos)) -# end -# return set_terms(sum, new_terms) -# end -# -# function noiseterm(sum::ParallelSum{<:Future}, phi::ITensor, dir::String) -# return Folds.sum(term -> noiseterm(fetch(term), phi, dir), terms(sum), executor(sum)) -# end diff --git a/src/references/parallel_projmpo.jl b/src/references/parallel_projmpo.jl deleted file mode 100644 index e6fa893..0000000 --- a/src/references/parallel_projmpo.jl +++ /dev/null @@ -1,24 +0,0 @@ -import ITensors: product, position!, noiseterm - -function product(P::ProjMPOSum, v::ITensor)::ITensor - Pvs = fill(ITensor(), Threads.nthreads()) - Threads.@threads for M in P.pm - Pvs[Threads.threadid()] += product(M, v) - end - return sum(Pvs) -end - -function position!(P::ProjMPOSum, psi::MPS, pos::Int) - Threads.@threads for M in P.pm - position!(M, psi, pos) - end - return P -end - -function noiseterm(P::ProjMPOSum, phi::ITensor, dir::String) - nts = fill(ITensor(), Threads.nthreads()) - Threads.@threads for M in P.pm - nts[Threads.threadid()] += noiseterm(M, phi, dir) - end - return sum(nts) -end diff --git a/test/runtests.jl b/test/runtests.jl index 782d251..53791ef 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,34 +3,8 @@ using MPI using Test @testset "ITensorParallel.jl" begin - examples_dir = joinpath(pkgdir(ITensorParallel), "examples") - - example_files = filter( - f -> startswith(f, "01_") && endswith(f, ".jl"), readdir(examples_dir) - ) - @testset "Threaded/Distributed example $example_file" for example_file in example_files - include(joinpath(examples_dir, example_file)) - Nx = 8 - Ny = 4 - maxdim = 20 - Sums = (ThreadedSum, DistributedSum, SequentialSum) - @testset "Sum type $Sum" for Sum in Sums - println("Running parallel test with $(Sum)") - main(; Nx, Ny, maxdim, Sum) - end - end - - example_files = ["02_mpi_run.jl"] - @testset "MPI example $example_file" for example_file in example_files - println("Running MPI parallel test") - nprocs = 2 - Nx = 8 - Ny = 4 - maxdim = 20 - mpiexec() do exe # MPI wrapper - run( - `$exe -n $(nprocs) $(Base.julia_cmd()) $(joinpath(examples_dir, example_file)) --Nx $(Nx) --Ny $(Ny) --maxdim $(maxdim)`, - ) - end + test_files = filter(file -> startswith("test_")(file) && endswith(".jl")(file), readdir()) + @testset "Test file $(test_file)" for test_file in test_files + include(test_file) end end diff --git a/test/test_mpi_example.jl b/test/test_mpi_example.jl new file mode 100644 index 0000000..6f74747 --- /dev/null +++ b/test/test_mpi_example.jl @@ -0,0 +1,27 @@ +using MPI +using ITensorParallel +using Test + +@testset "Test $(@__FILE__)" begin + examples_dir = joinpath(pkgdir(ITensorParallel), "examples") + example_files = ["02_mpi_run.jl"] + @testset "MPI example $example_file, threaded block sparse $threaded_blocksparse, write-to-disk $disk" for example_file in + example_files, + threaded_blocksparse in (false, true), + disk in (false, true) + + println( + "\nRunning MPI parallel test with threaded block sparse $threaded_blocksparse, write-to-disk $disk", + ) + nprocs = 2 + Nx = 8 + Ny = 4 + nsweeps = 2 + maxdim = 20 + mpiexec() do exe # MPI wrapper + run( + `$exe -n $(nprocs) $(Base.julia_cmd()) --threads $(Threads.nthreads()) $(joinpath(examples_dir, example_file)) --Nx $(Nx) --Ny $(Ny) --nsweeps $(nsweeps) --maxdim $(maxdim) --disk $(disk) --threaded_blocksparse $(threaded_blocksparse)`, + ) + end + end +end diff --git a/test/test_sequential_threaded_distributed_example.jl b/test/test_sequential_threaded_distributed_example.jl new file mode 100644 index 0000000..e6d556f --- /dev/null +++ b/test/test_sequential_threaded_distributed_example.jl @@ -0,0 +1,28 @@ +using ITensorParallel +using Test + +@testset "Test $(@__FILE__)" begin + examples_dir = joinpath(pkgdir(ITensorParallel), "examples") + + example_files = filter( + f -> startswith(f, "01_") && endswith(f, ".jl"), readdir(examples_dir) + ) + @testset "Threaded/Distributed example $example_file" for example_file in example_files + include(joinpath(examples_dir, example_file)) + Nx = 8 + Ny = 4 + nsweeps = 2 + maxdim = 20 + Sums = (SequentialSum, ThreadedSum, DistributedSum) + @testset "Sum type $Sum, threaded block sparse $threaded_blocksparse, write-to-disk $disk" for Sum in + Sums, + threaded_blocksparse in (false, true), + disk in (false, true) + + println( + "\nRunning parallel test with $(Sum), threaded block sparse $threaded_blocksparse, write-to-disk $disk", + ) + main(; Nx, Ny, nsweeps, maxdim, Sum, disk, threaded_blocksparse) + end + end +end