Skip to content

Commit

Permalink
Apple silicon workaround (#196)
Browse files Browse the repository at this point in the history
* avoid reductions with custom operators for MPI

* avoid MultiScalar reduction only on non-intel

* move changes to mapreduce

* small tweak

* fix mpi errors

* clean up

* use MPI logical operator directly

* Remove MultiScalar from sort_into_targets!
fciqmc_col! may now return stats as a vector

* Revert "Remove MultiScalar from sort_into_targets!"

This reverts commit b54868f.

---------

Co-authored-by: Joachim Brand <[email protected]>
  • Loading branch information
joachimbrand and joachimbrand authored Mar 9, 2023
1 parent 8819209 commit 053c5ec
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 19 deletions.
1 change: 1 addition & 0 deletions src/RMPI/RMPI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const mpi_registry = Dict{Int,Any}()
abstract type DistributeStrategy end

include("mpidata.jl")
include("multiscalar.jl")
include("helpers.jl")
include("noexchange.jl")
include("pointtopoint.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/RMPI/helpers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ end

function sort_into_targets!(dtarget::MPIData, w::AbstractDVec, stats)
# single threaded MPI version
mpi_combine_walkers!(dtarget,w) # combine walkers from different MPI ranks
mpi_combine_walkers!(dtarget, w) # combine walkers from different MPI ranks
res_stats = MPI.Allreduce(Rimu.MultiScalar(stats), +, dtarget.comm)
return dtarget, w, res_stats
end
Expand Down
21 changes: 20 additions & 1 deletion src/RMPI/mpidata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,26 @@ end

function Base.mapreduce(f, op, it::MPIDataIterator; kwargs...)
res = mapreduce(f, op, it.iter; kwargs...)
return MPI.Allreduce(res, op, it.data.comm)
T = typeof(res)
if T <: Bool # MPI.jl does not support Bool reductions
res = convert(UInt8, res)
end
return T(MPI.Allreduce(res, op, it.data.comm))
end

# Special case for `sum`, which uses a custom (type-widening) reduction operator `add_sum`.
# Replacing it by `+` is necessary for non-Intel architectures due to a limitation of
# MPI.jl. On Intel processors, it might be more perfomant.
# see https://github.com/JuliaParallel/MPI.jl/issues/404
function Base.mapreduce(f, op::typeof(Base.add_sum), it::MPIDataIterator; kwargs...)
res = mapreduce(f, op, it.iter; kwargs...)
return MPI.Allreduce(res, +, it.data.comm)
end

# Special case for `prod`, which uses a custom (type-widening) reduction operator `mul_prod`
function Base.mapreduce(f, op::typeof(Base.mul_prod), it::MPIDataIterator; kwargs...)
res = mapreduce(f, op, it.iter; kwargs...)
return MPI.Allreduce(res, *, it.data.comm)
end

Base.IteratorSize(::MPIDataIterator) = Base.SizeUnknown()
Expand Down
10 changes: 10 additions & 0 deletions src/RMPI/multiscalar.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Make MPI reduction of a `MultiScalar` work on non-Intel processors.
# The `MultiScalar` is converted into a vector before sending through MPI.Allreduce.
# Testing shows that this is about the same speed or even a bit faster on Intel processors
# than reducing the MultiScalar directly via a custom reduction operator.
# Defining the method in RMPI is strictly type piracy as MultiScalar belongs to Rimu and
# not to RMPI. Might clean this up later.
function MPI.Allreduce(ms::Rimu.MultiScalar{T}, op, comm::MPI.Comm) where {T<:Tuple}
result_vector = MPI.Allreduce([ms...], op, comm)
return Rimu.MultiScalar(T(result_vector))
end
1 change: 1 addition & 0 deletions src/helpers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ end
MultiScalar(args...) = MultiScalar(args)
MultiScalar(v::SVector) = MultiScalar(Tuple(v))
MultiScalar(m::MultiScalar) = m
MultiScalar{T}(m::MultiScalar{T}) where T<:Tuple = m
MultiScalar(arg) = MultiScalar((arg,))

Base.getindex(m::MultiScalar, i) = m.tuple[i]
Expand Down
22 changes: 11 additions & 11 deletions test/RMPI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ using Test

@testset "DistributeStrategies" begin
# `DistributeStrategy`s
ham = HubbardReal1D(BoseFS((1,2,3)))
ham = HubbardReal1D(BoseFS((1, 2, 3)))
for setup in [RMPI.mpi_no_exchange, RMPI.mpi_all_to_all, RMPI.mpi_point_to_point]
dv = DVec(starting_address(ham)=>10; style=IsDynamicSemistochastic())
dv = DVec(starting_address(ham) => 10; style=IsDynamicSemistochastic())
v = MPIData(dv; setup)
df, state = lomc!(ham,v)
df, state = lomc!(ham, v)
@test size(df) == (100, 12)
end
# need to do mpi_one_sided separately
dv = DVec(starting_address(ham)=>10; style=IsDynamicSemistochastic())
v = RMPI.mpi_one_sided(dv; capacity = 1000)
df, state = lomc!(ham,v)
dv = DVec(starting_address(ham) => 10; style=IsDynamicSemistochastic())
v = RMPI.mpi_one_sided(dv; capacity=1000)
df, state = lomc!(ham, v)
@test size(df) == (100, 12)
end

Expand All @@ -29,13 +29,13 @@ end
counts = zeros(Int, k)
displs = zeros(Int, k)

RMPI.sort_and_count!(counts, displs, vals, ordfun.(vals), (0, k-1))
RMPI.sort_and_count!(counts, displs, vals, ordfun.(vals), (0, k - 1))
@test issorted(vals, by=ordfun)
@test sum(counts) == l

for i in 0:(k - 1)
c = counts[i + 1]
d = displs[i + 1]
for i in 0:(k-1)
c = counts[i+1]
d = displs[i+1]
r = (1:c) .+ d
ords = ordfun.(vals)
@test all(ords[r] .== i)
Expand Down Expand Up @@ -79,7 +79,7 @@ end
@testset "dot" begin
@test dot(dv1, dv2) == 0
@test dot(dv1, dv1) == dot(localpart(dv1), dv1)
rand_ham = MatrixHamiltonian(rand(ComplexF64, 4,4))
rand_ham = MatrixHamiltonian(rand(ComplexF64, 4, 4))
ldv1 = localpart(dv1)
@test norm(dot(dv1, rand_ham, dv1)) norm(dot(ldv1, rand_ham, ldv1))
end
Expand Down
14 changes: 8 additions & 6 deletions test/mpi_runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ end
end
@testset "Single component $type" begin
for i in 1:N_REPEATS
add = BoseFS((0,0,10,0,0))
add = BoseFS((0, 0, 10, 0, 0))
H = HubbardMom1D(add)
Random.seed!(7350 * i)
v, dv = setup_dv(
Expand All @@ -98,7 +98,7 @@ end
@test sum(values(v)) sum(values(dv))
f((k, v)) = (k == add) + v > 0
@test mapreduce(f, |, pairs(v); init=true) ==
mapreduce(f, |, pairs(dv); init=true)
mapreduce(f, |, pairs(dv); init=true)
end

@testset "Operations" begin
Expand Down Expand Up @@ -127,7 +127,7 @@ end
end
@testset "Two-component $type" begin
for i in 1:N_REPEATS
add = BoseFS2C((0,0,10,0,0), (0,0,2,0,0))
add = BoseFS2C((0, 0, 10, 0, 0), (0, 0, 2, 0, 0))
H = BoseHubbardMom1D2C(add)
Random.seed!(7350 * i)
v, dv = setup_dv(
Expand Down Expand Up @@ -225,7 +225,7 @@ end
(RMPI.mpi_one_sided, (; capacity=1000)),
)
@testset "Regular with $setup and post-steps" begin
H = HubbardReal1D(BoseFS((1,1,1,1,1,1,1)); u=6.0)
H = HubbardReal1D(BoseFS((1, 1, 1, 1, 1, 1, 1)); u=6.0)
dv = MPIData(
DVec(starting_address(H) => 3; style=IsDynamicSemistochastic());
setup,
Expand Down Expand Up @@ -253,7 +253,7 @@ end
@test all(0 .≤ df.loneliness .≤ 1)
end
@testset "Initiator with $setup" begin
H = HubbardMom1D(BoseFS((0,0,0,7,0,0,0)); u=6.0)
H = HubbardMom1D(BoseFS((0, 0, 0, 7, 0, 0, 0)); u=6.0)
dv = MPIData(
InitiatorDVec(starting_address(H) => 3);
setup,
Expand Down Expand Up @@ -295,7 +295,9 @@ end

# Make sure all ranks came this far.
@testset "Finish" begin
@test MPI.Allreduce(true, &, mpi_comm())
# MPI.jl currently doesn't properly map logical operators (MPI v0.20.8)
@test MPI.Allreduce(true, MPI.LAND, mpi_comm())
# @test MPI.Allreduce(true, &, mpi_comm())
end
end

Expand Down

0 comments on commit 053c5ec

Please sign in to comment.