Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use @batch reduction for bounds check #126

Merged
merged 4 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions docs/src/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -282,14 +282,3 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension,
timing result, you need to set the analysis interval such that the
`AnalysisCallback` is invoked at least once during the course of the simulation and
discard the first PID value.

## Performance issues with multi-threaded reductions
[False sharing](https://en.wikipedia.org/wiki/False_sharing) is a known performance issue
for systems with distributed caches. It also occurred for the implementation of a thread
parallel bounds checking routine for the subcell IDP limiting
in [PR #1736](https://github.com/trixi-framework/Trixi.jl/pull/1736).
After some [testing and discussion](https://github.com/trixi-framework/Trixi.jl/pull/1736#discussion_r1423881895),
it turned out that initializing a vector of length `n * Threads.nthreads()` and only using every
n-th entry instead of a vector of length `Threads.nthreads()` fixes the problem.
Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`.
Now, the bounds checking routine of the IDP limiting scales as hoped.
77 changes: 26 additions & 51 deletions src/callbacks_stage/subcell_bounds_check_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,103 +11,86 @@
(; variable_bounds) = limiter.cache.subcell_limiter_coefficients
(; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache

# Note: Accessing the threaded memory vector `idp_bounds_delta_local` with
# `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance
# issues due to False Sharing.
# Initializing a vector with n times the length and using every n-th entry fixes this
# problem and allows proper scaling:
# `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]`
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(u))) # = n

if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
key_min = Symbol(v_string, "_min")
key_max = Symbol(v_string, "_max")
deviation_min_threaded = idp_bounds_delta_local[key_min]
deviation_max_threaded = idp_bounds_delta_local[key_max]
@threaded for element in eachelement(solver, cache)
deviation_min = deviation_min_threaded[stride_size * Threads.threadid()]
deviation_max = deviation_max_threaded[stride_size * Threads.threadid()]
deviation_min = idp_bounds_delta_local[key_min]
deviation_max = idp_bounds_delta_local[key_max]
@batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver,
cache)
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation_min = max(deviation_min,
variable_bounds[key_min][i, j, element] - var)
deviation_max = max(deviation_max,
var - variable_bounds[key_max][i, j, element])
end
deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min
deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max
end
idp_bounds_delta_local[key_min] = deviation_min
idp_bounds_delta_local[key_max] = deviation_max
end
end
if spec_entropy
key = :spec_entropy_min
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
s = entropy_spec(get_node_vars(u, equations, solver, i, j, element),
equations)
deviation = max(deviation, variable_bounds[key][i, j, element] - s)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
if math_entropy
key = :math_entropy_max
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
s = entropy_math(get_node_vars(u, equations, solver, i, j, element),
equations)
deviation = max(deviation, s - variable_bounds[key][i, j, element])
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
if positivity
for v in limiter.positivity_variables_cons
if v in limiter.local_minmax_variables_cons
continue
end
key = Symbol(string(v), "_min")
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
for variable in limiter.positivity_variables_nonlinear
key = Symbol(string(variable), "_min")
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
var = variable(get_node_vars(u, equations, solver, i, j, element),
equations)
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
end

for (key, _) in idp_bounds_delta_local
# Calculate maximum deviations of all threads
idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i]
for i in 1:Threads.nthreads())
# Update global maximum deviations
idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key],
idp_bounds_delta_local[key][stride_size])
idp_bounds_delta_local[key])
end

return nothing
Expand All @@ -118,46 +101,38 @@ end
(; local_minmax, positivity, spec_entropy, math_entropy) = limiter
(; idp_bounds_delta_local) = limiter.cache

stride_size = div(128, sizeof(eltype(u))) # = n

# Print errors to output file
open("$output_directory/deviations.txt", "a") do f
print(f, iter, ", ", time)
if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
print(f, ", ",
idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size],
", ",
idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size])
print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")],
", ", idp_bounds_delta_local[Symbol(v_string, "_max")])
end
end
if spec_entropy
print(f, ", ", idp_bounds_delta_local[:spec_entropy_min][stride_size])
print(f, ", ", idp_bounds_delta_local[:spec_entropy_min])
end
if math_entropy
print(f, ", ", idp_bounds_delta_local[:math_entropy_max][stride_size])
print(f, ", ", idp_bounds_delta_local[:math_entropy_max])
end
if positivity
for v in limiter.positivity_variables_cons
if v in limiter.local_minmax_variables_cons
continue
end
print(f, ", ",
idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size])
print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")])
end
for variable in limiter.positivity_variables_nonlinear
print(f, ", ",
idp_bounds_delta_local[Symbol(string(variable), "_min")][stride_size])
print(f, ", ", idp_bounds_delta_local[Symbol(string(variable), "_min")])
end
end
println(f)
end
# Reset local maximum deviations
for (key, _) in idp_bounds_delta_local
for i in 1:Threads.nthreads()
idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size]))
end
idp_bounds_delta_local[key] = zero(eltype(idp_bounds_delta_local[key]))
end

return nothing
Expand Down
10 changes: 2 additions & 8 deletions src/solvers/dgsem_tree/subcell_limiters_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,12 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat

# Memory for bounds checking routine with `BoundsCheckCallback`.
# Local variable contains the maximum deviation since the last export.
# Using a threaded vector to parallelize bounds check.
idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}()
idp_bounds_delta_local = Dict{Symbol, real(basis)}()
# Global variable contains the total maximum deviation.
idp_bounds_delta_global = Dict{Symbol, real(basis)}()
# Note: False sharing causes critical performance issues on multiple threads when using a vector
# of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()`
# and then only using every n-th entry, fixes the problem and allows proper scaling.
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(basis.nodes))) # = n
for key in bound_keys
idp_bounds_delta_local[key] = [zero(real(basis))
for _ in 1:(stride_size * Threads.nthreads())]
idp_bounds_delta_local[key] = zero(real(basis))
idp_bounds_delta_global[key] = zero(real(basis))
end

Expand Down
Loading