Skip to content

Commit

Permalink
(Thread-)Parallelize bounds check routine for subcell IDP limiting (#…
Browse files Browse the repository at this point in the history
…1736)

* Implement parallel bounds check for IDP limiting

* Add missing warning as "experimental" from last PR

* Updating `idp_bounds_delta_threaded` for all bounds at once

* Revise parallel memory structure

* Using maximum instead of reduce

* Expand vector length to fix False Sharing problem

* Generalize stride size in vector

* Add suggested comment
  • Loading branch information
bennibolm authored Jan 30, 2024
1 parent fcf2652 commit f4e6e49
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 37 deletions.
10 changes: 6 additions & 4 deletions src/callbacks_stage/subcell_bounds_check.jl
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ end
@inline function finalize_callback(callback::BoundsCheckCallback, semi,
limiter::SubcellLimiterIDP)
(; local_minmax, positivity) = limiter
(; idp_bounds_delta) = limiter.cache
(; idp_bounds_delta_global) = limiter.cache
variables = varnames(cons2cons, semi.equations)

println(""^100)
Expand All @@ -128,8 +128,10 @@ end
for v in limiter.local_minmax_variables_cons
v_string = string(v)
println("$(variables[v]):")
println("-lower bound: ", idp_bounds_delta[Symbol(v_string, "_min")][2])
println("-upper bound: ", idp_bounds_delta[Symbol(v_string, "_max")][2])
println("- lower bound: ",
idp_bounds_delta_global[Symbol(v_string, "_min")])
println("- upper bound: ",
idp_bounds_delta_global[Symbol(v_string, "_max")])
end
end
if positivity
Expand All @@ -138,7 +140,7 @@ end
continue
end
println(string(variables[v]) * ":\n- positivity: ",
idp_bounds_delta[Symbol(string(v), "_min")][2])
idp_bounds_delta_global[Symbol(string(v), "_min")])
end
end
println(""^100 * "\n")
Expand Down
81 changes: 54 additions & 27 deletions src/callbacks_stage/subcell_bounds_check_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,37 @@
time, iter, output_directory, save_errors)
(; local_minmax, positivity) = solver.volume_integral.limiter
(; variable_bounds) = limiter.cache.subcell_limiter_coefficients
(; idp_bounds_delta) = limiter.cache
(; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache

# Note: Accessing the threaded memory vector `idp_bounds_delta_local` with
# `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance
# issues due to False Sharing.
# Initializing a vector with n times the length and using every n-th entry fixes this
# problem and allows proper scaling:
# `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]`
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(u))) # = n

if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
key_min = Symbol(v_string, "_min")
key_max = Symbol(v_string, "_max")
deviation_min = idp_bounds_delta[key_min]
deviation_max = idp_bounds_delta[key_max]
for element in eachelement(solver, cache), j in eachnode(solver),
i in eachnode(solver)

var = u[v, i, j, element]
deviation_min[1] = max(deviation_min[1],
variable_bounds[key_min][i, j, element] - var)
deviation_max[1] = max(deviation_max[1],
var - variable_bounds[key_max][i, j, element])
deviation_min_threaded = idp_bounds_delta_local[key_min]
deviation_max_threaded = idp_bounds_delta_local[key_max]
@threaded for element in eachelement(solver, cache)
deviation_min = deviation_min_threaded[stride_size * Threads.threadid()]
deviation_max = deviation_max_threaded[stride_size * Threads.threadid()]
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation_min = max(deviation_min,
variable_bounds[key_min][i, j, element] - var)
deviation_max = max(deviation_max,
var - variable_bounds[key_max][i, j, element])
end
deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min
deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max
end
deviation_min[2] = max(deviation_min[2], deviation_min[1])
deviation_max[2] = max(deviation_max[2], deviation_max[1])
end
end
if positivity
Expand All @@ -38,41 +49,57 @@
continue
end
key = Symbol(string(v), "_min")
deviation = idp_bounds_delta[key]
for element in eachelement(solver, cache), j in eachnode(solver),
i in eachnode(solver)

var = u[v, i, j, element]
deviation[1] = max(deviation[1],
variable_bounds[key][i, j, element] - var)
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
deviation[2] = max(deviation[2], deviation[1])
end
end

for (key, _) in idp_bounds_delta_local
# Calculate maximum deviations of all threads
idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i]
for i in 1:Threads.nthreads())
# Update global maximum deviations
idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key],
idp_bounds_delta_local[key][stride_size])
end

if save_errors
# Print to output file
open("$output_directory/deviations.txt", "a") do f
print(f, iter, ", ", time)
if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
print(f, ", ", idp_bounds_delta[Symbol(v_string, "_min")][1], ", ",
idp_bounds_delta[Symbol(v_string, "_max")][1])
print(f, ", ",
idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size],
", ",
idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size])
end
end
if positivity
for v in limiter.positivity_variables_cons
if v in limiter.local_minmax_variables_cons
continue
end
print(f, ", ", idp_bounds_delta[Symbol(string(v), "_min")][1])
print(f, ", ",
idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size])
end
end
println(f)
end
# Reset first entries of idp_bounds_delta
for (key, _) in idp_bounds_delta
idp_bounds_delta[key][1] = zero(eltype(idp_bounds_delta[key][1]))
# Reset local maximum deviations
for (key, _) in idp_bounds_delta_local
for i in 1:Threads.nthreads()
idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size]))
end
end
end

Expand Down
3 changes: 3 additions & 0 deletions src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ end
For subcell limiting, the calculation of local bounds for non-periodic domains require the boundary
outer state. This function returns the boundary value at time `t` and for node with spatial
indices `indices`.
!!! warning "Experimental implementation"
This is an experimental feature and may change in future releases.
"""
@inline function get_boundary_outer_state(boundary_condition::BoundaryConditionDirichlet,
cache, t, equations, dg, indices...)
Expand Down
23 changes: 17 additions & 6 deletions src/solvers/dgsem_tree/subcell_limiters_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,32 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat
bound_keys)

# Memory for bounds checking routine with `BoundsCheckCallback`.
# The first entry of each vector contains the maximum deviation since the last export.
# The second one contains the total maximum deviation.
idp_bounds_delta = Dict{Symbol, Vector{real(basis)}}()
# Local variable contains the maximum deviation since the last export.
# Using a threaded vector to parallelize bounds check.
idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}()
# Global variable contains the total maximum deviation.
idp_bounds_delta_global = Dict{Symbol, real(basis)}()
# Note: False sharing causes critical performance issues on multiple threads when using a vector
# of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()`
# and then only using every n-th entry, fixes the problem and allows proper scaling.
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(basis.nodes))) # = n
for key in bound_keys
idp_bounds_delta[key] = zeros(real(basis), 2)
idp_bounds_delta_local[key] = [zero(real(basis))
for _ in 1:(stride_size * Threads.nthreads())]
idp_bounds_delta_global[key] = zero(real(basis))
end

return (; subcell_limiter_coefficients, idp_bounds_delta)
return (; subcell_limiter_coefficients, idp_bounds_delta_local,
idp_bounds_delta_global)
end

function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 4}, semi, dg::DGSEM, t,
dt;
kwargs...)
@unpack alpha = limiter.cache.subcell_limiter_coefficients
alpha .= zero(eltype(alpha))
# TODO: Do not abuse `reset_du!` but maybe implement a generic `set_zero!`
@trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache)

if limiter.local_minmax
@trixi_timeit timer() "local min/max limiting" idp_local_minmax!(alpha, limiter,
Expand Down

0 comments on commit f4e6e49

Please sign in to comment.