From 5d9584753c9613c4f6ad045e69e31d12fc0caf30 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Sat, 17 Jun 2023 17:04:49 +0200 Subject: [PATCH 1/5] WIP: threaded version of modified_equation --- src/BSeries.jl | 107 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 3 deletions(-) diff --git a/src/BSeries.jl b/src/BSeries.jl index fb4f7e2e..faf25ebd 100644 --- a/src/BSeries.jl +++ b/src/BSeries.jl @@ -1014,11 +1014,68 @@ Section 3.2 of Foundations of Computational Mathematics [DOI: 10.1007/s10208-010-9065-1](https://doi.org/10.1007/s10208-010-9065-1) """ -function modified_equation(series_integrator) - _modified_equation(series_integrator, evaluation_type(series_integrator)) +function modified_equation(series_integrator, thread = Threads.nthreads() > 1) + if thread + _modified_equation_thread(series_integrator, + evaluation_type(series_integrator)) + else + _modified_equation_serial(series_integrator, + evaluation_type(series_integrator)) + end +end + +function _modified_equation_serial(series_integrator, ::EagerEvaluation) + V = valtype(series_integrator) + + # B-series of the exact solution + # We could just use the lazy version + # series_ex = ExactSolution{V}() + # However, we need to access elements of `series_ex` more than once in the + # substitution below. Thus, it's cheaper to compute every entry only once and + # re-use it later. + series_ex = ExactSolution(series_integrator) + + # Prepare B-series of the modified equation + series_keys = keys(series_integrator) + series = empty(series_integrator) + for t in series_keys + series[t] = zero(V) + end + + iter = iterate(series_keys) + if iter !== nothing + t, t_state = iter + if isempty(t) + iter = iterate(series_keys, t_state) + if iter !== nothing + t, t_state = iter + end + end + + series[t] = series_integrator[t] + iter = iterate(series_keys, t_state) + end + + # Recursively solve + # substitute(series, series_ex, t) == series_integrator[t] + # This works because + # substitute(series, series_ex, t) = series[t] + lower order terms + # Since the `keys` are ordered, we don't need to use nested loops of the form + # for o in 2:order + # for _t in RootedTreeIterator(o) + # t = copy(_t) + # which are slightly less efficient due to additional computations and + # allocations. + while iter !== nothing + t, t_state = iter + series[t] += series_integrator[t] - substitute(series, series_ex, t) + iter = iterate(series_keys, t_state) + end + + return series end -function _modified_equation(series_integrator, ::EagerEvaluation) +function _modified_equation_thread(series_integrator, ::EagerEvaluation) V = valtype(series_integrator) # B-series of the exact solution @@ -1054,6 +1111,13 @@ function _modified_equation(series_integrator, ::EagerEvaluation) # substitute(series, series_ex, t) == series_integrator[t] # This works because # substitute(series, series_ex, t) = series[t] + lower order terms + + # Here, we use the serial version up to a specified `cutoff_order`, i.e., + # for low-order trees, since it avoids the parallel overhead. We only use + # the parallel (threaded) version for trees of an order of at least + # `cutoff_order`. + cutoff_order = 5 + # Since the `keys` are ordered, we don't need to use nested loops of the form # for o in 2:order # for _t in RootedTreeIterator(o) @@ -1062,10 +1126,47 @@ function _modified_equation(series_integrator, ::EagerEvaluation) # allocations. while iter !== nothing t, t_state = iter + order(t) >= cutoff_order && break series[t] += series_integrator[t] - substitute(series, series_ex, t) iter = iterate(series_keys, t_state) end + # The algorithm has a data dependency: It is assumed that the coefficients + # of the new `series` are already computed for all trees with a lower order + # than the current tree. Thus, we can use threaded parallelism only over a + # set of trees of the same order. + # for o in cutoff_order:order(series_integrator) + # # We need to collect the trees we wil iterate over in a vector for + # # threaded parallelism. + # # TODO: This should be the iterator type specified by the keys + # # of the series_integrator + # trees = map(copy, RootedTreeIterator(o)) + # Threads.@threads for t in trees + # series[t] += series_integrator[t] - substitute(series, series_ex, t) + # end + # end + + idx_stop = findfirst(==(cutoff_order) ∘ order, series_integrator.coef.keys) + if idx_stop === nothing + return series + else + idx_stop = idx_stop - 1 + end + for o in cutoff_order:order(series_integrator) + idx_start = findnext(==(o) ∘ order, series_integrator.coef.keys, idx_stop) + idx_stop = findnext(==(o + 1) ∘ order, series_integrator.coef.keys, idx_start) + if idx_stop === nothing + idx_stop = lastindex(series_integrator.coef.keys) + end + # We need to collect the trees we wil iterate over in a vector for + # threaded parallelism. + # TODO: This uses internal implementation details... + trees = view(series_integrator.coef.keys, idx_start:idx_stop) + Threads.@threads for t in trees + series[t] += series_integrator[t] - substitute(series, series_ex, t) + end + end + return series end From 09c23dedb19c53919d54a6cb9311f0e115e27089 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Sat, 17 Jun 2023 17:22:18 +0200 Subject: [PATCH 2/5] move shared code to another function --- src/BSeries.jl | 47 ++++++++++++++--------------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/src/BSeries.jl b/src/BSeries.jl index faf25ebd..702dff66 100644 --- a/src/BSeries.jl +++ b/src/BSeries.jl @@ -1025,41 +1025,13 @@ function modified_equation(series_integrator, thread = Threads.nthreads() > 1) end function _modified_equation_serial(series_integrator, ::EagerEvaluation) - V = valtype(series_integrator) - - # B-series of the exact solution - # We could just use the lazy version - # series_ex = ExactSolution{V}() - # However, we need to access elements of `series_ex` more than once in the - # substitution below. Thus, it's cheaper to compute every entry only once and - # re-use it later. - series_ex = ExactSolution(series_integrator) - - # Prepare B-series of the modified equation - series_keys = keys(series_integrator) - series = empty(series_integrator) - for t in series_keys - series[t] = zero(V) - end - - iter = iterate(series_keys) - if iter !== nothing - t, t_state = iter - if isempty(t) - iter = iterate(series_keys, t_state) - if iter !== nothing - t, t_state = iter - end - end - - series[t] = series_integrator[t] - iter = iterate(series_keys, t_state) - end + series, series_keys, series_ex, iter = _modified_equation_shared(series_integrator) # Recursively solve # substitute(series, series_ex, t) == series_integrator[t] # This works because # substitute(series, series_ex, t) = series[t] + lower order terms + # Since the `keys` are ordered, we don't need to use nested loops of the form # for o in 2:order # for _t in RootedTreeIterator(o) @@ -1075,7 +1047,7 @@ function _modified_equation_serial(series_integrator, ::EagerEvaluation) return series end -function _modified_equation_thread(series_integrator, ::EagerEvaluation) +@inline function _modified_equation_shared(series_integrator) V = valtype(series_integrator) # B-series of the exact solution @@ -1107,6 +1079,12 @@ function _modified_equation_thread(series_integrator, ::EagerEvaluation) iter = iterate(series_keys, t_state) end + return series, series_keys, series_ex, iter +end + +function _modified_equation_thread(series_integrator, ::EagerEvaluation) + series, series_keys, series_ex, iter = _modified_equation_shared(series_integrator) + # Recursively solve # substitute(series, series_ex, t) == series_integrator[t] # This works because @@ -1161,8 +1139,11 @@ function _modified_equation_thread(series_integrator, ::EagerEvaluation) # We need to collect the trees we wil iterate over in a vector for # threaded parallelism. # TODO: This uses internal implementation details... - trees = view(series_integrator.coef.keys, idx_start:idx_stop) - Threads.@threads for t in trees + # trees = view(series_integrator.coef.keys, idx_start:idx_stop) + # Threads.@threads for t in trees + indices = idx_start:idx_stop + Threads.@threads for i in indices + t = @inbounds series_integrator.coef.keys[i] series[t] += series_integrator[t] - substitute(series, series_ex, t) end end From 6ea3fb25b32b27356d3623c333c2f119109a5acf Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Sat, 17 Jun 2023 17:24:11 +0200 Subject: [PATCH 3/5] update comments and TODO notes --- src/BSeries.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/BSeries.jl b/src/BSeries.jl index 702dff66..52ec463d 100644 --- a/src/BSeries.jl +++ b/src/BSeries.jl @@ -1136,9 +1136,11 @@ function _modified_equation_thread(series_integrator, ::EagerEvaluation) if idx_stop === nothing idx_stop = lastindex(series_integrator.coef.keys) end - # We need to collect the trees we wil iterate over in a vector for - # threaded parallelism. # TODO: This uses internal implementation details... + # We iterate over the indices instead of the trees in the threaded + # loop since that is slightly more efficient at the time of writing + # since it results in less allocations. + # TODO: remove old version # trees = view(series_integrator.coef.keys, idx_start:idx_stop) # Threads.@threads for t in trees indices = idx_start:idx_stop From 8a559d5a45a9664168d3bd1e694db45a7105ac2d Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Sun, 18 Jun 2023 10:10:07 +0200 Subject: [PATCH 4/5] update comments --- src/BSeries.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/BSeries.jl b/src/BSeries.jl index 52ec463d..6373d8e4 100644 --- a/src/BSeries.jl +++ b/src/BSeries.jl @@ -989,10 +989,11 @@ function _evaluate(f, u, dt, series, ::EagerEvaluation, reduce_order_by) end """ - modified_equation(series_integrator) + modified_equation(series_integrator, thread::Bool = Threads.nthreads() > 1) Compute the B-series of the modified equation of the time integration method -with B-series `series_integrator`. +with B-series `series_integrator` using multiple threads if Julia is started +with multiple threads and `thread` is set to `true`. Given an ordinary differential equation (ODE) ``u'(t) = f(u(t))`` and a Runge-Kutta method, the idea is to interpret the numerical solution with @@ -1014,7 +1015,7 @@ Section 3.2 of Foundations of Computational Mathematics [DOI: 10.1007/s10208-010-9065-1](https://doi.org/10.1007/s10208-010-9065-1) """ -function modified_equation(series_integrator, thread = Threads.nthreads() > 1) +function modified_equation(series_integrator, thread::Bool = Threads.nthreads() > 1) if thread _modified_equation_thread(series_integrator, evaluation_type(series_integrator)) @@ -1025,6 +1026,7 @@ function modified_equation(series_integrator, thread = Threads.nthreads() > 1) end function _modified_equation_serial(series_integrator, ::EagerEvaluation) + # Setup shared between the serial and threaded versions series, series_keys, series_ex, iter = _modified_equation_shared(series_integrator) # Recursively solve @@ -1083,6 +1085,7 @@ end end function _modified_equation_thread(series_integrator, ::EagerEvaluation) + # Setup shared between the serial and threaded versions series, series_keys, series_ex, iter = _modified_equation_shared(series_integrator) # Recursively solve @@ -1131,18 +1134,15 @@ function _modified_equation_thread(series_integrator, ::EagerEvaluation) idx_stop = idx_stop - 1 end for o in cutoff_order:order(series_integrator) + # TODO: This uses internal implementation details... idx_start = findnext(==(o) ∘ order, series_integrator.coef.keys, idx_stop) idx_stop = findnext(==(o + 1) ∘ order, series_integrator.coef.keys, idx_start) if idx_stop === nothing idx_stop = lastindex(series_integrator.coef.keys) end - # TODO: This uses internal implementation details... # We iterate over the indices instead of the trees in the threaded # loop since that is slightly more efficient at the time of writing - # since it results in less allocations. - # TODO: remove old version - # trees = view(series_integrator.coef.keys, idx_start:idx_stop) - # Threads.@threads for t in trees + # due to less allocations. indices = idx_start:idx_stop Threads.@threads for i in indices t = @inbounds series_integrator.coef.keys[i] From 5d7fba3a37fa3827fe308dd461b05138ec8c8ab7 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Mon, 19 Jun 2023 13:58:00 +0200 Subject: [PATCH 5/5] fix typo --- src/BSeries.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BSeries.jl b/src/BSeries.jl index 52ec463d..69c12409 100644 --- a/src/BSeries.jl +++ b/src/BSeries.jl @@ -1114,7 +1114,7 @@ function _modified_equation_thread(series_integrator, ::EagerEvaluation) # than the current tree. Thus, we can use threaded parallelism only over a # set of trees of the same order. # for o in cutoff_order:order(series_integrator) - # # We need to collect the trees we wil iterate over in a vector for + # # We need to collect the trees we will iterate over in a vector for # # threaded parallelism. # # TODO: This should be the iterator type specified by the keys # # of the series_integrator