Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallelization of compute coefficients functions on GPU #63

Merged
merged 11 commits into from
Dec 6, 2024
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"

Expand All @@ -17,7 +16,6 @@ BenchmarkTools = "1"
CUDA = "5"
SciMLBase = "2"
StaticArrays = "1"
StrideArrays = "0.1"
Trixi = "0.8, 0.9"
TrixiBase = "0.1"
julia = "1.10"
Expand Down
5 changes: 2 additions & 3 deletions src/TrixiCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using CUDA
using CUDA: @cuda, CuArray, HostKernel,
threadIdx, blockIdx, blockDim, similar, launch_configuration

using Trixi: AbstractEquations, AbstractContainer,
using Trixi: AbstractEquations, AbstractContainer, AbstractSemidiscretization, AbstractMesh,
ElementContainer1D, ElementContainer2D, ElementContainer3D,
InterfaceContainer1D, InterfaceContainer2D, InterfaceContainer3D,
BoundaryContainer1D, BoundaryContainer2D, BoundaryContainer3D,
Expand All @@ -18,6 +18,7 @@ using Trixi: AbstractEquations, AbstractContainer,
BoundaryConditionPeriodic, BoundaryConditionDirichlet,
VolumeIntegralWeakForm, VolumeIntegralFluxDifferencing, VolumeIntegralShockCapturingHG,
LobattoLegendreMortarL2,
allocate_coefficients, mesh_equations_solver_cache,
flux, ntuple, nvariables, nnodes, nelements, nmortars,
local_leaf_cells, init_elements, init_interfaces, init_boundaries, init_mortars,
wrap_array, compute_coefficients, have_nonconservative_terms,
Expand All @@ -30,8 +31,6 @@ import Trixi: get_node_vars, get_node_coords, get_surface_node_vars,

using SciMLBase: ODEProblem, FullSpecialize

using StrideArrays: PtrArray

using StaticArrays: SVector

# Include other source files
Expand Down
19 changes: 19 additions & 0 deletions src/semidiscretization/semidiscretization.jl
Original file line number Diff line number Diff line change
@@ -1 +1,20 @@
include("semidiscretization_hyperbolic.jl")

# Similar to `compute_coefficients` in Trixi.jl but calls GPU kernel
function compute_coefficients_gpu(func, t, semi::AbstractSemidiscretization)
u_ode = allocate_coefficients(mesh_equations_solver_cache(semi)...)

# Call `compute_coefficients_gpu` defined below
u_ode = compute_coefficients_gpu(u_ode, func, t, semi)
return u_ode
end

# Compute the coefficients for 1D problems on the GPU
function compute_coefficients_gpu(u_ode, func, t, semi::AbstractSemidiscretization)
u = wrap_array(u_ode, semi)
u = CuArray(u)

# Call `compute_coefficients_gpu` defined in `src/solvers/dg.jl`
u_computed = compute_coefficients_gpu(u, func, t, mesh_equations_solver_cache(semi)...)
return u_computed
end
11 changes: 10 additions & 1 deletion src/semidiscretization/semidiscretization_hyperbolic.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# This file is part of the package `Semidiscretizations.jl`.
# Everything specific about semidiscretization hyperbolic for PDE solvers.

# Similar to `SemidiscretizationHyperbolic` in Trixi.jl but for GPU cache
function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver;
source_terms = nothing,
boundary_conditions = boundary_condition_periodic,
Expand All @@ -16,6 +17,7 @@ function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, sol

check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)

# Return the CPU type
SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
typeof(initial_condition),
typeof(_boundary_conditions), typeof(source_terms),
Expand All @@ -25,3 +27,10 @@ function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, sol
source_terms, solver,
cache)
end

# Similar to `compute_coefficients` in Trixi.jl but calls GPU kernel
function compute_coefficients_gpu(t, semi::SemidiscretizationHyperbolic)

# Call `compute_coefficients_gpu` defined in `src/semidiscretization/semidiscretization.jl`
compute_coefficients_gpu(semi.initial_condition, t, semi)
end
44 changes: 29 additions & 15 deletions src/solvers/cache.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,16 @@ end

function create_cache_gpu(mesh::TreeMesh{2}, equations,
mortar_l2::LobattoLegendreMortarL2, uEltype, cache)
fstar_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))

(; fstar_upper, fstar_lower)
fstar_primary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_primary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_secondary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_secondary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))

(; fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower)
end

# Create cache specialized for 3D tree mesh
Expand Down Expand Up @@ -194,15 +198,25 @@ end

function create_cache_gpu(mesh::TreeMesh{3}, equations,
mortar_l2::LobattoLegendreMortarL2, uEltype, cache)
fstar_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))

# Temporary arrays can also be created here
(; fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right)
(; fstar_primary_upper_left, fstar_primary_upper_right, fstar_primary_lower_left,
fstar_primary_lower_right, fstar_secondary_upper_left, fstar_secondary_upper_right,
fstar_secondary_lower_left, fstar_secondary_lower_right)
end
17 changes: 4 additions & 13 deletions src/solvers/common.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,11 @@
# Some common functions that are shared between the solvers.

# Copy data from CPU to GPU
function copy_to_gpu!(du::PtrArray, u::PtrArray)
du = CUDA.zeros(Float64, size(du)...)
u = CuArray{Float64}(u)
function reset_du!(du::CuArray{Float64})
du_zero = zero(du)
du .= du_zero # no scalar indexing

return (du, u)
end

# Copy data from GPU to CPU
function copy_to_cpu!(du::CuArray, u::CuArray)
# TODO: Direct CuArray to PtrArray
du = PtrArray(Array{Float64}(du))
u = PtrArray(Array{Float64}(u))

return (du, u)
return nothing
end

# Set diagonal entries of a matrix to zeros
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU1D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU1D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU1D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU2D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU2D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU2D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_3d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU3D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU3D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU3D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
122 changes: 122 additions & 0 deletions src/solvers/dg.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Kernel for computing the coefficients for 1D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{1})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2) && k <= size(u, 3))
x_node = get_node_coords(node_coordinates, equations, j, k)

if j == 1 # bad
x_node = SVector(nextfloat(x_node[1]))
elseif j == size(u, 2) # bad
x_node = SVector(prevfloat(x_node[1]))
end

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j, k] = u_node[ii]
end
end
end

return nothing
end

# Kernel for computing the coefficients for 2D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{2})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2)^2 && k <= size(u, 4))
j1 = div(j - 1, size(u, 2)) + 1
j2 = rem(j - 1, size(u, 2)) + 1

x_node = get_node_coords(node_coordinates, equations, j1, j2, k)

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j1, j2, k] = u_node[ii]
end
end
end

return nothing
end

# Kernel for computing the coefficients for 3D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{3})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2)^3 && k <= size(u, 5))
u2 = size(u, 2)

j1 = div(j - 1, u2^2) + 1
j2 = div(rem(j - 1, u2^2), u2) + 1
j3 = rem(rem(j - 1, u2^2), u2) + 1

x_node = get_node_coords(node_coordinates, equations, j1, j2, j3, k)

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j1, j2, j3, k] = u_node[ii]
end
end
end

return nothing
end

# Call kernels to compute the coefficients for 1D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{1}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2),
size(u, 3))...)

return u
end

# Call kernels to compute the coefficients for 2D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{2}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2)^2,
size(u, 4))...)

return u
end

# Call kernels to compute the coefficients for 2D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{3}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2)^3,
size(u, 5))...)

return u
end
7 changes: 2 additions & 5 deletions src/solvers/dg_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1075,9 +1075,9 @@ end
# Put everything together into a single function.

# See also `rhs!` function in Trixi.jl
function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{1}, equations, boundary_conditions,
function rhs_gpu!(du, u, t, mesh::TreeMesh{1}, equations, boundary_conditions,
source_terms::Source, dg::DGSEM, cache) where {Source}
du, u = copy_to_gpu!(du_cpu, u_cpu)
reset_du!(du)

cuda_volume_integral!(du, u, mesh, have_nonconservative_terms(equations), equations,
dg.volume_integral, dg, cache)
Expand All @@ -1097,8 +1097,5 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{1}, equations, boundary_condi

cuda_sources!(du, u, t, source_terms, equations, cache)

du_computed, _ = copy_to_cpu!(du, u)
du_cpu .= du_computed

return nothing
end
Loading
Loading