Skip to content

Commit

Permalink
Parallelization of compute coefficients functions on GPU (#63)
Browse files Browse the repository at this point in the history
* Start

* Compute coefficients

* Complete 1D

* Remove StrideArrays

* Fix noncons mortar flux 2D

* Complete 2D

* Fix noncons mortar flux 3D

* Complete 3D
  • Loading branch information
huiyuxie authored Dec 6, 2024
1 parent 5466caa commit ded2864
Show file tree
Hide file tree
Showing 68 changed files with 4,552 additions and 6,033 deletions.
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StrideArrays = "d1fa6d79-ef01-42a6-86c9-f7c551f8593b"
Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"

Expand All @@ -17,7 +16,6 @@ BenchmarkTools = "1"
CUDA = "5"
SciMLBase = "2"
StaticArrays = "1"
StrideArrays = "0.1"
Trixi = "0.8, 0.9"
TrixiBase = "0.1"
julia = "1.10"
Expand Down
5 changes: 2 additions & 3 deletions src/TrixiCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using CUDA
using CUDA: @cuda, CuArray, HostKernel,
threadIdx, blockIdx, blockDim, similar, launch_configuration

using Trixi: AbstractEquations, AbstractContainer,
using Trixi: AbstractEquations, AbstractContainer, AbstractSemidiscretization, AbstractMesh,
ElementContainer1D, ElementContainer2D, ElementContainer3D,
InterfaceContainer1D, InterfaceContainer2D, InterfaceContainer3D,
BoundaryContainer1D, BoundaryContainer2D, BoundaryContainer3D,
Expand All @@ -18,6 +18,7 @@ using Trixi: AbstractEquations, AbstractContainer,
BoundaryConditionPeriodic, BoundaryConditionDirichlet,
VolumeIntegralWeakForm, VolumeIntegralFluxDifferencing, VolumeIntegralShockCapturingHG,
LobattoLegendreMortarL2,
allocate_coefficients, mesh_equations_solver_cache,
flux, ntuple, nvariables, nnodes, nelements, nmortars,
local_leaf_cells, init_elements, init_interfaces, init_boundaries, init_mortars,
wrap_array, compute_coefficients, have_nonconservative_terms,
Expand All @@ -30,8 +31,6 @@ import Trixi: get_node_vars, get_node_coords, get_surface_node_vars,

using SciMLBase: ODEProblem, FullSpecialize

using StrideArrays: PtrArray

using StaticArrays: SVector

# Include other source files
Expand Down
19 changes: 19 additions & 0 deletions src/semidiscretization/semidiscretization.jl
Original file line number Diff line number Diff line change
@@ -1 +1,20 @@
include("semidiscretization_hyperbolic.jl")

# Similar to `compute_coefficients` in Trixi.jl but calls GPU kernel
function compute_coefficients_gpu(func, t, semi::AbstractSemidiscretization)
u_ode = allocate_coefficients(mesh_equations_solver_cache(semi)...)

# Call `compute_coefficients_gpu` defined below
u_ode = compute_coefficients_gpu(u_ode, func, t, semi)
return u_ode
end

# Compute the coefficients for 1D problems on the GPU
function compute_coefficients_gpu(u_ode, func, t, semi::AbstractSemidiscretization)
u = wrap_array(u_ode, semi)
u = CuArray(u)

# Call `compute_coefficients_gpu` defined in `src/solvers/dg.jl`
u_computed = compute_coefficients_gpu(u, func, t, mesh_equations_solver_cache(semi)...)
return u_computed
end
11 changes: 10 additions & 1 deletion src/semidiscretization/semidiscretization_hyperbolic.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# This file is part of the package `Semidiscretizations.jl`.
# Everything specific about semidiscretization hyperbolic for PDE solvers.

# Similar to `SemidiscretizationHyperbolic` in Trixi.jl but for GPU cache
function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver;
source_terms = nothing,
boundary_conditions = boundary_condition_periodic,
Expand All @@ -16,6 +17,7 @@ function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, sol

check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)

# Return the CPU type
SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
typeof(initial_condition),
typeof(_boundary_conditions), typeof(source_terms),
Expand All @@ -25,3 +27,10 @@ function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, sol
source_terms, solver,
cache)
end

# Similar to `compute_coefficients` in Trixi.jl but calls GPU kernel
function compute_coefficients_gpu(t, semi::SemidiscretizationHyperbolic)

# Call `compute_coefficients_gpu` defined in `src/semidiscretization/semidiscretization.jl`
compute_coefficients_gpu(semi.initial_condition, t, semi)
end
44 changes: 29 additions & 15 deletions src/solvers/cache.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,16 @@ end

function create_cache_gpu(mesh::TreeMesh{2}, equations,
mortar_l2::LobattoLegendreMortarL2, uEltype, cache)
fstar_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))

(; fstar_upper, fstar_lower)
fstar_primary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_primary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_secondary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))
fstar_secondary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nmortars(cache.mortars))

(; fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower)
end

# Create cache specialized for 3D tree mesh
Expand Down Expand Up @@ -194,15 +198,25 @@ end

function create_cache_gpu(mesh::TreeMesh{3}, equations,
mortar_l2::LobattoLegendreMortarL2, uEltype, cache)
fstar_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_primary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))
fstar_secondary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2),
nnodes(mortar_l2), nmortars(cache.mortars))

# Temporary arrays can also be created here
(; fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right)
(; fstar_primary_upper_left, fstar_primary_upper_right, fstar_primary_lower_left,
fstar_primary_lower_right, fstar_secondary_upper_left, fstar_secondary_upper_right,
fstar_secondary_lower_left, fstar_secondary_lower_right)
end
17 changes: 4 additions & 13 deletions src/solvers/common.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,11 @@
# Some common functions that are shared between the solvers.

# Copy data from CPU to GPU
function copy_to_gpu!(du::PtrArray, u::PtrArray)
du = CUDA.zeros(Float64, size(du)...)
u = CuArray{Float64}(u)
function reset_du!(du::CuArray{Float64})
du_zero = zero(du)
du .= du_zero # no scalar indexing

return (du, u)
end

# Copy data from GPU to CPU
function copy_to_cpu!(du::CuArray, u::CuArray)
# TODO: Direct CuArray to PtrArray
du = PtrArray(Array{Float64}(du))
u = PtrArray(Array{Float64}(u))

return (du, u)
return nothing
end

# Set diagonal entries of a matrix to zeros
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU1D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU1D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU1D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU2D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU2D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU2D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
1 change: 1 addition & 0 deletions src/solvers/containers_3d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mutable struct ElementContainerGPU3D{RealT <: Real, uEltype <: Real} <: Abstract
end
end

Base.eltype(elements::ElementContainerGPU3D) = eltype(elements.surface_flux_values)
@inline nelements(elements::ElementContainerGPU3D) = length(elements.cell_ids)

# Copy arrays from DG elements to GPU
Expand Down
122 changes: 122 additions & 0 deletions src/solvers/dg.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Kernel for computing the coefficients for 1D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{1})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2) && k <= size(u, 3))
x_node = get_node_coords(node_coordinates, equations, j, k)

if j == 1 # bad
x_node = SVector(nextfloat(x_node[1]))
elseif j == size(u, 2) # bad
x_node = SVector(prevfloat(x_node[1]))
end

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j, k] = u_node[ii]
end
end
end

return nothing
end

# Kernel for computing the coefficients for 2D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{2})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2)^2 && k <= size(u, 4))
j1 = div(j - 1, size(u, 2)) + 1
j2 = rem(j - 1, size(u, 2)) + 1

x_node = get_node_coords(node_coordinates, equations, j1, j2, k)

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j1, j2, k] = u_node[ii]
end
end
end

return nothing
end

# Kernel for computing the coefficients for 3D problems
function compute_coefficients_kernel!(u, node_coordinates, func::Any, t,
equations::AbstractEquations{3})
j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

if (j <= size(u, 2)^3 && k <= size(u, 5))
u2 = size(u, 2)

j1 = div(j - 1, u2^2) + 1
j2 = div(rem(j - 1, u2^2), u2) + 1
j3 = rem(rem(j - 1, u2^2), u2) + 1

x_node = get_node_coords(node_coordinates, equations, j1, j2, j3, k)

u_node = func(x_node, t, equations)

@inbounds begin
for ii in axes(u, 1)
u[ii, j1, j2, j3, k] = u_node[ii]
end
end
end

return nothing
end

# Call kernels to compute the coefficients for 1D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{1}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2),
size(u, 3))...)

return u
end

# Call kernels to compute the coefficients for 2D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{2}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2)^2,
size(u, 4))...)

return u
end

# Call kernels to compute the coefficients for 2D problems
function compute_coefficients_gpu(u, func, t, mesh::AbstractMesh{3}, equations, dg::DGSEM, cache)
node_coordinates = cache.elements.node_coordinates

compute_coefficients_kernel = @cuda launch=false compute_coefficients_kernel!(u,
node_coordinates,
func, t,
equations)
compute_coefficients_kernel(u, node_coordinates, func, t, equations;
kernel_configurator_2d(compute_coefficients_kernel, size(u, 2)^3,
size(u, 5))...)

return u
end
7 changes: 2 additions & 5 deletions src/solvers/dg_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1075,9 +1075,9 @@ end
# Put everything together into a single function.

# See also `rhs!` function in Trixi.jl
function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{1}, equations, boundary_conditions,
function rhs_gpu!(du, u, t, mesh::TreeMesh{1}, equations, boundary_conditions,
source_terms::Source, dg::DGSEM, cache) where {Source}
du, u = copy_to_gpu!(du_cpu, u_cpu)
reset_du!(du)

cuda_volume_integral!(du, u, mesh, have_nonconservative_terms(equations), equations,
dg.volume_integral, dg, cache)
Expand All @@ -1097,8 +1097,5 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{1}, equations, boundary_condi

cuda_sources!(du, u, t, source_terms, equations, cache)

du_computed, _ = copy_to_cpu!(du, u)
du_cpu .= du_computed

return nothing
end
Loading

0 comments on commit ded2864

Please sign in to comment.