diff --git a/src/TrixiGPU.jl b/src/TrixiGPU.jl index 3893dc11..f2c06dc2 100644 --- a/src/TrixiGPU.jl +++ b/src/TrixiGPU.jl @@ -1,6 +1,6 @@ module TrixiGPU -# Include other packages that are used in TrixiGPU.jl (# FIXME: Remember to reorder) +# Include other packages that are used in TrixiGPU.jl # using Reexport: @reexport using CUDA: @cuda, CuArray, HostKernel, @@ -11,8 +11,8 @@ using Trixi: AbstractEquations, TreeMesh, DGSEM, VolumeIntegralWeakForm, VolumeIntegralFluxDifferencing, flux, ntuple, nvariables, True, False, - wrap_array, compute_coefficients, - have_nonconservative_terms, + wrap_array, compute_coefficients, have_nonconservative_terms, + boundary_condition_periodic, set_log_type, set_sqrt_type import Trixi: get_node_vars, get_node_coords, get_surface_node_vars diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index 699dc7fd..98c67936 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -1,5 +1,5 @@ include("configurators.jl") -include("methods.jl") +include("helpers.jl") # Some default settings for the package function load_default_settings() diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl index d36f23ed..a668f1cf 100644 --- a/src/auxiliary/configurators.jl +++ b/src/auxiliary/configurators.jl @@ -1,5 +1,5 @@ -# Kernel configurators are used for determining the number of -# threads and blocks to be used in the kernel, which optimizes the use of GPU resources. +# Kernel configurators are used for determining the number of threads and +# blocks to be used in the kernel, which optimizes the use of GPU resources. # Kernel configurator for 1D CUDA array function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1}) diff --git a/src/auxiliary/methods.jl b/src/auxiliary/helpers.jl similarity index 72% rename from src/auxiliary/methods.jl rename to src/auxiliary/helpers.jl index 40af3479..04278958 100644 --- a/src/auxiliary/methods.jl +++ b/src/auxiliary/helpers.jl @@ -1,4 +1,4 @@ -# Extend common helper methods from Trixi.jl +# Some helper functions and function extensions from Trixi.jl. # Ref: `get_node_vars(u, equations, solver::DG, indices...)` in Trixi.jl @inline function get_node_vars(u, equations, indices...) @@ -21,3 +21,9 @@ end return u_ll, u_rr end + +# Callable function to replace the `boundary_condition_periodic` from Trixi.jl +@inline function boundary_condition_periodic_callable(u_inner, orientation, + direction, x, t, surface_flux, equations) + return nothing +end diff --git a/src/solvers/common.jl b/src/solvers/common.jl index 25b54e02..72fe92ee 100644 --- a/src/solvers/common.jl +++ b/src/solvers/common.jl @@ -1,4 +1,12 @@ -# Here are some common functions that are shared between the solvers. +# Some common functions that are shared between the solvers. + +# Replace the `boundary_condition_periodic` from Trixi.jl with a callable one +function replace_boundary_conditions(boundary_conditions::NamedTuple) + keys_ = keys(boundary_conditions) + values_ = (func == boundary_condition_periodic ? boundary_condition_periodic_callable : func + for func in values(boundary_conditions)) + return NamedTuple{keys_}(values_) +end # Copy data from host to device function copy_to_device!(du::PtrArray, u::PtrArray) @@ -16,3 +24,19 @@ function copy_to_host!(du::CuArray, u::CuArray) return (du, u) end + +# Kernel for getting last and first indices +function last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (i <= length(n_boundaries_per_direction)) + @inbounds begin + for ii in 1:i + lasts[i] += n_boundaries_per_direction[ii] + end + firsts[i] = lasts[i] - n_boundaries_per_direction[i] + 1 + end + end + + return nothing +end diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl index 3c874942..9fbc5255 100644 --- a/src/solvers/dg_1d.jl +++ b/src/solvers/dg_1d.jl @@ -1,4 +1,4 @@ -# Everything related to a DG semidiscretization in 1D +# Everything related to a DG semidiscretization in 1D. # Functions that end with `_kernel` are CUDA kernels that are going to be launched by # the @cuda macro with parameters from the kernel configurator. They are purely run on @@ -84,7 +84,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr) end # Kernel for prolonging two interfaces -function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, equations::AbstractEquations{1}) +function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y @@ -122,8 +122,7 @@ function surface_flux_kernel!(surface_flux_arr, interfaces_u, equations::Abstrac end # Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids, - equations::AbstractEquations{1}) +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y @@ -140,6 +139,65 @@ function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ return nothing end +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) && k <= size(boundaries_u, 3)) + element = neighbor_ids[k] + side = neighbor_sides[k] + + @inbounds begin + boundaries_u[1, j, k] = u[j, size(u, 2), element] * isequal(side, 1) # set to 0 instead of NaN + boundaries_u[2, j, k] = u[j, 1, element] * (1 - isequal(side, 1)) # set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{1}, + surface_flux::Any) + k = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (k <= length(boundary_arr)) + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, boundary) + u_inner = isequal(side, 1) * u_ll + (1 - isequal(side, 1)) * u_rr + x = get_node_coords(node_coordinates, equations, boundary) + + # TODO: Optimize flow controls as they are not recommended on GPUs + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + @inbounds begin + for ii in axes(surface_flux_values, 1) + surface_flux_values[ii, direction, neighbor] = boundary_flux_node === nothing ? # bad + surface_flux_values[ii, direction, + neighbor] : + boundary_flux_node[ii] + end + end + end + + return nothing +end + # Kernel for calculating surface integrals function surface_integral_kernel!(du, factor_arr, surface_flux_values, equations::AbstractEquations{1}) @@ -226,12 +284,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...,) + configurator_2d(volume_flux_kernel, size_arr)...) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, volume_flux_arr) volume_integral_kernel(du, derivative_split, volume_flux_arr; - configurator_3d(volume_integral_kernel, du)...,) + configurator_3d(volume_integral_kernel, du)...) return nothing end @@ -244,10 +302,9 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{1}, equations, cache) size_arr = CuArray{Float64}(undef, size(interfaces_u, 2), size(interfaces_u, 3)) prolong_interfaces_kernel = @cuda launch=false prolong_interfaces_kernel!(interfaces_u, u, - neighbor_ids, - equations) - prolong_interfaces_kernel(interfaces_u, u, neighbor_ids, equations; - configurator_2d(prolong_interfaces_kernel, size_arr)...,) + neighbor_ids) + prolong_interfaces_kernel(interfaces_u, u, neighbor_ids; + configurator_2d(prolong_interfaces_kernel, size_arr)...) cache.interfaces.u = interfaces_u # copy back to host automatically @@ -269,15 +326,15 @@ function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::False, e surface_flux_kernel = @cuda launch=false surface_flux_kernel!(surface_flux_arr, interfaces_u, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, equations, surface_flux; - configurator_1d(surface_flux_kernel, size_arr)...,) + configurator_1d(surface_flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, - neighbor_ids, equations) - interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids, equations; - configurator_2d(interface_flux_kernel, size_arr)...,) + neighbor_ids) + interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids; + configurator_2d(interface_flux_kernel, size_arr)...) cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically @@ -286,16 +343,82 @@ end # Dummy function for asserting boundaries function cuda_prolong2boundaries!(u, mesh::TreeMesh{1}, - boundary_condition::BoundaryConditionPeriodic, cache) + boundary_condition::BoundaryConditionPeriodic, equations, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for prolonging solution to boundaries +function cuda_prolong2boundaries!(u, mesh::TreeMesh{1}, boundary_conditions::NamedTuple, equations, + cache) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + + size_arr = CuArray{Float64}(undef, size(boundaries_u, 2), size(boundaries_u, 3)) + + prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, + neighbor_ids, + neighbor_sides) + prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides; + configurator_2d(prolong_boundaries_kernel, size_arr)...) + + cache.boundaries.u = boundaries_u # copy back to host automatically + + return nothing +end + # Dummy function for asserting boundary fluxes function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_condition::BoundaryConditionPeriodic, equations, dg::DGSEM, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for calculating boundary fluxes +function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_conditions::NamedTuple, equations, + dg::DGSEM, cache) + surface_flux = dg.surface_integral.surface_flux + + n_boundaries_per_direction = CuArray{Int64}(cache.boundaries.n_boundaries_per_direction) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + orientations = CuArray{Int64}(cache.boundaries.orientations) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + node_coordinates = CuArray{Float64}(cache.boundaries.node_coordinates) + surface_flux_values = CuArray{Float64}(cache.elements.surface_flux_values) + + lasts = zero(n_boundaries_per_direction) + firsts = zero(n_boundaries_per_direction) + + last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, + n_boundaries_per_direction) + last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; + configurator_1d(last_first_indices_kernel, lasts)...) + + lasts, firsts = Array(lasts), Array(firsts) + boundary_arr = CuArray{Int64}(firsts[1]:lasts[2]) + indices_arr = CuArray{Int64}([firsts[1], firsts[2]]) + boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) + + size_arr = CuArray{Float64}(undef, length(boundary_arr)) + + boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, + boundaries_u, node_coordinates, + t, boundary_arr, indices_arr, + neighbor_ids, neighbor_sides, + orientations, + boundary_conditions_callable, + equations, + surface_flux) + boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions_callable, equations, surface_flux; + configurator_1d(boundary_flux_kernel, size_arr)...) + + cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically + + return nothing +end + # Pack kernels for calculating surface integrals function cuda_surface_integral!(du, mesh::TreeMesh{1}, equations, dg::DGSEM, cache) # FIXME: Check `surface_integral` @@ -311,7 +434,7 @@ function cuda_surface_integral!(du, mesh::TreeMesh{1}, equations, dg::DGSEM, cac surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, size_arr)...,) + configurator_3d(surface_integral_kernel, size_arr)...) return nothing end @@ -333,14 +456,14 @@ end # Pack kernels for calculating source terms function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{1}, cache) - node_coordinates = CuArray{Float32}(cache.elements.node_coordinates) + node_coordinates = CuArray{Float64}(cache.elements.node_coordinates) - size_arr = CuArray{Float32}(undef, size(du, 2), size(du, 3)) + size_arr = CuArray{Float64}(undef, size(du, 2), size(du, 3)) source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...,) + configurator_2d(source_terms_kernel, size_arr)...) return nothing end @@ -359,7 +482,7 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{1}, equations, initial_condit cuda_interface_flux!(mesh, have_nonconservative_terms(equations), equations, dg, cache) - cuda_prolong2boundaries!(u, mesh, boundary_conditions, cache) + cuda_prolong2boundaries!(u, mesh, boundary_conditions, equations, cache) cuda_boundary_flux!(t, mesh, boundary_conditions, equations, dg, cache) diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index 3d284c52..90ad0f28 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -1,4 +1,4 @@ -# Everything related to a DG semidiscretization in 2D +# Everything related to a DG semidiscretization in 2D. # Functions that end with `_kernel` are CUDA kernels that are going to be launched by # the @cuda macro with parameters from the kernel configurator. They are purely run on @@ -178,6 +178,87 @@ function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ return nothing end +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations::AbstractEquations{2}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) * size(boundaries_u, 3) && k <= size(boundaries_u, 4)) + j1 = div(j - 1, size(boundaries_u, 3)) + 1 + j2 = rem(j - 1, size(boundaries_u, 3)) + 1 + + element = neighbor_ids[k] + side = neighbor_sides[k] + orientation = orientations[k] + + u2 = size(u, 2) + + @inbounds begin + boundaries_u[1, j1, j2, k] = u[j1, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2, + element] * isequal(side, 1) # Set to 0 instead of NaN + boundaries_u[2, j1, j2, k] = u[j1, + isequal(orientation, 1) * 1 + isequal(orientation, 2) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * 1, + element] * (1 - isequal(side, 1)) # Set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{2}, + surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_values, 2) && k <= length(boundary_arr)) + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j, boundary) + u_inner = isequal(side, 1) * u_ll + (1 - isequal(side, 1)) * u_rr + x = get_node_coords(node_coordinates, equations, j, boundary) + + # TODO: Optimize flow controls as they are not recommended on GPUs + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 2 + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 3 + boundary_flux_node = boundary_conditions[3](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[4](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + @inbounds begin + for ii in axes(surface_flux_values, 1) + surface_flux_values[ii, j, direction, neighbor] = boundary_flux_node === nothing ? # bad + surface_flux_values[ii, j, + direction, + neighbor] : + boundary_flux_node[ii] + end + end + end + + return nothing +end + # Kernel for calculating surface integrals function surface_integral_kernel!(du, factor_arr, surface_flux_values, equations::AbstractEquations{2}) @@ -256,14 +337,14 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms, size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, u, equations, flux) - flux_kernel(flux_arr1, flux_arr2, u, equations; configurator_2d(flux_kernel, size_arr)...) + flux_kernel(flux_arr1, flux_arr2, u, equations, flux; configurator_2d(flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2) weak_form_kernel(du, derivative_dhat, flux_arr1, flux_arr2; - configurator_3d(weak_form_kernel, size_arr)...,) + configurator_3d(weak_form_kernel, size_arr)...) return nothing end @@ -283,7 +364,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...,) + configurator_2d(volume_flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) @@ -291,7 +372,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_flux_arr1, volume_flux_arr2) volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2; - configurator_3d(volume_integral_kernel, size_arr)...,) + configurator_3d(volume_integral_kernel, size_arr)...) return nothing end @@ -310,7 +391,7 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{2}, equations, cache) orientations, equations) prolong_interfaces_kernel(interfaces_u, u, neighbor_ids, orientations, equations; - configurator_2d(prolong_interfaces_kernel, size_arr)...,) + configurator_2d(prolong_interfaces_kernel, size_arr)...) cache.interfaces.u = interfaces_u # copy back to host automatically @@ -334,7 +415,7 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::False, e orientations, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, orientations, equations, surface_flux; - configurator_2d(surface_flux_kernel, size_arr)...,) + configurator_2d(surface_flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3), size(interfaces_u, 4)) @@ -345,7 +426,7 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::False, e equations) interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...,) + configurator_3d(interface_flux_kernel, size_arr)...) cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically @@ -354,16 +435,87 @@ end # Dummy function for asserting boundaries function cuda_prolong2boundaries!(u, mesh::TreeMesh{2}, - boundary_condition::BoundaryConditionPeriodic, cache) + boundary_condition::BoundaryConditionPeriodic, equations, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for prolonging solution to boundaries +function cuda_prolong2boundaries!(u, mesh::TreeMesh{2}, boundary_conditions::NamedTuple, equations, + cache) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + orientations = CuArray{Int64}(cache.boundaries.orientations) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + + size_arr = CuArray{Float64}(undef, size(boundaries_u, 2) * size(boundaries_u, 3), + size(boundaries_u, 4)) + + prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, + neighbor_ids, + neighbor_sides, + orientations, + equations) + prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations; + configurator_2d(prolong_boundaries_kernel, size_arr)...) + + cache.boundaries.u = boundaries_u # copy back to host automatically + + return nothing +end + # Dummy function for asserting boundary fluxes function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_condition::BoundaryConditionPeriodic, equations, dg::DGSEM, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for calculating boundary fluxes +function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTuple, equations, + dg::DGSEM, cache) + surface_flux = dg.surface_integral.surface_flux + + n_boundaries_per_direction = CuArray{Int64}(cache.boundaries.n_boundaries_per_direction) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + orientations = CuArray{Int64}(cache.boundaries.orientations) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + node_coordinates = CuArray{Float64}(cache.boundaries.node_coordinates) + surface_flux_values = CuArray{Float64}(cache.elements.surface_flux_values) + + lasts = zero(n_boundaries_per_direction) + firsts = zero(n_boundaries_per_direction) + + last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, + n_boundaries_per_direction) + last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; + configurator_1d(last_first_indices_kernel, lasts)...) + + lasts, firsts = Array(lasts), Array(firsts) + boundary_arr = CuArray{Int64}(firsts[1]:lasts[4]) + indices_arr = CuArray{Int64}([firsts[1], firsts[2], firsts[3], firsts[4]]) + boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) + + size_arr = CuArray{Float64}(undef, size(surface_flux_values, 2), length(boundary_arr)) + + boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, + boundaries_u, node_coordinates, + t, boundary_arr, indices_arr, + neighbor_ids, neighbor_sides, + orientations, + boundary_conditions_callable, + equations, + surface_flux) + boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions_callable, equations, surface_flux; + configurator_2d(boundary_flux_kernel, size_arr)...) + + cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically + + return nothing +end + # Pack kernels for calculating surface integrals function cuda_surface_integral!(du, mesh::TreeMesh{2}, equations, dg::DGSEM, cache) # FIXME: `surface_integral::SurfaceIntegralWeakForm` @@ -379,7 +531,7 @@ function cuda_surface_integral!(du, mesh::TreeMesh{2}, equations, dg::DGSEM, cac surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, size_arr)...,) + configurator_3d(surface_integral_kernel, size_arr)...) return nothing end @@ -403,14 +555,14 @@ end # Pack kernels for calculating source terms function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{2}, cache) - node_coordinates = CuArray{Float32}(cache.elements.node_coordinates) + node_coordinates = CuArray{Float64}(cache.elements.node_coordinates) - size_arr = CuArray{Float32}(undef, size(u, 2)^2, size(u, 4)) + size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...,) + configurator_2d(source_terms_kernel, size_arr)...) return nothing end @@ -429,7 +581,7 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{2}, equations, initial_condit cuda_interface_flux!(mesh, have_nonconservative_terms(equations), equations, dg, cache) - cuda_prolong2boundaries!(u, mesh, boundary_conditions, cache) + cuda_prolong2boundaries!(u, mesh, boundary_conditions, equations, cache) cuda_boundary_flux!(t, mesh, boundary_conditions, equations, dg, cache) diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 0b5b7a09..a2d309dc 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -1,6 +1,4 @@ -# Everything related to a DG semidiscretization in 3D - -# TODO: Check format standards +# Everything related to a DG semidiscretization in 3D. # Functions that end with `_kernel` are CUDA kernels that are going to be launched by # the @cuda macro with parameters from the kernel configurator. They are purely run on @@ -205,6 +203,102 @@ function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ return nothing end +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations::AbstractEquations{3}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) * size(boundaries_u, 3)^2 && k <= size(boundaries_u, 5)) + j1 = div(j - 1, size(boundaries_u, 3)^2) + 1 + j2 = div(rem(j - 1, size(boundaries_u, 3)^2), size(boundaries_u, 3)) + 1 + j3 = rem(rem(j - 1, size(boundaries_u, 3)^2), size(boundaries_u, 3)) + 1 + + element = neighbor_ids[k] + side = neighbor_sides[k] + orientation = orientations[k] + + u2 = size(u, 2) + + @inbounds begin + boundaries_u[1, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * u2, + element] * isequal(side, 1) # Set to 0 instead of NaN + boundaries_u[2, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) * 1 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * 1 + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * 1, + element] * (1 - isequal(side, 1)) # Set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{3}, + surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_values, 2)^2 && k <= length(boundary_arr)) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + + (indices_arr[5] <= boundary) + (indices_arr[6] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j1, j2, boundary) + u_inner = isequal(side, 1) * u_ll + (1 - isequal(side, 1)) * u_rr + x = get_node_coords(node_coordinates, equations, j1, j2, boundary) + + # TODO: Optimize flow controls as they are not recommended on GPUs + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 2 + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 3 + boundary_flux_node = boundary_conditions[3](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 4 + boundary_flux_node = boundary_conditions[4](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 5 + boundary_flux_node = boundary_conditions[5](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[6](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + @inbounds begin + for ii in axes(surface_flux_values, 1) + surface_flux_values[ii, j1, j2, direction, neighbor] = boundary_flux_node === + nothing ? # bad + surface_flux_values[ii, j1, + j2, + direction, + neighbor] : + boundary_flux_node[ii] + end + end + end + + return nothing +end + # Kernel for calculating surface integrals function surface_integral_kernel!(du, factor_arr, surface_flux_values, equations::AbstractEquations{3}) @@ -293,15 +387,15 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms, flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations, flux) - flux_kernel(flux_arr1, flux_arr2, flux_arr3, u, equations; - configurator_2d(flux_kernel, size_arr)...,) + flux_kernel(flux_arr1, flux_arr2, flux_arr3, u, equations, flux; + configurator_2d(flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3) weak_form_kernel(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3; - configurator_3d(weak_form_kernel, size_arr)...,) + configurator_3d(weak_form_kernel, size_arr)...) return nothing end @@ -311,24 +405,24 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM) volume_flux = volume_integral.volume_flux - derivative_split = CuArray{Float32}(dg.basis.derivative_split) - volume_flux_arr1 = CuArray{Float32}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), + derivative_split = CuArray{Float64}(dg.basis.derivative_split) + volume_flux_arr1 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - volume_flux_arr2 = CuArray{Float32}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), + volume_flux_arr2 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - volume_flux_arr3 = CuArray{Float32}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), + volume_flux_arr3 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - size_arr = CuArray{Float32}(undef, size(u, 2)^4, size(u, 5)) + size_arr = CuArray{Float64}(undef, size(u, 2)^4, size(u, 5)) volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...,) + configurator_2d(volume_flux_kernel, size_arr)...) - size_arr = CuArray{Float32}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) + size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, volume_flux_arr1, @@ -336,7 +430,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_arr3) volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2, volume_flux_arr3; - configurator_3d(volume_integral_kernel, size_arr)...,) + configurator_3d(volume_integral_kernel, size_arr)...) return nothing end @@ -355,7 +449,7 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{3}, equations, cache) orientations, equations) prolong_interfaces_kernel(interfaces_u, u, neighbor_ids, orientations, equations; - configurator_2d(prolong_interfaces_kernel, size_arr)...,) + configurator_2d(prolong_interfaces_kernel, size_arr)...) cache.interfaces.u = interfaces_u # copy back to host automatically @@ -380,7 +474,7 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::False, e orientations, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, orientations, equations, surface_flux; - configurator_3d(surface_flux_kernel, size_arr)...,) + configurator_3d(surface_flux_kernel, size_arr)...) size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)^2, size(interfaces_u, 5)) @@ -391,7 +485,7 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::False, e equations) interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...,) + configurator_3d(interface_flux_kernel, size_arr)...) cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically @@ -400,16 +494,86 @@ end # Dummy function for asserting boundaries function cuda_prolong2boundaries!(u, mesh::TreeMesh{3}, - boundary_condition::BoundaryConditionPeriodic, cache) + boundary_condition::BoundaryConditionPeriodic, equations, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for prolonging solution to boundaries +function cuda_prolong2boundaries!(u, mesh::TreeMesh{3}, boundary_conditions::NamedTuple, equations, + cache) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + orientations = CuArray{Int64}(cache.boundaries.orientations) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + + size_arr = CuArray{Float64}(undef, size(boundaries_u, 2) * size(boundaries_u, 3)^2, + size(boundaries_u, 5)) + + prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, + neighbor_ids, + neighbor_sides, + orientations, + equations) + prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations; + configurator_2d(prolong_boundaries_kernel, size_arr)...) + + cache.boundaries.u = boundaries_u # copy back to host automatically + + return nothing +end + # Dummy function for asserting boundary fluxes function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_condition::BoundaryConditionPeriodic, equations, dg::DGSEM, cache) @assert iszero(length(cache.boundaries.orientations)) end +# Pack kernels for calculating boundary fluxes +function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTuple, equations, + dg::DGSEM, cache) + surface_flux = dg.surface_integral.surface_flux + + n_boundaries_per_direction = CuArray{Int64}(cache.boundaries.n_boundaries_per_direction) + neighbor_ids = CuArray{Int64}(cache.boundaries.neighbor_ids) + neighbor_sides = CuArray{Int64}(cache.boundaries.neighbor_sides) + orientations = CuArray{Int64}(cache.boundaries.orientations) + boundaries_u = CuArray{Float64}(cache.boundaries.u) + node_coordinates = CuArray{Float64}(cache.boundaries.node_coordinates) + surface_flux_values = CuArray{Float64}(cache.elements.surface_flux_values) + + lasts = zero(n_boundaries_per_direction) + firsts = zero(n_boundaries_per_direction) + + last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, + n_boundaries_per_direction) + last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; + configurator_1d(last_first_indices_kernel, lasts)...) + + lasts, firsts = Array(lasts), Array(firsts) + boundary_arr = CuArray{Int64}(firsts[1]:lasts[6]) + indices_arr = CuArray{Int64}([firsts[1], firsts[2], firsts[3], firsts[4], firsts[5], firsts[6]]) + boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) + + size_arr = CuArray{Float64}(undef, size(surface_flux_values, 2)^2, length(boundary_arr)) + + boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, + boundaries_u, node_coordinates, + t, boundary_arr, indices_arr, + neighbor_ids, neighbor_sides, + orientations, + boundary_conditions_callable, + equations, surface_flux) + boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions_callable, equations, surface_flux; + configurator_2d(boundary_flux_kernel, size_arr)...) + + cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically + + return nothing +end + # Pack kernels for calculating surface integrals function cuda_surface_integral!(du, mesh::TreeMesh{3}, equations, dg::DGSEM, cache) # FIXME: Check `surface_integral` @@ -425,7 +589,7 @@ function cuda_surface_integral!(du, mesh::TreeMesh{3}, equations, dg::DGSEM, cac surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, size_arr)...,) + configurator_3d(surface_integral_kernel, size_arr)...) return nothing end @@ -449,14 +613,14 @@ end # Pack kernels for calculating source terms function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{3}, cache) - node_coordinates = CuArray{Float32}(cache.elements.node_coordinates) + node_coordinates = CuArray{Float64}(cache.elements.node_coordinates) - size_arr = CuArray{Float32}(undef, size(u, 2)^3, size(u, 5)) + size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...,) + configurator_2d(source_terms_kernel, size_arr)...) return nothing end @@ -475,7 +639,7 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{3}, equations, initial_condit cuda_interface_flux!(mesh, have_nonconservative_terms(equations), equations, dg, cache) - cuda_prolong2boundaries!(u, mesh, boundary_conditions, cache) + cuda_prolong2boundaries!(u, mesh, boundary_conditions, equations, cache) cuda_boundary_flux!(t, mesh, boundary_conditions, equations, dg, cache) diff --git a/test/test_script.jl b/test/test_script.jl index efa1584c..284fe46e 100644 --- a/test/test_script.jl +++ b/test/test_script.jl @@ -2,29 +2,37 @@ using Trixi, TrixiGPU using OrdinaryDiffEq using CUDA -equations = CompressibleEulerEquations3D(1.4) - -initial_condition = initial_condition_weak_blast_wave - -volume_flux = flux_ranocha -solver = DGSEM(; - polydeg = 3, - surface_flux = flux_ranocha, - volume_integral = VolumeIntegralFluxDifferencing(volume_flux),) - -coordinates_min = (-2.0, -2.0, -2.0) -coordinates_max = (2.0, 2.0, 2.0) -mesh = TreeMesh(coordinates_min, coordinates_max; initial_refinement_level = 3, - n_cells_max = 100_000) - -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) - -@unpack mesh, -equations, initial_condition, boundary_conditions, source_terms, solver, -cache = semi +equations = HyperbolicDiffusionEquations3D() + +initial_condition = initial_condition_poisson_nonperiodic +boundary_conditions = (x_neg = boundary_condition_poisson_nonperiodic, + x_pos = boundary_condition_poisson_nonperiodic, + y_neg = boundary_condition_periodic, + y_pos = boundary_condition_periodic, + z_neg = boundary_condition_periodic, + z_pos = boundary_condition_periodic) + +solver = DGSEM(polydeg = 4, surface_flux = flux_lax_friedrichs) + +coordinates_min = (0.0, 0.0, 0.0) +coordinates_max = (1.0, 1.0, 1.0) +mesh = TreeMesh(coordinates_min, + coordinates_max, + initial_refinement_level = 2, + n_cells_max = 30_000, + periodicity = (false, true, true)) + +semi = SemidiscretizationHyperbolic(mesh, + equations, + initial_condition, + solver, + source_terms = source_terms_poisson_nonperiodic, + boundary_conditions = boundary_conditions) + +(; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi t = 0.0 -tspan = (0.0, 0.4) +tspan = (0.0, 5.0) ode = semidiscretize(semi, tspan) u_ode = copy(ode.u0) @@ -33,17 +41,13 @@ u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) du, u = TrixiGPU.copy_to_device!(du, u) -TrixiGPU.cuda_volume_integral!(du, - u, - mesh, - Trixi.have_nonconservative_terms(equations), - equations, - solver.volume_integral, - solver) +TrixiGPU.cuda_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), equations, + solver.volume_integral, solver) TrixiGPU.cuda_prolong2interfaces!(u, mesh, equations, cache) TrixiGPU.cuda_interface_flux!(mesh, Trixi.have_nonconservative_terms(equations), equations, solver, cache) -TrixiGPU.cuda_prolong2boundaries!(u, mesh, boundary_conditions, cache) +TrixiGPU.cuda_prolong2boundaries!(u, mesh, boundary_conditions, equations, cache) + TrixiGPU.cuda_boundary_flux!(t, mesh, boundary_conditions, equations, solver, cache) TrixiGPU.cuda_surface_integral!(du, mesh, equations, solver, cache) TrixiGPU.cuda_jacobian!(du, mesh, equations, cache)