From 4f7d797f5ec14f5c81b23c1cc7510237a9de1f81 Mon Sep 17 00:00:00 2001 From: mpudig Date: Fri, 26 Jul 2024 13:33:48 -0400 Subject: [PATCH] First attempt at writing kernel and workspace following Greg suggestions on issue #112. --- src/multilayerqg.jl | 92 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 4e35d679..5a10f890 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -18,11 +18,13 @@ using LinearAlgebra, StaticArrays, Reexport, - DocStringExtensions + DocStringExtensions, + KernelAbstractions @reexport using FourierFlows using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft +using KernelAbstractions.Extras.LoopInfo: @unroll nothingfunction(args...) = nothing @@ -533,16 +535,52 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. """ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) +""" + @kernel function pvfromstreamfunction_kernel!(qh, ψh, params) + +Kernel for GPU acceleration of PV from streamfunction calculation, i.e., obtaining the Fourier +transform of the PV from the streamfunction `ψh` in each layer using `qh = params.S * ψh`. +""" +@kernel function pvfromstreamfunction_kernel!(qh, ψh, params) + i, j = @index(Global, NTuple) + nlayers, S = params.nlayers, params.S + + @unroll for k = 1:nlayers + + @inbounds qh[i, j, k] = 0 + + @unroll for m = 1:nlayers + @inbounds qh[i, j, k] += S[i, j][k, m] * ψh[i, j, m] + end + + end +end + """ pvfromstreamfunction!(qh, ψh, params, grid) Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using -`qh = params.S * ψh`. +`qh = params.S * ψh`. We use a work layout over which the above-defined kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :] - end + # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs + # different behavior when either nkl or nl are less than 16 + workgroup = 16, 16 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # This (and its usage below) will ensure the kernel is not run _before_ the data in ψh is available + barrier = Event(grid.device) + + # Creates a loop over the specified worksize, using workgroup to organize the computation + loop_kernel! = pvfromstreamfunction_kernel!(grid.device, workgroup, worksize) + + # Launch the kernel + event = loop_kernel!(qh, ψh, params, dependencies = barrier) + + # This will ensure that no other operations occur until the kernel has finished + wait(event) return nothing end @@ -587,16 +625,52 @@ function pvfromstreamfunction!(qh, ψh, params::TwoLayerParams, grid) return nothing end +""" + @kernel function streamfunctionfrompv_kernel!(ψh, qh, params) + +Kernel for GPU acceleration of streamfunction from PV calculation, i.e., invert the PV to obtain +the Fourier transform of the streamfunction `ψh` in each layer from `qh` using `ψh = params.S⁻¹ qh`. +""" +@kernel function streamfunctionfrompv_kernel!(ψh, qh, params) + i, j = @index(Global, NTuple) + nlayers, S = params.nlayers, params.S + + @unroll for k = 1:nlayers + + @inbounds ψh[i, j, k] = 0 + + @unroll for m = 1:nlayers + @inbounds ψh[i, j, k] += S⁻¹[i, j][k, m] * qh[i, j, m] + end + + end +end + """ streamfunctionfrompv!(ψh, qh, params, grid) Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from -`qh` using `ψh = params.S⁻¹ qh`. +`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the above-defined kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :] - end + # Larger workgroups are generally more efficient. For more generality, we could put an if statement that incurs + # different behavior when either nkl or nl are less than 16 + workgroup = 16, 16 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # This (and its usage below) will ensure the kernel is not run _before_ the data in qh is available + barrier = Event(grid.device) + + # Creates a loop over the specified worksize, using workgroup to organize the computation + loop_kernel! = streamfunctionfrompv_kernel!(grid.device, workgroup, worksize) + + # Launch the kernel + event = loop_kernel!(ψh, qh, params, dependencies = barrier) + + # This will ensure that no other operations occur until the kernel has finished + wait(event) return nothing end