From ce08ca3dfb433c28b106f38daf3f7f3bbce650f1 Mon Sep 17 00:00:00 2001 From: "White, Trey" Date: Tue, 4 Feb 2025 12:44:15 -0500 Subject: [PATCH 1/3] Merge branch trey/caar/frontier_gnu from trey-ornl/scream. --- .../machineFiles/frontier-bfb-serial.cmake | 56 ++ .../cmake/machineFiles/frontier-bfb.cmake | 9 +- .../homme/cmake/machineFiles/pm-cpu-bfb.cmake | 48 ++ .../cxx/HyperviscosityFunctorImpl.hpp | 2 +- .../homme/src/share/cxx/SphereOperators.hpp | 526 +++++++++++++++ .../src/share/cxx/utilities/ViewUtils.hpp | 36 + .../homme/src/theta-l_kokkos/CMakeLists.txt | 1 + .../theta-l_kokkos/cxx/CaarFunctorImpl.cpp | 636 ++++++++++++++++++ .../theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 56 +- .../theta-l_kokkos/cxx/EquationOfState.hpp | 6 + .../src/theta-l_kokkos/cxx/LimiterFunctor.hpp | 2 +- 11 files changed, 1353 insertions(+), 25 deletions(-) create mode 100644 components/homme/cmake/machineFiles/frontier-bfb-serial.cmake create mode 100644 components/homme/cmake/machineFiles/pm-cpu-bfb.cmake create mode 100644 components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp diff --git a/components/homme/cmake/machineFiles/frontier-bfb-serial.cmake b/components/homme/cmake/machineFiles/frontier-bfb-serial.cmake new file mode 100644 index 000000000000..e2735c80b163 --- /dev/null +++ b/components/homme/cmake/machineFiles/frontier-bfb-serial.cmake @@ -0,0 +1,56 @@ +SET (HOMME_MACHINE "frontier-bfb-serial" CACHE STRING "") + +#SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} CACHE FILEPATH "") +SET (HDF5_DIR $ENV{CRAY_HDF5_PARALLEL_PREFIX} CACHE FILEPATH "") +SET (CPRNC_DIR /ccs/proj/cli115/software/frontier/cprnc CACHE FILEPATH "") + +SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +#SET(HIP_BUILD TRUE CACHE BOOL "") + +SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "") +#SET(Kokkos_ENABLE_DEBUG OFF CACHE BOOL "") +#SET(Kokkos_ARCH_VEGA90A ON CACHE BOOL "") +SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +#SET(Kokkos_ENABLE_HIP ON CACHE BOOL "") +SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "cc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") +#SET(CMAKE_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "CC" CACHE STRING "") + +SET(Extrae_LIBRARY "-I$ENV{CRAY_MPICH_DIR}/include -L$ENV{CRAY_MPICH_DIR}/lib -lmpi -lmpifort" CACHE STRING "") + +#SET(ADD_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround -h flex_mp=intolerant -h thread0 -G2 ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_Fortran_FLAGS "-g -O0 ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_C_FLAGS "-g -O0 ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_CXX_FLAGS "-g -std=c++17 -O0 ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_LINKER_FLAGS "-g -O0 ${Extrae_LIBRARY}" CACHE STRING "") + + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "short" CACHE STRING "") +SET (BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +set (USE_NUM_PROCS 1 CACHE STRING "") + +#SET (USE_MPI_OPTIONS "-c7 --gpu-bind=closest --gpus-per-task=1" CACHE FILEPATH "") diff --git a/components/homme/cmake/machineFiles/frontier-bfb.cmake b/components/homme/cmake/machineFiles/frontier-bfb.cmake index 15f97742dd7b..46bf1a0b424d 100644 --- a/components/homme/cmake/machineFiles/frontier-bfb.cmake +++ b/components/homme/cmake/machineFiles/frontier-bfb.cmake @@ -32,14 +32,15 @@ SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") SET(CMAKE_C_COMPILER "cc" CACHE STRING "") SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") -SET(CMAKE_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround" CACHE STRING "") +#SET(CMAKE_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround" CACHE STRING "") SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") -SET(Extrae_LIBRARY "-I$ENV{CRAY_MPICH_DIR}/include -L$ENV{CRAY_MPICH_DIR}/lib -lmpi $ENV{PE_MPICH_GTL_DIR_amd_gfx90a} $ENV{PE_MPICH_GTL_LIBS_amd_gfx90a}" CACHE STRING "") +SET(Extrae_LIBRARY "-I$ENV{CRAY_MPICH_DIR}/include -L$ENV{CRAY_MPICH_DIR}/lib -lmpi $ENV{PE_MPICH_GTL_DIR_amd_gfx90a} $ENV{PE_MPICH_GTL_LIBS_amd_gfx90a} -lmpifort" CACHE STRING "") -SET(ADD_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround -h flex_mp=intolerant -h thread0 -G2 ${Extrae_LIBRARY}" CACHE STRING "") +#SET(ADD_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround -h flex_mp=intolerant -h thread0 -G2 ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_Fortran_FLAGS "-h flex_mp=intolerant -h thread0 -G2 ${Extrae_LIBRARY}" CACHE STRING "") SET(ADD_C_FLAGS "-g -O -ffp-model=strict ${Extrae_LIBRARY}" CACHE STRING "") -SET(ADD_CXX_FLAGS "-g -std=c++14 -O -ffp-model=strict -munsafe-fp-atomics --offload-arch=gfx90a -fno-gpu-rdc -Wno-unused-command-line-argument -Wno-unsupported-floating-point-opt -Wno-#pragma-messages ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_CXX_FLAGS "-g -std=c++17 -O -ffp-model=strict -munsafe-fp-atomics --offload-arch=gfx90a -fno-gpu-rdc -Wno-unused-command-line-argument -Wno-unsupported-floating-point-opt -Wno-#pragma-messages ${Extrae_LIBRARY}" CACHE STRING "") SET(ADD_LINKER_FLAGS "-g -O ${Extrae_LIBRARY}" CACHE STRING "") diff --git a/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake new file mode 100644 index 000000000000..131320b4bce7 --- /dev/null +++ b/components/homme/cmake/machineFiles/pm-cpu-bfb.cmake @@ -0,0 +1,48 @@ +# CMake initial cache file +# +# This machine file works with PrgEnv-intel +# +# +# Perlmutter generic MPI enabled compiler wrappers: +SET (CMAKE_Fortran_COMPILER ftn CACHE FILEPATH "") +SET (CMAKE_C_COMPILER cc CACHE FILEPATH "") +SET (CMAKE_CXX_COMPILER CC CACHE FILEPATH "") + + +# Set kokkos arch, to get correct avx flags +SET (Kokkos_ARCH_ZEN2 ON CACHE BOOL "") + +SET (WITH_PNETCDF FALSE CACHE FILEPATH "") + +EXECUTE_PROCESS(COMMAND nf-config --prefix + RESULT_VARIABLE NFCONFIG_RESULT + OUTPUT_VARIABLE NFCONFIG_OUTPUT + ERROR_VARIABLE NFCONFIG_ERROR + OUTPUT_STRIP_TRAILING_WHITESPACE +) +SET (NetCDF_Fortran_PATH "${NFCONFIG_OUTPUT}" CACHE STRING "") + +EXECUTE_PROCESS(COMMAND nc-config --prefix + RESULT_VARIABLE NCCONFIG_RESULT + OUTPUT_VARIABLE NCCONFIG_OUTPUT + ERROR_VARIABLE NCCONFIG_ERROR + OUTPUT_STRIP_TRAILING_WHITESPACE +) +SET (NetCDF_C_PATH "${NCCONFIG_OUTPUT}" CACHE STRING "") + +SET (USE_QUEUING FALSE CACHE BOOL "") +# for standalone HOMME builds: +SET(CPRNC_DIR /global/cfs/cdirs/e3sm/tools/cprnc CACHE FILEPATH "") + +SET (HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + #turn on preqxx target and thus strict fpmodel for F vs CXX comparison + SET (ADD_Fortran_FLAGS "-traceback -fp-model strict -qopenmp -O1" CACHE STRING "") + SET (ADD_C_FLAGS "-traceback -fp-model strict -qopenmp -O1" CACHE STRING "") + SET (ADD_CXX_FLAGS "-traceback -fp-model strict -qopenmp -O1" CACHE STRING "") + SET (BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") + SET (HOMMEXX_BFB_TESTING TRUE CACHE BOOL "") + SET (HOMME_TESTING_PROFILE "short" CACHE STRING "") + SET (BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +SET(USE_MPIEXEC "srun" CACHE STRING "") +SET(USE_MPI_OPTIONS "-K --cpu_bind=cores" CACHE STRING "") diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp index 688007e27f24..4243154ca033 100644 --- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp @@ -297,7 +297,7 @@ class HyperviscosityFunctorImpl SphereOperators m_sphere_ops; // Policies -#ifndef NDEBUG +#if defined(KOKKOS__ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else diff --git a/components/homme/src/share/cxx/SphereOperators.hpp b/components/homme/src/share/cxx/SphereOperators.hpp index c227d97ea708..57f711fd96f1 100644 --- a/components/homme/src/share/cxx/SphereOperators.hpp +++ b/components/homme/src/share/cxx/SphereOperators.hpp @@ -1176,6 +1176,532 @@ class SphereOperators Real m_scale_factor_inv, m_laplacian_rigid_factor; }; +static constexpr int NPNP = NP * NP; + +#ifdef KOKKOS_ENABLE_CUDA +using TeamPolicy = Kokkos::TeamPolicy >; +#else +using TeamPolicy = Kokkos::TeamPolicy; +#endif + +#ifndef WARP_SIZE + +#if defined(KOKKOS_ENABLE_HIP) +#define WARP_SIZE warpSize +#elif defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) +#define WARP_SIZE 32 +#else +#define WARP_SIZE 1 +#endif + +#endif + +#if (WARP_SIZE == 1) + +#define SPHERE_BLOCK_START3(B,X,Y,Z) \ + Scalar X; Scalar sbo##X[NP][NP][NUM_LEV]; \ + Scalar Y; Scalar sbo##Y[NP][NP][NUM_LEV]; \ + Scalar Z; Scalar sbo##Z[NP][NP][NUM_LEV]; \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix, iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_START0(B) \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) \ + sbo##X[B.x][B.y][B.z] = X; \ + sbo##Y[B.x][B.y][B.z] = Y; \ + sbo##Z[B.x][B.y][B.z] = Z; \ + }\ + } \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; \ + X = sbo##X[B.x][B.y][B.z]; \ + Y = sbo##Y[B.x][B.y][B.z]; \ + Z = sbo##Z[B.x][B.y][B.z]; + +#define SPHERE_BLOCK_MIDDLE0(B) \ + } \ + } \ + for (int ix = 0; ix < NP; ix++) for(int iy = 0; iy < NP; iy++) { \ + B.update(ix,iy); \ + for (int iz = 0; iz < NUM_LEV; iz++) { \ + B.z = iz; + +#define SPHERE_BLOCK_END() } } + +#else + + static_assert(VECTOR_SIZE == 1, "VECTOR_SIZE != 1"); + +#define SPHERE_BLOCK_START3(B,X,Y,Z) \ + if (B.z >= NUM_LEV) return; \ + Scalar X, Y, Z; +#define SPHERE_BLOCK_START0(B) if (B.z >= NUM_LEV) return; + +#define SPHERE_BLOCK_MIDDLE3(B,X,Y,Z) B.t.team_barrier(); +#define SPHERE_BLOCK_MIDDLE0(B) B.t.team_barrier(); + +#define SPHERE_BLOCK_END() + +static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE; +static constexpr int SPHERE_BLOCK = NPNP * SPHERE_BLOCK_LEV; +static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1; + +#endif + +namespace SphereOuter { +#if WARP_SIZE == 1 + using Team = TeamPolicy::member_type; + template + void parallel_for(const int num_elems, F f) { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems, 1), + f); + } +#else + using Team = int; + template + void parallel_for(const int num_elems, F f) { + f(num_elems); + } +#endif +} + +struct SphereGlobal { + + const ExecViewManaged d; + const ExecViewManaged dinv; + const ExecViewManaged dvv; + const ExecViewManaged metdet; + const Real scale_factor_inv; + + SphereGlobal(const SphereOperators &op): + d(op.m_d), + dinv(op.m_dinv), + dvv(op.dvv), + metdet(op.m_metdet), + scale_factor_inv(op.m_scale_factor_inv) + {} +}; + +struct SphereBlockOps; + +#if (WARP_SIZE == 1) + +struct SphereBlockScratch { + const SphereBlockOps &b; + Scalar v[NP][NP][NUM_LEV]; + KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &); + KOKKOS_INLINE_FUNCTION const Scalar &sv(int x, int y) const; + KOKKOS_INLINE_FUNCTION Scalar &sv(int x, int y); +}; + +#else + +using SphereBlockScratchView = Kokkos::View< + Scalar[SPHERE_BLOCK_LEV][NP][NP], + ExecSpace::scratch_memory_space, + Kokkos::MemoryTraits + >; + +using SphereBlockScratchSubview = Kokkos::Subview< + SphereBlockScratchView, int, + std::remove_const_t, + std::remove_const_t + >; + +struct SphereBlockScratch { + SphereBlockScratchView v; + SphereBlockScratchSubview sv; + KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &b); +}; + +#endif + +struct SphereBlockOps { + using Team = TeamPolicy::member_type; + + const SphereGlobal &g; + const Team &t; + Real scale_factor_inv; + Real metdet; + Real rrdmd; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + int e,x,y,z; + + KOKKOS_INLINE_FUNCTION SphereBlockOps(const SphereGlobal &sg, const Team &team): + g(sg), + t(team), + scale_factor_inv(g.scale_factor_inv) + { +#if (WARP_SIZE == 1) + e = t.league_rank(); + x = y = z = 0; +#else + const int lr = t.league_rank(); + e = lr / SPHERE_BLOCKS_PER_COL; + const int iw = lr % SPHERE_BLOCKS_PER_COL; + const int tr = t.team_rank(); + const int ixy = tr / SPHERE_BLOCK_LEV; + const int dz = tr % SPHERE_BLOCK_LEV; + z = dz + iw * SPHERE_BLOCK_LEV; + update(ixy / NP, ixy % NP); +#endif + } + + KOKKOS_INLINE_FUNCTION void update(const int ix, const int iy) + { + rrdmd = 0; + x = ix; + y = iy; + metdet = g.metdet(e,x,y); + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = g.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = g.dvv(x,j); + dvvy[j] = g.dvv(y,j); + } + } + + KOKKOS_INLINE_FUNCTION Scalar div(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + // Separately compute du and dv to match Fortran bfb + Scalar du = 0; + Scalar dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvy[j] * t0.sv(x,j); + dv += dvvx[j] * t1.sv(j,y); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (du + dv) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void divInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const + { + t0.sv(x,y) = (dinv[0][0] * v0 + dinv[1][0] * v1) * metdet; + t1.sv(x,y) = (dinv[0][1] * v0 + dinv[1][1] * v1) * metdet; + } + + KOKKOS_INLINE_FUNCTION void grad(Scalar &g0, Scalar &g1, const SphereBlockScratch &t) const + { + Scalar s0 = 0; + Scalar s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * t.sv(x,j); + s1 += dvvx[j] * t.sv(j,y); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + g0 = dinv[0][0] * s0 + dinv[0][1] * s1; + g1 = dinv[1][0] * s0 + dinv[1][1] * s1; + } + + KOKKOS_INLINE_FUNCTION void gradInit(SphereBlockScratch &t0, const Scalar v0) + { + t0.sv(x,y) = v0; + } + + KOKKOS_INLINE_FUNCTION Scalar vort(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + Scalar du = 0; + Scalar dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvx[j] * t0.sv(j,y); + dv += dvvy[j] * t1.sv(x,j); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (dv - du) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void vortInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Scalar v0, const Scalar v1) const + { + t0.sv(x,y) = g.d(e,0,0,x,y) * v0 + g.d(e,0,1,x,y) * v1; + t1.sv(x,y) = g.d(e,1,0,x,y) * v0 + g.d(e,1,1,x,y) * v1; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar *const here, const Scalar *const above) const + { + if (VECTOR_SIZE == 1) return above[0]; + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + if (z < NUM_LEV_P-1) out[VECTOR_END] = above[0][0]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const here, const Scalar *const above) const + { + if (VECTOR_SIZE == 1) return (z < NUM_LEV-1) ? above[0] : top; + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + if (z < NUM_LEV-1) out[VECTOR_END] = above[0][0]; + else out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zbelow(const Scalar bottom, const Scalar *const below, const Scalar *const here) const + { + if (VECTOR_SIZE == 1) return (z > 0) ? below[0] : bottom; + Scalar out; + for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1]; + if (z > 0) out[0] = below[0][VECTOR_END]; + else out[0] = bottom[0]; + return out; + } + +#if (WARP_SIZE == 1) + template + static void parallel_for(const Team &team, int /*num_scratch*/, F f) + { + f(team); + } +#else + template + static void parallel_for(const int num_elems, const int num_scratch, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems * SPHERE_BLOCKS_PER_COL, SPHERE_BLOCK). + set_scratch_size(0, Kokkos::PerTeam(num_scratch * SphereBlockScratchView::shmem_size())), + f); + } +#endif + +}; + +#if (WARP_SIZE == 1) + +KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b): + b(b) +{} + +KOKKOS_INLINE_FUNCTION const Scalar &SphereBlockScratch::sv(const int x, const int y) const +{ return v[x][y][b.z]; } + +KOKKOS_INLINE_FUNCTION Scalar &SphereBlockScratch::sv(const int x, const int y) +{ return v[x][y][b.z]; } + +#else + +KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b): + v(b.t.team_scratch(0)), + sv(Kokkos::subview(v, b.z % SPHERE_BLOCK_LEV, Kokkos::ALL, Kokkos::ALL)) +{} + +#endif + +struct SphereCol { + int e,x,y,z; + +#if (WARP_SIZE == 1) + struct Team { int e,x,y,z; }; + KOKKOS_INLINE_FUNCTION SphereCol(const Team &team): + e(team.e), + x(team.x), + y(team.y), + z(team.z) + {} + + template + static void parallel_for(const SphereOuter::Team &outer, const int num_lev, F f) + { + const int e = outer.league_rank(); + for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(outer, num_lev), + [=] (const int z) { f(Team{e,x,y,z}); }); + } + } +#else + using Team = TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION SphereCol(const Team &team) + { + const int lr = team.league_rank(); + e = lr / NPNP; + const int xy = lr % NPNP; + x = xy / NP; + y = xy % NP; + z = team.team_rank(); + } + + template + static void parallel_for(const int num_elems, const int num_lev, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems * NPNP, num_lev), + f); + } +#endif + + KOKKOS_INLINE_FUNCTION Scalar zplus(const Scalar *const here, const Scalar *const above) const + { + Scalar out; + for (int i = 0; i < VECTOR_END; i++) out[i] = here[0][i+1]; + out[VECTOR_END] = above[0][0]; + return out; + } +}; + +struct SphereColOps: SphereCol { + Real scale_factor_inv; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + + KOKKOS_INLINE_FUNCTION SphereColOps(const SphereGlobal &sg, const Team &team): + SphereCol(team) + { + scale_factor_inv = sg.scale_factor_inv; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = sg.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = sg.dvv(x,j); + dvvy[j] = sg.dvv(y,j); + } + } + + template + KOKKOS_INLINE_FUNCTION void grad(OutView &out, const InView &in, const int n) const + { + Scalar s0 = 0; + Scalar s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * in(e,n,x,j,z); + s1 += dvvx[j] * in(e,n,j,y,z); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + out(e,0,x,y,z) = dinv[0][0] * s0 + dinv[0][1] * s1; + out(e,1,x,y,z) = dinv[1][0] * s0 + dinv[1][1] * s1; + } + + KOKKOS_INLINE_FUNCTION Scalar zabove(const Scalar top, const Scalar *const in) const + { + Scalar out = *in; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar zbelow(const Real bottom, const Scalar *const below, const Scalar *const here) const + { + Scalar out; + out[0] = (z == 0) ? bottom : below[0][VECTOR_END]; + for (int i = 1; i < VECTOR_SIZE; i++) out[i] = here[0][i-1]; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar ztop(const Real top, const Real in) const + { + Scalar out = in; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top; + return out; + } + + KOKKOS_INLINE_FUNCTION Scalar ztrim(const Scalar bottom, const Scalar top, const Scalar in) const + { + Scalar out = in; + if (z == 0) out[0] = bottom[0]; + if (z == NUM_LEV_P-1) out[NUM_PHYSICAL_LEV%VECTOR_SIZE] = top[NUM_PHYSICAL_LEV%VECTOR_SIZE]; + return out; + } +}; + +struct SphereScanOps { + TeamPolicy::member_type t; + int e,x,y; + +#if (WARP_SIZE == 1) + struct Team { + TeamPolicy::member_type t; + int e,x,y; + }; + + KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team): + t(team.t), + e(team.e), + x(team.x), + y(team.y) + {} + + template + static void parallel_for(const SphereOuter::Team &outer, F f) + { + const int e = outer.league_rank(); + for (int x = 0; x < NP; x++) for (int y = 0; y < NP; y++) { + f(Team{outer,e,x,y}); + } + } +#else + using Team = TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team): + t(team) + { + const int lr = t.league_rank(); + const int tr = t.team_rank(); + e = lr; + x = tr / NP; + y = tr % NP; + } + + template + static void parallel_for(const int num_elems, F f) + { + Kokkos::parallel_for( + __PRETTY_FUNCTION__, + TeamPolicy(num_elems, NPNP, WARP_SIZE), + f); + } +#endif + + template + KOKKOS_INLINE_FUNCTION void nacs(OutView &out, const int n, const InView &in, const EndView &end) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int a, Real &sum, const bool last) { + if (a == 0) out(e,n,x,y,NUM_PHYSICAL_LEV) = sum = end(e,x,y); + const int z = NUM_PHYSICAL_LEV-a-1; + sum += in(e,x,y,z); + if (last) out(e,n,x,y,z) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const int n, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,n,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } +}; + } // namespace Homme #endif // HOMMEXX_SPHERE_OPERATORS_HPP diff --git a/components/homme/src/share/cxx/utilities/ViewUtils.hpp b/components/homme/src/share/cxx/utilities/ViewUtils.hpp index 1b3efcd91020..b749e99e096a 100644 --- a/components/homme/src/share/cxx/utilities/ViewUtils.hpp +++ b/components/homme/src/share/cxx/utilities/ViewUtils.hpp @@ -67,6 +67,18 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + template KOKKOS_INLINE_FUNCTION typename @@ -79,6 +91,30 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + // Structure to define the type of a view that has a const data type, // given the type of an input view template diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt index 191e3821cdd7..7b1b953a9b35 100644 --- a/components/homme/src/theta-l_kokkos/CMakeLists.txt +++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt @@ -142,6 +142,7 @@ MACRO(THETAL_KOKKOS_SETUP) ) SET(THETAL_DEPS_CXX + ${TARGET_DIR}/cxx/CaarFunctorImpl.cpp ${TARGET_DIR}/cxx/CamForcing.cpp ${TARGET_DIR}/cxx/Diagnostics.cpp ${TARGET_DIR}/cxx/ElementsForcing.cpp diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp new file mode 100644 index 000000000000..a36c8b09c5ff --- /dev/null +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp @@ -0,0 +1,636 @@ +#include "CaarFunctorImpl.hpp" + +namespace Homme { + +template +void CaarFunctorImpl::epoch1_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_exner = m_buffers.exner; + auto buffers_phi = m_buffers.phi; + auto buffers_pnh = m_buffers.pnh; + auto buffers_theta_tens = m_buffers.theta_tens; + auto buffers_vdp = m_buffers.vdp; + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_vn0 = m_derived.m_vn0; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + + SphereBlockOps::parallel_for( + team, 4, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b); + + SPHERE_BLOCK_START3(b, v0, v1, vtheta); + + if (!HYDROSTATIC) { + + const Scalar dphi = b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_phinh_i(b.e,data_n0,b.x,b.y,b.z); + const Scalar vtheta_dp = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + for (int i = 0; i < VECTOR_SIZE; i++) { + if ((vtheta_dp[i] < 0) || (dphi[i] > 0)) { + if (b.z * VECTOR_SIZE + i < NUM_PHYSICAL_LEV) abort(); + } + } + + EquationOfState::compute_pnh_and_exner(vtheta_dp, dphi, buffers_pnh(b.e,b.x,b.y,b.z), buffers_exner(b.e,b.x,b.y,b.z)); + + buffers_phi(b.e,b.x,b.y,b.z) = 0.5 * (state_phinh_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&state_phinh_i(b.e,data_n0,b.x,b.y,b.z), &state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1))); + } + + v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + b.divInit(ttmp0, ttmp1, v0, v1); + buffers_vdp(b.e,0,b.x,b.y,b.z) = v0; + buffers_vdp(b.e,1,b.x,b.y,b.z) = v1; + derived_vn0(b.e,0,b.x,b.y,b.z) += data_eta_ave_w * v0; + derived_vn0(b.e,1,b.x,b.y,b.z) += data_eta_ave_w * v1; + + vtheta = 0; + + if (CONSERVATIVE) { + const Scalar sv0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + const Scalar sv1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + b.divInit(ttmp2, ttmp3, sv0, sv1); + } else { + vtheta = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z); + b.gradInit(ttmp2, vtheta); + } + + SPHERE_BLOCK_MIDDLE3(b, v0, v1, vtheta); + + const Scalar dvdp = b.div(ttmp0, ttmp1); + buffers_dp_tens(b.e,b.x,b.y,b.z) = dvdp; + + if (CONSERVATIVE) { + buffers_theta_tens(b.e,b.x,b.y,b.z) = b.div(ttmp2, ttmp3); + } else { + Scalar grad0, grad1; + b.grad(grad0, grad1, ttmp2); + Scalar theta_tens = dvdp * vtheta; + theta_tens += grad0 * v0; + theta_tens += grad1 * v1; + buffers_theta_tens(b.e,b.x,b.y,b.z) = theta_tens; + } + + SPHERE_BLOCK_END(); + }); +} + +template +void CaarFunctorImpl::epoch2_scanOps(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_dp_i = viewAsReal(m_buffers.dp_i); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const int data_n0 = m_data.n0; + auto &hvcoord_hybrid_bi = m_hvcoord.hybrid_bi; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + auto state_dp3d = viewAsReal(m_state.m_dp3d); + + SphereScanOps::parallel_for( + team, + KOKKOS_LAMBDA(const SphereScanOps::Team &team) { + SphereScanOps s(team); + + s.scan(buffers_dp_i, state_dp3d, data_n0, pi_i00); + s.scan(buffers_w_tens, buffers_dp_tens, 0); + + if (RSPLIT_ZERO) { + + s.scan(buffers_eta_dot_dpdn, buffers_dp_tens, 0); + + const Real last = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, 1, NUM_PHYSICAL_LEV), + [&](const int z) { + Real eta_dot_dpdn = -buffers_eta_dot_dpdn(s.e,s.x,s.y,z); + eta_dot_dpdn += hvcoord_hybrid_bi(z) * last; + buffers_eta_dot_dpdn(s.e,s.x,s.y,z) = eta_dot_dpdn; + }); + + Kokkos::single( + Kokkos::PerThread(s.t), + [&]() { + buffers_eta_dot_dpdn(s.e,s.x,s.y,0) = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV) = 0; + }); + } + }); +} + +template +void CaarFunctorImpl::epoch3_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dp_i = m_buffers.dp_i; + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_exner = m_buffers.exner; + auto buffers_omega_p = m_buffers.omega_p; + auto buffers_pnh = m_buffers.pnh; + auto buffers_temp = m_buffers.temp; + auto buffers_v_tens = m_buffers.v_tens; + auto buffers_w_tens = m_buffers.w_tens; + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_eta_dot_dpdn = m_derived.m_eta_dot_dpdn; + auto derived_omega_p = m_derived.m_omega_p; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereBlockOps::parallel_for( + team, 1, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b); + + SPHERE_BLOCK_START0(b); + + const Scalar pi = 0.5 * (buffers_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_dp_i(b.e,b.x,b.y,b.z), &buffers_dp_i(b.e,b.x,b.y,b.z+1))); + b.gradInit(ttmp0, pi); + + if (HYDROSTATIC) { + Scalar exner = pi; + EquationOfState::pressure_to_exner(exner); + buffers_exner(b.e,b.x,b.y,b.z) = exner; + buffers_pnh(b.e,b.x,b.y,b.z) = EquationOfState::compute_dphi(state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z), exner, pi); + } + + SPHERE_BLOCK_MIDDLE0(b); + + Scalar grad0, grad1; + b.grad(grad0, grad1, ttmp0); + + Scalar omega = -0.5 * (buffers_w_tens(b.e,b.x,b.y,b.z) + b.zabove(&buffers_w_tens(b.e,b.x,b.y,b.z), &buffers_w_tens(b.e,b.x,b.y,b.z+1))); + const Scalar uz = state_v(b.e,data_n0,0,b.x,b.y,b.z); + const Scalar vz = state_v(b.e,data_n0,1,b.x,b.y,b.z); + omega += uz * grad0 + vz * grad1; + buffers_omega_p(b.e,b.x,b.y,b.z) = omega; + + derived_omega_p(b.e,b.x,b.y,b.z) += data_eta_ave_w * omega; + + if (RSPLIT_ZERO) { + + const Scalar dp = state_dp3d(b.e,data_n0,b.x,b.y,b.z); + const Scalar etap = b.zabove(&buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z), &buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z+1)); + const Scalar etaz = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z); + + buffers_dp_tens(b.e,b.x,b.y,b.z) += etap - etaz; + + derived_eta_dot_dpdn(b.e,b.x,b.y,b.z) += data_eta_ave_w * etaz; + + if (!HYDROSTATIC) { + const Scalar dw = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)) - state_w_i(b.e,data_n0,b.x,b.y,b.z); + const Scalar eta = 0.5 * (etaz + etap); + buffers_temp(b.e,b.x,b.y,b.z) = dw * eta; + } + + const Scalar facp = 0.5 * etap / dp; + Scalar u = facp * (b.zabove(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z), &state_v(b.e,data_n0,0,b.x,b.y,b.z+1)) - uz); + Scalar v = facp * (b.zabove(vz, &state_v(b.e,data_n0,1,b.x,b.y,b.z), &state_v(b.e,data_n0,1,b.x,b.y,b.z+1)) - vz); + + const Scalar facm = 0.5 * etaz / dp; + u += facm * (uz - b.zbelow(uz, &state_v(b.e,data_n0,0,b.x,b.y,b.z-1), &state_v(b.e,data_n0,0,b.x,b.y,b.z))); + v += facm * (vz - b.zbelow(uz, &state_v(b.e,data_n0,1,b.x,b.y,b.z-1), &state_v(b.e,data_n0,1,b.x,b.y,b.z))); + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v; + } + SPHERE_BLOCK_END(); + }); +} + +void CaarFunctorImpl::epoch4_scanOps(const SphereOuter::Team &team) +{ + auto buffers_phi = viewAsReal(m_buffers.phi); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + + const int data_n0 = m_data.n0; + + auto &geometry_phis = m_geometry.m_phis; + + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + + SphereScanOps::parallel_for( + team, + KOKKOS_LAMBDA(const SphereScanOps::Team &team) { + SphereScanOps s(team); + s.nacs(state_phinh_i, data_n0, buffers_pnh, geometry_phis); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, NUM_PHYSICAL_LEV), + [&](const int z) { + buffers_phi(s.e,s.x,s.y,z) = 0.5 * (state_phinh_i(s.e,data_n0,s.x,s.y,z) + state_phinh_i(s.e,data_n0,s.x,s.y,z+1)); + }); + }); +} + +template +void CaarFunctorImpl::epoch5_colOps(const SphereOuter::Team &team) +{ + auto buffers_dp_i = m_buffers.dp_i; + auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_exner = m_buffers.exner; + auto buffers_grad_phinh_i = m_buffers.grad_phinh_i; + auto buffers_grad_w_i = m_buffers.grad_w_i; + auto buffers_phi = m_buffers.phi; + auto buffers_phi_tens = m_buffers.phi_tens; + auto buffers_pnh = m_buffers.pnh; + auto buffers_temp = m_buffers.temp; + auto buffers_v_i = m_buffers.v_i; + auto buffers_vtheta_i = m_buffers.vtheta_i; + auto buffers_w_tens = m_buffers.w_tens; + + const int data_n0 = m_data.n0; + const Real dscale = m_data.scale1 - m_data.scale2; + + auto &geometry_gradphis = m_geometry.m_gradphis; + + const Real gscale1 = m_data.scale1 * PhysicalConstants::g; + const Real gscale2 = m_data.scale2 * PhysicalConstants::g; + + auto hvcoord_hybrid_bi_packed = m_hvcoord.hybrid_bi_packed; + + const Real ndata_scale1 = -m_data.scale1; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_w_i = m_state.m_w_i; + + SphereColOps::parallel_for( + team, NUM_LEV_P, + KOKKOS_LAMBDA(const SphereColOps::Team &team) { + SphereColOps c(sg, team); + + c.grad(buffers_grad_phinh_i, state_phinh_i, data_n0); + if (!HYDROSTATIC) c.grad(buffers_grad_w_i, state_w_i, data_n0); + + const Scalar dm = c.zbelow(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z-1), &state_dp3d(c.e,data_n0,c.x,c.y,c.z));; + const Scalar dz = c.zabove(0, &state_dp3d(c.e,data_n0,c.x,c.y,c.z)); + const Scalar dp_i = c.ztrim(dz, dm, 0.5 * (dz + dm)); + buffers_dp_i(c.e,c.x,c.y,c.z) = dp_i; + + if (!HYDROSTATIC) { + + const Scalar v0m = c.zbelow(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z-1), &state_v(c.e,data_n0,0,c.x,c.y,c.z));; + const Scalar v0z = c.zabove(0, &state_v(c.e,data_n0,0,c.x,c.y,c.z)); + const Scalar v_i0 = c.ztrim(v0z, v0m, (dz * v0z + dm * v0m) / (dm + dz)); + buffers_v_i(c.e,0,c.x,c.y,c.z) = v_i0; + + const Scalar v1m = c.zbelow(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z-1), &state_v(c.e,data_n0,1,c.x,c.y,c.z));; + const Scalar v1z = c.zabove(0, &state_v(c.e,data_n0,1,c.x,c.y,c.z)); + const Scalar v_i1 = c.ztrim(v1z, v1m, (dz * v1z + dm * v1m) / (dm + dz)); + buffers_v_i(c.e,1,c.x,c.y,c.z) = v_i1; + + const Scalar pm = c.zbelow(pi_i00, &buffers_pnh(c.e,c.x,c.y,c.z-1), &buffers_pnh(c.e,c.x,c.y,c.z));; + const Scalar pz = c.zabove(pm + 0.5 * dm, &buffers_pnh(c.e,c.x,c.y,c.z)); + buffers_dpnh_dp_i(c.e,c.x,c.y,c.z) = 2.0 * (pz - pm) / (dm + dz); + } + + if (RSPLIT_ZERO) { + + const Scalar phim = c.zbelow(0, &buffers_phi(c.e,c.x,c.y,c.z-1), &buffers_phi(c.e,c.x,c.y,c.z));; + const Scalar phiz = c.zabove(0, &buffers_phi(c.e,c.x,c.y,c.z)); + + if (!HYDROSTATIC) { + const Scalar phi_vadv = c.ztrim(0, 0, (phiz - phim) * buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z) / dp_i); + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_vadv; + } + + Scalar vtheta_i = phiz - phim; + + const Scalar exnerm = c.zbelow(0, &buffers_exner(c.e,c.x,c.y,c.z-1), &buffers_exner(c.e,c.x,c.y,c.z));; + const Scalar exnerz = c.zabove(0, &buffers_exner(c.e,c.x,c.y,c.z)); + const Scalar dexner = c.ztrim(1, 1, exnerz - exnerm); + vtheta_i /= dexner; + + vtheta_i /= -PhysicalConstants::cp; // separate for BFB equality with original Fortran + + if (!HYDROSTATIC) vtheta_i *= buffers_dpnh_dp_i(c.e,c.x,c.y,c.z); + buffers_vtheta_i(c.e,c.x,c.y,c.z) = vtheta_i; + } + + if (!HYDROSTATIC) { + + Scalar w_tens = 0; + if (RSPLIT_ZERO) { + const Scalar tempm = c.zbelow(0, &buffers_temp(c.e,c.x,c.y,c.z-1), &buffers_temp(c.e,c.x,c.y,c.z));; + const Scalar tempz = c.zabove(0, &buffers_temp(c.e,c.x,c.y,c.z)); + const Scalar dw = c.ztrim(tempz, tempm, 0.5 * (tempz + tempm)); + w_tens = dw / dp_i; + } + w_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_w_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_w_i(c.e,1,c.x,c.y,c.z); + w_tens *= ndata_scale1; + const Scalar gscale = c.ztop(gscale1, gscale2); + w_tens += (buffers_dpnh_dp_i(c.e,c.x,c.y,c.z)-Scalar(1)) * gscale; + buffers_w_tens(c.e,c.x,c.y,c.z) = w_tens; + + Scalar phi_tens = (RSPLIT_ZERO) ? buffers_phi_tens(c.e,c.x,c.y,c.z) : 0; + phi_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,1,c.x,c.y,c.z); + phi_tens *= ndata_scale1; + phi_tens += state_w_i(c.e,data_n0,c.x,c.y,c.z) * gscale; + + if (dscale) phi_tens += dscale * (buffers_v_i(c.e,0,c.x,c.y,c.z) * geometry_gradphis(c.e,0,c.x,c.y) + buffers_v_i(c.e,1,c.x,c.y,c.z) * geometry_gradphis(c.e,1,c.x,c.y)) * hvcoord_hybrid_bi_packed(c.z); + + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_tens; + } + }); +} + +template +void CaarFunctorImpl::epoch6_blockOps(const SphereOuter::Team &team) +{ + auto buffers_dpnh_dp_i = m_buffers.dpnh_dp_i; + auto buffers_exner = m_buffers.exner; + auto buffers_grad_phinh_i = m_buffers.grad_phinh_i; + auto buffers_grad_w_i = m_buffers.grad_w_i; + auto buffers_v_tens = m_buffers.v_tens; + + const int data_n0 = m_data.n0; + + auto &geometry_fcor = m_geometry.m_fcor; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = m_state.m_dp3d; + auto state_v = m_state.m_v; + auto state_vtheta_dp= m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereBlockOps::parallel_for( + team, 6, + KOKKOS_LAMBDA(const SphereBlockOps::Team &team) { + SphereBlockOps b(sg, team); + SphereBlockScratch ttmp0(b), ttmp1(b), ttmp2(b), ttmp3(b), ttmp4(b), ttmp5(b); + + SPHERE_BLOCK_START3(b, exneriz, v0, v1); + + if (!HYDROSTATIC) { + const Scalar wz = b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1)); + const Scalar w2 = 0.25 * (state_w_i(b.e,data_n0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + wz * wz); + b.gradInit(ttmp0, w2); + } + + exneriz = buffers_exner(b.e,b.x,b.y,b.z); + b.gradInit(ttmp1, exneriz); + + const Scalar log_exneriz = (PGRAD_CORRECTION) ? log(exneriz) : 0; + b.gradInit(ttmp2, log_exneriz); + + v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z); + v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z); + + b.vortInit(ttmp3, ttmp4, v0, v1); + b.gradInit(ttmp5, 0.5 * (v0 * v0 + v1 * v1)); + + SPHERE_BLOCK_MIDDLE3(b, exneriz, v0, v1); + + Scalar grad_v0, grad_v1; + b.grad(grad_v0, grad_v1, ttmp5); + + Scalar u_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,0,b.x,b.y,b.z) : 0; + Scalar v_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,1,b.x,b.y,b.z) : 0; + u_tens += grad_v0; + v_tens += grad_v1; + + const Scalar cp_vtheta = PhysicalConstants::cp * (state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z)); + + Scalar grad_exner0, grad_exner1; + b.grad(grad_exner0, grad_exner1, ttmp1); + + u_tens += cp_vtheta * grad_exner0; + v_tens += cp_vtheta * grad_exner1; + + Scalar mgrad_x, mgrad_y; + if (HYDROSTATIC) { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1))); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1))); + + } else { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1))); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + b.zabove(&buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z), &buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&buffers_dpnh_dp_i(b.e,b.x,b.y,b.z), &buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1))); + + } + + if (PGRAD_CORRECTION) { + + Scalar grad_lexner0, grad_lexner1; + b.grad(grad_lexner0, grad_lexner1, ttmp2); + + namespace PC = PhysicalConstants; + constexpr Real cpt0 = PC::cp * (PC::Tref - PC::Tref_lapse_rate * PC::Tref * PC::cp / PC::g); + mgrad_x += cpt0 * (grad_lexner0 - grad_exner0 / exneriz); + mgrad_y += cpt0 * (grad_lexner1 - grad_exner1 / exneriz); + } + + Scalar wvor_x = 0; + Scalar wvor_y = 0; + if (!HYDROSTATIC) { + b.grad(wvor_x, wvor_y, ttmp0); + wvor_x -= 0.5 * (buffers_grad_w_i(b.e,0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,0,b.x,b.y,b.z), &buffers_grad_w_i(b.e,0,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1))); + wvor_y -= 0.5 * (buffers_grad_w_i(b.e,1,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + b.zabove(&buffers_grad_w_i(b.e,1,b.x,b.y,b.z), &buffers_grad_w_i(b.e,1,b.x,b.y,b.z+1)) * b.zabove(&state_w_i(b.e,data_n0,b.x,b.y,b.z), &state_w_i(b.e,data_n0,b.x,b.y,b.z+1))); + } + + u_tens += mgrad_x + wvor_x; + v_tens += mgrad_y + wvor_y; + + const Scalar vort = b.vort(ttmp3, ttmp4) + geometry_fcor(b.e,b.x,b.y); + u_tens -= v1 * vort; + v_tens += v0 * vort; + + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u_tens; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v_tens; + + SPHERE_BLOCK_END(); + }); +} + +template +void CaarFunctorImpl::epoch7_col(const SphereOuter::Team &team) +{ + auto buffers_dp_tens = m_buffers.dp_tens; + auto buffers_eta_dot_dpdn = m_buffers.eta_dot_dpdn; + auto buffers_phi_tens = m_buffers.phi_tens; + auto buffers_theta_tens = m_buffers.theta_tens; + auto buffers_v_tens = m_buffers.v_tens; + auto buffers_vtheta_i = m_buffers.vtheta_i; + auto buffers_w_tens = m_buffers.w_tens; + + const Real data_dt = m_data.dt; + const int data_nm1 = m_data.nm1; + const int data_np1 = m_data.np1; + const Real data_scale3 = m_data.scale3; + + auto &geometry_spheremp = m_geometry.m_spheremp; + + const Real scale1_dt = m_data.scale1 * m_data.dt; + + auto state_dp3d = m_state.m_dp3d; + auto state_phinh_i = m_state.m_phinh_i; + auto state_v = m_state.m_v; + auto state_vtheta_dp = m_state.m_vtheta_dp; + auto state_w_i = m_state.m_w_i; + + SphereCol::parallel_for( + team, NUM_LEV, + KOKKOS_LAMBDA(const SphereCol::Team &team) { + const SphereCol c(team); + + const Real spheremp = geometry_spheremp(c.e,c.x,c.y); + const Real scale1_dt_spheremp = scale1_dt * spheremp; + const Real scale3_spheremp = data_scale3 * spheremp; + + Scalar dp_tens = buffers_dp_tens(c.e,c.x,c.y,c.z); + dp_tens *= scale1_dt_spheremp; + Scalar dp_np1 = scale3_spheremp * state_dp3d(c.e,data_nm1,c.x,c.y,c.z); + dp_np1 -= dp_tens; + state_dp3d(c.e,data_np1,c.x,c.y,c.z) = dp_np1; + + Scalar theta_tens = buffers_theta_tens(c.e,c.x,c.y,c.z); + if (RSPLIT_ZERO) { + const Scalar etap = c.zplus(&buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z), &buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z+1)); + const Scalar etaz = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z); + const Scalar thetap = etap * c.zplus(&buffers_vtheta_i(c.e,c.x,c.y,c.z), &buffers_vtheta_i(c.e,c.x,c.y,c.z+1)); + const Scalar thetaz = etaz * buffers_vtheta_i(c.e,c.x,c.y,c.z); + theta_tens += thetap - thetaz; + } + theta_tens *= -scale1_dt_spheremp; + Scalar vtheta_np1 = state_vtheta_dp(c.e,data_nm1,c.x,c.y,c.z); + vtheta_np1 *= scale3_spheremp; + vtheta_np1 += theta_tens; + state_vtheta_dp(c.e,data_np1,c.x,c.y,c.z) = vtheta_np1; + + Scalar u_tens = buffers_v_tens(c.e,0,c.x,c.y,c.z); + u_tens *= -scale1_dt_spheremp; + Scalar u_np1 = state_v(c.e,data_nm1,0,c.x,c.y,c.z); + u_np1 *= scale3_spheremp; + u_np1 += u_tens; + state_v(c.e,data_np1,0,c.x,c.y,c.z) = u_np1; + + Scalar v_tens = buffers_v_tens(c.e,1,c.x,c.y,c.z); + v_tens *= -scale1_dt_spheremp; + Scalar v_np1 = state_v(c.e,data_nm1,1,c.x,c.y,c.z); + v_np1 *= scale3_spheremp; + v_np1 += v_tens; + state_v(c.e,data_np1,1,c.x,c.y,c.z) = v_np1; + + if (!HYDROSTATIC) { + + const Real dt_spheremp = data_dt * spheremp; + + Scalar phi_tens = buffers_phi_tens(c.e,c.x,c.y,c.z); + phi_tens *= dt_spheremp; + Scalar phi_np1 = state_phinh_i(c.e,data_nm1,c.x,c.y,c.z); + phi_np1 *= scale3_spheremp; + phi_np1 += phi_tens; + state_phinh_i(c.e,data_np1,c.x,c.y,c.z) = phi_np1; + + Scalar w_tens = buffers_w_tens(c.e,c.x,c.y,c.z); + w_tens *= dt_spheremp; + Scalar w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,c.z); + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,c.z) = w_np1; + + if (c.z == NUM_LEV-1) { + constexpr int z = NUM_LEV_P-1; + constexpr int dz = NUM_PHYSICAL_LEV%VECTOR_SIZE; + Real w_tens = buffers_w_tens(c.e,c.x,c.y,z)[dz]; + w_tens *= dt_spheremp; + buffers_w_tens(c.e,c.x,c.y,z)[dz] = w_tens; + Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,z)[dz]; + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,z)[dz] = w_np1; + } + } + }); +} + +void CaarFunctorImpl::caar_compute() +{ + SphereOuter::parallel_for( + m_num_elems, + KOKKOS_LAMBDA(const SphereOuter::Team &team) { + + if (m_theta_hydrostatic_mode) { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(team); + else epoch1_blockOps(team); + } else { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(team); + else epoch1_blockOps(team); + } + + if (m_rsplit == 0) epoch2_scanOps(team); + else epoch2_scanOps(team); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch3_blockOps(team); + else epoch3_blockOps(team); + } else { + if (m_rsplit == 0) epoch3_blockOps(team); + else epoch3_blockOps(team); + } + + if (m_theta_hydrostatic_mode) epoch4_scanOps(team); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch5_colOps(team); + else epoch5_colOps(team); + } else { + if (m_rsplit == 0) epoch5_colOps(team); + else epoch5_colOps(team); + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } else { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } + } else { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } else { + if (m_pgrad_correction) epoch6_blockOps(team); + else epoch6_blockOps(team); + } + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch7_col(team); + else epoch7_col(team); + } else { + if (m_rsplit == 0) epoch7_col(team); + else epoch7_col(team); + } + }); +} + +} diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 38f9dc8573d8..1a9d64ddf677 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -30,6 +30,7 @@ #include "profiling.hpp" #include "ErrorDefs.hpp" +#include #include namespace Homme { @@ -107,7 +108,7 @@ struct CaarFunctorImpl { struct TagPostExchange {}; // Policies -#ifndef NDEBUG +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else @@ -123,6 +124,16 @@ struct CaarFunctorImpl { Kokkos::Array, NUM_TIME_LEVELS> m_bes; + template void epoch1_blockOps(const SphereOuter::Team &); + template void epoch2_scanOps(const SphereOuter::Team &); + template void epoch3_blockOps(const SphereOuter::Team &); + void epoch4_scanOps(const SphereOuter::Team &); + template void epoch5_colOps(const SphereOuter::Team &); + template void epoch6_blockOps(const SphereOuter::Team &); + template void epoch7_col(const SphereOuter::Team &); + void caar_compute(); + + public: CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */, const ReferenceElement &ref_FE, const HybridVCoord &hvcoord, const SphereOperators &sphere_ops, const SimulationParams& params) @@ -180,7 +191,7 @@ struct CaarFunctorImpl { int requested_buffer_size () const { // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs - const int nslots = m_tu.get_num_ws_slots(); + const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems); int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf; int num_scalar_int_buf = Buffers::num_3d_scalar_int_buf; @@ -192,7 +203,7 @@ struct CaarFunctorImpl { if (m_theta_hydrostatic_mode) { // pi=pnh, and no wtens/phitens num_scalar_mid_buf -= 1; - num_scalar_int_buf -= 3; + num_scalar_int_buf -= 2; // No grad_w_i/v_i num_vector_int_buf -= 2; @@ -200,10 +211,6 @@ struct CaarFunctorImpl { if (m_rsplit>0) { // No theta_i/eta_dot_dpdn num_scalar_int_buf -=2; - if (m_theta_hydrostatic_mode) { - // No dp_i - num_scalar_int_buf -= 1; - } } return num_scalar_mid_buf *NP*NP*NUM_LEV *VECTOR_SIZE*nslots @@ -216,7 +223,7 @@ struct CaarFunctorImpl { Errors::runtime_check(fbm.allocated_size()>=requested_buffer_size(), "Error! Buffers size not sufficient.\n"); Scalar* mem = reinterpret_cast(fbm.get_memory()); - const int nslots = m_tu.get_num_ws_slots(); + const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems); // Midpoints scalars m_buffers.pnh = decltype(m_buffers.pnh )(mem,nslots); @@ -260,10 +267,8 @@ struct CaarFunctorImpl { mem += m_buffers.v_tens.size(); // Interface scalars - if (!m_theta_hydrostatic_mode || m_rsplit==0) { - m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots); - mem += m_buffers.dp_i.size(); - } + m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots); + mem += m_buffers.dp_i.size(); if (!m_theta_hydrostatic_mode) { m_buffers.dpnh_dp_i = decltype(m_buffers.dpnh_dp_i)(mem,nslots); @@ -280,9 +285,9 @@ struct CaarFunctorImpl { if (!m_theta_hydrostatic_mode) { m_buffers.phi_tens = decltype(m_buffers.phi_tens )(mem,nslots); mem += m_buffers.phi_tens.size(); - m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots); - mem += m_buffers.w_tens.size(); } + m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots); + mem += m_buffers.w_tens.size(); // Interface vectors if (!m_theta_hydrostatic_mode) { @@ -346,6 +351,15 @@ struct CaarFunctorImpl { profiling_resume(); GPTLstart("caar compute"); + +#if 0 + + caar_compute(); + Kokkos::fence(); + GPTLstop("caar compute"); + +#else + int nerr; Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr); Kokkos::fence(); @@ -353,6 +367,8 @@ struct CaarFunctorImpl { if (nerr > 0) check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0); +#endif + GPTLstart("caar_bexchV"); m_bes[data.np1]->exchange(m_geometry.m_rspheremp); Kokkos::fence(); @@ -544,8 +560,8 @@ struct CaarFunctorImpl { auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp); auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp); auto pi = Homme::subview(m_buffers.pi,kv.team_idx,igp,jgp); - auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); - auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp); + auto omega_i = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp); + auto pi_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp); Kokkos::single(Kokkos::PerThread(kv.team),[&]() { pi_i(0)[0] = m_hvcoord.ps0*m_hvcoord.hybrid_ai0; @@ -566,11 +582,13 @@ struct CaarFunctorImpl { kv.team_barrier(); ColumnOps::column_scan_mid_to_int(kv,div_vdp,omega_i); + kv.team_barrier(); // Average omega_i to midpoints, and change sign, since later // omega=v*grad(pi)-average(omega_i) auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp); ColumnOps::compute_midpoint_values(kv,omega_i,omega,-1.0); }); + kv.team_barrier(); // Compute grad(pi) @@ -642,6 +660,7 @@ struct CaarFunctorImpl { // Compute interface horiz velocity auto u_i = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp); auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp); + ColumnOps::compute_interface_values(kv.team,dp,dp_i,u,u_i); ColumnOps::compute_interface_values(kv.team,dp,dp_i,v,v_i); @@ -1138,7 +1157,6 @@ struct CaarFunctorImpl { ColumnOps::compute_midpoint_values(kv, w_sq, Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp),0.5); }); kv.team_barrier(); - // Compute grad(average(w^2/2)). Store in wvor. m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx), wvor); @@ -1179,8 +1197,8 @@ struct CaarFunctorImpl { if (!m_theta_hydrostatic_mode) { // Compute wvor = grad(average(w^2/2)) - average(w*grad(w)) // Note: vtens is already storing grad(avg(w^2/2)) - auto gradw_x = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp); - auto gradw_y = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp); + auto gradw_x = Homme::subview(m_buffers.grad_w_i,kv.team_idx,0,igp,jgp); + auto gradw_y = Homme::subview(m_buffers.grad_w_i,kv.team_idx,1,igp,jgp); auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp); const auto w_gradw_x = [&gradw_x,&w_i] (const int ilev) -> Scalar { diff --git a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp index dd97720f1be2..037041d10315 100644 --- a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp @@ -248,6 +248,12 @@ class EquationOfState { ColumnOps::column_scan_mid_to_int(kv,integrand_provider,phi_i); } + KOKKOS_INLINE_FUNCTION static + Scalar compute_dphi (const Scalar vtheta_dp, const Scalar exner, const Scalar p) { + return PhysicalConstants::Rgas*vtheta_dp*exner/p; + } + + public: bool m_theta_hydrostatic_mode; diff --git a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp index 7914c0a60e3a..01073c317e76 100644 --- a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp @@ -49,7 +49,7 @@ struct LimiterFunctor { struct TagDp3dLimiter {}; // Policies -#ifndef NDEBUG +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else From f804118e37acf77a927e41d1c8e88d6f69d8ff2a Mon Sep 17 00:00:00 2001 From: "White, Trey" Date: Tue, 4 Feb 2025 17:39:49 -0500 Subject: [PATCH 2/3] Turn on tuned code in `CaarFunctorImpl.hpp`. --- components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 1a9d64ddf677..e0b24202f4e8 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -352,7 +352,7 @@ struct CaarFunctorImpl { GPTLstart("caar compute"); -#if 0 +#if 1 caar_compute(); Kokkos::fence(); From f7a82a6d246ee22c102600d84c60d48bfb75ce0c Mon Sep 17 00:00:00 2001 From: "White, Trey" Date: Mon, 24 Feb 2025 16:12:34 -0500 Subject: [PATCH 3/3] Fix typo KOKKOS__ENABLE_CUDA in HyperviscosityFunctorImpl.hpp. --- .../homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp index 4243154ca033..923d38d4a770 100644 --- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp @@ -297,7 +297,7 @@ class HyperviscosityFunctorImpl SphereOperators m_sphere_ops; // Policies -#if defined(KOKKOS__ENABLE_CUDA) && !defined(NDEBUG) +#if defined(KOKKOS_ENABLE_CUDA) && !defined(NDEBUG) template using TeamPolicyType = Kokkos::TeamPolicy,Tag>; #else