From 0d7fca15289be7b9e2f3de74211ebd2b35497c1e Mon Sep 17 00:00:00 2001 From: Johan Ericsson Date: Wed, 23 Oct 2024 10:34:58 +0200 Subject: [PATCH 1/3] New CMake target and new cloudsc_loki src files for field offload loki targets --- bundle.yml | 2 +- src/cloudsc_loki/CMakeLists.txt | 70 +++++++++ .../cloudsc_driver_field_loki_mod.F90 | 140 ++++++++++++++++++ .../cloudsc_loki_field_offload.config | 101 +++++++++++++ src/cloudsc_loki/dwarf_cloudsc.F90 | 18 +++ 5 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 create mode 100644 src/cloudsc_loki/cloudsc_loki_field_offload.config diff --git a/bundle.yml b/bundle.yml index d1d2b106..d8dbf529 100644 --- a/bundle.yml +++ b/bundle.yml @@ -36,7 +36,7 @@ projects : - loki : git : https://github.com/ecmwf-ifs/loki - version : v0.2.7 + version : main require : ecbuild cmake : > LOKI_ENABLE_TESTS=OFF diff --git a/src/cloudsc_loki/CMakeLists.txt b/src/cloudsc_loki/CMakeLists.txt index 785773c4..a25cc7d0 100644 --- a/src/cloudsc_loki/CMakeLists.txt +++ b/src/cloudsc_loki/CMakeLists.txt @@ -623,6 +623,76 @@ if( HAVE_CLOUDSC_LOKI ) ) + #################################################### + ## "Single Column Coalesced Field" (SCC-FIELD) ## + ## * Removes horizontal vector loops ## + ## * Invokes compute kernel as `!$acc vector` ## + ## * Uses Field API for device data offloading ## + #################################################### +if ( HAVE_FIELD_API AND field_api_HAVE_ACC ) + loki_transform( + COMMAND convert + OUTPUT + loki-scc-field/cloudsc.scc_field.F90 + loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90 + BUILDDIR ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field + DEPENDS + cloudsc.F90 + cloudsc_driver_field_loki_mod.F90 + ${_OMNI_DEPENDENCIES} + MODE scc-field + CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/cloudsc_loki_field_offload.config + CPP + DEFINITIONS + CLOUDSC_GPU_TIMING + ${CLOUDSC_DEFINE_STMT_FUNC} + FRONTEND ${LOKI_FRONTEND} + HEADERS + ${COMMON_MODULE}/yomcst.F90 + ${COMMON_MODULE}/yomphyder.F90 + ${COMMON_MODULE}/yoethf.F90 + ${COMMON_MODULE}/yoecldp.F90 + ${COMMON_MODULE}/cloudsc_field_state_mod.F90 + ${COMMON_MODULE}/cloudsc_flux_type_mod.F90 + ${COMMON_MODULE}/cloudsc_aux_type_mod.F90 + ${COMMON_MODULE}/cloudsc_state_type_mod.F90 + SOURCES + ${CMAKE_CURRENT_SOURCE_DIR} + ${COMMON_MODULE} + INCLUDES + ${COMMON_INCLUDE} + XMOD + ${_TARGET_XMOD_DIR} + ${XMOD_DIR} + ) + + ecbuild_add_executable( TARGET dwarf-cloudsc-loki-scc-field + SOURCES + dwarf_cloudsc.F90 + loki-scc-field/cloudsc.scc_field.F90 + loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90 + LIBS + cloudsc-common-lib + DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_FIELD + ) + # Set specific module directory to avoid aliasing of .mod files + set_target_properties( dwarf-cloudsc-loki-scc-field + PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field + ) + + + ecbuild_add_test( + TARGET dwarf-cloudsc-loki-scc-field + COMMAND bin/dwarf-cloudsc-loki-scc-field + ARGS 1 1280 128 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../.. + OMP 1 + ENVIRONMENT "NVCOMPILER_ACC_CUDA_HEAPSIZE=128M" + ) + +endif() + + #################################################### ## SCC CUF (CUDA Fortran) ## ## * SCC with CUDA Fortran (CUF) ## diff --git a/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 b/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 new file mode 100644 index 00000000..ecfc680a --- /dev/null +++ b/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 @@ -0,0 +1,140 @@ +! (C) Copyright 1988- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD + USE PARKIND1, ONLY: JPIM, JPIB, JPRB, JPRD + USE YOMPHYDER, ONLY: STATE_TYPE + USE YOECLDP, ONLY : NCLV, YRECLDP + USE CLOUDSC_MPI_MOD, ONLY: NUMPROC, IRANK + USE TIMER_MOD, ONLY : PERFORMANCE_TIMER, GET_THREAD_NUM + USE EC_PMON_MOD, ONLY: EC_PMON + USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_AUX_TYPE, CLOUDSC_FLUX_TYPE, CLOUDSC_STATE_TYPE + + USE CLOUDSC_MOD, ONLY : CLOUDSC + + IMPLICIT NONE + +CONTAINS + + SUBROUTINE CLOUDSC_DRIVER_FIELD( & + & NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG, KFLDX, PTSPHY, PAUX, FLUX, & + & TENDENCY_TMP, TENDENCY_LOC) + ! Driver routine that invokes the optimized CLAW-based CLOUDSC GPU kernel + + USE YOECLDP , ONLY : TECLDP + USE YOMCST , ONLY : TOMCST + USE YOETHF , ONLY : TOETHF + + INTEGER(KIND=JPIM) ,INTENT(IN) :: NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG + INTEGER(KIND=JPIM) ,INTENT(IN) :: KFLDX + REAL(KIND=JPRB) ,INTENT(IN) :: PTSPHY ! PHYSICS TIMESTEP + TYPE(CLOUDSC_AUX_TYPE) ,INTENT(IN) :: PAUX + TYPE(CLOUDSC_FLUX_TYPE) ,INTENT(IN) :: FLUX + TYPE(CLOUDSC_STATE_TYPE) ,INTENT(IN) :: TENDENCY_TMP + TYPE(CLOUDSC_STATE_TYPE) ,INTENT(INOUT) :: TENDENCY_LOC + + INTEGER(KIND=JPIM) :: JKGLO,IBL,ICEND, NGPBLKS + + TYPE(TECLDP) :: LOCAL_YRECLDP + + TYPE(PERFORMANCE_TIMER) :: TIMER + INTEGER(KIND=JPIM) :: TID ! thread id from 0 .. NUMOMP - 1 + + NGPBLKS = (NGPTOT / NPROMA) + MIN(MOD(NGPTOT,NPROMA), 1) +1003 format(5x,'NUMPROC=',i0,', NUMOMP=',i0,', NGPTOTG=',i0,', NPROMA=',i0,', NGPBLKS=',i0) + if (irank == 0) then + write(0,1003) NUMPROC,NUMOMP,NGPTOTG,NPROMA,NGPBLKS + end if + + ! Global timer for the parallel region + CALL TIMER%START(NUMOMP) + + ! Workaround for PGI / OpenACC oddities: + ! Create a local copy of the parameter struct to ensure they get + ! moved to the device the in ``acc data`` clause below + LOCAL_YRECLDP = YRECLDP + + !$loki data + + !$omp parallel default(shared) private(JKGLO,IBL,ICEND,TID) & + !$omp& num_threads(NUMOMP) firstprivate(PAUX, FLUX, TENDENCY_TMP, TENDENCY_LOC) + + ! Local timer for each thread + TID = GET_THREAD_NUM() + CALL TIMER%THREAD_START(TID) + + !$omp do schedule(runtime) reduction(+:power_total,power_count) + DO JKGLO=1,NGPTOT,NPROMA + IBL=(JKGLO-1)/NPROMA+1 + ICEND=MIN(NPROMA,NGPTOT-JKGLO+1) + + CALL PAUX%UPDATE_VIEW(IBL) + CALL FLUX%UPDATE_VIEW(IBL) + CALL TENDENCY_LOC%UPDATE_VIEW(IBL) + CALL TENDENCY_TMP%UPDATE_VIEW(IBL) + + !-- These were uninitialized : meaningful only when we compare error differences + PAUX%PCOVPTOT = 0.0_JPRB + TENDENCY_LOC%CLD(:,:,NCLV) = 0.0_JPRB + + + + CALL CLOUDSC( 1, ICEND, NPROMA, NLEV, & ! These could also be accessed through FIELD_STATE + & PTSPHY,& + & PAUX%PT, PAUX%PQ, & + & TENDENCY_TMP%T, TENDENCY_TMP%Q, TENDENCY_TMP%A, TENDENCY_TMP%CLD, & + & TENDENCY_LOC%T, TENDENCY_LOC%Q, TENDENCY_LOC%A, TENDENCY_LOC%CLD, & + & PAUX%PVFA, PAUX%PVFL, PAUX%PVFI, PAUX%PDYNA, PAUX%PDYNL, PAUX%PDYNI, & + & PAUX%PHRSW, PAUX%PHRLW,& + & PAUX%PVERVEL, PAUX%PAP, PAUX%PAPH,& + & PAUX%PLSM, PAUX%LDCUM, PAUX%KTYPE, & + & PAUX%PLU, PAUX%PLUDE, PAUX%PSNDE, PAUX%PMFU, PAUX%PMFD,& + !---prognostic fields + & PAUX%PA,& + & PAUX%PCLV, & + & PAUX%PSUPSAT,& +! -- arrays for aerosol-cloud interactions +! !! & PQAER, KAER, & + & PAUX%PLCRIT_AER,PAUX%PICRIT_AER,& + & PAUX%PRE_ICE,& + & PAUX%PCCN, PAUX%PNICE,& + !---diagnostic output + & PAUX%PCOVPTOT, PAUX%PRAINFRAC_TOPRFZ,& + !---resulting fluxes + & FLUX%PFSQLF, FLUX%PFSQIF , FLUX%PFCQNNG, FLUX%PFCQLNG,& + & FLUX%PFSQRF, FLUX%PFSQSF , FLUX%PFCQRNG, FLUX%PFCQSNG,& + & FLUX%PFSQLTUR, FLUX%PFSQITUR , & + & FLUX%PFPLSL, FLUX%PFPLSN, FLUX%PFHPSL, FLUX%PFHPSN, & + & LOCAL_YRECLDP) + +#ifndef CLOUDSC_GPU_TIMING + ! Log number of columns processed by this thread + CALL TIMER%THREAD_LOG(TID, IGPC=ICEND) +#endif + ENDDO + + !-- The "nowait" is here to get correct local timings (tloc) per thread + ! i.e. we should not wait for slowest thread to finish before measuring tloc + !$omp end do nowait + + CALL TIMER%THREAD_END(TID) + + !$omp end parallel + + !$loki end data + + CALL TIMER%END() + + CALL TIMER%PRINT_PERFORMANCE(NPROMA, NGPBLKS, NGPTOT) + + + END SUBROUTINE CLOUDSC_DRIVER_FIELD + +END MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD + diff --git a/src/cloudsc_loki/cloudsc_loki_field_offload.config b/src/cloudsc_loki/cloudsc_loki_field_offload.config new file mode 100644 index 00000000..b262f5f2 --- /dev/null +++ b/src/cloudsc_loki/cloudsc_loki_field_offload.config @@ -0,0 +1,101 @@ +[default] +# Specifies the behaviour of auto-expanded routines +role = 'kernel' +expand = true # Automatically expand subroutine calls +strict = true # Throw exceptions during dicovery +enable_imports = true # Chase dependencies incurred via imports + +# disable - not parsed and not transformed +# block - contained as nodes but never parsed/transformed +# ignore - are parsed but not transformed + +disable = ['abor1', 'timer_mod', 'abort', 'file_io_mod', 'foe*', 'fokoop', 'get_environment_variable', '*%update_view', 'cloudsc_mpi_reduce_min', 'cloudsc_mpi_reduce_max','cloudsc_mpi_reduce_sum', 'EC_PMON', 'expand_l1', 'expand_i1', 'expand_r1', 'expand_r2', 'expand_r3', 'load_and_expand_l1', 'load_and_expand_i1', 'load_and_expand_r1', 'load_and_expand_r2', 'load_and_expand_r3', 'VALIDATE_L1', 'VALIDATE_I1', 'VALIDATE_R1', 'VALIDATE_R2', 'VALIDATE_R3', 'get_offsets', 'ERROR_PRINT', '*get_device_data_rdonly', '*get_device_data_rdwr', '*sync_host_rdwr'] + +ignore = ['parkind1', 'yomphyder', 'yoecldp', 'fc*_mod'] + + +# Define entry point for call-tree transformation +[routines.cloudsc_driver_field] + role = 'driver' + expand = true + + +# Define indices and bounds for array dimensions +[dimensions.horizontal] + size = 'KLON' + index = 'JL' + bounds = ['KIDIA', 'KFDIA'] + aliases = ['NPROMA', 'KDIM%KLON'] + +[dimensions.vertical] + size = 'KLEV' + index = 'JK' + +[dimensions.block_dim] + size = 'NGPBLKS' + index = 'IBL' + + +# Overwrite frontend for header modules that cannot be parsed via OMNI +[frontend_args] + +[frontend_args."yomphyder.F90"] +frontend = 'FP' + +[frontend_args."yomcst.F90"] +frontend = 'FP' + +[frontend_args."yoethf.F90"] +frontend = 'FP' + +[frontend_args."yoecldp.F90"] +frontend = 'FP' + + +[transformations.Idem] + classname = 'IdemTransformation' + module = 'loki.transformations' + + +# Loki-SCC +# ----------------------------------------- +[transformations.SCCVector] + classname = 'SCCVectorPipeline' + module = 'loki.transformations.single_column' +[transformations.SCCVector.options] + horizontal = '%dimensions.horizontal%' + block_dim = '%dimensions.block_dim%' + directive = 'openacc' + +# Housekeeping and other transformations +# ----------------------------------------- +[transformations.FieldOffload] + classname = 'FieldOffloadTransformation' + module = "loki.transformations" + options = { field_group_types = ['CLOUDSC_STATE_TYPE', 'CLOUDSC_AUX_TYPE', 'CLOUDSC_FLUX_TYPE'] } + +[transformations.DataOffload] + classname = 'DataOffloadTransformation' + module = 'loki.transformations' + options = { remove_openmp = true, claw_data_offload = false, assume_deviceptr = false, present_on_device = true } + +[transformations.ModuleWrap] + classname = 'ModuleWrapTransformation' + module = 'loki.transformations.build_system' + options = { module_suffix = '_MOD' } + +[transformations.Dependency] + classname = 'DependencyTransformation' + module = 'loki.transformations.build_system' + options = { suffix = '_LOKI', module_suffix = '_MOD' } + + +# Full transformation pipelines +# ----------------------------------------- +[pipelines] + +[pipelines.idem] + transformations = ['Idem', 'ModuleWrap', 'Dependency'] + +[pipelines.scc-field] + transformations = ['FieldOffload', 'DataOffload', 'SCCVector', 'ModuleWrap', 'Dependency'] diff --git a/src/cloudsc_loki/dwarf_cloudsc.F90 b/src/cloudsc_loki/dwarf_cloudsc.F90 index 67857a78..55711d75 100644 --- a/src/cloudsc_loki/dwarf_cloudsc.F90 +++ b/src/cloudsc_loki/dwarf_cloudsc.F90 @@ -14,9 +14,14 @@ PROGRAM DWARF_CLOUDSC USE CLOUDSC_GLOBAL_STATE_MOD, ONLY: CLOUDSC_GLOBAL_STATE #if CLOUDSC_GPU_SCC_CUF USE CUF_CLOUDSC_DRIVER_LOKI_MOD, ONLY: CUF_CLOUDSC_DRIVER +#elif defined(CLOUDSC_GPU_SCC_FIELD) +USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_FIELD_STATE +USE CLOUDSC_DRIVER_FIELD_LOKI_MOD, ONLY: CLOUDSC_DRIVER_FIELD #else USE CLOUDSC_DRIVER_LOKI_MOD, ONLY: CLOUDSC_DRIVER #endif + + USE EC_PMON_MOD, ONLY: EC_PMON #ifdef _OPENMP @@ -33,7 +38,11 @@ PROGRAM DWARF_CLOUDSC INTEGER(KIND=JPIM) :: NPROMA = 16384 ! NPROMA blocking factor (currently active) INTEGER(KIND=JPIM) :: NGPTOT ! Local number of grid points +#ifdef CLOUDSC_GPU_SCC_FIELD +TYPE(CLOUDSC_FIELD_STATE) :: GLOBAL_STATE +#else TYPE(CLOUDSC_GLOBAL_STATE) :: GLOBAL_STATE +#endif INTEGER(KIND=JPIB) :: ENERGY, POWER CHARACTER(LEN=1) :: CLEC_PMON @@ -107,6 +116,12 @@ PROGRAM DWARF_CLOUDSC & GLOBAL_STATE%PFSQLTUR, GLOBAL_STATE%PFSQITUR, & & GLOBAL_STATE%PFPLSL, GLOBAL_STATE%PFPLSN, GLOBAL_STATE%PFHPSL, GLOBAL_STATE%PFHPSN & & ) +#elif defined(CLOUDSC_GPU_SCC_FIELD) +CALL CLOUDSC_DRIVER_FIELD( & + & NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, & + & GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, & + & GLOBAL_STATE%AUX, GLOBAL_STATE%FLUX, & + & GLOBAL_STATE%TENDENCY_TMP, GLOBAL_STATE%TENDENCY_LOC) #else CALL CLOUDSC_DRIVER(NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, GLOBAL_STATE%NBLOCKS, & & GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, & @@ -133,6 +148,9 @@ PROGRAM DWARF_CLOUDSC ! Validate the output against serialized reference data CALL GLOBAL_STATE%VALIDATE(NPROMA, NGPTOT, NGPTOTG) +#ifdef CLOUDSC_GPU_SCC_FIELD +CALL GLOBAL_STATE%FINALIZE() +#endif ! Tear down MPI environment CALL CLOUDSC_MPI_END() From 3786554431d212a78dd02e8f484625cc6d84962d Mon Sep 17 00:00:00 2001 From: Johan Ericsson Date: Tue, 26 Nov 2024 08:27:51 +0100 Subject: [PATCH 2/3] bumped loki version to v0.2.9 --- bundle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle.yml b/bundle.yml index d8dbf529..09bcbdc9 100644 --- a/bundle.yml +++ b/bundle.yml @@ -36,7 +36,7 @@ projects : - loki : git : https://github.com/ecmwf-ifs/loki - version : main + version : v0.2.9 require : ecbuild cmake : > LOKI_ENABLE_TESTS=OFF From b2815a3d62479c2811c95f198be73965d9a463f3 Mon Sep 17 00:00:00 2001 From: Johan Ericsson <33478726+wertysas@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:06:16 +0100 Subject: [PATCH 3/3] added missing GPU timer logging to Loki field driver Co-authored-by: Balthasar Reuter <6384870+reuterbal@users.noreply.github.com> --- src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 b/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 index ecfc680a..44f4cdc4 100644 --- a/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 +++ b/src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90 @@ -131,6 +131,12 @@ SUBROUTINE CLOUDSC_DRIVER_FIELD( & CALL TIMER%END() +#ifdef CLOUDSC_GPU_TIMING + ! On GPUs, adding block-level column totals is cumbersome and + ! error prone, and of little value due to the large number of + ! processing "thread teams". Instead we register the total here. + CALL TIMER % THREAD_LOG(TID=TID, IGPC=NGPTOT) +#endif CALL TIMER%PRINT_PERFORMANCE(NPROMA, NGPBLKS, NGPTOT)