Skip to content

Commit

Permalink
Merge pull request #100 from ecmwf-ifs/je-field-api-offload-v2
Browse files Browse the repository at this point in the history
New Field API Loki variant with state types in driver
  • Loading branch information
reuterbal authored Dec 19, 2024
2 parents 82fdf4b + b2815a3 commit 751a953
Show file tree
Hide file tree
Showing 5 changed files with 336 additions and 1 deletion.
2 changes: 1 addition & 1 deletion bundle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ projects :
- loki :
git : https://github.com/ecmwf-ifs/loki
version : v0.2.7
version : v0.2.9
require : ecbuild
cmake : >
LOKI_ENABLE_TESTS=OFF
Expand Down
70 changes: 70 additions & 0 deletions src/cloudsc_loki/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,76 @@ if( HAVE_CLOUDSC_LOKI )
)


####################################################
## "Single Column Coalesced Field" (SCC-FIELD) ##
## * Removes horizontal vector loops ##
## * Invokes compute kernel as `!$acc vector` ##
## * Uses Field API for device data offloading ##
####################################################
if ( HAVE_FIELD_API AND field_api_HAVE_ACC )
loki_transform(
COMMAND convert
OUTPUT
loki-scc-field/cloudsc.scc_field.F90
loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90
BUILDDIR ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field
DEPENDS
cloudsc.F90
cloudsc_driver_field_loki_mod.F90
${_OMNI_DEPENDENCIES}
MODE scc-field
CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/cloudsc_loki_field_offload.config
CPP
DEFINITIONS
CLOUDSC_GPU_TIMING
${CLOUDSC_DEFINE_STMT_FUNC}
FRONTEND ${LOKI_FRONTEND}
HEADERS
${COMMON_MODULE}/yomcst.F90
${COMMON_MODULE}/yomphyder.F90
${COMMON_MODULE}/yoethf.F90
${COMMON_MODULE}/yoecldp.F90
${COMMON_MODULE}/cloudsc_field_state_mod.F90
${COMMON_MODULE}/cloudsc_flux_type_mod.F90
${COMMON_MODULE}/cloudsc_aux_type_mod.F90
${COMMON_MODULE}/cloudsc_state_type_mod.F90
SOURCES
${CMAKE_CURRENT_SOURCE_DIR}
${COMMON_MODULE}
INCLUDES
${COMMON_INCLUDE}
XMOD
${_TARGET_XMOD_DIR}
${XMOD_DIR}
)

ecbuild_add_executable( TARGET dwarf-cloudsc-loki-scc-field
SOURCES
dwarf_cloudsc.F90
loki-scc-field/cloudsc.scc_field.F90
loki-scc-field/cloudsc_driver_field_loki_mod.scc_field.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_FIELD
)
# Set specific module directory to avoid aliasing of .mod files
set_target_properties( dwarf-cloudsc-loki-scc-field
PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/loki-scc-field
)


ecbuild_add_test(
TARGET dwarf-cloudsc-loki-scc-field
COMMAND bin/dwarf-cloudsc-loki-scc-field
ARGS 1 1280 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
ENVIRONMENT "NVCOMPILER_ACC_CUDA_HEAPSIZE=128M"
)

endif()


####################################################
## SCC CUF (CUDA Fortran) ##
## * SCC with CUDA Fortran (CUF) ##
Expand Down
146 changes: 146 additions & 0 deletions src/cloudsc_loki/cloudsc_driver_field_loki_mod.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
! (C) Copyright 1988- ECMWF.
!
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
!
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.

MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD
USE PARKIND1, ONLY: JPIM, JPIB, JPRB, JPRD
USE YOMPHYDER, ONLY: STATE_TYPE
USE YOECLDP, ONLY : NCLV, YRECLDP
USE CLOUDSC_MPI_MOD, ONLY: NUMPROC, IRANK
USE TIMER_MOD, ONLY : PERFORMANCE_TIMER, GET_THREAD_NUM
USE EC_PMON_MOD, ONLY: EC_PMON
USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_AUX_TYPE, CLOUDSC_FLUX_TYPE, CLOUDSC_STATE_TYPE

USE CLOUDSC_MOD, ONLY : CLOUDSC

IMPLICIT NONE

CONTAINS

SUBROUTINE CLOUDSC_DRIVER_FIELD( &
& NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG, KFLDX, PTSPHY, PAUX, FLUX, &
& TENDENCY_TMP, TENDENCY_LOC)
! Driver routine that invokes the optimized CLAW-based CLOUDSC GPU kernel

USE YOECLDP , ONLY : TECLDP
USE YOMCST , ONLY : TOMCST
USE YOETHF , ONLY : TOETHF

INTEGER(KIND=JPIM) ,INTENT(IN) :: NUMOMP, NPROMA, NLEV, NGPTOT, NGPTOTG
INTEGER(KIND=JPIM) ,INTENT(IN) :: KFLDX
REAL(KIND=JPRB) ,INTENT(IN) :: PTSPHY ! PHYSICS TIMESTEP
TYPE(CLOUDSC_AUX_TYPE) ,INTENT(IN) :: PAUX
TYPE(CLOUDSC_FLUX_TYPE) ,INTENT(IN) :: FLUX
TYPE(CLOUDSC_STATE_TYPE) ,INTENT(IN) :: TENDENCY_TMP
TYPE(CLOUDSC_STATE_TYPE) ,INTENT(INOUT) :: TENDENCY_LOC

INTEGER(KIND=JPIM) :: JKGLO,IBL,ICEND, NGPBLKS

TYPE(TECLDP) :: LOCAL_YRECLDP

TYPE(PERFORMANCE_TIMER) :: TIMER
INTEGER(KIND=JPIM) :: TID ! thread id from 0 .. NUMOMP - 1

NGPBLKS = (NGPTOT / NPROMA) + MIN(MOD(NGPTOT,NPROMA), 1)
1003 format(5x,'NUMPROC=',i0,', NUMOMP=',i0,', NGPTOTG=',i0,', NPROMA=',i0,', NGPBLKS=',i0)
if (irank == 0) then
write(0,1003) NUMPROC,NUMOMP,NGPTOTG,NPROMA,NGPBLKS
end if

! Global timer for the parallel region
CALL TIMER%START(NUMOMP)

! Workaround for PGI / OpenACC oddities:
! Create a local copy of the parameter struct to ensure they get
! moved to the device the in ``acc data`` clause below
LOCAL_YRECLDP = YRECLDP

!$loki data

!$omp parallel default(shared) private(JKGLO,IBL,ICEND,TID) &
!$omp& num_threads(NUMOMP) firstprivate(PAUX, FLUX, TENDENCY_TMP, TENDENCY_LOC)

! Local timer for each thread
TID = GET_THREAD_NUM()
CALL TIMER%THREAD_START(TID)

!$omp do schedule(runtime) reduction(+:power_total,power_count)
DO JKGLO=1,NGPTOT,NPROMA
IBL=(JKGLO-1)/NPROMA+1
ICEND=MIN(NPROMA,NGPTOT-JKGLO+1)

CALL PAUX%UPDATE_VIEW(IBL)
CALL FLUX%UPDATE_VIEW(IBL)
CALL TENDENCY_LOC%UPDATE_VIEW(IBL)
CALL TENDENCY_TMP%UPDATE_VIEW(IBL)

!-- These were uninitialized : meaningful only when we compare error differences
PAUX%PCOVPTOT = 0.0_JPRB
TENDENCY_LOC%CLD(:,:,NCLV) = 0.0_JPRB



CALL CLOUDSC( 1, ICEND, NPROMA, NLEV, & ! These could also be accessed through FIELD_STATE
& PTSPHY,&
& PAUX%PT, PAUX%PQ, &
& TENDENCY_TMP%T, TENDENCY_TMP%Q, TENDENCY_TMP%A, TENDENCY_TMP%CLD, &
& TENDENCY_LOC%T, TENDENCY_LOC%Q, TENDENCY_LOC%A, TENDENCY_LOC%CLD, &
& PAUX%PVFA, PAUX%PVFL, PAUX%PVFI, PAUX%PDYNA, PAUX%PDYNL, PAUX%PDYNI, &
& PAUX%PHRSW, PAUX%PHRLW,&
& PAUX%PVERVEL, PAUX%PAP, PAUX%PAPH,&
& PAUX%PLSM, PAUX%LDCUM, PAUX%KTYPE, &
& PAUX%PLU, PAUX%PLUDE, PAUX%PSNDE, PAUX%PMFU, PAUX%PMFD,&
!---prognostic fields
& PAUX%PA,&
& PAUX%PCLV, &
& PAUX%PSUPSAT,&
! -- arrays for aerosol-cloud interactions
! !! & PQAER, KAER, &
& PAUX%PLCRIT_AER,PAUX%PICRIT_AER,&
& PAUX%PRE_ICE,&
& PAUX%PCCN, PAUX%PNICE,&
!---diagnostic output
& PAUX%PCOVPTOT, PAUX%PRAINFRAC_TOPRFZ,&
!---resulting fluxes
& FLUX%PFSQLF, FLUX%PFSQIF , FLUX%PFCQNNG, FLUX%PFCQLNG,&
& FLUX%PFSQRF, FLUX%PFSQSF , FLUX%PFCQRNG, FLUX%PFCQSNG,&
& FLUX%PFSQLTUR, FLUX%PFSQITUR , &
& FLUX%PFPLSL, FLUX%PFPLSN, FLUX%PFHPSL, FLUX%PFHPSN, &
& LOCAL_YRECLDP)

#ifndef CLOUDSC_GPU_TIMING
! Log number of columns processed by this thread
CALL TIMER%THREAD_LOG(TID, IGPC=ICEND)
#endif
ENDDO

!-- The "nowait" is here to get correct local timings (tloc) per thread
! i.e. we should not wait for slowest thread to finish before measuring tloc
!$omp end do nowait

CALL TIMER%THREAD_END(TID)

!$omp end parallel

!$loki end data

CALL TIMER%END()

#ifdef CLOUDSC_GPU_TIMING
! On GPUs, adding block-level column totals is cumbersome and
! error prone, and of little value due to the large number of
! processing "thread teams". Instead we register the total here.
CALL TIMER % THREAD_LOG(TID=TID, IGPC=NGPTOT)
#endif
CALL TIMER%PRINT_PERFORMANCE(NPROMA, NGPBLKS, NGPTOT)


END SUBROUTINE CLOUDSC_DRIVER_FIELD

END MODULE CLOUDSC_DRIVER_FIELD_LOKI_MOD

101 changes: 101 additions & 0 deletions src/cloudsc_loki/cloudsc_loki_field_offload.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
[default]
# Specifies the behaviour of auto-expanded routines
role = 'kernel'
expand = true # Automatically expand subroutine calls
strict = true # Throw exceptions during dicovery
enable_imports = true # Chase dependencies incurred via imports

# disable - not parsed and not transformed
# block - contained as nodes but never parsed/transformed
# ignore - are parsed but not transformed

disable = ['abor1', 'timer_mod', 'abort', 'file_io_mod', 'foe*', 'fokoop', 'get_environment_variable', '*%update_view', 'cloudsc_mpi_reduce_min', 'cloudsc_mpi_reduce_max','cloudsc_mpi_reduce_sum', 'EC_PMON', 'expand_l1', 'expand_i1', 'expand_r1', 'expand_r2', 'expand_r3', 'load_and_expand_l1', 'load_and_expand_i1', 'load_and_expand_r1', 'load_and_expand_r2', 'load_and_expand_r3', 'VALIDATE_L1', 'VALIDATE_I1', 'VALIDATE_R1', 'VALIDATE_R2', 'VALIDATE_R3', 'get_offsets', 'ERROR_PRINT', '*get_device_data_rdonly', '*get_device_data_rdwr', '*sync_host_rdwr']

ignore = ['parkind1', 'yomphyder', 'yoecldp', 'fc*_mod']


# Define entry point for call-tree transformation
[routines.cloudsc_driver_field]
role = 'driver'
expand = true


# Define indices and bounds for array dimensions
[dimensions.horizontal]
size = 'KLON'
index = 'JL'
bounds = ['KIDIA', 'KFDIA']
aliases = ['NPROMA', 'KDIM%KLON']

[dimensions.vertical]
size = 'KLEV'
index = 'JK'

[dimensions.block_dim]
size = 'NGPBLKS'
index = 'IBL'


# Overwrite frontend for header modules that cannot be parsed via OMNI
[frontend_args]

[frontend_args."yomphyder.F90"]
frontend = 'FP'

[frontend_args."yomcst.F90"]
frontend = 'FP'

[frontend_args."yoethf.F90"]
frontend = 'FP'

[frontend_args."yoecldp.F90"]
frontend = 'FP'


[transformations.Idem]
classname = 'IdemTransformation'
module = 'loki.transformations'


# Loki-SCC
# -----------------------------------------
[transformations.SCCVector]
classname = 'SCCVectorPipeline'
module = 'loki.transformations.single_column'
[transformations.SCCVector.options]
horizontal = '%dimensions.horizontal%'
block_dim = '%dimensions.block_dim%'
directive = 'openacc'

# Housekeeping and other transformations
# -----------------------------------------
[transformations.FieldOffload]
classname = 'FieldOffloadTransformation'
module = "loki.transformations"
options = { field_group_types = ['CLOUDSC_STATE_TYPE', 'CLOUDSC_AUX_TYPE', 'CLOUDSC_FLUX_TYPE'] }

[transformations.DataOffload]
classname = 'DataOffloadTransformation'
module = 'loki.transformations'
options = { remove_openmp = true, claw_data_offload = false, assume_deviceptr = false, present_on_device = true }

[transformations.ModuleWrap]
classname = 'ModuleWrapTransformation'
module = 'loki.transformations.build_system'
options = { module_suffix = '_MOD' }

[transformations.Dependency]
classname = 'DependencyTransformation'
module = 'loki.transformations.build_system'
options = { suffix = '_LOKI', module_suffix = '_MOD' }


# Full transformation pipelines
# -----------------------------------------
[pipelines]

[pipelines.idem]
transformations = ['Idem', 'ModuleWrap', 'Dependency']

[pipelines.scc-field]
transformations = ['FieldOffload', 'DataOffload', 'SCCVector', 'ModuleWrap', 'Dependency']
18 changes: 18 additions & 0 deletions src/cloudsc_loki/dwarf_cloudsc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@ PROGRAM DWARF_CLOUDSC
USE CLOUDSC_GLOBAL_STATE_MOD, ONLY: CLOUDSC_GLOBAL_STATE
#if CLOUDSC_GPU_SCC_CUF
USE CUF_CLOUDSC_DRIVER_LOKI_MOD, ONLY: CUF_CLOUDSC_DRIVER
#elif defined(CLOUDSC_GPU_SCC_FIELD)
USE CLOUDSC_FIELD_STATE_MOD, ONLY: CLOUDSC_FIELD_STATE
USE CLOUDSC_DRIVER_FIELD_LOKI_MOD, ONLY: CLOUDSC_DRIVER_FIELD
#else
USE CLOUDSC_DRIVER_LOKI_MOD, ONLY: CLOUDSC_DRIVER
#endif


USE EC_PMON_MOD, ONLY: EC_PMON

#ifdef _OPENMP
Expand All @@ -33,7 +38,11 @@ PROGRAM DWARF_CLOUDSC
INTEGER(KIND=JPIM) :: NPROMA = 16384 ! NPROMA blocking factor (currently active)
INTEGER(KIND=JPIM) :: NGPTOT ! Local number of grid points

#ifdef CLOUDSC_GPU_SCC_FIELD
TYPE(CLOUDSC_FIELD_STATE) :: GLOBAL_STATE
#else
TYPE(CLOUDSC_GLOBAL_STATE) :: GLOBAL_STATE
#endif

INTEGER(KIND=JPIB) :: ENERGY, POWER
CHARACTER(LEN=1) :: CLEC_PMON
Expand Down Expand Up @@ -107,6 +116,12 @@ PROGRAM DWARF_CLOUDSC
& GLOBAL_STATE%PFSQLTUR, GLOBAL_STATE%PFSQITUR, &
& GLOBAL_STATE%PFPLSL, GLOBAL_STATE%PFPLSN, GLOBAL_STATE%PFHPSL, GLOBAL_STATE%PFHPSN &
& )
#elif defined(CLOUDSC_GPU_SCC_FIELD)
CALL CLOUDSC_DRIVER_FIELD( &
& NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, &
& GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, &
& GLOBAL_STATE%AUX, GLOBAL_STATE%FLUX, &
& GLOBAL_STATE%TENDENCY_TMP, GLOBAL_STATE%TENDENCY_LOC)
#else
CALL CLOUDSC_DRIVER(NUMOMP, NPROMA, GLOBAL_STATE%KLEV, NGPTOT, NGPTOTG, GLOBAL_STATE%NBLOCKS, &
& GLOBAL_STATE%KFLDX, GLOBAL_STATE%PTSPHY, &
Expand All @@ -133,6 +148,9 @@ PROGRAM DWARF_CLOUDSC

! Validate the output against serialized reference data
CALL GLOBAL_STATE%VALIDATE(NPROMA, NGPTOT, NGPTOTG)
#ifdef CLOUDSC_GPU_SCC_FIELD
CALL GLOBAL_STATE%FINALIZE()
#endif

! Tear down MPI environment
CALL CLOUDSC_MPI_END()
Expand Down

0 comments on commit 751a953

Please sign in to comment.