From 8fc9a8d76e5848d46a2bf25ea6d6fd91cd1846d4 Mon Sep 17 00:00:00 2001 From: Natalie Perlin <68030316+natalie-perlin@users.noreply.github.com> Date: Thu, 29 Feb 2024 13:02:30 -0500 Subject: [PATCH] [develop] Update for Gaea-c5 (#1047) * Enable SRW to run on Gaea-c5, use spack-stack v1.5.0, and SRW-built conda environment * Update code to rename "gaea-c5" platform to "gaea". The name in for Jenkins still need to remain "gaeac5" at the moment. A solution to solve library conflict for libstdc++.so.6 was to preload a specific library during a runtime, as specified in ./modulefiles/wflow_gaea.lua , ./modulefiles/tasks/gaea/python_srw.lua: setenv("LD_PRELOAD", "/opt/cray/pe/gcc/12.2.0/snos/lib64/libstdc++.so.6") --------- Co-authored-by: Natalie Perlin Co-authored-by: michael.lueken --- .cicd/Jenkinsfile | 13 ++--- .cicd/scripts/srw_ftest.sh | 7 +-- .cicd/scripts/wrapper_srw_ftest.sh | 2 +- devbuild.sh | 7 +-- .../CustomizingTheWorkflow/ConfigWorkflow.rst | 2 +- etc/lmod-setup.csh | 2 +- etc/lmod-setup.sh | 2 +- ...gaea-c5_intel.lua => build_gaea_intel.lua} | 4 +- .../tasks/gaea-c5/plot_allvars.local.lua | 6 -- modulefiles/tasks/gaea-c5/python_srw.lua | 8 --- modulefiles/tasks/gaea/plot_allvars.local.lua | 4 ++ modulefiles/tasks/gaea/python_srw.lua | 7 +++ .../tasks/{gaea-c5 => gaea}/run_vx.local.lua | 0 .../{wflow_gaea-c5.lua => wflow_gaea.lua} | 7 +-- ...mprehensive.gaea-c5 => comprehensive.gaea} | 0 .../{coverage.gaea-c5 => coverage.gaea} | 0 tests/WE2E/setup_WE2E_tests.sh | 2 +- tests/build.sh | 2 +- ush/load_modules_wflow.sh | 7 +-- ush/machine/gaea-c5.yaml | 55 ------------------- ush/machine/gaea.yaml | 55 +++++++++++++++++++ ush/valid_param_vals.yaml | 2 +- ush/wrappers/job_cards/sbatch/get_ics.sbatch | 2 +- ush/wrappers/job_cards/sbatch/get_lbcs.sbatch | 2 +- .../job_cards/sbatch/make_grid.sbatch | 2 +- ush/wrappers/job_cards/sbatch/make_ics.sbatch | 2 +- .../job_cards/sbatch/make_lbcs.sbatch | 2 +- .../job_cards/sbatch/make_orog.sbatch | 2 +- .../job_cards/sbatch/make_sfc_climo.sbatch | 2 +- ush/wrappers/job_cards/sbatch/run_fcst.sbatch | 2 +- ush/wrappers/job_cards/sbatch/run_post.sbatch | 2 +- 31 files changed, 95 insertions(+), 117 deletions(-) rename modulefiles/{build_gaea-c5_intel.lua => build_gaea_intel.lua} (91%) delete mode 100644 modulefiles/tasks/gaea-c5/plot_allvars.local.lua delete mode 100644 modulefiles/tasks/gaea-c5/python_srw.lua create mode 100644 modulefiles/tasks/gaea/plot_allvars.local.lua create mode 100644 modulefiles/tasks/gaea/python_srw.lua rename modulefiles/tasks/{gaea-c5 => gaea}/run_vx.local.lua (100%) rename modulefiles/{wflow_gaea-c5.lua => wflow_gaea.lua} (68%) rename tests/WE2E/machine_suites/{comprehensive.gaea-c5 => comprehensive.gaea} (100%) rename tests/WE2E/machine_suites/{coverage.gaea-c5 => coverage.gaea} (100%) delete mode 100644 ush/machine/gaea-c5.yaml create mode 100644 ush/machine/gaea.yaml diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile index 86af5dded4..8cc95c6b00 100644 --- a/.cicd/Jenkinsfile +++ b/.cicd/Jenkinsfile @@ -10,11 +10,10 @@ pipeline { parameters { // Allow job runner to filter based on platform // Use the line below to enable all PW clusters - // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaeac5', 'hera', 'jet', 'orion', 'hercules', 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'], description: 'Specify the platform(s) to use') + // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'hercules', 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'], description: 'Specify the platform(s) to use') // Use the line below to enable the PW AWS cluster - // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaeac5', 'hera', 'jet', 'orion', 'hercules', 'pclusternoaav2use1'], description: 'Specify the platform(s) to use') - // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'derecho', 'gaeac5', 'hera', 'jet', 'orion', 'hercules'], description: 'Specify the platform(s) to use') - choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'derecho', 'hera', 'jet', 'orion', 'hercules'], description: 'Specify the platform(s) to use') + // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'hercules', 'pclusternoaav2use1'], description: 'Specify the platform(s) to use') + choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'derecho', 'gaea', 'hera', 'jet', 'orion', 'hercules'], description: 'Specify the platform(s) to use') // Allow job runner to filter based on compiler choice(name: 'SRW_COMPILER_FILTER', choices: ['all', 'gnu', 'intel'], description: 'Specify the compiler(s) to use to build') booleanParam name: 'SRW_WE2E_COMPREHENSIVE_TESTS', defaultValue: false, description: 'Whether to execute the comprehensive end-to-end tests' @@ -87,8 +86,7 @@ pipeline { axes { axis { name 'SRW_PLATFORM' - // values 'derecho', 'gaeac5', 'hera', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1' - values 'derecho', 'hera', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1' + values 'derecho', 'gaea', 'hera', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1' } axis { @@ -102,8 +100,7 @@ pipeline { exclude { axis { name 'SRW_PLATFORM' - // values 'derecho', 'gaeac5', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1' - values 'derecho', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1' + values 'derecho', 'gaea', 'jet', 'orion', 'hercules' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1' } axis { diff --git a/.cicd/scripts/srw_ftest.sh b/.cicd/scripts/srw_ftest.sh index 95d5e2f936..5479e8b46d 100755 --- a/.cicd/scripts/srw_ftest.sh +++ b/.cicd/scripts/srw_ftest.sh @@ -85,12 +85,7 @@ module load build_${platform,,}_${SRW_COMPILER} module load wflow_${platform,,} [[ ${FORGIVE_CONDA} == true ]] && set +e +u # Some platforms have incomplete python3 or conda support, but wouldn't necessarily block workflow tests -# Gaea-C5 special case missing jinja2 -if [ "${platform}" == "gaea-c5" ]; then - conda activate workflow_tools -else - conda activate srw_app -fi +conda activate srw_app set -e -u # Adjust for strict limitation of stack size diff --git a/.cicd/scripts/wrapper_srw_ftest.sh b/.cicd/scripts/wrapper_srw_ftest.sh index e4afaf9e98..fabdbb63ef 100755 --- a/.cicd/scripts/wrapper_srw_ftest.sh +++ b/.cicd/scripts/wrapper_srw_ftest.sh @@ -23,7 +23,7 @@ else fi # Customize wrapper scripts -if [[ "${SRW_PLATFORM}" == gaea-c5 ]]; then +if [[ "${SRW_PLATFORM}" == gaea ]]; then sed -i '15i #SBATCH --clusters=c5' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh sed -i 's|qos=batch|qos=normal|g' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh fi diff --git a/devbuild.sh b/devbuild.sh index 9136b86e7a..05cc76312c 100755 --- a/devbuild.sh +++ b/devbuild.sh @@ -212,11 +212,6 @@ printf "PLATFORM(MACHINE)=${PLATFORM}\n" >&2 if [ "${PLATFORM}" = "wcoss2" ]; then BUILD_CONDA="off" fi -# Conda is not used on Gaea-c5 F2 filesystem -# it needs to be reevaluated when moved to F2 filesystem -if [ "${PLATFORM}" = "gaea-c5" ]; then - BUILD_CONDA="off" -fi # build conda and conda environments, if requested. if [ "${BUILD_CONDA}" = "on" ] ; then @@ -288,7 +283,7 @@ set -eu # automatically determine compiler if [ -z "${COMPILER}" ] ; then case ${PLATFORM} in - jet|hera|gaea-c5) COMPILER=intel ;; + jet|hera|gaea) COMPILER=intel ;; orion) COMPILER=intel ;; wcoss2) COMPILER=intel ;; cheyenne) COMPILER=intel ;; diff --git a/doc/UsersGuide/source/CustomizingTheWorkflow/ConfigWorkflow.rst b/doc/UsersGuide/source/CustomizingTheWorkflow/ConfigWorkflow.rst index b0b0301973..0c8ed8e951 100644 --- a/doc/UsersGuide/source/CustomizingTheWorkflow/ConfigWorkflow.rst +++ b/doc/UsersGuide/source/CustomizingTheWorkflow/ConfigWorkflow.rst @@ -30,7 +30,7 @@ If non-default parameters are selected for the variables in this section, they s Setting ``RUN_ENVIR`` to "community" is recommended in most cases for users who are not running in NCO's production environment. Valid values: ``"nco"`` | ``"community"`` ``MACHINE``: (Default: "BIG_COMPUTER") - The machine (a.k.a. platform or system) on which the workflow will run. Currently supported platforms are listed on the :srw-wiki:`SRW App Wiki page `. When running the SRW App on any ParallelWorks/NOAA Cloud system, use "NOAACLOUD" regardless of the underlying system (AWS, GCP, or Azure). Valid values: ``"HERA"`` | ``"ORION"`` | ``"HERCULES"`` | ``"JET"`` | ``"CHEYENNE"`` | ``"DERECHO"`` | ``"GAEA"`` | ``"GAEA-C5"`` | ``"NOAACLOUD"`` | ``"STAMPEDE"`` | ``"ODIN"`` | ``"MACOS"`` | ``"LINUX"`` | ``"SINGULARITY"`` | ``"WCOSS2"`` (Check ``ufs-srweather-app/ush/valid_param_vals.yaml`` for the most up-to-date list of supported platforms.) + The machine (a.k.a. platform or system) on which the workflow will run. Currently supported platforms are listed on the :srw-wiki:`SRW App Wiki page `. When running the SRW App on any ParallelWorks/NOAA Cloud system, use "NOAACLOUD" regardless of the underlying system (AWS, GCP, or Azure). Valid values: ``"HERA"`` | ``"ORION"`` | ``"HERCULES"`` | ``"JET"`` | ``"CHEYENNE"`` | ``"DERECHO"`` | ``"GAEA"`` | ``"NOAACLOUD"`` | ``"STAMPEDE"`` | ``"ODIN"`` | ``"MACOS"`` | ``"LINUX"`` | ``"SINGULARITY"`` | ``"WCOSS2"`` (Check ``ufs-srweather-app/ush/valid_param_vals.yaml`` for the most up-to-date list of supported platforms.) .. hint:: Users who are NOT on a named, supported Level 1 or 2 platform will need to set the ``MACHINE`` variable to ``LINUX`` or ``MACOS``. To combine use of a Linux or MacOS platform with the Rocoto workflow manager, users will also need to set ``WORKFLOW_MANAGER: "rocoto"`` in the ``platform:`` section of ``config.yaml``. This combination will assume a Slurm batch manager when generating the XML. diff --git a/etc/lmod-setup.csh b/etc/lmod-setup.csh index 92a4394893..af79ad8a70 100644 --- a/etc/lmod-setup.csh +++ b/etc/lmod-setup.csh @@ -37,7 +37,7 @@ else if ( "$L_MACHINE" == singularity ) then module purge -else if ( "$L_MACHINE" == gaea-c5 ) then +else if ( "$L_MACHINE" == gaea ) then module reset else if ( "$L_MACHINE" == derecho ) then diff --git a/etc/lmod-setup.sh b/etc/lmod-setup.sh index 7328dea76f..b030d2a9f5 100644 --- a/etc/lmod-setup.sh +++ b/etc/lmod-setup.sh @@ -44,7 +44,7 @@ elif [ "$L_MACHINE" = singularity ]; then module purge -elif [ "$L_MACHINE" = gaea-c5 ]; then +elif [ "$L_MACHINE" = gaea ]; then module reset elif [ "$L_MACHINE" = derecho ]; then diff --git a/modulefiles/build_gaea-c5_intel.lua b/modulefiles/build_gaea_intel.lua similarity index 91% rename from modulefiles/build_gaea-c5_intel.lua rename to modulefiles/build_gaea_intel.lua index ecf21dcc8d..9c21f685da 100644 --- a/modulefiles/build_gaea-c5_intel.lua +++ b/modulefiles/build_gaea_intel.lua @@ -5,7 +5,7 @@ the NOAA RDHPC machine Gaea C5 using Intel-2023.1.0 whatis([===[Loads libraries needed for building the UFS SRW App on Gaea C5 ]===]) -prepend_path("MODULEPATH","/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") +prepend_path("MODULEPATH","/ncrc/proj/epic/spack-stack/spack-stack-1.5.0/envs/unified-env/install/modulefiles/Core") stack_intel_ver=os.getenv("stack_intel_ver") or "2023.1.0" load(pathJoin("stack-intel", stack_intel_ver)) @@ -32,4 +32,4 @@ setenv("CXX","CC") setenv("CMAKE_C_COMPILER","cc") setenv("CMAKE_Fortran_COMPILER","ftn") setenv("CMAKE_CXX_COMPILER","CC") -setenv("CMAKE_Platform","gaea-c5.intel") +setenv("CMAKE_Platform","gaea.intel") diff --git a/modulefiles/tasks/gaea-c5/plot_allvars.local.lua b/modulefiles/tasks/gaea-c5/plot_allvars.local.lua deleted file mode 100644 index 624b869bdb..0000000000 --- a/modulefiles/tasks/gaea-c5/plot_allvars.local.lua +++ /dev/null @@ -1,6 +0,0 @@ -unload("miniconda3") -unload("python") -prepend_path("MODULEPATH","/ncrc/proj/epic/miniconda3/modulefiles") -load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0")) - -setenv("SRW_ENV", "regional_workflow") diff --git a/modulefiles/tasks/gaea-c5/python_srw.lua b/modulefiles/tasks/gaea-c5/python_srw.lua deleted file mode 100644 index b6107cc465..0000000000 --- a/modulefiles/tasks/gaea-c5/python_srw.lua +++ /dev/null @@ -1,8 +0,0 @@ -unload("miniconda3") -unload("python") -prepend_path("MODULEPATH","/ncrc/proj/epic/miniconda3/modulefiles") -load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0")) - -setenv("SRW_ENV", "workflow_tools") - -load("darshan-runtime/3.4.0") diff --git a/modulefiles/tasks/gaea/plot_allvars.local.lua b/modulefiles/tasks/gaea/plot_allvars.local.lua new file mode 100644 index 0000000000..104da06f5c --- /dev/null +++ b/modulefiles/tasks/gaea/plot_allvars.local.lua @@ -0,0 +1,4 @@ +unload("python") +load("conda") + +setenv("SRW_ENV", "srw_graphics") diff --git a/modulefiles/tasks/gaea/python_srw.lua b/modulefiles/tasks/gaea/python_srw.lua new file mode 100644 index 0000000000..5058b3f615 --- /dev/null +++ b/modulefiles/tasks/gaea/python_srw.lua @@ -0,0 +1,7 @@ +load("darshan-runtime/3.4.0") +unload("python") +load("conda") + +setenv("SRW_ENV", "srw_app") +setenv("LD_PRELOAD", "/opt/cray/pe/gcc/12.2.0/snos/lib64/libstdc++.so.6") + diff --git a/modulefiles/tasks/gaea-c5/run_vx.local.lua b/modulefiles/tasks/gaea/run_vx.local.lua similarity index 100% rename from modulefiles/tasks/gaea-c5/run_vx.local.lua rename to modulefiles/tasks/gaea/run_vx.local.lua diff --git a/modulefiles/wflow_gaea-c5.lua b/modulefiles/wflow_gaea.lua similarity index 68% rename from modulefiles/wflow_gaea-c5.lua rename to modulefiles/wflow_gaea.lua index 3073aa0522..6c24672c30 100644 --- a/modulefiles/wflow_gaea-c5.lua +++ b/modulefiles/wflow_gaea.lua @@ -6,16 +6,15 @@ the NOAA RDHPC machine Gaea C5 whatis([===[Loads libraries needed for running the UFS SRW App on gaea ]===]) unload("python") -load("set_pythonpath") -prepend_path("MODULEPATH","/ncrc/proj/epic/miniconda3/modulefiles/") -load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0")) prepend_path("MODULEPATH","/ncrc/proj/epic/rocoto/modulefiles/") load("rocoto") +load("conda") pushenv("MKLROOT", "/opt/intel/oneapi/mkl/2023.1.0/") +setenv("LD_PRELOAD", "/opt/cray/pe/gcc/12.2.0/snos/lib64/libstdc++.so.6") if mode() == "load" then LmodMsgRaw([===[Please do the following to activate conda: - > conda activate workflow_tools + > conda activate srw_app ]===]) end diff --git a/tests/WE2E/machine_suites/comprehensive.gaea-c5 b/tests/WE2E/machine_suites/comprehensive.gaea similarity index 100% rename from tests/WE2E/machine_suites/comprehensive.gaea-c5 rename to tests/WE2E/machine_suites/comprehensive.gaea diff --git a/tests/WE2E/machine_suites/coverage.gaea-c5 b/tests/WE2E/machine_suites/coverage.gaea similarity index 100% rename from tests/WE2E/machine_suites/coverage.gaea-c5 rename to tests/WE2E/machine_suites/coverage.gaea diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index 0644102c06..309c755966 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -45,7 +45,7 @@ function usage { } -machines=( hera jet cheyenne derecho orion wcoss2 gaea-c5 odin singularity macos noaacloud ) +machines=( hera jet cheyenne derecho orion wcoss2 gaea odin singularity macos noaacloud ) if [ "$1" = "-h" ] ; then usage ; fi [[ $# -le 2 ]] && usage diff --git a/tests/build.sh b/tests/build.sh index caf0e2b0ae..f230354a61 100755 --- a/tests/build.sh +++ b/tests/build.sh @@ -21,7 +21,7 @@ function usage() { exit 1 } -machines=( hera jet cheyenne derecho orion hercules wcoss2 gaea-c5 odin singularity macos noaacloud ) +machines=( hera jet cheyenne derecho orion hercules wcoss2 gaea odin singularity macos noaacloud ) [[ $# -gt 4 ]] && usage diff --git a/ush/load_modules_wflow.sh b/ush/load_modules_wflow.sh index cf33a43f3f..d770d7c2d9 100755 --- a/ush/load_modules_wflow.sh +++ b/ush/load_modules_wflow.sh @@ -62,12 +62,7 @@ task failed: $has_mu && set +u if [ ! -z $(command -v conda) ]; then -# Gaea-C5 special case missing jinja2 - if [ "${machine}" == "gaea-c5" ]; then - conda activate workflow_tools - else - conda activate srw_app - fi + conda activate srw_app fi $has_mu && set -u diff --git a/ush/machine/gaea-c5.yaml b/ush/machine/gaea-c5.yaml deleted file mode 100644 index 1f6f115495..0000000000 --- a/ush/machine/gaea-c5.yaml +++ /dev/null @@ -1,55 +0,0 @@ -platform: - WORKFLOW_MANAGER: rocoto - NCORES_PER_NODE: 128 - SCHED: slurm - TEST_CCPA_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/ccpa/proc - TEST_MRMS_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/mrms/proc - TEST_NDAS_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/ndas/proc - TEST_NOHRSC_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/nohrsc/proc - DOMAIN_PREGEN_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/FV3LAM_pregen - QUEUE_DEFAULT: normal - QUEUE_FCST: normal - QUEUE_HPSS: normal - REMOVE_MEMORY: True - PARTITION_HPSS: eslogin_c5 - RUN_CMD_FCST: srun --export=ALL -n ${PE_MEMBER01} - RUN_CMD_POST: srun --export=ALL -n $nprocs - RUN_CMD_PRDGEN: srun --export=ALL -n $nprocs - RUN_CMD_SERIAL: time - RUN_CMD_UTILS: srun --export=ALL -n $nprocs - SCHED_NATIVE_CMD: --clusters=c5 --partition=batch --export=NONE - SCHED_NATIVE_CMD_HPSS: --clusters=es --partition=eslogin_c5 --export=NONE - PRE_TASK_CMDS: '{ ulimit -s unlimited; ulimit -a; }' - TEST_EXTRN_MDL_SOURCE_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data - TEST_PREGEN_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/FV3LAM_pregen - TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/dummy_FV3GFS_sys_dir - TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/dummy_FV3GFS_sys_dir - TEST_VX_FCST_INPUT_BASEDIR: '{{ "/lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/output_data/fcst_" }}{{ "ens" if (global.NUM_ENS_MEMBERS > 0) else "det" }}{{ "/{{workflow.PREDEF_GRID_NAME}}" }}{% raw %}{% endraw %}' - FIXaer: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_aer - FIXgsi: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_gsi - FIXgsm: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_am - FIXlut: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_lut - FIXorg: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_orog - FIXsfc: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_sfc_climo - FIXshp: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/NaturalEarth - EXTRN_MDL_DATA_STORES: aws -data: - ics_lbcs: - FV3GFS: - nemsio: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/FV3GFS/nemsio/${yyyymmdd}${hh} - grib2: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/FV3GFS/grib2/${yyyymmdd}${hh} - netcdf: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/FV3GFS/netcdf/${yyyymmdd}${hh} - RAP: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/RAP/${yyyymmdd}${hh} - HRRR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/HRRR/${yyyymmdd}${hh} - RAP: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/RAP/${yyyymmdd}${hh} - GSMGFS: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/GSMGFS/${yyyymmdd}${hh} -rocoto: - tasks: - metatask_run_ensemble: - task_run_fcst_mem#mem#: - cores: '{{ task_run_fcst.PE_MEMBER01 // 1 }}' - native: '--cpus-per-task {{ task_run_fcst.OMP_NUM_THREADS_RUN_FCST|int }} --exclusive {{ platform.SCHED_NATIVE_CMD }}' - nodes: - nnodes: - nodesize: - ppn: diff --git a/ush/machine/gaea.yaml b/ush/machine/gaea.yaml new file mode 100644 index 0000000000..1ec2ded2ef --- /dev/null +++ b/ush/machine/gaea.yaml @@ -0,0 +1,55 @@ +platform: + WORKFLOW_MANAGER: rocoto + NCORES_PER_NODE: 128 + SCHED: slurm + TEST_CCPA_OBS_DIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/obs_data/ccpa/proc + TEST_MRMS_OBS_DIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/obs_data/mrms/proc + TEST_NDAS_OBS_DIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/obs_data/ndas/proc + TEST_NOHRSC_OBS_DIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/obs_data/nohrsc/proc + DOMAIN_PREGEN_BASEDIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/FV3LAM_pregen + QUEUE_DEFAULT: normal + QUEUE_FCST: normal + QUEUE_HPSS: normal + REMOVE_MEMORY: True + PARTITION_HPSS: eslogin_c5 + RUN_CMD_FCST: srun --export=ALL -n ${PE_MEMBER01} + RUN_CMD_POST: srun --export=ALL -n $nprocs + RUN_CMD_PRDGEN: srun --export=ALL -n $nprocs + RUN_CMD_SERIAL: time + RUN_CMD_UTILS: srun --export=ALL -n $nprocs + SCHED_NATIVE_CMD: --clusters=c5 --partition=batch --export=NONE + SCHED_NATIVE_CMD_HPSS: --clusters=es --partition=eslogin_c5 --export=NONE + PRE_TASK_CMDS: '{ ulimit -s unlimited; ulimit -a; }' + TEST_EXTRN_MDL_SOURCE_BASEDIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data + TEST_PREGEN_BASEDIR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/FV3LAM_pregen + TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/dummy_FV3GFS_sys_dir + TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/dummy_FV3GFS_sys_dir + TEST_VX_FCST_INPUT_BASEDIR: '{{ "/gpfs/f5/epic/world-shared/UFS_SRW_data/develop/output_data/fcst_" }}{{ "ens" if (global.NUM_ENS_MEMBERS > 0) else "det" }}{{ "/{{workflow.PREDEF_GRID_NAME}}" }}{% raw %}{% endraw %}' + FIXaer: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_aer + FIXgsi: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_gsi + FIXgsm: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_am + FIXlut: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_lut + FIXorg: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_orog + FIXsfc: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/fix/fix_sfc_climo + FIXshp: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/NaturalEarth + EXTRN_MDL_DATA_STORES: aws +data: + ics_lbcs: + FV3GFS: + nemsio: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/FV3GFS/nemsio/${yyyymmdd}${hh} + grib2: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/FV3GFS/grib2/${yyyymmdd}${hh} + netcdf: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/FV3GFS/netcdf/${yyyymmdd}${hh} + RAP: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/RAP/${yyyymmdd}${hh} + HRRR: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/HRRR/${yyyymmdd}${hh} + RAP: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/RAP/${yyyymmdd}${hh} + GSMGFS: /gpfs/f5/epic/world-shared/UFS_SRW_data/develop/input_model_data/GSMGFS/${yyyymmdd}${hh} +rocoto: + tasks: + metatask_run_ensemble: + task_run_fcst_mem#mem#: + cores: '{{ task_run_fcst.PE_MEMBER01 // 1 }}' + native: '--cpus-per-task {{ task_run_fcst.OMP_NUM_THREADS_RUN_FCST|int }} --exclusive {{ platform.SCHED_NATIVE_CMD }}' + nodes: + nnodes: + nodesize: + ppn: diff --git a/ush/valid_param_vals.yaml b/ush/valid_param_vals.yaml index f432c0bd76..3530b51ae9 100644 --- a/ush/valid_param_vals.yaml +++ b/ush/valid_param_vals.yaml @@ -4,7 +4,7 @@ valid_vals_RUN_ENVIR: ["nco", "community"] valid_vals_VERBOSE: [True, False] valid_vals_DEBUG: [True, False] -valid_vals_MACHINE: ["HERA", "WCOSS2", "ORION", "HERCULES", "JET", "ODIN", "CHEYENNE", "DERECHO", "STAMPEDE", "LINUX", "MACOS", "NOAACLOUD", "SINGULARITY", "GAEA-C5"] +valid_vals_MACHINE: ["HERA", "WCOSS2", "ORION", "HERCULES", "JET", "ODIN", "CHEYENNE", "DERECHO", "STAMPEDE", "LINUX", "MACOS", "NOAACLOUD", "SINGULARITY", "GAEA"] valid_vals_SCHED: ["slurm", "pbspro", "lsf", "lsfcray", "none"] valid_vals_FCST_MODEL: ["ufs-weather-model"] valid_vals_WORKFLOW_MANAGER: ["rocoto", "ecflow", "none"] diff --git a/ush/wrappers/job_cards/sbatch/get_ics.sbatch b/ush/wrappers/job_cards/sbatch/get_ics.sbatch index 5aca1c2e7f..17b6210eae 100644 --- a/ush/wrappers/job_cards/sbatch/get_ics.sbatch +++ b/ush/wrappers/job_cards/sbatch/get_ics.sbatch @@ -25,6 +25,6 @@ export ICS_OR_LBCS='ICS' $USHdir/load_modules_run_task.sh "get_extrn_ics" $JOBSdir/JREGIONAL_GET_EXTRN_MDL_FILES -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=es --partition=eslogin_c5 --export=NONE diff --git a/ush/wrappers/job_cards/sbatch/get_lbcs.sbatch b/ush/wrappers/job_cards/sbatch/get_lbcs.sbatch index fc747ece40..46a4aad45e 100644 --- a/ush/wrappers/job_cards/sbatch/get_lbcs.sbatch +++ b/ush/wrappers/job_cards/sbatch/get_lbcs.sbatch @@ -25,6 +25,6 @@ export ICS_OR_LBCS='LBCS' $USHdir/load_modules_run_task.sh "get_extrn_lbcs" $JOBSdir/JREGIONAL_GET_EXTRN_MDL_FILES -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=es --partition=eslogin_c5 --export=NONE diff --git a/ush/wrappers/job_cards/sbatch/make_grid.sbatch b/ush/wrappers/job_cards/sbatch/make_grid.sbatch index b8866af36f..4b7dbd218c 100644 --- a/ush/wrappers/job_cards/sbatch/make_grid.sbatch +++ b/ush/wrappers/job_cards/sbatch/make_grid.sbatch @@ -17,7 +17,7 @@ export JOBSdir=`grep JOBSdir $GLOBAL_VAR_DEFNS_FP | cut -d\' -f2` $USHdir/load_modules_run_task.sh "make_grid" $JOBSdir/JREGIONAL_MAKE_GRID -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='24' diff --git a/ush/wrappers/job_cards/sbatch/make_ics.sbatch b/ush/wrappers/job_cards/sbatch/make_ics.sbatch index 512eefeae5..729240bdbf 100644 --- a/ush/wrappers/job_cards/sbatch/make_ics.sbatch +++ b/ush/wrappers/job_cards/sbatch/make_ics.sbatch @@ -23,7 +23,7 @@ export NWGES_DIR=$PWD'/../../../nco_dirs/nwges/20190615' $USHdir/load_modules_run_task.sh "make_ics" $JOBSdir/JREGIONAL_MAKE_ICS -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='48' diff --git a/ush/wrappers/job_cards/sbatch/make_lbcs.sbatch b/ush/wrappers/job_cards/sbatch/make_lbcs.sbatch index ab1d1312c8..d4db098b28 100644 --- a/ush/wrappers/job_cards/sbatch/make_lbcs.sbatch +++ b/ush/wrappers/job_cards/sbatch/make_lbcs.sbatch @@ -25,7 +25,7 @@ export bcgrpnum='1' $USHdir/load_modules_run_task.sh "make_lbcs" $JOBSdir/JREGIONAL_MAKE_LBCS -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='48' diff --git a/ush/wrappers/job_cards/sbatch/make_orog.sbatch b/ush/wrappers/job_cards/sbatch/make_orog.sbatch index 3b440cbd42..b0c8d21e54 100644 --- a/ush/wrappers/job_cards/sbatch/make_orog.sbatch +++ b/ush/wrappers/job_cards/sbatch/make_orog.sbatch @@ -17,7 +17,7 @@ export JOBSdir=`grep JOBSdir $GLOBAL_VAR_DEFNS_FP | cut -d\' -f2` $USHdir/load_modules_run_task.sh "make_orog" $JOBSdir/JREGIONAL_MAKE_OROG -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='24' diff --git a/ush/wrappers/job_cards/sbatch/make_sfc_climo.sbatch b/ush/wrappers/job_cards/sbatch/make_sfc_climo.sbatch index b791288922..52769cb033 100644 --- a/ush/wrappers/job_cards/sbatch/make_sfc_climo.sbatch +++ b/ush/wrappers/job_cards/sbatch/make_sfc_climo.sbatch @@ -17,7 +17,7 @@ export JOBSdir=`grep JOBSdir $GLOBAL_VAR_DEFNS_FP | cut -d\' -f2` $USHdir/load_modules_run_task.sh "make_sfc_climo" $JOBSdir/JREGIONAL_MAKE_SFC_CLIMO -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='48' diff --git a/ush/wrappers/job_cards/sbatch/run_fcst.sbatch b/ush/wrappers/job_cards/sbatch/run_fcst.sbatch index 75abd6fc03..056fd70a28 100644 --- a/ush/wrappers/job_cards/sbatch/run_fcst.sbatch +++ b/ush/wrappers/job_cards/sbatch/run_fcst.sbatch @@ -21,7 +21,7 @@ export SLASH_ENSMEM_SUBDIR='/' $USHdir/load_modules_run_task.sh "run_fcst" $JOBSdir/JREGIONAL_RUN_FCST -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE #export nprocs='48' diff --git a/ush/wrappers/job_cards/sbatch/run_post.sbatch b/ush/wrappers/job_cards/sbatch/run_post.sbatch index c2a24a7f5e..6af04693f3 100644 --- a/ush/wrappers/job_cards/sbatch/run_post.sbatch +++ b/ush/wrappers/job_cards/sbatch/run_post.sbatch @@ -26,7 +26,7 @@ for (( i=0; i<=$((num_fcst_hrs)); i++ )); do $USHdir/load_modules_run_task.sh "run_post" $JOBSdir/JREGIONAL_RUN_POST done -# Gaea-c5 differences: +# Gaea differences: ##SBATCH --qos=normal ##SBATCH --clusters=c5 --partition=batch --export=NONE