From 8ed01d3376194945b1cbf4c45b8f998b5e4844c0 Mon Sep 17 00:00:00 2001 From: Lauren Chilutti Date: Tue, 18 Jun 2024 13:23:10 +0000 Subject: [PATCH 1/2] runscripts for CI being stored in .github/.parallelworks. These runscripts need to be stored on parallelworks SHiELD_physics_CI cluster at /contrib/fv3/SHiELD_physics_CI --- .github/.parallelworks/checkout.sh | 92 ++++++++++++++++++++++++++++++ .github/.parallelworks/compile.sh | 84 +++++++++++++++++++++++++++ .github/.parallelworks/run_test.sh | 89 +++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100755 .github/.parallelworks/checkout.sh create mode 100755 .github/.parallelworks/compile.sh create mode 100755 .github/.parallelworks/run_test.sh diff --git a/.github/.parallelworks/checkout.sh b/.github/.parallelworks/checkout.sh new file mode 100755 index 00000000..9749ab8c --- /dev/null +++ b/.github/.parallelworks/checkout.sh @@ -0,0 +1,92 @@ +#!/bin/bash -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +echo "branch is $branch" +echo "commit is $commit" + + +## Set up the directories +testDir=${dirRoot}/${intelVersion}/SHiELD_physics/${branch}/${commit} +logDir=${testDir}/log +export MODULESHOME=/usr/share/lmod/lmod +#Define External Libs path +export EXTERNAL_LIBS=${dirRoot}/${intelVersion}/SHiELD_physics/externallibs +mkdir -p ${EXTERNAL_LIBS} +## create directories +rm -rf ${testDir} +mkdir -p ${logDir} +# salloc commands to start up +#2 tests layout 8,8 (16 nodes) +#2 tests layout 4,8 (8 nodes) +#9 tests layout 4,4 (18 nodes) +#5 tests layout 4,1 (5 nodes) +#17 tests layout 2,2 (17 nodes) +#salloc --partition=p2 -N 64 -J ${branch} sleep 20m & + +## clone code +cd ${testDir} +git clone --recursive https://github.com/NOAA-GFDL/SHiELD_build.git + +##checkout components +cd ${testDir}/SHiELD_build && ./CHECKOUT_code + +## Check out the PR +cd ${testDir}/SHiELD_SRC/SHiELD_physcis && git fetch origin ${branch}:toMerge && git merge toMerge + +##Check if we already have FMS compiled and recompile if version doesn't match what is in SHiELD_build checkout script +grep -m 1 "fms_release" ${testDir}/SHiELD_build/CHECKOUT_code > ${logDir}/release.txt +source ${logDir}/release.txt +echo ${fms_release} +echo `cat ${EXTERNAL_LIBS}/FMSversion` +if [[ ${fms_release} != `cat ${EXTERNAL_LIBS}/FMSversion` ]] + then + #remove libFMS if it exists + if [ -d $EXTERNAL_LIBS/libFMS ] + then + rm -rf $EXTERNAL_LIBS/libFMS + fi + if [ -e $EXTERNAL_LIBS/FMSversion ] + then + rm $EXTERNAL_LIBS/FMSversion + fi + echo $fms_release > $EXTERNAL_LIBS/FMSversion + echo $container > $EXTERNAL_LIBS/FMScontainerversion + echo $container_env_script >> $EXTERNAL_LIBS/FMScontainerversion + # Build FMS + cd ${testDir}/SHiELD_build/Build + set -o pipefail + singularity exec -B /contrib ${container} ${container_env_script} "./BUILDlibfms intel" + fi diff --git a/.github/.parallelworks/compile.sh b/.github/.parallelworks/compile.sh new file mode 100755 index 00000000..39017366 --- /dev/null +++ b/.github/.parallelworks/compile.sh @@ -0,0 +1,84 @@ +#!/bin/bash -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -c|--config) + config="$2" + shift # past argument + shift # past value + ;; + --hydro) + hydro="$2" + shift # past argument + shift # past value + ;; + --bit) + bit="$2" + shift # past argument + shift # past value + ;; + -m|--mode) + mode="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $mode ] || [ -z $bit ] || [ -z $hydro ] || [ -z $config ] + then + echo "must specify config, hydro, bit, and mode options for compile" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "mode is $mode" +echo "bit is $bit" +echo "hydro is $hydro" +echo "config is $config" + +if [ $hydro = "sw" ] && [ $config = "shield" ] + then + echo "this combination should not be tested" + else + ## Set up the directories + testDir=${dirRoot}/${intelVersion}/SHiELD_physics/${branch}/${commit} + logDir=${testDir}/log + # Set up build + cd ${testDir}/SHiELD_build/Build + #Define External Libs path + export EXTERNAL_LIBS=${dirRoot}/${intelVersion}/SHiELD_physics/externallibs + # Build SHiELD + set -o pipefail + singularity exec -B /contrib ${container} ${container_env_script} "./COMPILE ${config} ${hydro} ${bit} ${mode} intel clean" +fi diff --git a/.github/.parallelworks/run_test.sh b/.github/.parallelworks/run_test.sh new file mode 100755 index 00000000..355f9ee2 --- /dev/null +++ b/.github/.parallelworks/run_test.sh @@ -0,0 +1,89 @@ +#!/bin/bash -xe +ulimit -s unlimited +############################################################################## +## User set up veriables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -t|--test) + testname="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $testname ] + then + echo "must specify a test name with -t" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "test is $testname" + +## Set up the directories +MODULESHOME=/usr/share/lmod/lmod +testDir=${dirRoot}/${intelVersion}/SHiELD_physics/${branch}/${commit} +logDir=${testDir}/log +baselineDir=${dirRoot}/baselines/intel/${intelVersion} + +## Run the CI Test +# Define the builddir testscriptdir and rundir +# Set the BUILDDIR for the test script to use +export BUILDDIR="${testDir}/SHiELD_build" +testscriptDir=${BUILDDIR}/RTS/CI +runDir=${BUILDDIR}/CI/BATCH-CI + +# Run CI test scripts +cd ${testscriptDir} +set -o pipefail +# Execute the test piping output to log file +./${testname} " --partition=p2 --mpi=pmi2 --job-name=${commit}_${testname} singularity exec -B /contrib ${container} ${container_env_script}" |& tee ${logDir}/run_${testname}.log + +## Compare Restarts to Baseline +#The following tests are not expectred to have run-to-run reproducibility: +#d96_2k.solo.bubble +#d96_2k.solo.bubble.n0 +#d96_2k.solo.bubble.nhK +if [[ ${testname} == "d96_2k.solo.bubble" || ${testname} == "d96_2k.solo.bubble.n0" || ${testname} == "d96_2k.solo.bubble.nhK" ]] + then + echo "${testname} is not expected to reproduce so answers were not compared" + else + source $MODULESHOME/init/sh + export MODULEPATH=/mnt/shared/manual_modules:/usr/share/modulefiles/Linux:/usr/share/modulefiles/Core:/usr/share/lmod/lmod/modulefiles/Core:/apps/modules/modulefiles:/apps/modules/modulefamilies/intel + module load intel/2022.1.2 + module load netcdf + module load nccmp + for resFile in `ls ${baselineDir}/${testname}` + do + nccmp -d ${baselineDir}/${testname}/${resFile} ${runDir}/${testname}/RESTART/${resFile} + done +fi From 4b5fd232478a712d1df056c35af7648c1c7be7ca Mon Sep 17 00:00:00 2001 From: Lauren Chilutti <60401591+laurenchilutti@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:35:50 -0400 Subject: [PATCH 2/2] Update Intel_Parallelworks_CI.yaml to now build full shield and not solo shield. Still to do: add a full shield test run. Right now it only checks out code and builds shield --- .github/workflows/Intel_Parallelworks_CI.yaml | 170 ++++++------------ 1 file changed, 53 insertions(+), 117 deletions(-) diff --git a/.github/workflows/Intel_Parallelworks_CI.yaml b/.github/workflows/Intel_Parallelworks_CI.yaml index f814b114..4c08d8bf 100644 --- a/.github/workflows/Intel_Parallelworks_CI.yaml +++ b/.github/workflows/Intel_Parallelworks_CI.yaml @@ -1,4 +1,4 @@ -name: Compile SHiELD SOLO and run tests +name: Compile SHiELD and run tests # This GitHub Action Workflow is running on the cloud shieldphysicsciintel cluster # The tests are run inside of a container with the following software/libraries: @@ -30,131 +30,67 @@ jobs: # so this salloc will prompt 46 nodes to startup and stay active for 20 min # this is enough nodes for the first 17 tests to run in parallel, and we # have 17 runners configured. - - run: salloc --partition=p2 -N 46 -J $GITHUB_SHA sleep 20m & - - run: /contrib/fv3/SHiELD_physics_CI/checkout.sh $GITHUB_REF $GITHUB_SHA +# - run: salloc --partition=p2 -N 46 -J $GITHUB_SHA sleep 20m & + - run: /contrib/fv3/SHiELD_physics_CI/checkout.sh -b $GITHUB_REF -h $GITHUB_SHA build: if: github.repository == 'NOAA-GFDL/SHiELD_physics' runs-on: [shieldphysicsciintel] - name: SOLO SHiELD build + name: SHiELD build needs: [checkout] strategy: fail-fast: true - max-parallel: 3 - matrix: - runpath: [/contrib/fv3/SHiELD_physics_CI/] - runscript: [swcompile.sh, nhcompile.sh, hydrocompile.sh] - steps: - - env: - RUNPATH: ${{ matrix.runpath }} - RUNSCRIPT: ${{ matrix.runscript }} - run: $RUNPATH/$RUNSCRIPT $GITHUB_REF $GITHUB_SHA - - test: - if: github.repository == 'NOAA-GFDL/SHiELD_physics' - runs-on: [shieldphysicsciintel] - name: SOLO SHiELD test suite - needs: [checkout, build] - strategy: - fail-fast: false max-parallel: 17 matrix: - runpath: [/contrib/fv3/SHiELD_physics_CI/] - runscript: - # These are placed in order of largest to smallest jobs - #layout 8,8 needs 8 nodes on dvcimultiintel cluster - - C512r20.solo.superC.sh - - C768.sw.BTwave.sh - #layout 4,8 needs 4 nodes on dvcimultiintel cluster - - C256r20.solo.superC.sh - - C384.sw.BLvortex.sh - #layout 4,4 needs 2 nodes on dvcimultiintel cluster - - C128r20.solo.superC.sh - - C128r3.solo.TC.d1.sh - - C128r3.solo.TC.h6.sh - - C128r3.solo.TC.sh - - C128r3.solo.TC.tr8.sh - - C192.sw.BLvortex.sh - - C192.sw.BTwave.sh - - C192.sw.modon.sh - - C384.sw.BTwave.sh - #layout 4,1 and 2,2 need 1 node on dvcimultiintel cluster - - C96.solo.BCdry.hyd.sh - - C96.solo.BCdry.sh - - C96.solo.BCmoist.hyd.d3.sh - - C96.solo.BCmoist.hyd.sh - - C96.solo.BCmoist.nhK.sh - - C96.solo.BCmoist.sh - - C96.solo.mtn_rest.hyd.diff2.sh - - C96.solo.mtn_rest.hyd.sh - - C96.solo.mtn_rest.nonmono.diff2.sh - - C96.solo.mtn_rest.sh - - C96.sw.BLvortex.sh - - C96.sw.BTwave.sh - - C96.sw.modon.sh - - C96.sw.RHwave.sh - - d96_1k.solo.mtn_rest_shear.olddamp.sh - - d96_1k.solo.mtn_rest_shear.sh - - d96_1k.solo.mtn_schar.mono.sh - - d96_1k.solo.mtn_schar.sh - - d96_2k.solo.bubble.n0.sh - - d96_2k.solo.bubble.nhK.sh - - d96_2k.solo.bubble.sh - - d96_500m.solo.mtn_schar.sh + runscript: [/contrib/fv3/SHiELD_physics_CI/compile.sh] + config: [shield] + hydro: [nh] + bit: [32bit] + mode: [repro] steps: - # This will end the slurm job started in the checkout job - - run: scancel -n $GITHUB_SHA - env: - RUNPATH: ${{ matrix.runpath }} RUNSCRIPT: ${{ matrix.runscript }} - run: $RUNPATH/$RUNSCRIPT $GITHUB_REF $GITHUB_SHA - shutdown: - if: always() && github.repository == 'NOAA-GFDL/SHiELD_physics' - runs-on: [shieldphysicsciintel] - name: Shutdown Processes - needs: [checkout, build, test] - strategy: - fail-fast: false - max-parallel: 17 - matrix: - test: - - C512r20.solo.superC - - C768.sw.BTwave - - C256r20.solo.superC - - C384.sw.BLvortex - - C128r20.solo.superC - - C128r3.solo.TC.d1 - - C128r3.solo.TC.h6 - - C128r3.solo.TC - - C128r3.solo.TC.tr8 - - C192.sw.BLvortex - - C192.sw.BTwave - - C192.sw.modon - - C384.sw.BTwave - - C96.solo.BCdry.hyd - - C96.solo.BCdry - - C96.solo.BCmoist.hyd.d3 - - C96.solo.BCmoist.hyd - - C96.solo.BCmoist.nhK - - C96.solo.BCmoist - - C96.solo.mtn_rest.hyd.diff2 - - C96.solo.mtn_rest.hyd - - C96.solo.mtn_rest.nonmono.diff2 - - C96.solo.mtn_rest - - C96.sw.BLvortex - - C96.sw.BTwave - - C96.sw.modon - - C96.sw.RHwave - - d96_1k.solo.mtn_rest_shear.olddamp - - d96_1k.solo.mtn_rest_shear - - d96_1k.solo.mtn_schar.mono - - d96_1k.solo.mtn_schar - - d96_2k.solo.bubble.n0 - - d96_2k.solo.bubble.nhK - - d96_2k.solo.bubble - - d96_500m.solo.mtn_schar - steps: - - run: scancel -n $GITHUB_SHA - - env: - JOB: ${{ github.sha }}_${{ matrix.test }} - run: scancel -n $JOB + CONFIG: ${{ matrix.config }} + HYDRO: ${{ matrix.hydro }} + BIT: ${{ matrix.bit }} + MODE: ${{ matrix.mode }} + run: $RUNSCRIPT -b $GITHUB_REF -h $GITHUB_SHA -c $CONFIG --hydro $HYDRO --bit $BIT -m $MODE + +# test: +# if: github.repository == 'NOAA-GFDL/SHiELD_physics' +# runs-on: [shieldphysicsciintel] +# name: SHiELD test suite +# needs: [checkout, build] +# strategy: +# fail-fast: false +# max-parallel: 17 +# matrix: +# runscript: [/contrib/fv3/SHiELD_physics_CI/run_test.sh] +# argument: +# - test1 +# - test2 +# steps: +# # This will end the slurm job started in the checkout job +# - run: scancel -n $GITHUB_SHA +# - env: +# RUNSCRIPT: ${{ matrix.runscript }} +# ARG1: ${{ matrix.argument }} +# run: $RUNSCRIPT -t $ARG1 -b $GITHUB_REF -h $GITHUB_SHA +# shutdown: +# if: always() && github.repository == 'NOAA-GFDL/SHiELD_physics' +# runs-on: [shieldphysicsciintel] +# name: Shutdown Processes +# needs: [checkout, build, test] +# needs: [checkout, build] +# strategy: +# fail-fast: false +# max-parallel: 17 +# matrix: +# test: +# - test2 +# - test2 +# steps: +# - run: scancel -n $GITHUB_SHA +# - env: +# JOB: ${{ github.sha }}_${{ matrix.test }} +# run: scancel -n $JOB