-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_general.sh
executable file
·119 lines (103 loc) · 3.74 KB
/
run_general.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
#============================
# Run forecast for a variety of
# node types. Need to set:
# run dir
# number of MPI tasks per node
# number of nodes in request
#
# There are always 6 OpenMP
# threads being used, so the
# smallest nodes must have
# at least 6 CPU with 1 MPI
# task/rank per node.
#
# For larger nodes, work in multiples
# of 6. So for a c2-standard-60, there
# are 30 real CPU (hyperthreading is
# turned off) -> up to 5 MPI tasks/ranks
# can be activated on one node (30 CPU).
#
# But, we always need a total of
# 32 MPI ranks, so even if we
# pack ranks with 5 ranks/node,
# we won't be able to evenly
# ranks across 7 nodes, so we need
# to use 8 nodes with 4 ranks per node.
#============================
# Key inputs
export RUNDIR=$1
export NUM_NODES=$2
export NUM_MPI_TASK_PER_NODE=$3
export PARTITION=$4
echo Starting general launcher for WRF...
echo RUNDIR is $RUNDIR
echo NUM_NODES is $NUM_NODES
echo NUM_MPI_TASK_PER_NODE is $NUM_MPI_TASK_PER_NODE
echo PARTITION is $PARTITION
# Source .bashrc because this sets up spack.
source ~/.bashrc
# Move data to home with local.setup.sh
# (Not done automatically here in case
# cluster is persistent.)
# Create launch script
# Modifications wrt version posted by Smith et al. (2020):
# 1. Adjust --nodes and --ntasks-per-node to match number
# of CPU = vCPU/2 on instance. For example, 2 x 16 = 4 x 8.
# The number of CPU available should be >= --ntasks-per-node x OMP_NUM_THREADS
# This particular model needs 2 x 16 x 6 = 4 x 8 x 6 = 192 threads.
# In the two cases above, 16x6 means 2 quantity 96CPU instances and
# 8x6 means 4 quantity 48CPU instances.
# 2. We do not need to module load libfabric-aws or any EFA env vars
# since this is on GCE. GCE gvnic env vars are autoconfigured.
# 3. Change the output logging from %j to %J.%t
# Go to whereever the WRF run directory is.
cd ${RUNDIR}
# This env var needs to match the version
# of IntelMPI. For the general_install,
# IntelMPI is 2022, see https://cloud.google.com/architecture/best-practices-for-using-mpi-on-compute-engine#use_intel_mpi
#export I_MPI_FABRICS="ofi_rxm;tcp"
# Newer version of OneAPI? 2021.12.2 - but still get error messages
# experiment with shm:ofi and setting I_MPI_OFI_PROVIDER to TCP?
# See:
# https://www.intel.com/content/www/us/en/docs/mpi-library/developer-reference-linux/2021-8/communication-fabrics-control.html
# https://www.intel.com/content/www/us/en/docs/mpi-library/developer-reference-linux/2021-8/ofi-capable-network-fabrics-control.html
# This variable should already be set by default
# on Parallel Works cloud clusters based on your CSP.
#export I_MPI_FABRICS="shm:tcp"
cat > slurm-wrf-conus12km.sh <<EOF
#!/bin/bash
#SBATCH --job-name=WRF
#SBATCH --output=conus-%J.%t.out
#SBATCH --nodes=$NUM_NODES
#SBATCH --ntasks-per-node=$NUM_MPI_TASK_PER_NODE
#SBATCH --exclusive
#SBATCH --partition $PARTITION
spack load intel-oneapi-mpi
spack load wrf
wrf_exe=$(spack location -i wrf)/run/wrf.exe
set -x
ulimit -s unlimited
ulimit -a
export OMP_NUM_THREADS=6
export I_MPI_FABRICS=$I_MPI_FABRICS
export I_MPI_PIN_DOMAIN=omp
export KMP_AFFINITY=compact
export I_MPI_DEBUG=6
time mpiexec.hydra -np \$SLURM_NTASKS --ppn \$SLURM_NTASKS_PER_NODE \$wrf_exe
echo $? > wrf.exit.code
EOF
# Run it!
echo; echo "Running sbatch slurm-wrf-conus12km.sh from ${PWD}"
# Launch job with sbatch and get job ID from sbatach output.
slurm_job_id=$(sbatch slurm-wrf-conus12km.sh | awk '{print $4}')
# Monitor running job until it finishes.
while squeue -j $slurm_job_id | grep -q $slurm_job_id; do
squeue -u ${USER}
echo WRF job $slurm_job_id is still running. Wait 10 s and check again...
sleep 10
done
squeue -u ${USER}
echo WRF job $slurm_job_id is complete!
# Clean up
#rm -f slurm-wrf-conus12km.sh