Skip to content

Commit

Permalink
doc: update HPC documents (#1613)
Browse files Browse the repository at this point in the history
add python script that can poll memory usage on HPC system
  • Loading branch information
jdhughes-usgs authored Feb 12, 2024
1 parent 07f2a9a commit e4703cf
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 32 deletions.
30 changes: 0 additions & 30 deletions .hpc/BUILD.md

This file was deleted.

68 changes: 68 additions & 0 deletions .hpc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

# Building MODFLOW 6 on HPC systems

_On Denali_

```
sbatch --reservation=dev cray-meson-build.slurm.batch
```

_Hovenweep_

```
sbatch --reservation=dev cray-hovenweep-meson-build.slurm.batch
```


## Create a module file for a new version of MODFLOW 6

On _Denali_ make a copy of an existing module file using
```
rsync /home/software/denali/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x
```
On _Hovenweep_ make a copy of an existing module file using
```
rsync /home/software/hovenweep/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x
```

Edit `product_version` in the new module file from `6.5.0.dev0` to `6.x.x` on both systems.


## Profiling memory usage

The `sstat_poll.py` script can be used profile the memory usage of a job while it running. It uses the SLURM utility `sstat` to profile a job at a fixed interval (default is 30 sec.). The script can not be run with python 2.7. On both Denali and Hovenweep load the cray-python using module

```
module load cray-python
```

After loading cray-python run the script from any location on Denali or Hovenweep using

```
python sstat_poll.py JobID
```

where `JobID` is the SLURM JobID of the job you want to profile. Additional user controls can be specified and can be identified using

```
python sstat_poll.py -h
```

Currently available options include

```
usage: sstat_poll [-h] [--format FORMAT] [--output OUTPUT] [--prefix PREFIX] [--command COMMAND] [--interval INTERVAL] jobid
python script for polling a SLURM job while it is running on a fixed interval. The python uses the SLURM command 'sstat' to return information on the job. By default, the script returns JobID, AveCPU, AveRSS, and MaxRSS but other data can be returned by specifying the format argument (--format=JobID,AveCPU,AveRSS,MaxRSS,...).
positional arguments:
jobid SLURM JobID
options:
-h, --help show this help message and exit
--format FORMAT SLURM sstat format string (default is JobID,AveCPU,AveRSS,MaxRSS)
--output OUTPUT Output file (default is None)
--prefix PREFIX Output file prefix (default is None)
--command COMMAND SLURM function (default is sstat)
--interval INTERVAL polling interval in sec. (default is 30.0 sec.)
```
2 changes: 1 addition & 1 deletion .hpc/cray-hovenweep-meson-build.slurm.batch
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ set -euxo pipefail

# load appropriate modules
module switch PrgEnv-${PE_ENV,,} PrgEnv-intel
module load petsc/3.15.5
module load petsc/3.15.5 meson/1.2.1 ninja/1.11.1
export PKG_CONFIG_PATH=$CRAY_MPICH_DIR/lib/pkgconfig:$PKG_CONFIG_PATH

# list loaded modules
Expand Down
2 changes: 1 addition & 1 deletion .hpc/cray-meson-build.slurm.batch
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TESTDIR=$MODFLOW6ROOT/.mf6minsim
PREFIX=/home/software/denali/contrib/impd/apps/modflow/$VERSION/$PE_ENV/19.1.0.166

# build MODFLOW 6
CC=cc CXX=CC F77=ftn F90=ftn FC=ftn meson setup $BUILDDIR --prefix=$PREFIX --bindir=bin --libdir=lib -Dcray=true -Ddebug=false
CC=cc CXX=CC F77=ftn F90=ftn FC=ftn meson setup $BUILDDIR --prefix=$PREFIX --bindir=bin --libdir=lib -Dcray=true -Ddebug=false --wipe
meson compile -C $BUILDDIR

# install MODFLOW 6
Expand Down
167 changes: 167 additions & 0 deletions .hpc/sstat_poll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import argparse
import pathlib as pl
import sys
import time
from subprocess import PIPE, STDOUT, Popen


def _build_command(
cmd: str,
jobid: int,
fmt: str,
with_header: bool = True,
) -> list:
cmd_args = [cmd]
cmd_args.append(f"-j {jobid}")
cmd_args.append(f"--format={fmt}")
if not with_header:
cmd_args.append("--noheader")
return cmd_args


def _run_command(
cmd: str,
jobid: int,
end_msg: str,
fmt: str,
with_header: bool = True,
silent: bool = False,
) -> list:
cmd_args = _build_command(cmd, jobid, fmt, with_header=with_header)
try:
result = []
proc = Popen(cmd_args, stdout=PIPE, stderr=STDOUT, cwd=".")

while True:
line = proc.stdout.readline().decode("utf-8")
if line == "" and proc.poll() is not None:
break
line = line.rstrip("\r\n")
if line:
if end_msg in line or ".ext+" in line:
result = None
break
else:
if not silent:
print(line)
if "-----" not in line:
result.append(
",".join(line.split())
+ ","
+ time.strftime("%Y-%m-%d %H:%M:%S")
)
else:
break
except:
result = None
return result


if __name__ == "__main__":
description = (
"python script for polling a SLURM job "
+ "while it is running on a fixed interval. "
+ "The python uses the SLURM command 'sstat' "
+ "to return information on the job. By default, "
+ "the script returns JobID, AveCPU, AveRSS, "
+ "and MaxRSS but other data can be returned "
+ "by specifying the format argument "
+ "(--format=JobID,AveCPU,AveRSS,MaxRSS,...)."
)
parser = argparse.ArgumentParser(
"sstat_poll",
description=description,
)
parser.add_argument("jobid", help="SLURM JobID", type=int)
parser.add_argument(
"--format",
help="SLURM sstat format string (default is JobID,AveCPU,AveRSS,MaxRSS)",
type=str,
default="JobID,AveCPU,AveRSS,MaxRSS",
required=False,
)
parser.add_argument(
"--output",
help="Output file (default is None)",
type=str,
required=False,
default=None,
)
parser.add_argument(
"--prefix",
help="Output file prefix (default is None)",
type=str,
required=False,
default=None,
)
parser.add_argument(
"--command",
help="SLURM function (default is sstat)",
type=str,
required=False,
default="sstat",
)
parser.add_argument(
"--interval",
help="polling interval in sec. (default is 30.0 sec.) ",
type=float,
required=False,
default=30.0,
)
slurm_args = parser.parse_args()

if sys.version_info < (3, 8):
sys.exit("Python version must be 3.8 or higher.")

print(f"SLURM command: {slurm_args.command}")
print(f"JobID: {slurm_args.jobid}")

if slurm_args.output is None:
output_path = f"{slurm_args.jobid}.{slurm_args.command}.csv"
if slurm_args.prefix is not None:
output_path = f"{slurm_args.prefix}.{output_path}"
output_path = pl.Path(output_path)
else:
output_path = pl.Path(slurm_args.output)
print(f"output path: {output_path}")

end_msg = (
f"{slurm_args.command}: error: no steps "
+ f"running for job {slurm_args.jobid}"
)

# test if exe exists
if (
_run_command(
slurm_args.command,
slurm_args.jobid,
end_msg,
slurm_args.format,
silent=True,
)
is None
):
raise ValueError(
f"SLURM command '{slurm_args.command}' does not exist"
)

end_tag = f"sstat:,error:,no,steps,running,for,job,{slurm_args.jobid}"
# open file
with open(output_path, "w") as f:
with_header = True
job_complete = False
while job_complete is False:
result = _run_command(
slurm_args.command,
slurm_args.jobid,
end_msg,
slurm_args.format,
with_header=with_header,
)
if result is None:
job_complete = True
if not job_complete:
with_header = False
for line in result:
f.write(f"{line}\n")
time.sleep(slurm_args.interval)

0 comments on commit e4703cf

Please sign in to comment.