From e4703cfcd68328129634936a7f06faaf4994ef6a Mon Sep 17 00:00:00 2001 From: jdhughes-usgs Date: Mon, 12 Feb 2024 12:33:33 -0800 Subject: [PATCH] doc: update HPC documents (#1613) add python script that can poll memory usage on HPC system --- .hpc/BUILD.md | 30 ---- .hpc/README.md | 68 ++++++++ .hpc/cray-hovenweep-meson-build.slurm.batch | 2 +- .hpc/cray-meson-build.slurm.batch | 2 +- .hpc/sstat_poll.py | 167 ++++++++++++++++++++ 5 files changed, 237 insertions(+), 32 deletions(-) delete mode 100644 .hpc/BUILD.md create mode 100644 .hpc/README.md create mode 100644 .hpc/sstat_poll.py diff --git a/.hpc/BUILD.md b/.hpc/BUILD.md deleted file mode 100644 index 9f9b0ac8374..00000000000 --- a/.hpc/BUILD.md +++ /dev/null @@ -1,30 +0,0 @@ - -# Building MODFLOW 6 on HPC systems - -_On Denali_ - -``` -sbatch --reservation=dev cray-meson-build.slurm.batch -``` - -_Hovenweep_ - -``` -sbatch cray-hovenweep-meson-build.slurm.batch -``` - - -## Create a module file for a new version of MODFLOW 6 - -On _Denali_ make a copy of an existing module file using -``` -rsync /home/software/denali/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x -``` -On _Hovenweep_ make a copy of an existing module file using -``` -rsync /home/software/hovenweep/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x -``` - -Edit `product_version` in the new module file from `6.5.0.dev0` to `6.x.x` on both systems. - - diff --git a/.hpc/README.md b/.hpc/README.md new file mode 100644 index 00000000000..391ae7e64e4 --- /dev/null +++ b/.hpc/README.md @@ -0,0 +1,68 @@ + +# Building MODFLOW 6 on HPC systems + +_On Denali_ + +``` +sbatch --reservation=dev cray-meson-build.slurm.batch +``` + +_Hovenweep_ + +``` +sbatch --reservation=dev cray-hovenweep-meson-build.slurm.batch +``` + + +## Create a module file for a new version of MODFLOW 6 + +On _Denali_ make a copy of an existing module file using +``` +rsync /home/software/denali/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x +``` +On _Hovenweep_ make a copy of an existing module file using +``` +rsync /home/software/hovenweep/contrib/impd/modulefiles/modflow/6.5.0.dev0 /home/software/denali/contrib/impd/modulefiles/modflow/6.x.x +``` + +Edit `product_version` in the new module file from `6.5.0.dev0` to `6.x.x` on both systems. + + +## Profiling memory usage + +The `sstat_poll.py` script can be used profile the memory usage of a job while it running. It uses the SLURM utility `sstat` to profile a job at a fixed interval (default is 30 sec.). The script can not be run with python 2.7. On both Denali and Hovenweep load the cray-python using module + +``` +module load cray-python +``` + +After loading cray-python run the script from any location on Denali or Hovenweep using + +``` +python sstat_poll.py JobID +``` + +where `JobID` is the SLURM JobID of the job you want to profile. Additional user controls can be specified and can be identified using + +``` +python sstat_poll.py -h +``` + +Currently available options include + +``` +usage: sstat_poll [-h] [--format FORMAT] [--output OUTPUT] [--prefix PREFIX] [--command COMMAND] [--interval INTERVAL] jobid + +python script for polling a SLURM job while it is running on a fixed interval. The python uses the SLURM command 'sstat' to return information on the job. By default, the script returns JobID, AveCPU, AveRSS, and MaxRSS but other data can be returned by specifying the format argument (--format=JobID,AveCPU,AveRSS,MaxRSS,...). + +positional arguments: + jobid SLURM JobID + +options: + -h, --help show this help message and exit + --format FORMAT SLURM sstat format string (default is JobID,AveCPU,AveRSS,MaxRSS) + --output OUTPUT Output file (default is None) + --prefix PREFIX Output file prefix (default is None) + --command COMMAND SLURM function (default is sstat) + --interval INTERVAL polling interval in sec. (default is 30.0 sec.) +``` diff --git a/.hpc/cray-hovenweep-meson-build.slurm.batch b/.hpc/cray-hovenweep-meson-build.slurm.batch index a1020f54cf0..56ca5d7645a 100644 --- a/.hpc/cray-hovenweep-meson-build.slurm.batch +++ b/.hpc/cray-hovenweep-meson-build.slurm.batch @@ -12,7 +12,7 @@ set -euxo pipefail # load appropriate modules module switch PrgEnv-${PE_ENV,,} PrgEnv-intel -module load petsc/3.15.5 +module load petsc/3.15.5 meson/1.2.1 ninja/1.11.1 export PKG_CONFIG_PATH=$CRAY_MPICH_DIR/lib/pkgconfig:$PKG_CONFIG_PATH # list loaded modules diff --git a/.hpc/cray-meson-build.slurm.batch b/.hpc/cray-meson-build.slurm.batch index 1f62d43f6f3..d6633b078b5 100644 --- a/.hpc/cray-meson-build.slurm.batch +++ b/.hpc/cray-meson-build.slurm.batch @@ -34,7 +34,7 @@ TESTDIR=$MODFLOW6ROOT/.mf6minsim PREFIX=/home/software/denali/contrib/impd/apps/modflow/$VERSION/$PE_ENV/19.1.0.166 # build MODFLOW 6 -CC=cc CXX=CC F77=ftn F90=ftn FC=ftn meson setup $BUILDDIR --prefix=$PREFIX --bindir=bin --libdir=lib -Dcray=true -Ddebug=false +CC=cc CXX=CC F77=ftn F90=ftn FC=ftn meson setup $BUILDDIR --prefix=$PREFIX --bindir=bin --libdir=lib -Dcray=true -Ddebug=false --wipe meson compile -C $BUILDDIR # install MODFLOW 6 diff --git a/.hpc/sstat_poll.py b/.hpc/sstat_poll.py new file mode 100644 index 00000000000..12e60f101a0 --- /dev/null +++ b/.hpc/sstat_poll.py @@ -0,0 +1,167 @@ +import argparse +import pathlib as pl +import sys +import time +from subprocess import PIPE, STDOUT, Popen + + +def _build_command( + cmd: str, + jobid: int, + fmt: str, + with_header: bool = True, +) -> list: + cmd_args = [cmd] + cmd_args.append(f"-j {jobid}") + cmd_args.append(f"--format={fmt}") + if not with_header: + cmd_args.append("--noheader") + return cmd_args + + +def _run_command( + cmd: str, + jobid: int, + end_msg: str, + fmt: str, + with_header: bool = True, + silent: bool = False, +) -> list: + cmd_args = _build_command(cmd, jobid, fmt, with_header=with_header) + try: + result = [] + proc = Popen(cmd_args, stdout=PIPE, stderr=STDOUT, cwd=".") + + while True: + line = proc.stdout.readline().decode("utf-8") + if line == "" and proc.poll() is not None: + break + line = line.rstrip("\r\n") + if line: + if end_msg in line or ".ext+" in line: + result = None + break + else: + if not silent: + print(line) + if "-----" not in line: + result.append( + ",".join(line.split()) + + "," + + time.strftime("%Y-%m-%d %H:%M:%S") + ) + else: + break + except: + result = None + return result + + +if __name__ == "__main__": + description = ( + "python script for polling a SLURM job " + + "while it is running on a fixed interval. " + + "The python uses the SLURM command 'sstat' " + + "to return information on the job. By default, " + + "the script returns JobID, AveCPU, AveRSS, " + + "and MaxRSS but other data can be returned " + + "by specifying the format argument " + + "(--format=JobID,AveCPU,AveRSS,MaxRSS,...)." + ) + parser = argparse.ArgumentParser( + "sstat_poll", + description=description, + ) + parser.add_argument("jobid", help="SLURM JobID", type=int) + parser.add_argument( + "--format", + help="SLURM sstat format string (default is JobID,AveCPU,AveRSS,MaxRSS)", + type=str, + default="JobID,AveCPU,AveRSS,MaxRSS", + required=False, + ) + parser.add_argument( + "--output", + help="Output file (default is None)", + type=str, + required=False, + default=None, + ) + parser.add_argument( + "--prefix", + help="Output file prefix (default is None)", + type=str, + required=False, + default=None, + ) + parser.add_argument( + "--command", + help="SLURM function (default is sstat)", + type=str, + required=False, + default="sstat", + ) + parser.add_argument( + "--interval", + help="polling interval in sec. (default is 30.0 sec.) ", + type=float, + required=False, + default=30.0, + ) + slurm_args = parser.parse_args() + + if sys.version_info < (3, 8): + sys.exit("Python version must be 3.8 or higher.") + + print(f"SLURM command: {slurm_args.command}") + print(f"JobID: {slurm_args.jobid}") + + if slurm_args.output is None: + output_path = f"{slurm_args.jobid}.{slurm_args.command}.csv" + if slurm_args.prefix is not None: + output_path = f"{slurm_args.prefix}.{output_path}" + output_path = pl.Path(output_path) + else: + output_path = pl.Path(slurm_args.output) + print(f"output path: {output_path}") + + end_msg = ( + f"{slurm_args.command}: error: no steps " + + f"running for job {slurm_args.jobid}" + ) + + # test if exe exists + if ( + _run_command( + slurm_args.command, + slurm_args.jobid, + end_msg, + slurm_args.format, + silent=True, + ) + is None + ): + raise ValueError( + f"SLURM command '{slurm_args.command}' does not exist" + ) + + end_tag = f"sstat:,error:,no,steps,running,for,job,{slurm_args.jobid}" + # open file + with open(output_path, "w") as f: + with_header = True + job_complete = False + while job_complete is False: + result = _run_command( + slurm_args.command, + slurm_args.jobid, + end_msg, + slurm_args.format, + with_header=with_header, + ) + if result is None: + job_complete = True + if not job_complete: + with_header = False + for line in result: + f.write(f"{line}\n") + time.sleep(slurm_args.interval)