nonmem run: support Slurm submission

The new 'bbi nonmem run slurm' command follows 'bbi nonmem run sge' in terms of going through gridSpec, attempting to align the template directives, and testing with the same helpers. The one deliberate deviation in behavior is the output file name. For SGE, standard output and standard error are redirected to the default file name of '{run name}.o{job id}'. There's been a request (gh-312) to clean up these .o* and .po* files (only the former is relevant for Slurm). Cleaning them up is probably a bit aggressive because the .o* file contains the 'bbi run local ...' output, which is useful for troubleshooting. However, I suspect a core pain point with the these files is the _changing name_ when overwriting a model for reasons that have to do with how SVN handles deletions. So, don't mimic this behavior and instead use a consistent name to hopefully avoid this issue for Slurm. Closes #303
metrumresearchgroup · Dec 6, 2024 · 7c53082 · 7c53082
1 parent e31e3bb
commit 7c53082
Show file tree

Hide file tree

Showing 11 changed files with 225 additions and 21 deletions.
diff --git a/cmd/nonmem.go b/cmd/nonmem.go
@@ -157,7 +157,7 @@ type NonMemModel struct {
 }
 
 var nonmemExamples string = fmt.Sprintf("%s\n\n%s\n\n%s\n",
-	fmt.Sprintf(runExamples, "(local|sge)"),
+	fmt.Sprintf(runExamples, "(local|sge|slurm)"),
 	summaryExamples,
 	covcorExamples)
 

diff --git a/cmd/run.go b/cmd/run.go
@@ -56,7 +56,7 @@ const postProcessingScriptTemplate string = `#!/bin/bash
 `
 
 func run(_ *cobra.Command, _ []string) {
-	println(fmt.Sprintf(runExamples, "(local|sge)"))
+	println(fmt.Sprintf(runExamples, "(local|sge|slurm)"))
 }
 
 func NewRunCmd() *cobra.Command {
@@ -65,7 +65,7 @@ func NewRunCmd() *cobra.Command {
 		Short: "Run models locally or on the grid",
 		Long: `This is the entry point to subcommands for running NONMEM models. Each
 subcommand represents a different "mode" of execution (e.g., local).`,
-		Example: fmt.Sprintf(runExamples, "(local|sge)"),
+		Example: fmt.Sprintf(runExamples, "(local|sge|slurm)"),
 		Run:     run,
 	}
 
@@ -132,6 +132,7 @@ subcommand represents a different "mode" of execution (e.g., local).`,
 
 	cmd.AddCommand(NewLocalCmd())
 	cmd.AddCommand(NewSgeCmd())
+	cmd.AddCommand(NewSlurmCmd())
 
 	return cmd
 }

diff --git a/cmd/slurm.go b/cmd/slurm.go
@@ -0,0 +1,49 @@
+package cmd
+
+import (
+	"fmt"
+
+	"github.com/spf13/cobra"
+	"github.com/spf13/viper"
+)
+
+const slurmTemplate string = `#!/bin/bash
+#SBATCH --job-name={{.JobName | shquote}}
+#SBATCH --output=slurm.out
+#SBATCH --export=ALL
+{{- if .Config.Parallel}}
+#SBATCH --ntasks={{.Config.Threads}}{{end}}
+#SBATCH --chdir={{.WorkingDirectory | shquote}}
+
+{{range .Command}}{{. | shquote}} {{end}}
+`
+
+func NewSlurmCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:     "slurm [flags] <model> [<model>...]",
+		Short:   "Run models via Slurm",
+		Example: fmt.Sprintf(runExamples, "slurm"),
+		Run:     slurm,
+	}
+
+	cmd.PersistentFlags().String("bbi_binary", "",
+		"bbi executable to use in the Slurm submission script (default: current process's executable)")
+	errpanic(viper.BindPFlag("bbi_binary", cmd.PersistentFlags().Lookup("bbi_binary")))
+
+	const gridNamePrefixIdentifier string = "grid_name_prefix"
+	cmd.PersistentFlags().String(gridNamePrefixIdentifier, "",
+		"prefix to add to the name of submitted jobs")
+	errpanic(viper.BindPFlag(gridNamePrefixIdentifier, cmd.PersistentFlags().Lookup(gridNamePrefixIdentifier)))
+
+	return cmd
+}
+
+func slurm(_ *cobra.Command, args []string) {
+	gs := &gridSpec{
+		Name:          "Slurm",
+		Template:      slurmTemplate,
+		SubmitCommand: "sbatch",
+		IgnoreError:   func(_ error, _ string) bool { return false },
+	}
+	gs.run(args)
+}
diff --git a/docs/commands/bbi_nonmem.md b/docs/commands/bbi_nonmem.md
@@ -10,11 +10,11 @@ bbi nonmem [flags]
 
 ```
   # Execute model run001
-  bbi nonmem run (local|sge) run001.mod
+  bbi nonmem run (local|sge|slurm) run001.mod
   #  Run models run001.mod, run002.mod, and run003.mod
-  bbi nonmem run (local|sge) 'run[001:003].mod'
+  bbi nonmem run (local|sge|slurm) 'run[001:003].mod'
   # Run all models in the current directory
-  bbi nonmem run (local|sge) .
+  bbi nonmem run (local|sge|slurm) .
 
   # Summarize run001
   bbi nonmem summary run001/run001.lst

diff --git a/docs/commands/bbi_nonmem_run.md b/docs/commands/bbi_nonmem_run.md
@@ -15,11 +15,11 @@ bbi nonmem run [flags]
 
 ```
   # Execute model run001
-  bbi nonmem run (local|sge) run001.mod
+  bbi nonmem run (local|sge|slurm) run001.mod
   #  Run models run001.mod, run002.mod, and run003.mod
-  bbi nonmem run (local|sge) 'run[001:003].mod'
+  bbi nonmem run (local|sge|slurm) 'run[001:003].mod'
   # Run all models in the current directory
-  bbi nonmem run (local|sge) .
+  bbi nonmem run (local|sge|slurm) .
 ```
 
 ### Options
@@ -69,4 +69,5 @@ bbi nonmem run [flags]
 * [bbi nonmem](bbi_nonmem.md)	 - Entry point for NONMEM-related subcommands
 * [bbi nonmem run local](bbi_nonmem_run_local.md)	 - Run models locally
 * [bbi nonmem run sge](bbi_nonmem_run_sge.md)	 - Run models on the Sun Grid Engine
+* [bbi nonmem run slurm](bbi_nonmem_run_slurm.md)	 - Run models via Slurm
 
diff --git a/docs/commands/bbi_nonmem_run_slurm.md b/docs/commands/bbi_nonmem_run_slurm.md
@@ -0,0 +1,67 @@
+## bbi nonmem run slurm
+
+Run models via Slurm
+
+```
+bbi nonmem run slurm [flags] <model> [<model>...]
+```
+
+### Examples
+
+```
+  # Execute model run001
+  bbi nonmem run slurm run001.mod
+  #  Run models run001.mod, run002.mod, and run003.mod
+  bbi nonmem run slurm 'run[001:003].mod'
+  # Run all models in the current directory
+  bbi nonmem run slurm .
+```
+
+### Options
+
+```
+      --bbi_binary string         bbi executable to use in the Slurm submission script (default: current process's executable)
+      --grid_name_prefix string   prefix to add to the name of submitted jobs
+  -h, --help                      help for slurm
+```
+
+### Options inherited from parent commands
+
+```
+      --additional_post_work_envs strings   additional values (as ENV KEY=VALUE) to provide for the post execution environment
+      --background                          RAW NMFE OPTION - tell NONMEM not to scan stdin for control characters
+      --clean_lvl int                       clean level used for output (default 1)
+      --config string                       path to another bbi.yaml to load
+      --copy_lvl int                        copy level used for output
+  -d, --debug                               debug mode
+      --delay int                           Selects a random number of seconds between 1 and this value to stagger / jitter job execution. Assists in dealing with large volumes of work dealing with the same data set. May avoid NMTRAN issues about not being able read / close files
+      --git                                 whether git is used
+      --json                                show JSON output, if possible
+      --licfile string                      RAW NMFE OPTION - NONMEM license file to use
+      --log_file string                     file into which to store the output / logging details from bbi
+      --maxlim int                          RAW NMFE OPTION - set the maximum values for the buffers used by NONMEM (if 0, don't pass -maxlim to nmfe) (default 2)
+      --mpi_exec_path string                fully qualified path to mpiexec to use for NONMEM parallel operations (default "/usr/local/mpich3/bin/mpiexec")
+      --nm_version string                   version of NONMEM from the configuration list to use
+      --nmqual                              whether to execute with nmqual (autolog.pl)
+      --nobuild                             RAW NMFE OPTION - do not build a new NONMEM executable
+  -o, --output string                       output file
+      --output_dir string                   Go template for the output directory to use for storing details of each executed model (default "{{ .Name }}")
+      --overwrite                           whether to remove existing output directories
+      --parafile string                     location of a user-provided parafile to use for parallel execution
+      --parallel                            whether to run NONMEM in parallel mode
+      --parallel_timeout int                amount of time to wait for parallel operations in NONMEM before timing out (default 2147483647)
+      --post_work_executable string         script or binary to run when job execution completes or fails
+      --prcompile                           RAW NMFE OPTION - forces PREDPP compilation
+      --prdefault                           RAW NMFE OPTION - do not recompile any routines other than FSUBS
+  -p, --preview                             preview action, but don't actually run command
+      --prsame                              RAW NMFE OPTION - tell NONMEM to skip the PREDPP compilation step
+      --save_config                         whether to save the existing configuration to the output directory (default true)
+      --threads int                         number of threads to execute with locally or nodes to execute on in parallel (default 4)
+      --tprdefault                          RAW NMFE OPTION - test if is okay to do -prdefault
+  -v, --verbose                             verbose output
+```
+
+### SEE ALSO
+
+* [bbi nonmem run](bbi_nonmem_run.md)	 - Run models locally or on the grid
+
diff --git a/docs/validation/matrix.yaml b/docs/validation/matrix.yaml
@@ -68,6 +68,14 @@
     - integration/nonmem/bbi_sge_test.go
     - parsers/nmparser/add_path_level_test.go
 
+- entrypoint: bbi nonmem run slurm
+  code: cmd/slurm.go
+  doc: docs/commands/bbi_nonmem_run_slurm.md
+  tests:
+    - cmd/nonmem_test.go
+    - cmd/grid_test.go
+    - integration/nonmem/bbi_slurm_test.go
+
 - entrypoint: bbi nonmem scaffold
   code: cmd/scaffold.go
   doc: docs/commands/bbi_nonmem_scaffold.md

diff --git a/integration/nonmem/bbi_sge_test.go b/integration/nonmem/bbi_sge_test.go
@@ -37,14 +37,16 @@ func TestBbiCompletesParallelSGEExecution(tt *testing.T) {
 	checkParallelGridExecution(tt, "sge", tests, WaitForSGEToTerminate)
 }
 
-func WaitForSGEToTerminate(gridNameIdentifier string) {
+func WaitForSGEToTerminate(gridNameIdentifier string) error {
 	log.Info(fmt.Sprintf("Provided value for location job by name was : %s", gridNameIdentifier))
 	for CountOfPendingJobs(gridNameIdentifier) > 0 {
 		log.Infof("Located %d pending jobs. Waiting for 30 seconds to check again", CountOfPendingJobs(gridNameIdentifier))
 		time.Sleep(30 * time.Second)
 	}
 
 	log.Info("Looks like all queued and running jobs have terminated")
+
+	return nil
 }
 
 func CountOfPendingJobs(gridNameIdentifier string) int {

diff --git a/integration/nonmem/bbi_slurm_test.go b/integration/nonmem/bbi_slurm_test.go
@@ -0,0 +1,68 @@
+package nonmem
+
+import (
+	"bytes"
+	"os/exec"
+	"testing"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+)
+
+func TestBbiCompletesSlurmExecution(tt *testing.T) {
+	if !FeatureEnabled("SLURM") {
+		tt.Skip("Slurm is not enabled")
+	}
+
+	tests := []string{
+		"acop",
+		"ctl_test",
+		"leading-path-with space",
+	}
+
+	checkGridExecution(tt, "slurm", tests, WaitForSlurmToTerminate)
+}
+
+func TestBbiCompletesParallelSlurmExecution(tt *testing.T) {
+	if !FeatureEnabled("SLURM") {
+		tt.Skip("Slurm is not enabled")
+	}
+
+	tests := []string{
+		"acop",
+		"ctl_test",
+		"leading-path-with space",
+	}
+
+	checkParallelGridExecution(tt, "slurm", tests, WaitForSlurmToTerminate)
+}
+
+func WaitForSlurmToTerminate(name string) error {
+	log.Infof("waiting for Slurm job %s", name)
+	secs := 30 * time.Second
+	for {
+		time.Sleep(secs)
+		contains, err := squeueContains(name)
+		if err != nil {
+			return err
+		}
+
+		if !contains {
+			break
+		}
+
+		log.Infof("%s is still in squeue output; checking again in %s", name, secs)
+	}
+
+	return nil
+}
+
+func squeueContains(name string) (bool, error) {
+	cmd := exec.Command("squeue", "--noheader", "--format=%i", "--name="+name)
+	out, err := cmd.Output()
+	if err != nil {
+		return false, err
+	}
+
+	return len(bytes.TrimSpace(out)) > 0, nil
+}
diff --git a/integration/nonmem/grid.go b/integration/nonmem/grid.go
@@ -11,7 +11,7 @@ import (
 )
 
 func checkGridExecution(
-	tt *testing.T, command string, scenarios []string, waitFn func(string),
+	tt *testing.T, command string, scenarios []string, waitFn func(string) error,
 ) {
 
 	tt.Helper()
@@ -43,7 +43,8 @@ func checkGridExecution(
 					_, err := model.Execute(scenario, nonMemArguments...)
 					t.R.NoError(err)
 
-					waitFn(getGridNameIdentifier(model))
+					err = waitFn(getGridNameIdentifier(model))
+					t.R.NoError(err)
 
 					testingDetails := NonMemTestingDetails{
 						OutputDir: filepath.Join(scenario.Workpath, model.identifier),
@@ -61,7 +62,7 @@ func checkGridExecution(
 }
 
 func checkParallelGridExecution(
-	tt *testing.T, command string, scenarios []string, waitFn func(string),
+	tt *testing.T, command string, scenarios []string, waitFn func(string) error,
 ) {
 
 	tt.Helper()
@@ -98,7 +99,8 @@ func checkParallelGridExecution(
 					_, err := m.Execute(scenario, nonMemArguments...)
 					t.R.NoError(err)
 
-					waitFn(getGridNameIdentifier(m))
+					err = waitFn(getGridNameIdentifier(m))
+					t.R.NoError(err)
 
 					testingDetails := NonMemTestingDetails{
 						OutputDir: filepath.Join(scenario.Workpath, m.identifier),

diff --git a/scripts/run-integration-tests b/scripts/run-integration-tests
@@ -38,13 +38,19 @@ export NMVERSION_NMQUAL=nm74gf
 export NONMEMROOT=/opt/NONMEM
 export POST_EXECUTION=true
 export ROOT_EXECUTION_DIR="$tdir"
-export SGE=true
-export SGE_ARCH=lx-amd64
-export SGE_CELL=default
-export SGE_CLUSTER_NAME=p6444
-export SGE_EXECD_PORT=6445
-export SGE_QMASTER_PORT=6444
-export SGE_ROOT=/opt/sge
+
+if command -v sbatch >/dev/null
+then
+    export SLURM=true
+else
+    export SGE=true
+    export SGE_ARCH=lx-amd64
+    export SGE_CELL=default
+    export SGE_CLUSTER_NAME=p6444
+    export SGE_EXECD_PORT=6445
+    export SGE_QMASTER_PORT=6444
+    export SGE_ROOT=/opt/sge
+fi
 
 bin=$tdir/bin
 mkdir "$bin"