Skip to content

Commit

Permalink
nonmem run: support Slurm submission
Browse files Browse the repository at this point in the history
The new 'bbi nonmem run slurm' command follows 'bbi nonmem run sge' in
terms of going through gridSpec, attempting to align the template
directives, and testing with the same helpers.

The one deliberate deviation in behavior is the output file name.  For
SGE, standard output and standard error are redirected to the default
file name of '{run name}.o{job id}'.

There's been a request (gh-312) to clean up these .o* and .po* files
(only the former is relevant for Slurm).  Cleaning them up is probably
a bit aggressive because the .o* file contains the 'bbi run local ...'
output, which is useful for troubleshooting.  However, I suspect a
core pain point with the these files is the _changing name_ when
overwriting a model for reasons that have to do with how SVN handles
deletions.  So, don't mimic this behavior and instead use a consistent
name to hopefully avoid this issue for Slurm.

Closes #303
  • Loading branch information
kyleam committed Dec 6, 2024
1 parent e31e3bb commit 7c53082
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 21 deletions.
2 changes: 1 addition & 1 deletion cmd/nonmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ type NonMemModel struct {
}

var nonmemExamples string = fmt.Sprintf("%s\n\n%s\n\n%s\n",
fmt.Sprintf(runExamples, "(local|sge)"),
fmt.Sprintf(runExamples, "(local|sge|slurm)"),
summaryExamples,
covcorExamples)

Expand Down
5 changes: 3 additions & 2 deletions cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ const postProcessingScriptTemplate string = `#!/bin/bash
`

func run(_ *cobra.Command, _ []string) {
println(fmt.Sprintf(runExamples, "(local|sge)"))
println(fmt.Sprintf(runExamples, "(local|sge|slurm)"))
}

func NewRunCmd() *cobra.Command {
Expand All @@ -65,7 +65,7 @@ func NewRunCmd() *cobra.Command {
Short: "Run models locally or on the grid",
Long: `This is the entry point to subcommands for running NONMEM models. Each
subcommand represents a different "mode" of execution (e.g., local).`,
Example: fmt.Sprintf(runExamples, "(local|sge)"),
Example: fmt.Sprintf(runExamples, "(local|sge|slurm)"),
Run: run,
}

Expand Down Expand Up @@ -132,6 +132,7 @@ subcommand represents a different "mode" of execution (e.g., local).`,

cmd.AddCommand(NewLocalCmd())
cmd.AddCommand(NewSgeCmd())
cmd.AddCommand(NewSlurmCmd())

return cmd
}
Expand Down
49 changes: 49 additions & 0 deletions cmd/slurm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package cmd

import (
"fmt"

"github.com/spf13/cobra"
"github.com/spf13/viper"
)

const slurmTemplate string = `#!/bin/bash
#SBATCH --job-name={{.JobName | shquote}}
#SBATCH --output=slurm.out
#SBATCH --export=ALL
{{- if .Config.Parallel}}
#SBATCH --ntasks={{.Config.Threads}}{{end}}
#SBATCH --chdir={{.WorkingDirectory | shquote}}
{{range .Command}}{{. | shquote}} {{end}}
`

func NewSlurmCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "slurm [flags] <model> [<model>...]",
Short: "Run models via Slurm",
Example: fmt.Sprintf(runExamples, "slurm"),
Run: slurm,
}

cmd.PersistentFlags().String("bbi_binary", "",
"bbi executable to use in the Slurm submission script (default: current process's executable)")
errpanic(viper.BindPFlag("bbi_binary", cmd.PersistentFlags().Lookup("bbi_binary")))

const gridNamePrefixIdentifier string = "grid_name_prefix"
cmd.PersistentFlags().String(gridNamePrefixIdentifier, "",
"prefix to add to the name of submitted jobs")
errpanic(viper.BindPFlag(gridNamePrefixIdentifier, cmd.PersistentFlags().Lookup(gridNamePrefixIdentifier)))

return cmd
}

func slurm(_ *cobra.Command, args []string) {
gs := &gridSpec{
Name: "Slurm",
Template: slurmTemplate,
SubmitCommand: "sbatch",
IgnoreError: func(_ error, _ string) bool { return false },
}
gs.run(args)
}
6 changes: 3 additions & 3 deletions docs/commands/bbi_nonmem.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ bbi nonmem [flags]

```
# Execute model run001
bbi nonmem run (local|sge) run001.mod
bbi nonmem run (local|sge|slurm) run001.mod
# Run models run001.mod, run002.mod, and run003.mod
bbi nonmem run (local|sge) 'run[001:003].mod'
bbi nonmem run (local|sge|slurm) 'run[001:003].mod'
# Run all models in the current directory
bbi nonmem run (local|sge) .
bbi nonmem run (local|sge|slurm) .
# Summarize run001
bbi nonmem summary run001/run001.lst
Expand Down
7 changes: 4 additions & 3 deletions docs/commands/bbi_nonmem_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ bbi nonmem run [flags]

```
# Execute model run001
bbi nonmem run (local|sge) run001.mod
bbi nonmem run (local|sge|slurm) run001.mod
# Run models run001.mod, run002.mod, and run003.mod
bbi nonmem run (local|sge) 'run[001:003].mod'
bbi nonmem run (local|sge|slurm) 'run[001:003].mod'
# Run all models in the current directory
bbi nonmem run (local|sge) .
bbi nonmem run (local|sge|slurm) .
```

### Options
Expand Down Expand Up @@ -69,4 +69,5 @@ bbi nonmem run [flags]
* [bbi nonmem](bbi_nonmem.md) - Entry point for NONMEM-related subcommands
* [bbi nonmem run local](bbi_nonmem_run_local.md) - Run models locally
* [bbi nonmem run sge](bbi_nonmem_run_sge.md) - Run models on the Sun Grid Engine
* [bbi nonmem run slurm](bbi_nonmem_run_slurm.md) - Run models via Slurm

67 changes: 67 additions & 0 deletions docs/commands/bbi_nonmem_run_slurm.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## bbi nonmem run slurm

Run models via Slurm

```
bbi nonmem run slurm [flags] <model> [<model>...]
```

### Examples

```
# Execute model run001
bbi nonmem run slurm run001.mod
# Run models run001.mod, run002.mod, and run003.mod
bbi nonmem run slurm 'run[001:003].mod'
# Run all models in the current directory
bbi nonmem run slurm .
```

### Options

```
--bbi_binary string bbi executable to use in the Slurm submission script (default: current process's executable)
--grid_name_prefix string prefix to add to the name of submitted jobs
-h, --help help for slurm
```

### Options inherited from parent commands

```
--additional_post_work_envs strings additional values (as ENV KEY=VALUE) to provide for the post execution environment
--background RAW NMFE OPTION - tell NONMEM not to scan stdin for control characters
--clean_lvl int clean level used for output (default 1)
--config string path to another bbi.yaml to load
--copy_lvl int copy level used for output
-d, --debug debug mode
--delay int Selects a random number of seconds between 1 and this value to stagger / jitter job execution. Assists in dealing with large volumes of work dealing with the same data set. May avoid NMTRAN issues about not being able read / close files
--git whether git is used
--json show JSON output, if possible
--licfile string RAW NMFE OPTION - NONMEM license file to use
--log_file string file into which to store the output / logging details from bbi
--maxlim int RAW NMFE OPTION - set the maximum values for the buffers used by NONMEM (if 0, don't pass -maxlim to nmfe) (default 2)
--mpi_exec_path string fully qualified path to mpiexec to use for NONMEM parallel operations (default "/usr/local/mpich3/bin/mpiexec")
--nm_version string version of NONMEM from the configuration list to use
--nmqual whether to execute with nmqual (autolog.pl)
--nobuild RAW NMFE OPTION - do not build a new NONMEM executable
-o, --output string output file
--output_dir string Go template for the output directory to use for storing details of each executed model (default "{{ .Name }}")
--overwrite whether to remove existing output directories
--parafile string location of a user-provided parafile to use for parallel execution
--parallel whether to run NONMEM in parallel mode
--parallel_timeout int amount of time to wait for parallel operations in NONMEM before timing out (default 2147483647)
--post_work_executable string script or binary to run when job execution completes or fails
--prcompile RAW NMFE OPTION - forces PREDPP compilation
--prdefault RAW NMFE OPTION - do not recompile any routines other than FSUBS
-p, --preview preview action, but don't actually run command
--prsame RAW NMFE OPTION - tell NONMEM to skip the PREDPP compilation step
--save_config whether to save the existing configuration to the output directory (default true)
--threads int number of threads to execute with locally or nodes to execute on in parallel (default 4)
--tprdefault RAW NMFE OPTION - test if is okay to do -prdefault
-v, --verbose verbose output
```

### SEE ALSO

* [bbi nonmem run](bbi_nonmem_run.md) - Run models locally or on the grid

8 changes: 8 additions & 0 deletions docs/validation/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@
- integration/nonmem/bbi_sge_test.go
- parsers/nmparser/add_path_level_test.go

- entrypoint: bbi nonmem run slurm
code: cmd/slurm.go
doc: docs/commands/bbi_nonmem_run_slurm.md
tests:
- cmd/nonmem_test.go
- cmd/grid_test.go
- integration/nonmem/bbi_slurm_test.go

- entrypoint: bbi nonmem scaffold
code: cmd/scaffold.go
doc: docs/commands/bbi_nonmem_scaffold.md
Expand Down
4 changes: 3 additions & 1 deletion integration/nonmem/bbi_sge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ func TestBbiCompletesParallelSGEExecution(tt *testing.T) {
checkParallelGridExecution(tt, "sge", tests, WaitForSGEToTerminate)
}

func WaitForSGEToTerminate(gridNameIdentifier string) {
func WaitForSGEToTerminate(gridNameIdentifier string) error {
log.Info(fmt.Sprintf("Provided value for location job by name was : %s", gridNameIdentifier))
for CountOfPendingJobs(gridNameIdentifier) > 0 {
log.Infof("Located %d pending jobs. Waiting for 30 seconds to check again", CountOfPendingJobs(gridNameIdentifier))
time.Sleep(30 * time.Second)
}

log.Info("Looks like all queued and running jobs have terminated")

return nil
}

func CountOfPendingJobs(gridNameIdentifier string) int {
Expand Down
68 changes: 68 additions & 0 deletions integration/nonmem/bbi_slurm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package nonmem

import (
"bytes"
"os/exec"
"testing"
"time"

log "github.com/sirupsen/logrus"
)

func TestBbiCompletesSlurmExecution(tt *testing.T) {
if !FeatureEnabled("SLURM") {
tt.Skip("Slurm is not enabled")
}

tests := []string{
"acop",
"ctl_test",
"leading-path-with space",
}

checkGridExecution(tt, "slurm", tests, WaitForSlurmToTerminate)
}

func TestBbiCompletesParallelSlurmExecution(tt *testing.T) {
if !FeatureEnabled("SLURM") {
tt.Skip("Slurm is not enabled")
}

tests := []string{
"acop",
"ctl_test",
"leading-path-with space",
}

checkParallelGridExecution(tt, "slurm", tests, WaitForSlurmToTerminate)
}

func WaitForSlurmToTerminate(name string) error {
log.Infof("waiting for Slurm job %s", name)
secs := 30 * time.Second
for {
time.Sleep(secs)
contains, err := squeueContains(name)
if err != nil {
return err
}

if !contains {
break
}

log.Infof("%s is still in squeue output; checking again in %s", name, secs)
}

return nil
}

func squeueContains(name string) (bool, error) {
cmd := exec.Command("squeue", "--noheader", "--format=%i", "--name="+name)
out, err := cmd.Output()
if err != nil {
return false, err
}

return len(bytes.TrimSpace(out)) > 0, nil
}
10 changes: 6 additions & 4 deletions integration/nonmem/grid.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
)

func checkGridExecution(
tt *testing.T, command string, scenarios []string, waitFn func(string),
tt *testing.T, command string, scenarios []string, waitFn func(string) error,
) {

tt.Helper()
Expand Down Expand Up @@ -43,7 +43,8 @@ func checkGridExecution(
_, err := model.Execute(scenario, nonMemArguments...)
t.R.NoError(err)

waitFn(getGridNameIdentifier(model))
err = waitFn(getGridNameIdentifier(model))
t.R.NoError(err)

testingDetails := NonMemTestingDetails{
OutputDir: filepath.Join(scenario.Workpath, model.identifier),
Expand All @@ -61,7 +62,7 @@ func checkGridExecution(
}

func checkParallelGridExecution(
tt *testing.T, command string, scenarios []string, waitFn func(string),
tt *testing.T, command string, scenarios []string, waitFn func(string) error,
) {

tt.Helper()
Expand Down Expand Up @@ -98,7 +99,8 @@ func checkParallelGridExecution(
_, err := m.Execute(scenario, nonMemArguments...)
t.R.NoError(err)

waitFn(getGridNameIdentifier(m))
err = waitFn(getGridNameIdentifier(m))
t.R.NoError(err)

testingDetails := NonMemTestingDetails{
OutputDir: filepath.Join(scenario.Workpath, m.identifier),
Expand Down
20 changes: 13 additions & 7 deletions scripts/run-integration-tests
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,19 @@ export NMVERSION_NMQUAL=nm74gf
export NONMEMROOT=/opt/NONMEM
export POST_EXECUTION=true
export ROOT_EXECUTION_DIR="$tdir"
export SGE=true
export SGE_ARCH=lx-amd64
export SGE_CELL=default
export SGE_CLUSTER_NAME=p6444
export SGE_EXECD_PORT=6445
export SGE_QMASTER_PORT=6444
export SGE_ROOT=/opt/sge

if command -v sbatch >/dev/null
then
export SLURM=true
else
export SGE=true
export SGE_ARCH=lx-amd64
export SGE_CELL=default
export SGE_CLUSTER_NAME=p6444
export SGE_EXECD_PORT=6445
export SGE_QMASTER_PORT=6444
export SGE_ROOT=/opt/sge
fi

bin=$tdir/bin
mkdir "$bin"
Expand Down

0 comments on commit 7c53082

Please sign in to comment.