From 8da72a0184019d911c298100caef751ab2addbe4 Mon Sep 17 00:00:00 2001 From: becker33 Date: Wed, 1 Nov 2017 17:32:26 -0700 Subject: [PATCH] Bugfix slurm daemon hang (#99) * Change scr_srun to kill daemon process explicitly Will not make changes to other resource managers, as we have not seen errors there and I have not evaluated this method for them. --- scripts/TLCC/scr_run.in | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/TLCC/scr_run.in b/scripts/TLCC/scr_run.in index 4ddf8a8c..67caae33 100755 --- a/scripts/TLCC/scr_run.in +++ b/scripts/TLCC/scr_run.in @@ -126,8 +126,11 @@ fi # start background scr_transfer processes (1 per node) if async flush is enabled if [ "$SCR_FLUSH_ASYNC" == "1" ] ; then + redirect="" + if [ -z "$SCR_DEBUG" ]; then redirect="2> /dev/null"; fi nnodes=`$bindir/scr_glob_hosts --count --hosts $SCR_NODELIST` - srun -W 0 -n${nnodes} -N${nnodes} $bindir/scr_transfer $cntldir/transfer.scrinfo & + srun -q -Q --disable-status -W 0 -n${nnodes} -N${nnodes} $bindir/scr_transfer $cntldir/transfer.scrinfo $redirect & + daemon_pid=$! fi # enter the run loop @@ -275,6 +278,13 @@ while [ 1 ] ; do fi done +# Stop the transfer daemon +if [ $daemon_pid ]; then + echo "Killing the transfer daemon process" + echo "This may result in an error message from slurmstepd" + kill -s SIGINT $daemon_pid +fi + # stop scr_transfer processes before we attempt to scavenge if [ "$SCR_FLUSH_ASYNC" == "1" ] ; then # TODO: this doesn't currently do anything