Skip to content

Commit

Permalink
Merge pull request #1316 from ilya-da/nvidia-smi_kill
Browse files Browse the repository at this point in the history
Update GPU process cleanup logic in SLURM epilog script
  • Loading branch information
dholt authored Jan 16, 2025
2 parents 78264a9 + 5efe4a5 commit eb1fa28
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ set -ex
command -v nvidia-smi || exit 0

# Clean up processes still running. If processes don't exit node is drained.
if nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v - > /dev/null
if nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v - > /dev/null
then
for i in $(nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v -)
for i in $(nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v -)
do
logger -s -t slurm-epilog "Killing residual GPU process $i ..."
kill -9 "$i"
done
fi
sleep 5
if nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v - > /dev/null
if nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v - > /dev/null
then
logger -s -t slurm-epilog 'Failed to kill residual GPU processes. Draining node ...'
scontrol update nodename="$HOSTNAME" state=drain reason='Residual GPU processes found'
Expand Down

0 comments on commit eb1fa28

Please sign in to comment.