Skip to content

Commit

Permalink
merge from devel
Browse files Browse the repository at this point in the history
  • Loading branch information
andre-merzky committed Dec 4, 2024
2 parents 92f5ec6 + 0fcc590 commit 1c9f7be
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 9 deletions.
12 changes: 10 additions & 2 deletions src/radical/pilot/agent/executing/popen.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def initialize(self):
#
def cancel_task(self, uid):

self._log.debug('request cancel task %s', uid)
self._watch_queue.put([self.TO_CANCEL, uid])


Expand Down Expand Up @@ -289,7 +290,6 @@ def _watch(self):
# next step. Also check for a requested cancellation for the tasks.
def _check_running(self, to_watch, to_cancel):

#
action = False

# `to_watch.remove()` in the loop requires copy to iterate over the list
Expand Down Expand Up @@ -317,9 +317,17 @@ def _check_running(self, to_watch, to_cancel):
# process group (which should include the actual launch
# method)
try:
# kill the whole process group
# kill the whole process group.
# Try SIGINT first to allow signal handlers, then
# SIGTERM to allow clean termination, then SIGKILL to
# enforce termination.
pgrp = os.getpgid(task['proc'].pid)
os.killpg(pgrp, signal.SIGINT)
time.sleep(0.1)
os.killpg(pgrp, signal.SIGTERM)
time.sleep(0.1)
os.killpg(pgrp, signal.SIGKILL)

except OSError:
# lost race: task is already gone, we ignore this
# FIXME: collect and move to DONE/FAILED
Expand Down
10 changes: 8 additions & 2 deletions src/radical/pilot/agent/launch_method/srun.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,14 @@ def get_launch_cmds(self, task, exec_path):
gpus_per_task = len(slots[0]['gpus'])

mapping = ''
if n_tasks > 1 and td['use_mpi'] is False:
mapping += '--kill-on-bad-exit=0 '
if n_tasks > 1:
if td['use_mpi'] is False:
mapping += '-K0 ' # '--kill-on-bad-exit=0 '
else:
# ensure that all ranks are killed if one rank fails
mapping += '-K1 ' # '--kill-on-bad-exit=1 '
# allow step cancellation with single SIGINT
mapping += '--quit-on-interrupt '

if self._exact:
mapping += '--exact '
Expand Down
5 changes: 3 additions & 2 deletions src/radical/pilot/utils/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,9 @@ def _control_cb(self, topic, msg):
with self._cancel_lock:
self._cancel_list += uids

# scheduler handles cancelation itself
if 'AgentSchedulingComponent' in repr(self):
# scheduler and executor handle cancelation directly
if 'agent.scheduler' in repr(self) or \
'agent.executing' in repr(self):
self.control_cb(topic, msg)
return

Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_lm/test_cases/task.000010.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"results": {
"lm": {
"srun" : {
"launch_cmd" : "srun --export=ALL --nodes 2 --ntasks 2 --cpus-per-task 4 --mem 0",
"launch_cmd" : "srun --export=ALL -K1 --quit-on-interrupt --nodes 2 --ntasks 2 --cpus-per-task 4 --mem 0",
"rank_exec" : "/bin/sleep"
},
"aprun": {
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_lm/test_cases/task.000014.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"rank_exec" : "/bin/sleep \"15\""
},
"srun": {
"launch_cmd" : "srun --export=ALL --nodes 1 --ntasks 2 --cpus-per-task 2 --threads-per-core 2 --mem 1024 --gpus-per-task 2 --gpu-bind closest --nodelist=node1",
"launch_cmd" : "srun --export=ALL -K1 --quit-on-interrupt --nodes 1 --ntasks 2 --cpus-per-task 2 --threads-per-core 2 --mem 1024 --gpus-per-task 2 --gpu-bind closest --nodelist=node1",
"rank_exec" : "/bin/sleep \"15\""
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_lm/test_cases/task.000019.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"results": {
"lm": {
"srun": {
"launch_cmd" : "srun --export=ALL --kill-on-bad-exit=0 --exact --nodes 1 --ntasks 4 --cpus-per-task 2 --mem 0 --nodelist=node1",
"launch_cmd" : "srun --export=ALL -K0 --exact --nodes 1 --ntasks 4 --cpus-per-task 2 --mem 0 --nodelist=node1",
"rank_exec" : "/bin/sleep \"12\""
}
}
Expand Down

0 comments on commit 1c9f7be

Please sign in to comment.