From 0b4527b02900ab10f66808ff052e4cdb78230757 Mon Sep 17 00:00:00 2001 From: Andre Merzky Date: Mon, 2 Dec 2024 09:09:08 +0100 Subject: [PATCH 1/4] safer srun termination --- src/radical/pilot/agent/executing/popen.py | 10 +++++++++- src/radical/pilot/agent/launch_method/srun.py | 10 ++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/radical/pilot/agent/executing/popen.py b/src/radical/pilot/agent/executing/popen.py index b456db69d..0bc114e33 100644 --- a/src/radical/pilot/agent/executing/popen.py +++ b/src/radical/pilot/agent/executing/popen.py @@ -317,9 +317,17 @@ def _check_running(self, to_watch, to_cancel): # process group (which should include the actual launch # method) try: - # kill the whole process group + # kill the whole process group. + # Try SIGINT first to allow signal handlers, then + # SIGTERM to allow clean termination, then SIGKILL to + # enforce termination. pgrp = os.getpgid(task['proc'].pid) + os.killpg(pgrp, signal.SIGINT) + time.sleep(0.1) + os.killpg(pgrp, signal.SIGTERM) + time.sleep(0.1) os.killpg(pgrp, signal.SIGKILL) + except OSError: # lost race: task is already gone, we ignore this # FIXME: collect and move to DONE/FAILED diff --git a/src/radical/pilot/agent/launch_method/srun.py b/src/radical/pilot/agent/launch_method/srun.py index ec4235e1b..de9cc7519 100644 --- a/src/radical/pilot/agent/launch_method/srun.py +++ b/src/radical/pilot/agent/launch_method/srun.py @@ -150,8 +150,14 @@ def get_launch_cmds(self, task, exec_path): gpus_per_task = len(slots[0]['gpus']) mapping = '' - if n_tasks > 1 and td['use_mpi'] is False: - mapping += '--kill-on-bad-exit=0 ' + if n_tasks > 1: + if td['use_mpi'] is False: + mapping += '-K0 ' # '--kill-on-bad-exit=0 ' + else: + # ensure that all ranks are killed if one rank fails + mapping += '-K1 ' # '--kill-on-bad-exit=1 ' + # allow step cancellation with single SIGINT + mapping += '--quit-on-interrupt ' if self._exact: mapping += '--exact ' From eda38f7541e41722ba48df669ce062af72638de2 Mon Sep 17 00:00:00 2001 From: Andre Merzky Date: Tue, 3 Dec 2024 02:22:06 +0100 Subject: [PATCH 2/4] ensure cancel arrives at executor --- src/radical/pilot/agent/executing/popen.py | 2 +- src/radical/pilot/utils/component.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/radical/pilot/agent/executing/popen.py b/src/radical/pilot/agent/executing/popen.py index 0bc114e33..8d020be07 100644 --- a/src/radical/pilot/agent/executing/popen.py +++ b/src/radical/pilot/agent/executing/popen.py @@ -70,6 +70,7 @@ def initialize(self): # def cancel_task(self, uid): + self._log.debug('request cancel task %s', uid) self._watch_queue.put([self.TO_CANCEL, uid]) @@ -289,7 +290,6 @@ def _watch(self): # next step. Also check for a requested cancellation for the tasks. def _check_running(self, to_watch, to_cancel): - # action = False # `to_watch.remove()` in the loop requires copy to iterate over the list diff --git a/src/radical/pilot/utils/component.py b/src/radical/pilot/utils/component.py index 11a137440..a209eed1e 100644 --- a/src/radical/pilot/utils/component.py +++ b/src/radical/pilot/utils/component.py @@ -378,8 +378,9 @@ def _control_cb(self, topic, msg): with self._cancel_lock: self._cancel_list += uids - # scheduler handles cancelation itself - if 'AgentSchedulingComponent' in repr(self): + # scheduler and executor handle cancelation directly + if 'agent.scheduling' in repr(self) or \ + 'agent.executing' in repr(self): self.control_cb(topic, msg) return From 79ce28281022f6a0d2d2a72a9b4fb9e032e9a43d Mon Sep 17 00:00:00 2001 From: Andre Merzky Date: Tue, 3 Dec 2024 08:33:04 +0100 Subject: [PATCH 3/4] respond to comments --- src/radical/pilot/utils/component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/radical/pilot/utils/component.py b/src/radical/pilot/utils/component.py index a209eed1e..008271496 100644 --- a/src/radical/pilot/utils/component.py +++ b/src/radical/pilot/utils/component.py @@ -379,7 +379,7 @@ def _control_cb(self, topic, msg): self._cancel_list += uids # scheduler and executor handle cancelation directly - if 'agent.scheduling' in repr(self) or \ + if 'agent.scheduler' in repr(self) or \ 'agent.executing' in repr(self): self.control_cb(topic, msg) return From 98780bb1c42087af19a0052f8d159112fee1c041 Mon Sep 17 00:00:00 2001 From: Andre Merzky Date: Wed, 4 Dec 2024 11:24:07 +0100 Subject: [PATCH 4/4] fix tests --- tests/unit_tests/test_lm/test_cases/task.000010.json | 2 +- tests/unit_tests/test_lm/test_cases/task.000014.json | 2 +- tests/unit_tests/test_lm/test_cases/task.000019.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/test_lm/test_cases/task.000010.json b/tests/unit_tests/test_lm/test_cases/task.000010.json index 2d956d6dc..8df36f79d 100644 --- a/tests/unit_tests/test_lm/test_cases/task.000010.json +++ b/tests/unit_tests/test_lm/test_cases/task.000010.json @@ -23,7 +23,7 @@ "results": { "lm": { "srun" : { - "launch_cmd" : "srun --export=ALL --nodes 2 --ntasks 2 --cpus-per-task 4 --mem 0", + "launch_cmd" : "srun --export=ALL -K1 --quit-on-interrupt --nodes 2 --ntasks 2 --cpus-per-task 4 --mem 0", "rank_exec" : "/bin/sleep" }, "aprun": { diff --git a/tests/unit_tests/test_lm/test_cases/task.000014.json b/tests/unit_tests/test_lm/test_cases/task.000014.json index 8375a8cd1..12b82184b 100644 --- a/tests/unit_tests/test_lm/test_cases/task.000014.json +++ b/tests/unit_tests/test_lm/test_cases/task.000014.json @@ -51,7 +51,7 @@ "rank_exec" : "/bin/sleep \"15\"" }, "srun": { - "launch_cmd" : "srun --export=ALL --nodes 1 --ntasks 2 --cpus-per-task 2 --threads-per-core 2 --mem 1024 --gpus-per-task 2 --gpu-bind closest --nodelist=node1", + "launch_cmd" : "srun --export=ALL -K1 --quit-on-interrupt --nodes 1 --ntasks 2 --cpus-per-task 2 --threads-per-core 2 --mem 1024 --gpus-per-task 2 --gpu-bind closest --nodelist=node1", "rank_exec" : "/bin/sleep \"15\"" } } diff --git a/tests/unit_tests/test_lm/test_cases/task.000019.json b/tests/unit_tests/test_lm/test_cases/task.000019.json index a8df0bab8..60e24ee35 100644 --- a/tests/unit_tests/test_lm/test_cases/task.000019.json +++ b/tests/unit_tests/test_lm/test_cases/task.000019.json @@ -61,7 +61,7 @@ "results": { "lm": { "srun": { - "launch_cmd" : "srun --export=ALL --kill-on-bad-exit=0 --exact --nodes 1 --ntasks 4 --cpus-per-task 2 --mem 0 --nodelist=node1", + "launch_cmd" : "srun --export=ALL -K0 --exact --nodes 1 --ntasks 4 --cpus-per-task 2 --mem 0 --nodelist=node1", "rank_exec" : "/bin/sleep \"12\"" } }