From e74143f73eaea86d13553010dd8f7fdf2edb58b2 Mon Sep 17 00:00:00 2001 From: Mikhail Titov Date: Thu, 21 Sep 2023 18:55:29 -0400 Subject: [PATCH 1/2] added option `--ppn` for `PALS` flavor in MPIEXEC LM --- .../pilot/agent/launch_method/mpiexec.py | 23 +++++-- .../test_lm/test_cases/task.000003.json | 2 +- .../test_lm/test_cases/task.000020.json | 61 +++++++++++++++++++ 3 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 tests/unit_tests/test_lm/test_cases/task.000020.json diff --git a/src/radical/pilot/agent/launch_method/mpiexec.py b/src/radical/pilot/agent/launch_method/mpiexec.py index 0cb4ab4971..c53ba7eec3 100644 --- a/src/radical/pilot/agent/launch_method/mpiexec.py +++ b/src/radical/pilot/agent/launch_method/mpiexec.py @@ -209,8 +209,11 @@ def get_launch_cmds(self, task, exec_path): assert slots.get('ranks'), 'task.slots.ranks not defined' - n_ranks = sum([len(slot['core_map']) for slot in slots['ranks']]) - cmd_options = '-np %d ' % n_ranks + host_slots = defaultdict(int) + for rank in slots['ranks']: + host_slots[rank['node_name']] += len(rank['core_map']) + + cmd_options = '-np %d ' % sum(host_slots.values()) if self._use_rf: rankfile = self._get_rank_file(slots, uid, sbox) @@ -221,9 +224,19 @@ def get_launch_cmds(self, task, exec_path): hostfile = self._get_host_file(slots, uid, sbox) core_ids = ':'.join([ str(cores[0]) + ('-%s' % cores[-1] if len(cores) > 1 else '') - for cores in [rank['core_map'][0] for rank in slots['ranks']]]) - cmd_options += '--hostfile %s ' % hostfile + \ - '--cpu-bind list:%s' % core_ids + for core_map in [rank['core_map'] for rank in slots['ranks']] + for cores in core_map]) + cmd_options += '--ppn %d ' % max(host_slots.values()) + \ + '--cpu-bind list:%s ' % core_ids + \ + '--hostfile %s' % hostfile + + # NOTE: Option "--ppn" controls "node-depth" vs. "core-depth" + # process placement. If we submit "mpiexec" command with + # "--ppn" option, it will place processes within the same + # node first. If we do not provide "--ppn" option, it will + # place processes on the available nodes one by one and + # round-robin when each available node is populated. + # if over-subscription is allowed, # then the following approach is applicable too: # cores_per_rank = len(slots['ranks'][0]['core_map'][0]) diff --git a/tests/unit_tests/test_lm/test_cases/task.000003.json b/tests/unit_tests/test_lm/test_cases/task.000003.json index c4519289b2..1f489786a9 100644 --- a/tests/unit_tests/test_lm/test_cases/task.000003.json +++ b/tests/unit_tests/test_lm/test_cases/task.000003.json @@ -66,7 +66,7 @@ "rank_exec" : "/bin/sleep \"10\"" }, "mpiexec" : { - "launch_cmd" : "mpiexec -np 1 --hostfile /tmp/task.000003.hf --cpu-bind list:0-3", + "launch_cmd" : "mpiexec -np 1 --ppn 1 --cpu-bind list:0-3 --hostfile /tmp/task.000003.hf", "rank_exec" : "/bin/sleep \"10\"" } }, diff --git a/tests/unit_tests/test_lm/test_cases/task.000020.json b/tests/unit_tests/test_lm/test_cases/task.000020.json new file mode 100644 index 0000000000..c276e1b34b --- /dev/null +++ b/tests/unit_tests/test_lm/test_cases/task.000020.json @@ -0,0 +1,61 @@ + +{ + "task": { + "uid" : "task.000020", + "description": { + "executable" : "/bin/sleep", + "arguments" : ["25"], + "ranks" : 5, + "cores_per_rank": 2, + "threading_type": "", + "gpus_per_rank" : 0, + "gpu_type" : "", + "environment" : {} + }, + "task_sandbox_path" : "/tmp" + }, + + "setup": { + "lm": { + "slots": { + "cores_per_node": 8, + "gpus_per_node" : 0, + "lfs_per_node" : 0, + "ranks" : [{"node_name" : "node1", + "node_id" : "1", + "core_map" : [[0, 1], [2, 3]], + "gpu_map" : [], + "lfs" : 0}, + {"node_name" : "node1", + "node_id" : "1", + "core_map" : [[4, 5], [6, 7]], + "gpu_map" : [], + "lfs" : 0}, + {"node_name" : "node2", + "node_id" : "2", + "core_map" : [[0, 1]], + "gpu_map" : [], + "lfs" : 0}] + }, + "task_sandbox": "./", + "mpi_flavor" : "PALS" + } + }, + + "results": { + "lm": { + "mpiexec" : { + "launch_cmd" : "mpiexec -np 5 --ppn 4 --cpu-bind list:0-1:2-3:4-5:6-7:0-1 --hostfile /tmp/task.000020.hf", + "rank_exec" : "/bin/sleep \"25\"" + } + }, + "resource_file": { + "mpiexec" : ["node1\n", + "node2\n"] + }, + "resource_filename": { + "mpiexec" : "/tmp/task.000020.hf" + } + } +} + From cbecfa877d04d34fdccb8aa8d8daaf4596a7e92d Mon Sep 17 00:00:00 2001 From: Mikhail Titov Date: Thu, 21 Sep 2023 18:59:19 -0400 Subject: [PATCH 2/2] updated documentation for Polaris (GPUs assignment) --- docs/source/supported/polaris.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/source/supported/polaris.rst b/docs/source/supported/polaris.rst index 78bfd5aba4..869c0b7793 100644 --- a/docs/source/supported/polaris.rst +++ b/docs/source/supported/polaris.rst @@ -40,6 +40,21 @@ General description } EOF +.. note:: + + `Binding MPI ranks to GPUs `_: + If you want to control GPUs assignment per task, then the following code + snippet provides an example of setting ``CUDA_VISIBLE_DEVICES`` for each MPI + rank on Polaris: + + .. code-block:: python + + import radical.pilot as rp + + td = rp.TaskDescription() + td.pre_exec.append('export CUDA_VISIBLE_DEVICES=$((3 - $PMI_LOCAL_RANK % 4))') + td.gpu_type = '' # reset GPU type, thus RP will not set "CUDA_VISIBLE_DEVICES" + Setup execution environment ===========================