Skip to content

Commit

Permalink
Merge pull request #357 from argonne-lcf/polaris
Browse files Browse the repository at this point in the history
Polaris app_run changes
  • Loading branch information
cms21 authored Aug 4, 2023
2 parents 51b9c7d + 3890bfa commit 10f5d27
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 10 deletions.
2 changes: 0 additions & 2 deletions balsam/config/defaults/alcf_polaris/job-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
export http_proxy="http://proxy:3128"
export https_proxy="http://proxy:3128"

export PYTHONPATH=/home/turam/dev/polaris/balsam:$PYTHONPATH

#remove export PMI_NO_FORK=1
export BALSAM_SITE_PATH={{balsam_site_path}}
cd $BALSAM_SITE_PATH
Expand Down
22 changes: 18 additions & 4 deletions balsam/platform/app_run/app_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import psutil # type: ignore

from balsam.platform.compute_node import ComputeNode
from balsam.site.launcher import NodeSpec

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -67,10 +68,23 @@ def get_num_ranks(self) -> int:
return self._ranks_per_node * len(self._node_spec.node_ids)

def get_cpus_per_rank(self) -> int:
cpu_per_rank = len(self._node_spec.cpu_ids[0]) // self._ranks_per_node
if not cpu_per_rank:
cpu_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
return cpu_per_rank
# Get the list of cpus assigned to the job. If it is a single node job, that is stored in
# the NodeSpec object. If it is a multinode job, the cpu_ids assigned to NodeSpec is empty,
# so we will assume all cpus on a compute node are available to the job. The list of cpus is
# just the list of cpus on the node in that case.
cpu_ids = self._node_spec.cpu_ids[0]
cpus_per_node = len(cpu_ids)
if not cpu_ids:
compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
cpus_per_node = len(compute_node.cpu_ids)

cpus_per_rank = cpus_per_node // self._ranks_per_node

# If ranks are oversubscribed to cpus (ranks_per_node > cpus_per_node), set it to a minimum of
# 1 cpu per rank or the number of cores per rank from the threading settings
if not cpus_per_rank:
cpus_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
return cpus_per_rank

@abstractmethod
def start(self) -> None:
Expand Down
93 changes: 91 additions & 2 deletions balsam/platform/app_run/polaris.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import logging
import os

from balsam.platform.compute_node import PolarisNode

from .app_run import SubprocessAppRun

logger = logging.getLogger(__name__)


class PolarisRun(SubprocessAppRun):
"""
Expand All @@ -8,7 +15,59 @@ class PolarisRun(SubprocessAppRun):

def _build_cmdline(self) -> str:
node_ids = [h for h in self._node_spec.hostnames]
cpu_bind = self._launch_params.get("cpu_bind", "none")

# If the user does not set a cpu_bind option,
# this code sets cpu-bind to be optimal for the gpus being used.
# This does not handle the case where the application is using less than
# 8 cpus per gpu. This code will not skip the appropriate number of cpus
# in the rank binding assignments.
if "cpu_bind" in self._launch_params.keys():
cpu_bind = self._launch_params.get("cpu_bind")
elif "--cpu-bind" in self._launch_params.keys():
cpu_bind = self._launch_params.get("--cpu-bind")
else:
# Here we grab the cpu_ids assigned to the job in the NodeSpec object
# If this is not set in NodeSpec (it is only set for single node jobs),
# then we take the cpu_id list from the Polaris ComputeNode subclass,
# assuming the job will have use of all the cpus in nodes assigned to it.
cpu_ids = self._node_spec.cpu_ids[0]
polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
if not cpu_ids:
cpu_ids = polaris_node.cpu_ids

cpus_per_rank = self.get_cpus_per_rank()

# PolarisNode reverses the order of the gpu_ids, so assigning the cpu-bind
# in ascending cpu order is what we want here.
cpu_bind_list = ["list"]
for irank in range(self._ranks_per_node):
cpu_bind_list.append(":")
for i in range(cpus_per_rank):
if i > 0:
cpu_bind_list.append(",")
cid = str(cpu_ids[i + cpus_per_rank * irank])
cpu_bind_list.append(cid)
# If the job is using 2 hardware threads per core, we need to add those threads to the list
# The additional threads should go in the same ascending order (threads 0 and 32 are on the
# same physical core, threads 31 and 63 are on the same physical core)
if self._threads_per_core == 2:
cpu_bind_list.append(",")
cid = str(cpu_ids[i + cpus_per_rank * irank] + len(polaris_node.cpu_ids))
cpu_bind_list.append(cid)
cpu_bind = "".join(cpu_bind_list)

launch_params = []
for k in self._launch_params.keys():
if k != "cpu_bind" and k != "--cpu-bind":
launch_params.append(str(self._launch_params[k]))

# The value of -d depends on the setting of cpu_bind. If cpu-bind=core, -d is the number of
# physical cores per rank, otherwise it is the number of hardware threads per rank
# https://docs.alcf.anl.gov/running-jobs/example-job-scripts/
depth = self._threads_per_rank
if "core" == cpu_bind:
depth = self.get_cpus_per_rank()

nid_str = ",".join(map(str, node_ids))
args = [
"mpiexec",
Expand All @@ -21,7 +80,37 @@ def _build_cmdline(self) -> str:
"--cpu-bind",
cpu_bind,
"-d",
self._threads_per_rank,
depth,
*launch_params,
self._cmdline,
]
return " ".join(str(arg) for arg in args)

def _set_envs(self) -> None:
envs = os.environ.copy()
envs.update(self._envs)

# Here we grab the gpus assigned to the job from NodeSpec. NodeSpec only
# sets this for single node jobs. For multinode jobs, gpu_ids below will
# be an empty list of lists (e.g. [[], []]). The ordering of the gpu_ids
# is reversed in PolarisNode and therefore the reverse ordering of
# cpus to gpus should be reflected here
gpu_ids = self._node_spec.gpu_ids[0]
cpu_ids = self._node_spec.cpu_ids[0]
logger.info(f"Polaris set_envs: gpu_ids={gpu_ids} cpu_ids={cpu_ids}")

# Here we set CUDA_VISIBLE_DEVICES for single node jobs only. We assume
# for multinode jobs that the job has access to all gpus, and
# CUDA_VISIBLE_DEVICES is set by the user, for example by local rank with an
# gpu_affinity.sh script that wraps around the user application in the
# ApplicationDefinition.
# One special case: if your job has one node, 2 ranks, and 1 gpu per rank, the
# code here will set CUDA_VISIBLE_DEVICES to "3,2" or "1,0". A user provided
# gpu_affinity.sh script should take this assigment and use it to reset
# CUDA_VISIBLE_DEVICES for each local rank. The user script should NOT
# round-robin the setting CUDA_VISIBLE_DEVICES starting from 3.
if gpu_ids:
envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
self._envs = envs
6 changes: 4 additions & 2 deletions balsam/platform/compute_node/alcf_polaris_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@


class PolarisNode(ComputeNode):
# turam: confirm number of cpus
cpu_ids = list(range(64))
cpu_ids = list(range(32))
gpu_ids: List[IntStr] = list(range(4))

# cms21: optimal gpu/cpu binding on Polaris nodes goes in reverse order
gpu_ids.reverse()

@classmethod
def get_job_nodelist(cls) -> List["PolarisNode"]:
"""
Expand Down

0 comments on commit 10f5d27

Please sign in to comment.