From 4945700f63a79186b99b3090cb8b94e54b4aa77a Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 28 Apr 2023 16:27:45 -0500
Subject: [PATCH 01/20] first attempt to sketch out cpu affinity bindings

---
 balsam/platform/app_run/polaris.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 72834f39..4da45bfd 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -1,3 +1,5 @@
+import os
+
 from .app_run import SubprocessAppRun
 
 
@@ -9,6 +11,18 @@ class PolarisRun(SubprocessAppRun):
     def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
         cpu_bind = self._launch_params.get("cpu_bind", "none")
+        if (
+            cpu_bind == "none"
+            and self._gpus_per_rank > 0
+            and self.get_num_ranks() == 8
+            and self.get_cpus_per_rank == 1
+        ):
+            gpu_device = int(os.getenv("CUDA_VISIBLE_DEVICES"))
+            cpu_bind_list = ["list"]
+            start_cpu = 32 - 8 * (1 + gpu_device)
+            for i in range(8):
+                cpu_bind_list.append(":" + str(start_cpu + i))
+            cpu_bind = "".join(cpu_bind_list)
         nid_str = ",".join(map(str, node_ids))
         args = [
             "mpiexec",

From 5c85221db688af073946807a6a95324e67adf07e Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 19 May 2023 18:09:04 -0500
Subject: [PATCH 02/20] updates to polaris app run

---
 balsam/platform/app_run/polaris.py            | 56 +++++++++++++++----
 .../compute_node/alcf_polaris_node.py         |  2 +
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 4da45bfd..8eb806c0 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -1,7 +1,10 @@
+import logging
 import os
 
 from .app_run import SubprocessAppRun
 
+logger = logging.getLogger(__name__)
+
 
 class PolarisRun(SubprocessAppRun):
     """
@@ -10,19 +13,35 @@ class PolarisRun(SubprocessAppRun):
 
     def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
+
         cpu_bind = self._launch_params.get("cpu_bind", "none")
-        if (
-            cpu_bind == "none"
-            and self._gpus_per_rank > 0
-            and self.get_num_ranks() == 8
-            and self.get_cpus_per_rank == 1
-        ):
-            gpu_device = int(os.getenv("CUDA_VISIBLE_DEVICES"))
-            cpu_bind_list = ["list"]
-            start_cpu = 32 - 8 * (1 + gpu_device)
-            for i in range(8):
-                cpu_bind_list.append(":" + str(start_cpu + i))
+        if cpu_bind == "none" and self._gpus_per_rank > 0:
+            gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
+            gpu_ids = gpu_device.split(",")
+            gpu_ids.reverse()
+            cpu_ids = self._node_spec.cpu_ids[0]
+
+            cpu_bind_list = ["verbose,list"]
+            for gid in gpu_ids:
+                start_cpu = 32 - int(gid) * 8 - self.get_cpus_per_rank()
+                cpu_bind_list.append(":")
+                for icpu in range(self.get_cpus_per_rank()):
+                    if icpu > 0:
+                        cpu_bind_list.append(",")
+                    cpu_bind_list.append(str(start_cpu + icpu))
+
+            # start_cpu = 32 - 8 * (1 + gpu_device)
+            # for i in range(8):
+            #     cpu_bind_list.append(":" + str(start_cpu + i))
             cpu_bind = "".join(cpu_bind_list)
+            logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
+
+        launch_params = []
+        for k in self._launch_params.keys():
+            if k != "cpu_bind":
+                launch_params.append("--" + k)
+                launch_params.append(str(self._launch_params[k]))
+
         nid_str = ",".join(map(str, node_ids))
         args = [
             "mpiexec",
@@ -36,6 +55,21 @@ def _build_cmdline(self) -> str:
             cpu_bind,
             "-d",
             self._threads_per_rank,
+            *launch_params,
             self._cmdline,
         ]
         return " ".join(str(arg) for arg in args)
+
+    # Overide default because sunspot does not use CUDA
+    def _set_envs(self) -> None:
+
+        envs = os.environ.copy()
+        envs.update(self._envs)
+        # Check the assigned GPU ID list from the first compute node:
+        gpu_ids = self._node_spec.gpu_ids[0]
+
+        if gpu_ids:
+            envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+            envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
+        envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
+        self._envs = envs
diff --git a/balsam/platform/compute_node/alcf_polaris_node.py b/balsam/platform/compute_node/alcf_polaris_node.py
index b5283c3b..af5925fb 100644
--- a/balsam/platform/compute_node/alcf_polaris_node.py
+++ b/balsam/platform/compute_node/alcf_polaris_node.py
@@ -12,6 +12,8 @@
 class PolarisNode(ComputeNode):
     # turam: confirm number of cpus
     cpu_ids = list(range(64))
+    # cms21: recommended cpu affinity for polaris nodes is in reverse order to gpu ids
+    cpu_ids.reverse()
     gpu_ids: List[IntStr] = list(range(4))
 
     @classmethod

From 9f1b1ca717bb400c22e9d32c73cd39e2f7a96d8d Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 19 May 2023 18:27:51 -0500
Subject: [PATCH 03/20] updates to polaris app run

---
 balsam/platform/app_run/polaris.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 8eb806c0..6b5afdec 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -18,7 +18,6 @@ def _build_cmdline(self) -> str:
         if cpu_bind == "none" and self._gpus_per_rank > 0:
             gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
             gpu_ids = gpu_device.split(",")
-            gpu_ids.reverse()
             cpu_ids = self._node_spec.cpu_ids[0]
 
             cpu_bind_list = ["verbose,list"]

From c811f374163ec4c6e164a9fe0c6e670790712e68 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Wed, 24 May 2023 18:49:37 -0500
Subject: [PATCH 04/20] attempt to fix cpu affinity in Polaris app_run

---
 balsam/platform/app_run/polaris.py            | 44 ++++++++++++-------
 .../compute_node/alcf_polaris_node.py         |  9 ++--
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 6b5afdec..f04b6277 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -1,6 +1,8 @@
 import logging
 import os
 
+from balsam.platform.compute_node.alcf_polaris_node import PolarisNode
+
 from .app_run import SubprocessAppRun
 
 logger = logging.getLogger(__name__)
@@ -14,26 +16,31 @@ class PolarisRun(SubprocessAppRun):
     def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
 
+        # cms21: currently this is broken for multinode jobs
+
         cpu_bind = self._launch_params.get("cpu_bind", "none")
         if cpu_bind == "none" and self._gpus_per_rank > 0:
-            gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
-            gpu_ids = gpu_device.split(",")
-            cpu_ids = self._node_spec.cpu_ids[0]
+            polaris_node = PolarisNode()
+            # gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
+            # gpu_ids = gpu_device.split(",")
+            # cpu_ids = self._node_spec.cpu_ids[0]
+            cpu_ids = polaris_node.cpu_ids
+            gpu_ids = polaris_node.gpu_ids
+            cpus_per_rank = self.get_cpus_per_rank()
+            cpu_ids_ns = self._node_spec.cpu_ids
 
             cpu_bind_list = ["verbose,list"]
-            for gid in gpu_ids:
-                start_cpu = 32 - int(gid) * 8 - self.get_cpus_per_rank()
+            for irank in range(self._ranks_per_node):
                 cpu_bind_list.append(":")
-                for icpu in range(self.get_cpus_per_rank()):
-                    if icpu > 0:
+                for i in range(cpus_per_rank):
+                    if i > 0:
                         cpu_bind_list.append(",")
-                    cpu_bind_list.append(str(start_cpu + icpu))
-
-            # start_cpu = 32 - 8 * (1 + gpu_device)
-            # for i in range(8):
-            #     cpu_bind_list.append(":" + str(start_cpu + i))
+                    cid = str(cpu_ids[i + cpus_per_rank * irank])
+                    cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
-            logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
+            logger.info(
+                f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} cpu_ids_ns={cpu_ids_ns} gpu_ids={gpu_ids}"
+            )
 
         launch_params = []
         for k in self._launch_params.keys():
@@ -65,9 +72,16 @@ def _set_envs(self) -> None:
         envs = os.environ.copy()
         envs.update(self._envs)
         # Check the assigned GPU ID list from the first compute node:
-        gpu_ids = self._node_spec.gpu_ids[0]
+        gpu_ids = self._node_spec.gpu_ids
+        cpu_ids = self._node_spec.cpu_ids
+        logger.info(f"Polaris set_envs: gpu_ids={gpu_ids} cpu_ids={cpu_ids}")
+        if gpu_ids[0] and len(self._node_spec.node_ids) == 1:
+            envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+            envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
+        if not gpu_ids[0] and len(self._node_spec.node_ids) > 1 and self._gpus_per_rank > 0:
+            polaris_node = PolarisNode()
+            gpu_ids = polaris_node.gpu_ids
 
-        if gpu_ids:
             envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
         envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
diff --git a/balsam/platform/compute_node/alcf_polaris_node.py b/balsam/platform/compute_node/alcf_polaris_node.py
index af5925fb..c2788e6d 100644
--- a/balsam/platform/compute_node/alcf_polaris_node.py
+++ b/balsam/platform/compute_node/alcf_polaris_node.py
@@ -10,12 +10,13 @@
 
 
 class PolarisNode(ComputeNode):
-    # turam: confirm number of cpus
-    cpu_ids = list(range(64))
-    # cms21: recommended cpu affinity for polaris nodes is in reverse order to gpu ids
-    cpu_ids.reverse()
+
+    cpu_ids = list(range(32))
     gpu_ids: List[IntStr] = list(range(4))
 
+    # cms21: optimal gpu/cpu binding on Polaris nodes goes in reverse order
+    gpu_ids.reverse()
+
     @classmethod
     def get_job_nodelist(cls) -> List["PolarisNode"]:
         """

From 0936c686177f1c67c0e1980da60ab3b507bd604e Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-02.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 25 May 2023 03:16:41 -0500
Subject: [PATCH 05/20] added polaris gpu affinity script

---
 balsam/platform/app_run/app_run.py | 11 ++++
 balsam/platform/app_run/polaris.py | 84 +++++++++++++++++++++++++-----
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index ff9f2cf7..5d5af973 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -8,6 +8,7 @@
 
 import psutil  # type: ignore
 
+from balsam.platform.compute_node import ComputeNode
 from balsam.site.launcher import NodeSpec
 
 logger = logging.getLogger(__name__)
@@ -72,6 +73,16 @@ def get_cpus_per_rank(self) -> int:
             cpu_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
         return cpu_per_rank
 
+    def get_gpus_per_node_for_job(self) -> int:
+        gpus_per_node = self._gpus_per_rank * self._ranks_per_node
+        compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+        total_gpus_per_node = len(compute_node.gpu_ids)
+        if gpus_per_node > total_gpus_per_node:
+            logger.warning(
+                f"You have too many gpus per node! Physical gpus={total_gpus_per_node} gpus_per_rank={self._gpus_per_rank} ranks_per_node={self._ranks_per_node}"
+            )
+        return min(gpus_per_node, total_gpus_per_node)
+
     @abstractmethod
     def start(self) -> None:
         pass
diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index f04b6277..1a0bfae7 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import stat
 
 from balsam.platform.compute_node.alcf_polaris_node import PolarisNode
 
@@ -19,13 +20,66 @@ def _build_cmdline(self) -> str:
         # cms21: currently this is broken for multinode jobs
 
         cpu_bind = self._launch_params.get("cpu_bind", "none")
+        gpu_affinity_script = ""
         if cpu_bind == "none" and self._gpus_per_rank > 0:
-            polaris_node = PolarisNode()
-            # gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
-            # gpu_ids = gpu_device.split(",")
-            # cpu_ids = self._node_spec.cpu_ids[0]
-            cpu_ids = polaris_node.cpu_ids
-            gpu_ids = polaris_node.gpu_ids
+            if len(self._node_spec.node_ids) == 1 or self._ranks_per_node == 1:
+                cpu_ids = self._node_spec.cpu_ids[0]
+                gpu_ids = self._node_spec.gpu_ids[0]
+            else:
+                gpu_ids = self._envs["CUDA_VISIBLE_DEVICES"].split(
+                    ","
+                )  # These should be distributed across local ranks
+                polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+                cpu_ids = polaris_node.cpu_ids
+                node_gpu_ids = polaris_node.gpu_ids
+                gpu_affinity_script = self._cwd.joinpath("set_affinity_gpu_polaris.sh")
+                with open(gpu_affinity_script, "w") as f:
+                    f.write(
+                        f"""#!/bin/bash -l
+                                    gpu_ids=( "{" ".join(gpu_ids)}" )
+                                    num_gpus={len(node_gpu_ids)}
+                                    gpus_per_rank={self._gpus_per_rank}
+                                    ngpu=0
+                                    gpu_string=""\n
+                                    """
+                    )
+                    f.write(
+                        """while [ $ngpu -lt $gpus_per_rank ]
+                                do
+                                    igpu=$(((${PMI_LOCAL_RANK} * ${gpus_per_rank}) + ${ngpu} % ${num_gpus}))
+                                    gpu=${gpu_ids[$igpu]}
+                                    ##gpu=$((${num_gpus} - 1 - ${ngpu} - (${PMI_LOCAL_RANK} * ${gpus_per_rank}) % ${num_gpus}))
+                                    sep=""
+                                    if [ $ngpu -gt 0 ]
+                                    then
+                                        sep=","
+                                    fi
+                                    gpu_string=$gpu_string$sep$gpu
+                                    ngpu=$((${igpu} + 1))
+                                done
+                                export CUDA_VISIBLE_DEVICES=$gpu_string
+                                echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= $gpu_string”
+                                exec "$@"
+                    """
+                    )
+                    st = os.stat(gpu_affinity_script)
+                    os.chmod(gpu_affinity_script, st.st_mode | stat.S_IEXEC)
+
+                # gpu_ids = polaris_node.gpu_ids
+                # num_gpus = len(gpu_ids)
+                # gpu_affinity_script = self._cwd.joinpath("set_affinity_gpu_polaris.sh")
+                # with open(gpu_affinity_script,"w") as f:
+                #     f.write(f"""#!/bin/bash -l
+                #                 num_gpus={num_gpus}
+                #                 gpus_per_rank={self._gpus_per_rank}\n"""+
+                #              """gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus}))\n
+                #                 export CUDA_VISIBLE_DEVICES=$gpu\n
+                #                 echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”\n
+                #                 exec "$@"\n
+                #             """)
+                #     st = os.stat(gpu_affinity_script)
+                #     os.chmod(gpu_affinity_script, st.st_mode | stat.S_IEXEC)
+
             cpus_per_rank = self.get_cpus_per_rank()
             cpu_ids_ns = self._node_spec.cpu_ids
 
@@ -62,6 +116,7 @@ def _build_cmdline(self) -> str:
             "-d",
             self._threads_per_rank,
             *launch_params,
+            gpu_affinity_script,
             self._cmdline,
         ]
         return " ".join(str(arg) for arg in args)
@@ -72,17 +127,18 @@ def _set_envs(self) -> None:
         envs = os.environ.copy()
         envs.update(self._envs)
         # Check the assigned GPU ID list from the first compute node:
-        gpu_ids = self._node_spec.gpu_ids
-        cpu_ids = self._node_spec.cpu_ids
+        gpu_ids = self._node_spec.gpu_ids[0]
+        cpu_ids = self._node_spec.cpu_ids[0]
         logger.info(f"Polaris set_envs: gpu_ids={gpu_ids} cpu_ids={cpu_ids}")
-        if gpu_ids[0] and len(self._node_spec.node_ids) == 1:
+        if gpu_ids:
             envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
-        if not gpu_ids[0] and len(self._node_spec.node_ids) > 1 and self._gpus_per_rank > 0:
-            polaris_node = PolarisNode()
-            gpu_ids = polaris_node.gpu_ids
+        else:
+            polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+            if self._gpus_per_rank > 0:
+                gpu_ids = polaris_node.gpu_ids[0 : self.get_gpus_per_node_for_job()]
+                envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+                envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
 
-            envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-            envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
         envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
         self._envs = envs

From 08e09766a6fa1ee912e209b76f56e36479ac9424 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 25 May 2023 10:20:40 -0500
Subject: [PATCH 06/20] fixes to the affinity script

---
 balsam/platform/app_run/polaris.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 1a0bfae7..c30f4536 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -22,7 +22,7 @@ def _build_cmdline(self) -> str:
         cpu_bind = self._launch_params.get("cpu_bind", "none")
         gpu_affinity_script = ""
         if cpu_bind == "none" and self._gpus_per_rank > 0:
-            if len(self._node_spec.node_ids) == 1 or self._ranks_per_node == 1:
+            if len(self._node_spec.node_ids) == 1:
                 cpu_ids = self._node_spec.cpu_ids[0]
                 gpu_ids = self._node_spec.gpu_ids[0]
             else:
@@ -31,12 +31,15 @@ def _build_cmdline(self) -> str:
                 )  # These should be distributed across local ranks
                 polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
                 cpu_ids = polaris_node.cpu_ids
+
+            if len(self._node_spec.node_ids) > 1 or self._ranks_per_node > 1:
+                polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
                 node_gpu_ids = polaris_node.gpu_ids
                 gpu_affinity_script = self._cwd.joinpath("set_affinity_gpu_polaris.sh")
                 with open(gpu_affinity_script, "w") as f:
                     f.write(
                         f"""#!/bin/bash -l
-                                    gpu_ids=( "{" ".join(gpu_ids)}" )
+                                    gpu_ids=( {" ".join(gpu_ids)} )
                                     num_gpus={len(node_gpu_ids)}
                                     gpus_per_rank={self._gpus_per_rank}
                                     ngpu=0
@@ -65,21 +68,6 @@ def _build_cmdline(self) -> str:
                     st = os.stat(gpu_affinity_script)
                     os.chmod(gpu_affinity_script, st.st_mode | stat.S_IEXEC)
 
-                # gpu_ids = polaris_node.gpu_ids
-                # num_gpus = len(gpu_ids)
-                # gpu_affinity_script = self._cwd.joinpath("set_affinity_gpu_polaris.sh")
-                # with open(gpu_affinity_script,"w") as f:
-                #     f.write(f"""#!/bin/bash -l
-                #                 num_gpus={num_gpus}
-                #                 gpus_per_rank={self._gpus_per_rank}\n"""+
-                #              """gpu=$((${num_gpus} - 1 - ${PMI_LOCAL_RANK} % ${num_gpus}))\n
-                #                 export CUDA_VISIBLE_DEVICES=$gpu\n
-                #                 echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= ${gpu}”\n
-                #                 exec "$@"\n
-                #             """)
-                #     st = os.stat(gpu_affinity_script)
-                #     os.chmod(gpu_affinity_script, st.st_mode | stat.S_IEXEC)
-
             cpus_per_rank = self.get_cpus_per_rank()
             cpu_ids_ns = self._node_spec.cpu_ids
 

From e61c12cbe79afad2e8ebbcb7aa492ab558aeb329 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 25 May 2023 10:49:10 -0500
Subject: [PATCH 07/20] some style changes

---
 balsam/platform/app_run/polaris.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index c30f4536..0f258033 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -2,7 +2,7 @@
 import os
 import stat
 
-from balsam.platform.compute_node.alcf_polaris_node import PolarisNode
+from balsam.platform.compute_node import PolarisNode
 
 from .app_run import SubprocessAppRun
 
@@ -17,8 +17,6 @@ class PolarisRun(SubprocessAppRun):
     def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
 
-        # cms21: currently this is broken for multinode jobs
-
         cpu_bind = self._launch_params.get("cpu_bind", "none")
         gpu_affinity_script = ""
         if cpu_bind == "none" and self._gpus_per_rank > 0:

From 7bcbc52c353a06977ee880c6cd02a69d21396519 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 25 May 2023 15:39:26 -0500
Subject: [PATCH 08/20] reverting affinity script addition, put in different
 branch

---
 balsam/platform/app_run/polaris.py | 62 ++++--------------------------
 1 file changed, 8 insertions(+), 54 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 0f258033..05b506c1 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import stat
 
 from balsam.platform.compute_node import PolarisNode
 
@@ -18,56 +17,15 @@ def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
 
         cpu_bind = self._launch_params.get("cpu_bind", "none")
-        gpu_affinity_script = ""
-        if cpu_bind == "none" and self._gpus_per_rank > 0:
+        if cpu_bind == "none" and self._gpus_per_rank > 0 and self._ranks_per_node == 1:
+            gpu_ids = self._envs["CUDA_VISIBLE_DEVICES"].split(",")
             if len(self._node_spec.node_ids) == 1:
                 cpu_ids = self._node_spec.cpu_ids[0]
-                gpu_ids = self._node_spec.gpu_ids[0]
             else:
-                gpu_ids = self._envs["CUDA_VISIBLE_DEVICES"].split(
-                    ","
-                )  # These should be distributed across local ranks
                 polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
                 cpu_ids = polaris_node.cpu_ids
 
-            if len(self._node_spec.node_ids) > 1 or self._ranks_per_node > 1:
-                polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-                node_gpu_ids = polaris_node.gpu_ids
-                gpu_affinity_script = self._cwd.joinpath("set_affinity_gpu_polaris.sh")
-                with open(gpu_affinity_script, "w") as f:
-                    f.write(
-                        f"""#!/bin/bash -l
-                                    gpu_ids=( {" ".join(gpu_ids)} )
-                                    num_gpus={len(node_gpu_ids)}
-                                    gpus_per_rank={self._gpus_per_rank}
-                                    ngpu=0
-                                    gpu_string=""\n
-                                    """
-                    )
-                    f.write(
-                        """while [ $ngpu -lt $gpus_per_rank ]
-                                do
-                                    igpu=$(((${PMI_LOCAL_RANK} * ${gpus_per_rank}) + ${ngpu} % ${num_gpus}))
-                                    gpu=${gpu_ids[$igpu]}
-                                    ##gpu=$((${num_gpus} - 1 - ${ngpu} - (${PMI_LOCAL_RANK} * ${gpus_per_rank}) % ${num_gpus}))
-                                    sep=""
-                                    if [ $ngpu -gt 0 ]
-                                    then
-                                        sep=","
-                                    fi
-                                    gpu_string=$gpu_string$sep$gpu
-                                    ngpu=$((${igpu} + 1))
-                                done
-                                export CUDA_VISIBLE_DEVICES=$gpu_string
-                                echo “RANK= ${PMI_RANK} LOCAL_RANK= ${PMI_LOCAL_RANK} gpu= $gpu_string”
-                                exec "$@"
-                    """
-                    )
-                    st = os.stat(gpu_affinity_script)
-                    os.chmod(gpu_affinity_script, st.st_mode | stat.S_IEXEC)
-
             cpus_per_rank = self.get_cpus_per_rank()
-            cpu_ids_ns = self._node_spec.cpu_ids
 
             cpu_bind_list = ["verbose,list"]
             for irank in range(self._ranks_per_node):
@@ -78,9 +36,7 @@ def _build_cmdline(self) -> str:
                     cid = str(cpu_ids[i + cpus_per_rank * irank])
                     cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
-            logger.info(
-                f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} cpu_ids_ns={cpu_ids_ns} gpu_ids={gpu_ids}"
-            )
+            logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
 
         launch_params = []
         for k in self._launch_params.keys():
@@ -102,7 +58,6 @@ def _build_cmdline(self) -> str:
             "-d",
             self._threads_per_rank,
             *launch_params,
-            gpu_affinity_script,
             self._cmdline,
         ]
         return " ".join(str(arg) for arg in args)
@@ -116,15 +71,14 @@ def _set_envs(self) -> None:
         gpu_ids = self._node_spec.gpu_ids[0]
         cpu_ids = self._node_spec.cpu_ids[0]
         logger.info(f"Polaris set_envs: gpu_ids={gpu_ids} cpu_ids={cpu_ids}")
-        if gpu_ids:
+        if gpu_ids and self._ranks_per_node == 1:
             envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
-        else:
+        if not gpu_ids and self._ranks_per_node == 1 and self._gpus_per_rank > 0:
             polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-            if self._gpus_per_rank > 0:
-                gpu_ids = polaris_node.gpu_ids[0 : self.get_gpus_per_node_for_job()]
-                envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-                envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
+            gpu_ids = polaris_node.gpu_ids[0 : self._gpus_per_rank]
+            envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+            envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
 
         envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
         self._envs = envs

From b0973cf47b60852b84ef77ac5556087152554b0f Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 26 May 2023 10:39:22 -0500
Subject: [PATCH 09/20] removed helper function

---
 balsam/platform/app_run/app_run.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index 5d5af973..ff9f2cf7 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -8,7 +8,6 @@
 
 import psutil  # type: ignore
 
-from balsam.platform.compute_node import ComputeNode
 from balsam.site.launcher import NodeSpec
 
 logger = logging.getLogger(__name__)
@@ -73,16 +72,6 @@ def get_cpus_per_rank(self) -> int:
             cpu_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
         return cpu_per_rank
 
-    def get_gpus_per_node_for_job(self) -> int:
-        gpus_per_node = self._gpus_per_rank * self._ranks_per_node
-        compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-        total_gpus_per_node = len(compute_node.gpu_ids)
-        if gpus_per_node > total_gpus_per_node:
-            logger.warning(
-                f"You have too many gpus per node! Physical gpus={total_gpus_per_node} gpus_per_rank={self._gpus_per_rank} ranks_per_node={self._ranks_per_node}"
-            )
-        return min(gpus_per_node, total_gpus_per_node)
-
     @abstractmethod
     def start(self) -> None:
         pass

From 77f8941307b102c088ccda0f1bc2a65d5f24ce0b Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 26 May 2023 13:54:22 -0500
Subject: [PATCH 10/20] Updates to polaris cmdline implementation after dev
 discussion; includes notes

---
 balsam/platform/app_run/polaris.py | 47 +++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 05b506c1..6ca5b397 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -17,9 +17,20 @@ def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
 
         cpu_bind = self._launch_params.get("cpu_bind", "none")
-        if cpu_bind == "none" and self._gpus_per_rank > 0 and self._ranks_per_node == 1:
-            gpu_ids = self._envs["CUDA_VISIBLE_DEVICES"].split(",")
-            if len(self._node_spec.node_ids) == 1:
+
+        # If the user does not set a cpu_bind option and gpus are being used,
+        # this code sets cpu-bind to be optimal for the gpus being used.
+        # This does not handle the case where the application is using less than
+        # 8 cpus per gpu.  This code will not skip the appropriate number of cpus
+        # in the rank binding assignments.
+        if cpu_bind == "none" and self._gpus_per_rank > 0:
+
+            # Here we grab the cpu_ids assigned to the job in the NodeSpec object
+            # If this is not set in NodeSpec (it is only set for single node jobs),
+            # then we take the cpu_id list from the Polaris ComputeNode subclass,
+            # assuming the job will have use of all the cpus in nodes assigned to it.
+            cpu_ids_ns = self._node_spec.cpu_ids[0]
+            if cpu_ids_ns:
                 cpu_ids = self._node_spec.cpu_ids[0]
             else:
                 polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
@@ -27,6 +38,8 @@ def _build_cmdline(self) -> str:
 
             cpus_per_rank = self.get_cpus_per_rank()
 
+            # PolarisNode reverses the order of the gpu_ids, so assigning the cpu-bind
+            # in ascending cpu order is what we want here.
             cpu_bind_list = ["verbose,list"]
             for irank in range(self._ranks_per_node):
                 cpu_bind_list.append(":")
@@ -36,6 +49,8 @@ def _build_cmdline(self) -> str:
                     cid = str(cpu_ids[i + cpus_per_rank * irank])
                     cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
+            gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
+            gpu_ids = gpu_device.split(",")
             logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
 
         launch_params = []
@@ -67,18 +82,28 @@ def _set_envs(self) -> None:
 
         envs = os.environ.copy()
         envs.update(self._envs)
-        # Check the assigned GPU ID list from the first compute node:
+
+        # Here we grab the gpus assigned to the job from NodeSpec.  NodeSpec only
+        # sets this for single node jobs.  For multinode jobs, gpu_ids below will
+        # be an empty list of lists (e.g. [[], []]).  The ordering of the gpu_ids
+        # is reversed in PolarisNode and therefore the reverse ordering of
+        # cpus to gpus should be reflected here
         gpu_ids = self._node_spec.gpu_ids[0]
         cpu_ids = self._node_spec.cpu_ids[0]
         logger.info(f"Polaris set_envs: gpu_ids={gpu_ids} cpu_ids={cpu_ids}")
-        if gpu_ids and self._ranks_per_node == 1:
-            envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-            envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
-        if not gpu_ids and self._ranks_per_node == 1 and self._gpus_per_rank > 0:
-            polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-            gpu_ids = polaris_node.gpu_ids[0 : self._gpus_per_rank]
+
+        # Here we set CUDA_VISIBLE_DEVICES for single node jobs only.  We assume
+        # for multinode jobs that the job has access to all gpus, and
+        # CUDA_VISIBLE_DEVICES is set by the user, for example by local rank with an
+        # gpu_affinity.sh script that wraps around the user application in the
+        # ApplicationDefinition.
+        # One special case: if your job has one node, 2 ranks, and 1 gpu per rank, the
+        # code here will set CUDA_VISIBLE_DEVICES to "3,2" or "1,0".  A user provided
+        # gpu_affinity.sh script should take this assigment and use it to reset
+        # CUDA_VISIBLE_DEVICES for each local rank.  The user script should NOT
+        # round-robin the setting CUDA_VISIBLE_DEVICES starting from 3.
+        if gpu_ids:
             envs["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             envs["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
-
         envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
         self._envs = envs

From 2efaa8ed82c87487188a1dac01ee0d08aafb5451 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 26 May 2023 17:23:17 -0500
Subject: [PATCH 11/20] remove turam path from polaris job-template.sh

---
 balsam/config/defaults/alcf_polaris/job-template.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/balsam/config/defaults/alcf_polaris/job-template.sh b/balsam/config/defaults/alcf_polaris/job-template.sh
index 8dae69c2..dd090dee 100644
--- a/balsam/config/defaults/alcf_polaris/job-template.sh
+++ b/balsam/config/defaults/alcf_polaris/job-template.sh
@@ -8,8 +8,6 @@
 export http_proxy="http://proxy:3128"
 export https_proxy="http://proxy:3128"
 
-export PYTHONPATH=/home/turam/dev/polaris/balsam:$PYTHONPATH
-
 #remove export PMI_NO_FORK=1
 export BALSAM_SITE_PATH={{balsam_site_path}}
 cd $BALSAM_SITE_PATH

From 1281a794650a3311bfb671e7656153ca60bfda10 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-04.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 26 May 2023 17:24:04 -0500
Subject: [PATCH 12/20] more updates to polaris cmdline

---
 balsam/platform/app_run/polaris.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 6ca5b397..46062f2d 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -49,8 +49,11 @@ def _build_cmdline(self) -> str:
                     cid = str(cpu_ids[i + cpus_per_rank * irank])
                     cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
-            gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
-            gpu_ids = gpu_device.split(",")
+            if "CUDA_VISIBLE_DEVICES" in self._envs.keys():
+                gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
+                gpu_ids = gpu_device.split(",")
+            else:
+                gpu_ids = []
             logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
 
         launch_params = []
@@ -71,7 +74,7 @@ def _build_cmdline(self) -> str:
             "--cpu-bind",
             cpu_bind,
             "-d",
-            self._threads_per_rank,
+            self.get_cpus_per_rank(),
             *launch_params,
             self._cmdline,
         ]

From 1b64cdb798f7804b652782149d0b4d07b5449089 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-02.hsn.cm.polaris.alcf.anl.gov>
Date: Wed, 31 May 2023 21:56:52 -0500
Subject: [PATCH 13/20] changes to make depth paramter for Polaris app_run
 consistent with docs

---
 balsam/platform/app_run/app_run.py | 23 +++++++++++++++++++----
 balsam/platform/app_run/polaris.py | 18 +++++++++++++++++-
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index ff9f2cf7..7713b974 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -8,6 +8,7 @@
 
 import psutil  # type: ignore
 
+from balsam.platform.compute_node import ComputeNode
 from balsam.site.launcher import NodeSpec
 
 logger = logging.getLogger(__name__)
@@ -67,10 +68,24 @@ def get_num_ranks(self) -> int:
         return self._ranks_per_node * len(self._node_spec.node_ids)
 
     def get_cpus_per_rank(self) -> int:
-        cpu_per_rank = len(self._node_spec.cpu_ids[0]) // self._ranks_per_node
-        if not cpu_per_rank:
-            cpu_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
-        return cpu_per_rank
+
+        # Get the list of cpus assigned to the job.  If it is a single node job, that is stored in
+        # the NodeSpec object.  If it is a multinode job, the cpu_ids assigned to NodeSpec is empty,
+        # so we will assume all cpus on a compute node are available to the job.  The list of cpus is
+        # just the list of cpus on the node in that case.
+        cpu_ids = self._node_spec.cpu_ids[0]
+        if not cpu_ids:
+            compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+            cpu_ids = compute_node.cpu_ids
+
+        cpus_per_node = len(cpu_ids)
+        cpus_per_rank = cpus_per_node // self._ranks_per_node
+
+        # If ranks are oversubscribed to cpus (ranks_per_node > cpus_per_node), set it to a minimum of
+        # 1 cpu per rank or the number of cores per rank from the threading settings
+        if not cpus_per_rank:
+            cpus_per_rank = max(1, int(self._threads_per_rank // self._threads_per_core))
+        return cpus_per_rank
 
     @abstractmethod
     def start(self) -> None:
diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 46062f2d..5f59cd78 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -32,6 +32,8 @@ def _build_cmdline(self) -> str:
             cpu_ids_ns = self._node_spec.cpu_ids[0]
             if cpu_ids_ns:
                 cpu_ids = self._node_spec.cpu_ids[0]
+                if self._threads_per_core == 2:
+                    polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
             else:
                 polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
                 cpu_ids = polaris_node.cpu_ids
@@ -48,6 +50,13 @@ def _build_cmdline(self) -> str:
                         cpu_bind_list.append(",")
                     cid = str(cpu_ids[i + cpus_per_rank * irank])
                     cpu_bind_list.append(cid)
+                    # If the job is using 2 hardware threads per core, we need to add those threads to the list
+                    # The additional threads should go in the same ascending order (threads 0 and 32 are on the
+                    # same physical core, threads 31 and 63 are on the same physical core)
+                    if self._threads_per_core == 2:
+                        cpu_bind_list.append(",")
+                        cid = str(cpu_ids[i + cpus_per_rank * irank] + len(polaris_node.cpu_ids))
+                        cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
             if "CUDA_VISIBLE_DEVICES" in self._envs.keys():
                 gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
@@ -62,6 +71,13 @@ def _build_cmdline(self) -> str:
                 launch_params.append("--" + k)
                 launch_params.append(str(self._launch_params[k]))
 
+        # The value of -d depends on the setting of cpu_bind.  If cpu-bind=core, -d is the number of
+        # physical cores per rank, otherwise it is the number of hardware threads per rank
+        # https://docs.alcf.anl.gov/running-jobs/example-job-scripts/
+        depth = self._threads_per_rank
+        if "core" in cpu_bind:
+            depth = self.get_cpus_per_rank()
+
         nid_str = ",".join(map(str, node_ids))
         args = [
             "mpiexec",
@@ -74,7 +90,7 @@ def _build_cmdline(self) -> str:
             "--cpu-bind",
             cpu_bind,
             "-d",
-            self.get_cpus_per_rank(),
+            depth,
             *launch_params,
             self._cmdline,
         ]

From 937947ecfa7f483bb8ee789a6ba4c003f6bd7e28 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-02.hsn.cm.polaris.alcf.anl.gov>
Date: Wed, 31 May 2023 22:41:17 -0500
Subject: [PATCH 14/20] Removed blank lines

---
 balsam/platform/app_run/app_run.py                | 1 -
 balsam/platform/app_run/polaris.py                | 2 --
 balsam/platform/compute_node/alcf_polaris_node.py | 1 -
 3 files changed, 4 deletions(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index 7713b974..2aa06e39 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -68,7 +68,6 @@ def get_num_ranks(self) -> int:
         return self._ranks_per_node * len(self._node_spec.node_ids)
 
     def get_cpus_per_rank(self) -> int:
-
         # Get the list of cpus assigned to the job.  If it is a single node job, that is stored in
         # the NodeSpec object.  If it is a multinode job, the cpu_ids assigned to NodeSpec is empty,
         # so we will assume all cpus on a compute node are available to the job.  The list of cpus is
diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 5f59cd78..761878a5 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -24,7 +24,6 @@ def _build_cmdline(self) -> str:
         # 8 cpus per gpu.  This code will not skip the appropriate number of cpus
         # in the rank binding assignments.
         if cpu_bind == "none" and self._gpus_per_rank > 0:
-
             # Here we grab the cpu_ids assigned to the job in the NodeSpec object
             # If this is not set in NodeSpec (it is only set for single node jobs),
             # then we take the cpu_id list from the Polaris ComputeNode subclass,
@@ -98,7 +97,6 @@ def _build_cmdline(self) -> str:
 
     # Overide default because sunspot does not use CUDA
     def _set_envs(self) -> None:
-
         envs = os.environ.copy()
         envs.update(self._envs)
 
diff --git a/balsam/platform/compute_node/alcf_polaris_node.py b/balsam/platform/compute_node/alcf_polaris_node.py
index c2788e6d..208490a1 100644
--- a/balsam/platform/compute_node/alcf_polaris_node.py
+++ b/balsam/platform/compute_node/alcf_polaris_node.py
@@ -10,7 +10,6 @@
 
 
 class PolarisNode(ComputeNode):
-
     cpu_ids = list(range(32))
     gpu_ids: List[IntStr] = list(range(4))
 

From 8d6f5f00f1fd2f14c703998e8955c7074640f478 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-02.hsn.cm.polaris.alcf.anl.gov>
Date: Wed, 31 May 2023 22:51:30 -0500
Subject: [PATCH 15/20] lint fixes

---
 balsam/_api/model.py    |  2 +-
 balsam/config/config.py | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/balsam/_api/model.py b/balsam/_api/model.py
index 05d4831a..38ae2c48 100644
--- a/balsam/_api/model.py
+++ b/balsam/_api/model.py
@@ -186,7 +186,7 @@ def __repr__(self) -> str:
 
     def __str__(self) -> str:
         d = self.display_dict()
-        return yaml.dump(d, sort_keys=False, indent=4)  # type: ignore
+        return yaml.dump(d, sort_keys=False, indent=4)
 
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, BalsamModel):
diff --git a/balsam/config/config.py b/balsam/config/config.py
index 5766afc5..00d95c69 100644
--- a/balsam/config/config.py
+++ b/balsam/config/config.py
@@ -235,13 +235,10 @@ def save(self, path: Union[str, Path]) -> None:
             fp.write(self.dump_yaml())
 
     def dump_yaml(self) -> str:
-        return cast(
-            str,
-            yaml.dump(
-                json.loads(self.json()),
-                sort_keys=False,
-                indent=4,
-            ),
+        return yaml.dump(
+            json.loads(self.json()),
+            sort_keys=False,
+            indent=4,
         )
 
     @classmethod

From c57beb787ba78471dbf73950cb78434aaf0fc0a4 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 1 Jun 2023 11:56:46 -0500
Subject: [PATCH 16/20] fix type error

---
 balsam/platform/app_run/app_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index 2aa06e39..c4efb45b 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -75,7 +75,7 @@ def get_cpus_per_rank(self) -> int:
         cpu_ids = self._node_spec.cpu_ids[0]
         if not cpu_ids:
             compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-            cpu_ids = compute_node.cpu_ids
+            cpu_ids = list(compute_node.cpu_ids)
 
         cpus_per_node = len(cpu_ids)
         cpus_per_rank = cpus_per_node // self._ranks_per_node

From 0691ed3d88c2170c4f355808e9c311d95145aecf Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Thu, 1 Jun 2023 12:00:49 -0500
Subject: [PATCH 17/20] fix type error

---
 balsam/platform/app_run/app_run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/balsam/platform/app_run/app_run.py b/balsam/platform/app_run/app_run.py
index c4efb45b..3c25c4f6 100644
--- a/balsam/platform/app_run/app_run.py
+++ b/balsam/platform/app_run/app_run.py
@@ -73,11 +73,11 @@ def get_cpus_per_rank(self) -> int:
         # so we will assume all cpus on a compute node are available to the job.  The list of cpus is
         # just the list of cpus on the node in that case.
         cpu_ids = self._node_spec.cpu_ids[0]
+        cpus_per_node = len(cpu_ids)
         if not cpu_ids:
             compute_node = ComputeNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-            cpu_ids = list(compute_node.cpu_ids)
+            cpus_per_node = len(compute_node.cpu_ids)
 
-        cpus_per_node = len(cpu_ids)
         cpus_per_rank = cpus_per_node // self._ranks_per_node
 
         # If ranks are oversubscribed to cpus (ranks_per_node > cpus_per_node), set it to a minimum of

From ad0e661b68a9bfd9f5c37ee537c75dac48043bc5 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-02.hsn.cm.polaris.alcf.anl.gov>
Date: Tue, 13 Jun 2023 16:28:22 -0500
Subject: [PATCH 18/20] made change to accept a user setting cpu_bind to none

---
 balsam/platform/app_run/polaris.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 761878a5..d18efb12 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -16,14 +16,14 @@ class PolarisRun(SubprocessAppRun):
     def _build_cmdline(self) -> str:
         node_ids = [h for h in self._node_spec.hostnames]
 
-        cpu_bind = self._launch_params.get("cpu_bind", "none")
-
-        # If the user does not set a cpu_bind option and gpus are being used,
+        # If the user does not set a cpu_bind option,
         # this code sets cpu-bind to be optimal for the gpus being used.
         # This does not handle the case where the application is using less than
         # 8 cpus per gpu.  This code will not skip the appropriate number of cpus
         # in the rank binding assignments.
-        if cpu_bind == "none" and self._gpus_per_rank > 0:
+        if "cpu_bind" in self._launch_params.keys():
+            cpu_bind = self._launch_params.get("cpu_bind", "none")
+        else:
             # Here we grab the cpu_ids assigned to the job in the NodeSpec object
             # If this is not set in NodeSpec (it is only set for single node jobs),
             # then we take the cpu_id list from the Polaris ComputeNode subclass,
@@ -57,12 +57,12 @@ def _build_cmdline(self) -> str:
                         cid = str(cpu_ids[i + cpus_per_rank * irank] + len(polaris_node.cpu_ids))
                         cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
-            if "CUDA_VISIBLE_DEVICES" in self._envs.keys():
-                gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
-                gpu_ids = gpu_device.split(",")
-            else:
-                gpu_ids = []
-            logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
+            # if "CUDA_VISIBLE_DEVICES" in self._envs.keys():
+            #     gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
+            #     gpu_ids = gpu_device.split(",")
+            # else:
+            #     gpu_ids = []
+            # logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
 
         launch_params = []
         for k in self._launch_params.keys():

From 6a10eb72b8d421cce25a6166b90abf029d49596e Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 4 Aug 2023 16:52:42 -0500
Subject: [PATCH 19/20] polaris app_run cleanup

---
 balsam/platform/app_run/polaris.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index d18efb12..3bc86a65 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -22,26 +22,24 @@ def _build_cmdline(self) -> str:
         # 8 cpus per gpu.  This code will not skip the appropriate number of cpus
         # in the rank binding assignments.
         if "cpu_bind" in self._launch_params.keys():
-            cpu_bind = self._launch_params.get("cpu_bind", "none")
+            cpu_bind = self._launch_params.get("cpu_bind")
+        elif "--cpu-bind" in self._launch_params.keys():
+            cpu_bind = self._launch_params.get("--cpu-bind")
         else:
             # Here we grab the cpu_ids assigned to the job in the NodeSpec object
             # If this is not set in NodeSpec (it is only set for single node jobs),
             # then we take the cpu_id list from the Polaris ComputeNode subclass,
             # assuming the job will have use of all the cpus in nodes assigned to it.
-            cpu_ids_ns = self._node_spec.cpu_ids[0]
-            if cpu_ids_ns:
-                cpu_ids = self._node_spec.cpu_ids[0]
-                if self._threads_per_core == 2:
-                    polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
-            else:
-                polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+            cpu_ids = self._node_spec.cpu_ids[0]
+            polaris_node = PolarisNode(self._node_spec.node_ids[0], self._node_spec.hostnames[0])
+            if not cpu_ids:
                 cpu_ids = polaris_node.cpu_ids
 
             cpus_per_rank = self.get_cpus_per_rank()
 
             # PolarisNode reverses the order of the gpu_ids, so assigning the cpu-bind
             # in ascending cpu order is what we want here.
-            cpu_bind_list = ["verbose,list"]
+            cpu_bind_list = ["list"]
             for irank in range(self._ranks_per_node):
                 cpu_bind_list.append(":")
                 for i in range(cpus_per_rank):
@@ -57,17 +55,10 @@ def _build_cmdline(self) -> str:
                         cid = str(cpu_ids[i + cpus_per_rank * irank] + len(polaris_node.cpu_ids))
                         cpu_bind_list.append(cid)
             cpu_bind = "".join(cpu_bind_list)
-            # if "CUDA_VISIBLE_DEVICES" in self._envs.keys():
-            #     gpu_device = self._envs["CUDA_VISIBLE_DEVICES"]
-            #     gpu_ids = gpu_device.split(",")
-            # else:
-            #     gpu_ids = []
-            # logger.info(f"Polaris app_run: cpu_bind={cpu_bind} cpu_ids={cpu_ids} gpu_ids={gpu_ids}")
 
         launch_params = []
         for k in self._launch_params.keys():
-            if k != "cpu_bind":
-                launch_params.append("--" + k)
+            if k != "cpu_bind" and k != "--cpu-bind":
                 launch_params.append(str(self._launch_params[k]))
 
         # The value of -d depends on the setting of cpu_bind.  If cpu-bind=core, -d is the number of
@@ -95,7 +86,6 @@ def _build_cmdline(self) -> str:
         ]
         return " ".join(str(arg) for arg in args)
 
-    # Overide default because sunspot does not use CUDA
     def _set_envs(self) -> None:
         envs = os.environ.copy()
         envs.update(self._envs)

From 020ae447d1ade936cc71f4e4e243c80fab08ae09 Mon Sep 17 00:00:00 2001
From: Christine Simpson
 <csimpson@polaris-login-01.hsn.cm.polaris.alcf.anl.gov>
Date: Fri, 4 Aug 2023 16:58:11 -0500
Subject: [PATCH 20/20] lint fix

---
 balsam/platform/app_run/polaris.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/balsam/platform/app_run/polaris.py b/balsam/platform/app_run/polaris.py
index 3bc86a65..20f6ea22 100644
--- a/balsam/platform/app_run/polaris.py
+++ b/balsam/platform/app_run/polaris.py
@@ -65,7 +65,7 @@ def _build_cmdline(self) -> str:
         # physical cores per rank, otherwise it is the number of hardware threads per rank
         # https://docs.alcf.anl.gov/running-jobs/example-job-scripts/
         depth = self._threads_per_rank
-        if "core" in cpu_bind:
+        if "core" == cpu_bind:
             depth = self.get_cpus_per_rank()
 
         nid_str = ",".join(map(str, node_ids))