diff --git a/balsam/config/defaults/alcf_aurora/job-template.sh b/balsam/config/defaults/alcf_aurora/job-template.sh new file mode 100644 index 00000000..0a5ac69b --- /dev/null +++ b/balsam/config/defaults/alcf_aurora/job-template.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#PBS -l select={{ num_nodes }}:system=aurora,place=scatter +#PBS -l walltime={{ wall_time_min//60 | int }}:{{ wall_time_min | int }}:00 +#PBS -l filesystems=home +#PBS -A {{ project }} +#PBS -q {{ queue }} + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 + +#remove export PMI_NO_FORK=1 +export BALSAM_SITE_PATH={{balsam_site_path}} +cd $BALSAM_SITE_PATH + +echo "Starting balsam launcher at $(date)" +{{launcher_cmd}} -j {{job_mode}} -t {{wall_time_min - 2}} \ +{% for k, v in filter_tags.items() %} --tag {{k}}={{v}} {% endfor %} \ +{{partitions}} +echo "Balsam launcher done at $(date)" diff --git a/balsam/config/defaults/alcf_aurora/settings.yml b/balsam/config/defaults/alcf_aurora/settings.yml new file mode 100644 index 00000000..d69fa076 --- /dev/null +++ b/balsam/config/defaults/alcf_aurora/settings.yml @@ -0,0 +1,21 @@ +title: "Aurora (ALCF)" + +compute_node: balsam.platform.compute_node.AuroraNode +mpi_app_launcher: balsam.platform.app_run.AuroraRun +local_app_launcher: balsam.platform.app_run.LocalAppRun +mpirun_allows_node_packing: true + +serial_mode_startup_params: + cpu_affinity: none + +scheduler_class: balsam.platform.scheduler.PBSScheduler +allowed_queues: + workq: + max_nodes: 128 + max_queued_jobs: 1 + max_walltime: 240 + +allowed_projects: +- Aurora_deployment + +optional_batch_job_params: {} diff --git a/balsam/config/defaults/alcf_sunspot/settings.yml b/balsam/config/defaults/alcf_sunspot/settings.yml index 4c79cc12..daff990e 100644 --- a/balsam/config/defaults/alcf_sunspot/settings.yml +++ b/balsam/config/defaults/alcf_sunspot/settings.yml @@ -1,7 +1,7 @@ title: "Sunspot (ALCF)" -compute_node: balsam.platform.compute_node.SunspotNode -mpi_app_launcher: balsam.platform.app_run.SunspotRun +compute_node: balsam.platform.compute_node.AuroraNode +mpi_app_launcher: balsam.platform.app_run.AuroraRun local_app_launcher: balsam.platform.app_run.LocalAppRun mpirun_allows_node_packing: true diff --git a/balsam/config/defaults/settings.yml.j2 b/balsam/config/defaults/settings.yml.j2 index 6bdfb9ee..f4b5afe3 100644 --- a/balsam/config/defaults/settings.yml.j2 +++ b/balsam/config/defaults/settings.yml.j2 @@ -31,6 +31,7 @@ launcher: local_app_launcher: {{ local_app_launcher }} mpirun_allows_node_packing: {{ mpirun_allows_node_packing }} # mpi_app_launcher supports multiple concurrent runs per node serial_mode_prefetch_per_rank: 64 # How many jobs to prefetch from API in serial mode + # sort_by: long_large_first # Enable this option to run jobs with longest wall_time_min first, followed by jobs with largest num_nodes # Pass-through parameters to mpirun when starting the serial mode launcher: serial_mode_startup_params: {{ {} if not serial_mode_startup_params }} @@ -137,4 +138,4 @@ queue_maintainer: null file_cleaner: null # file_cleaner: # cleanup_batch_size: 180 # Clean up to this many Job workdirs at a time -# service_period: 30 # Cleanup files every `service_period` seconds \ No newline at end of file +# service_period: 30 # Cleanup files every `service_period` seconds diff --git a/balsam/config/defaults/nersc_corihaswell/job-template.sh b/balsam/config/retired/nersc_corihaswell/job-template.sh similarity index 100% rename from balsam/config/defaults/nersc_corihaswell/job-template.sh rename to balsam/config/retired/nersc_corihaswell/job-template.sh diff --git a/balsam/config/defaults/nersc_corihaswell/settings.yml b/balsam/config/retired/nersc_corihaswell/settings.yml similarity index 100% rename from balsam/config/defaults/nersc_corihaswell/settings.yml rename to balsam/config/retired/nersc_corihaswell/settings.yml diff --git a/balsam/config/defaults/nersc_coriknl/job-template.sh b/balsam/config/retired/nersc_coriknl/job-template.sh similarity index 100% rename from balsam/config/defaults/nersc_coriknl/job-template.sh rename to balsam/config/retired/nersc_coriknl/job-template.sh diff --git a/balsam/config/defaults/nersc_coriknl/settings.yml b/balsam/config/retired/nersc_coriknl/settings.yml similarity index 100% rename from balsam/config/defaults/nersc_coriknl/settings.yml rename to balsam/config/retired/nersc_coriknl/settings.yml diff --git a/balsam/platform/app_run/__init__.py b/balsam/platform/app_run/__init__.py index 7887597e..34229fc1 100644 --- a/balsam/platform/app_run/__init__.py +++ b/balsam/platform/app_run/__init__.py @@ -1,11 +1,11 @@ from .app_run import AppRun, LocalAppRun +from .aurora import AuroraRun from .mpich import MPICHRun from .openmpi import OpenMPIRun from .perlmutter import PerlmutterRun from .polaris import PolarisRun from .slurm import SlurmRun from .summit import SummitJsrun -from .sunspot import SunspotRun from .theta import ThetaAprun from .theta_gpu import ThetaGPURun @@ -19,6 +19,6 @@ "ThetaGPURun", "MPICHRun", "SummitJsrun", - "SunspotRun", + "AuroraRun", "PerlmutterRun", ] diff --git a/balsam/platform/app_run/sunspot.py b/balsam/platform/app_run/aurora.py similarity index 92% rename from balsam/platform/app_run/sunspot.py rename to balsam/platform/app_run/aurora.py index dbe178fd..af28644a 100644 --- a/balsam/platform/app_run/sunspot.py +++ b/balsam/platform/app_run/aurora.py @@ -3,7 +3,7 @@ from .app_run import SubprocessAppRun -class SunspotRun(SubprocessAppRun): +class AuroraRun(SubprocessAppRun): """ https://www.open-mpi.org/doc/v3.0/man1/mpiexec.1.php """ @@ -29,7 +29,7 @@ def _build_cmdline(self) -> str: ] return " ".join(str(arg) for arg in args) - # Overide default because sunspot does not use CUDA + # Overide default because aurora/sunspot does not use CUDA def _set_envs(self) -> None: envs = os.environ.copy() envs.update(self._envs) diff --git a/balsam/platform/compute_node/__init__.py b/balsam/platform/compute_node/__init__.py index 002cc229..d45d519f 100644 --- a/balsam/platform/compute_node/__init__.py +++ b/balsam/platform/compute_node/__init__.py @@ -1,12 +1,10 @@ +from .alcf_aurora_node import AuroraNode from .alcf_cooley_node import CooleyNode from .alcf_polaris_node import PolarisNode -from .alcf_sunspot_node import SunspotNode from .alcf_thetagpu_node import ThetaGPUNode from .alcf_thetaknl_node import ThetaKNLNode from .compute_node import ComputeNode from .default import DefaultNode -from .nersc_corihas_node import CoriHaswellNode -from .nersc_coriknl_node import CoriKNLNode from .nersc_perlmutter import PerlmutterNode from .summit_node import SummitNode @@ -16,10 +14,8 @@ "SummitNode", "ThetaGPUNode", "CooleyNode", - "CoriHaswellNode", - "CoriKNLNode", "PerlmutterNode", "PolarisNode", - "SunspotNode", + "AuroraNode", "ComputeNode", ] diff --git a/balsam/platform/compute_node/alcf_sunspot_node.py b/balsam/platform/compute_node/alcf_aurora_node.py similarity index 94% rename from balsam/platform/compute_node/alcf_sunspot_node.py rename to balsam/platform/compute_node/alcf_aurora_node.py index 715f4e42..8afcfaa1 100644 --- a/balsam/platform/compute_node/alcf_sunspot_node.py +++ b/balsam/platform/compute_node/alcf_aurora_node.py @@ -8,7 +8,7 @@ IntStr = Union[int, str] -class SunspotNode(ComputeNode): +class AuroraNode(ComputeNode): cpu_ids = list(range(104)) gpu_ids: List[IntStr] @@ -18,7 +18,7 @@ class SunspotNode(ComputeNode): gpu_ids.append(str(gid) + "." + str(tid)) @classmethod - def get_job_nodelist(cls) -> List["SunspotNode"]: + def get_job_nodelist(cls) -> List["AuroraNode"]: """ Get all compute nodes allocated in the current job context """ diff --git a/balsam/platform/compute_node/nersc_corihas_node.py b/balsam/platform/compute_node/retired/nersc_corihas_node.py similarity index 100% rename from balsam/platform/compute_node/nersc_corihas_node.py rename to balsam/platform/compute_node/retired/nersc_corihas_node.py diff --git a/balsam/platform/compute_node/nersc_coriknl_node.py b/balsam/platform/compute_node/retired/nersc_coriknl_node.py similarity index 100% rename from balsam/platform/compute_node/nersc_coriknl_node.py rename to balsam/platform/compute_node/retired/nersc_coriknl_node.py diff --git a/docs/tutorials/theta-quickstart.md b/docs/tutorials/theta-quickstart.md index f4015611..2d729e62 100644 --- a/docs/tutorials/theta-quickstart.md +++ b/docs/tutorials/theta-quickstart.md @@ -9,9 +9,9 @@ the available default site setups: - Theta-GPU - Theta-KNL - Cooley -- Cori (Haswell or KNL partitions) - Perlmutter - Summit +- Aurora (coming soon) ## Install diff --git a/docs/user-guide/batchjob.md b/docs/user-guide/batchjob.md index 19050ecc..1c04756d 100644 --- a/docs/user-guide/batchjob.md +++ b/docs/user-guide/batchjob.md @@ -76,6 +76,26 @@ multiple runs per node. smaller BatchJobs can get through the queues faster and improve overall throughput. +## Ordering Job Execution + +By default, Balsam will sort jobs that are ready to run first by `num_nodes` +in acending order, then by `node_packing_count` in decending order, and finally +by `wall_time_min` in decending order. This default behavior will result in +the smallest jobs by node count starting first. + +There is an alternative sorting model that can be enabled that sorts jobs first +by `wall_time_min` in decending order, then by `num_nodes` in decending order, +and finally by `node_packing_count` in decending order. This alternative +sorting behavior will start the longest running jobs, as estimated by +`wall_time_min`, first. If jobs have no `wall_time_min` set, it will start +the largest jobs by node count first. This alternative sorting model can be +enabled for the site by modifying the site's configuration `settings.yml` file. +Under `launcher`, add this option: +```yaml +sort_by: long_large_firstĀ # set this to enable alternative sorting model that starts the longest running and largest node count jobs first +``` +Restart the site after changing `settings.yml` for the changes to take effect. + ## Using the API A unique capability of the [Balsam Python API](./api.md) is that it allows us