From fd8a73659c89fc5d2678a6343002f7a2fc0a1a1f Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 20 Dec 2024 01:21:32 +0100 Subject: [PATCH] Add warnings and docs for affinity set failure (#1420) When PyNVML fails to identify CPU affinity appropriately, it may cause an error with launching Dask-CUDA. After extensive discussions in https://github.com/rapidsai/dask-cuda/issues/1381, it seems appropriate to allow continuing if CPU affinity identification fails and print a warning with a link to documentation instead. New documentation is also added to help in first steps of troubleshooting. Unfortunately testing warnings in Distributed plugins seems very hard to do, I couldn't find a way to do that even with `distributed.utils_tests.captured_logger`, which runs only after the cluster is created with a `LocalCluster` (or `LocalCUDACluster`). For the `dask cuda worker` CLI there's no way for us to mock the value passed to `CPUAffinity` to force a warning to be raised, so no tests are added at this time. Closes #1381 . Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/dask-cuda/pull/1420 --- dask_cuda/plugins.py | 11 ++++++- docs/source/troubleshooting.rst | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py index cd1928af9..ef60e52ec 100644 --- a/dask_cuda/plugins.py +++ b/dask_cuda/plugins.py @@ -1,4 +1,5 @@ import importlib +import logging import os from typing import Callable, Dict @@ -12,7 +13,15 @@ def __init__(self, cores): self.cores = cores def setup(self, worker=None): - os.sched_setaffinity(0, self.cores) + try: + os.sched_setaffinity(0, self.cores) + except Exception: + logger = logging.getLogger("distributed.worker") + logger.warning( + "Setting CPU affinity for GPU failed. Please refer to the following " + "link for troubleshooting information: " + "https://docs.rapids.ai/api/dask-cuda/nightly/troubleshooting/#setting-cpu-affinity-failure" # noqa: E501 + ) class CUDFSetup(WorkerPlugin): diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 3af5e08dd..d83c55778 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -30,3 +30,60 @@ For the DGX Station A100, the display GPU is commonly the fourth in the PCI Bus >>> from dask_cuda import LocalCUDACluster >>> cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=[0, 1, 2, 4]) + +Setting CPU Affinity Failure +---------------------------- + +Setting the proper CPU affinity for a Dask-CUDA worker is important to ensure optimal performance, particularly when +memory transfers to/from system memory is necessary. In Dask-CUDA this is an automatic feature that attempts to +determine the appropriate CPU affinity for each worker according to the GPU that worker is targeting. + +There are situations where setting the CPU affinity may fail, the more common case involves workload managers and job +schedulers used by large compute clusters, such as Slurm. + +Within a node with multiple physical CPU (i.e., multiple CPU sockets) and multiple GPUs, in such systems it is common +for GPUs to be directly connected to a specific physical CPU to balance resources. Consider for example a node with 4 +GPUs and 40 CPU cores, where the CPU cores are split between two physical CPUs, in this case GPUs 0 and 1 may be +connected to CPUs 0-19 and GPUs 2 and 3 may be connected to CPUs 20-39. In a setup like this, if the node is entirely +assigned to the Dask-CUDA job, most likely setting CPU affinity will succeed, however, it is still possible that the +job assigns the wrong CPUs 20-39 to GPUs 0 and 1, or CPUs 0-19 to GPUs 2 and 3, in this case setting the CPU affinity +will be impossible, since the correct CPU/GPU resources are not available to the job. When this happens, the best +Dask-CUDA can do is raise a warning that redirects you to this sections and not set any CPU affinity, letting the +operating system handle all transfers as it sees fit, even if they may follow a suboptimal path. + +If after following the instructions contained in this section, including consulting your cluster's manual and +administrators, please [file an issue under the Dask-CUDA repository](https://github.com/rapidsai/dask-cuda/issues), +including the output for all commands below, they must be executed from the allocated cluster job: + +- ``conda list``, if environment was installed with conda or uses a RAPIDS provided Docker image; +- ``pip list``, if environment was installed with pip; +- ``nvidia-smi``; +- ``nvidia-smi topo -m``; +- ``python print_affinity.py``, the code for ``print_affinity.py`` immediately follows. + +.. code-block:: python + + # print_affinity.py + import math + from multiprocessing import cpu_count + + import pynvml + + pynvml.nvmlInit() + for i in range(pynvml.nvmlDeviceGetCount()): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + cpu_affinity = pynvml.nvmlDeviceGetCpuAffinity(handle, math.ceil(cpu_count() / 64)) + print(f"GPU {i}: list(cpu_affinity)") + +Slurm +~~~~~ + +The more commonly observed cases of this issue have been reported on Slurm clusters. Common ways to resolve this +normally involve providing a specific subset of CPUs to the job with one of the following arguments: + +- `--cpus-per-task=N`: the number of CPUs the job will have allocated, you may need to ask for all CPUs to ensure + the GPUs have all CPUs relevant to them available; +- `--exclusive`: to ensure exclusive allocation of CPUs to the job. + +Unfortunately, providing exact solutions for all existing clust configurations is not possible, therefore make +make sure to consult your cluster's manual and administrator for detailed information and further troubleshooting.