From 093927b1c2befadb223ff704f0ebe5154b44d935 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 6 Jun 2023 19:53:02 +0200 Subject: [PATCH] Increase minimum timeout to wait for workers in CI (#1192) We have been getting timeouts waiting for workers in CI, those are not reproducible locally. The reason for that is probably some sort of congestion causing spinup to take longer in CI, therefore this change introduces a variable that can be used to control the minimum timeout and the minimum timeout is doubled in CI. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/dask-cuda/pull/1192 --- ci/test_python.sh | 1 + dask_cuda/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/test_python.sh b/ci/test_python.sh index b9610bcaf..c988ee15e 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -41,6 +41,7 @@ set +e rapids-logger "pytest dask-cuda" pushd dask_cuda DASK_CUDA_TEST_SINGLE_GPU=1 \ +DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \ UCXPY_IFNAME=eth0 \ UCX_WARN_UNUSED_ENV_VARS=n \ UCX_MEMTYPE_CACHE=n \ diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 468c37f47..9fe31333b 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -446,7 +446,9 @@ def wait_workers( client: distributed.Client Instance of client, used to query for number of workers connected. min_timeout: float - Minimum number of seconds to wait before timeout. + Minimum number of seconds to wait before timeout. This value may be + overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with + a positive integer. seconds_per_gpu: float Seconds to wait for each GPU on the system. For example, if its value is 2 and there is a total of 8 GPUs (workers) being started, @@ -463,6 +465,8 @@ def wait_workers( ------- True if all workers were started, False if a timeout occurs. """ + min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None) + min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env) n_gpus = n_gpus or get_n_gpus() timeout = max(min_timeout, seconds_per_gpu * n_gpus)