From b6212ea04b414012b292b35bee368e8f2b345acd Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 4 Oct 2023 19:05:43 +0200
Subject: [PATCH] Small reorganization and fixes for `test_spill` (#1255)

Do a minor reorganization on how `client.run()` is invoked and the centralize functions to check host and disk chunks.

The failures seem related to `del` not cleaning up objects in time, thus invoke garbage collection after `del` until it memory is actually released or the test times out. Local tests seem that invoking garbage collection once or twice is enough to prevent the test from failing.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1255
---
 dask_cuda/tests/test_spill.py | 94 ++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 30 deletions(-)

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 6a542cfb9..6172b0bc6 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -1,3 +1,4 @@
+import gc
 import os
 from time import sleep
 
@@ -58,7 +59,10 @@ def assert_device_host_file_size(
 
 
 def worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     assert_device_host_file_size(
         dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
@@ -66,7 +70,10 @@ def worker_assert(
 
 
 def delayed_worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     start = time()
     while not device_host_file_size_matches(
@@ -82,6 +89,18 @@ def delayed_worker_assert(
             )
 
 
+def assert_host_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is False:
+        assert len(dask_worker.data.host)
+
+
+def assert_disk_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is True:
+        assert len(dask_worker.data.disk or list()) > 0
+    else:
+        assert len(dask_worker.data.disk or list()) == 0
+
+
 @pytest.mark.parametrize(
     "params",
     [
@@ -122,7 +141,7 @@ def delayed_worker_assert(
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
     with dask.config.set(
@@ -144,6 +163,8 @@ async def test_cupy_cluster_device_spill(params):
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 rs = da.random.RandomState(RandomState=cupy.random.RandomState)
                 x = rs.random(int(50e6), chunks=2e6)
                 await wait(x)
@@ -153,7 +174,10 @@ async def test_cupy_cluster_device_spill(params):
 
                 # Allow up to 1024 bytes overhead per chunk serialized
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
 
                 y = client.compute(x.sum())
@@ -162,20 +186,19 @@ async def test_cupy_cluster_device_spill(params):
                 assert (abs(res / x.size) - 0.5) < 1e-3
 
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
 
 @pytest.mark.parametrize(
@@ -218,7 +241,7 @@ async def test_cupy_cluster_device_spill(params):
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cudf_cluster_device_spill(params):
     cudf = pytest.importorskip("cudf")
 
@@ -243,6 +266,8 @@ async def test_cudf_cluster_device_spill(params):
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 # There's a known issue with datetime64:
                 # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
                 # The same error above happens when spilling datetime64 to disk
@@ -264,26 +289,35 @@ async def test_cudf_cluster_device_spill(params):
                 await wait(cdf2)
 
                 del cdf
+                gc.collect()
 
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, nbytes, 32, 2048)
+                    worker_assert,
+                    nbytes,
+                    32,
+                    2048,
                 )
 
                 del cdf2
 
-                await client.run(
-                    lambda dask_worker: delayed_worker_assert(dask_worker, 0, 0, 0)
-                )
+                while True:
+                    try:
+                        await client.run(
+                            delayed_worker_assert,
+                            0,
+                            0,
+                            0,
+                        )
+                    except AssertionError:
+                        gc.collect()
+                    else:
+                        break