do not force reinit memsys on every kernel launch, add another test

NVIDIA · Dec 4, 2024 · a7d2887 · a7d2887
1 parent 8805e99
commit a7d2887
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 15 deletions.
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -362,11 +362,10 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
 
         stream_handle = stream and stream.handle or zero_stream
 
-        rtsys.allocate(stream_handle)
+        rtsys.ensure_allocate(stream_handle)
         rtsys.set_memsys_to_module(cufunc.module, stream_handle)
-        rtsys.initialize(stream_handle)
+        rtsys.ensure_initialize(stream_handle)
         rtsys.enable(stream_handle)
-        rtsys.print_memsys(0)
 
         # Invoke kernel
         driver.launch_kernel(cufunc.handle,

diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py
@@ -38,7 +38,7 @@ def _compile_memsys_module(self):
 
         self._memsys_module = module
 
-    def _ensure_allocate(self, stream):
+    def ensure_allocate(self, stream):
         if self._memsys is not None:
             return
 
@@ -50,13 +50,12 @@ def allocate(self, stream):
         if self._memsys_module is None:
             self._compile_memsys_module()
 
-        if self._memsys is None:
-            # Allocate space for NRT_MemSys
-            # TODO: determine the size of NRT_MemSys at runtime
-            self._memsys = device_array((40,), dtype="i1", stream=stream)
-            # TODO: Memsys module needs a stream that's consistent with the
-            # system's stream.
-            self.set_memsys_to_module(self._memsys_module, stream=stream)
+        # Allocate space for NRT_MemSys
+        # TODO: determine the size of NRT_MemSys at runtime
+        self._memsys = device_array((40,), dtype="i1", stream=stream)
+        # TODO: Memsys module needs a stream that's consistent with the
+        # system's stream.
+        self.set_memsys_to_module(self._memsys_module, stream=stream)
 
     def _single_thread_launch(self, module, stream, name, params=()):
         func = module.get_function(name)
@@ -70,7 +69,7 @@ def _single_thread_launch(self, module, stream, name, params=()):
             cooperative=False
         )
 
-    def _ensure_initialize(self, stream):
+    def ensure_initialize(self, stream):
         if self._initialized:
             return
 
@@ -94,8 +93,8 @@ def disable(self, stream):
             self._memsys_module, stream, "NRT_MemSys_disable")
 
     def _copy_memsys_to_host(self, stream=0):
-        self._ensure_allocate(stream)
-        self._ensure_initialize(stream)
+        self.ensure_allocate(stream)
+        self.ensure_initialize(stream)
 
         # Q: What stream should we execute this on?
         dt = np.dtype([

diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
@@ -32,15 +32,44 @@ def kernel():
             return None
 
         init_stats = rtsys.get_allocation_stats()
+        print("init_stats", init_stats)
 
         with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
             kernel[1,1]()
         print("After kernel launch...")
-        rtsys.print_memsys(0)
         cur_stats = rtsys.get_allocation_stats()
+        print("cur_stats", cur_stats)
         self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
         self.assertEqual(cur_stats.free - init_stats.free, n)
 
+    def test_escaping_var_init_in_loop(self):
+        """
+        Test issue #1297
+        """
+
+        @cuda.jit
+        def g(n):
+
+            x = cuda_empty((n, 2), np.float64)
+
+            for i in range(n):
+                y = x[i]
+
+            for i in range(n):
+                y = x[i] # noqa: F841
+
+            return None
+
+        init_stats = rtsys.get_allocation_stats()
+        print("init_stats", init_stats)
+        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+            g[1, 1](10)
+        print("After kernel launch...")
+        cur_stats = rtsys.get_allocation_stats()
+        print("cur_stats", cur_stats)
+        self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
+        self.assertEqual(cur_stats.free - init_stats.free, 1)
+
 
 class TestNrtBasic(CUDATestCase):
     def test_nrt_launches(self):