diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index f85473b..f612e6b 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -362,11 +362,10 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): stream_handle = stream and stream.handle or zero_stream - rtsys.allocate(stream_handle) + rtsys.ensure_allocate(stream_handle) rtsys.set_memsys_to_module(cufunc.module, stream_handle) - rtsys.initialize(stream_handle) + rtsys.ensure_initialize(stream_handle) rtsys.enable(stream_handle) - rtsys.print_memsys(0) # Invoke kernel driver.launch_kernel(cufunc.handle, diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py index 8b1ea0a..4fefae1 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.py +++ b/numba_cuda/numba/cuda/runtime/nrt.py @@ -38,7 +38,7 @@ def _compile_memsys_module(self): self._memsys_module = module - def _ensure_allocate(self, stream): + def ensure_allocate(self, stream): if self._memsys is not None: return @@ -50,13 +50,12 @@ def allocate(self, stream): if self._memsys_module is None: self._compile_memsys_module() - if self._memsys is None: - # Allocate space for NRT_MemSys - # TODO: determine the size of NRT_MemSys at runtime - self._memsys = device_array((40,), dtype="i1", stream=stream) - # TODO: Memsys module needs a stream that's consistent with the - # system's stream. - self.set_memsys_to_module(self._memsys_module, stream=stream) + # Allocate space for NRT_MemSys + # TODO: determine the size of NRT_MemSys at runtime + self._memsys = device_array((40,), dtype="i1", stream=stream) + # TODO: Memsys module needs a stream that's consistent with the + # system's stream. + self.set_memsys_to_module(self._memsys_module, stream=stream) def _single_thread_launch(self, module, stream, name, params=()): func = module.get_function(name) @@ -70,7 +69,7 @@ def _single_thread_launch(self, module, stream, name, params=()): cooperative=False ) - def _ensure_initialize(self, stream): + def ensure_initialize(self, stream): if self._initialized: return @@ -94,8 +93,8 @@ def disable(self, stream): self._memsys_module, stream, "NRT_MemSys_disable") def _copy_memsys_to_host(self, stream=0): - self._ensure_allocate(stream) - self._ensure_initialize(stream) + self.ensure_allocate(stream) + self.ensure_initialize(stream) # Q: What stream should we execute this on? dt = np.dtype([ diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index cdb14e6..59aa93f 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -32,15 +32,44 @@ def kernel(): return None init_stats = rtsys.get_allocation_stats() + print("init_stats", init_stats) with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): kernel[1,1]() print("After kernel launch...") - rtsys.print_memsys(0) cur_stats = rtsys.get_allocation_stats() + print("cur_stats", cur_stats) self.assertEqual(cur_stats.alloc - init_stats.alloc, n) self.assertEqual(cur_stats.free - init_stats.free, n) + def test_escaping_var_init_in_loop(self): + """ + Test issue #1297 + """ + + @cuda.jit + def g(n): + + x = cuda_empty((n, 2), np.float64) + + for i in range(n): + y = x[i] + + for i in range(n): + y = x[i] # noqa: F841 + + return None + + init_stats = rtsys.get_allocation_stats() + print("init_stats", init_stats) + with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): + g[1, 1](10) + print("After kernel launch...") + cur_stats = rtsys.get_allocation_stats() + print("cur_stats", cur_stats) + self.assertEqual(cur_stats.alloc - init_stats.alloc, 1) + self.assertEqual(cur_stats.free - init_stats.free, 1) + class TestNrtBasic(CUDATestCase): def test_nrt_launches(self):