diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 4028828..f85473b 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -341,8 +341,6 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): # Prepare kernel cufunc = self._codelibrary.get_cufunc() - rtsys.allocate() - if self.debug: excname = cufunc.name + "__errcode__" excmem, excsz = cufunc.module.get_global_symbol(excname) @@ -364,8 +362,11 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): stream_handle = stream and stream.handle or zero_stream + rtsys.allocate(stream_handle) rtsys.set_memsys_to_module(cufunc.module, stream_handle) rtsys.initialize(stream_handle) + rtsys.enable(stream_handle) + rtsys.print_memsys(0) # Invoke kernel driver.launch_kernel(cufunc.handle, diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py index fb22f05..8b1ea0a 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.py +++ b/numba_cuda/numba/cuda/runtime/nrt.py @@ -38,13 +38,13 @@ def _compile_memsys_module(self): self._memsys_module = module - def _ensure_allocate(self): + def _ensure_allocate(self, stream): if self._memsys is not None: return - self.allocate() + self.allocate(stream) - def allocate(self): + def allocate(self, stream): from numba.cuda import device_array if self._memsys_module is None: @@ -53,7 +53,10 @@ def allocate(self): if self._memsys is None: # Allocate space for NRT_MemSys # TODO: determine the size of NRT_MemSys at runtime - self._memsys = device_array((40,), dtype="i1") + self._memsys = device_array((40,), dtype="i1", stream=stream) + # TODO: Memsys module needs a stream that's consistent with the + # system's stream. + self.set_memsys_to_module(self._memsys_module, stream=stream) def _single_thread_launch(self, module, stream, name, params=()): func = module.get_function(name) @@ -84,18 +87,17 @@ def initialize(self, stream): def enable(self, stream): self._single_thread_launch( - self._memsys_module, stream, "NR_MemSys_enable") + self._memsys_module, stream, "NRT_MemSys_enable") def disable(self, stream): self._single_thread_launch( - self._memsys_module, stream, "NR_MemSys_disable") + self._memsys_module, stream, "NRT_MemSys_disable") def _copy_memsys_to_host(self, stream=0): - self._ensure_allocate() + self._ensure_allocate(stream) self._ensure_initialize(stream) # Q: What stream should we execute this on? - # read the stats dt = np.dtype([ ('alloc', np.uint64), ('free', np.uint64), @@ -116,20 +118,12 @@ def _copy_memsys_to_host(self, stream=0): return stats_for_read[0] def get_allocation_stats(self): - # This is commented out to test the 700 error code from cuda. - # if self._memsys is None or (not self._initialized): - # return _nrt_mstats( - # alloc=0, - # free=0, - # mi_alloc=0, - # mi_free=0 - # ) memsys = self._copy_memsys_to_host() return _nrt_mstats( - alloc=memsys.alloc, - free=memsys.free, - mi_alloc=memsys.mi_alloc, - mi_free=memsys.mi_free + alloc=memsys["alloc"], + free=memsys["free"], + mi_alloc=memsys["mi_alloc"], + mi_free=memsys["mi_free"] ) def set_memsys_to_module(self, module, stream): @@ -137,6 +131,7 @@ def set_memsys_to_module(self, module, stream): raise RuntimeError( "Please allocate NRT Memsys first before initializing.") + print(f"Setting {self._memsys.device_ctypes_pointer} to {module}") self._single_thread_launch( module, stream, @@ -144,5 +139,13 @@ def set_memsys_to_module(self, module, stream): [self._memsys.device_ctypes_pointer,] ) + def print_memsys(self, stream): + cuda.synchronize() + self._single_thread_launch( + self._memsys_module, + stream, + "NRT_MemSys_print" + ) + rtsys = _Runtime() diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index e60a128..cdb14e6 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -25,7 +25,7 @@ def test_no_return(self): """ n = 10 - @cuda.jit + @cuda.jit(debug=True) def kernel(): for i in range(n): temp = cuda_empty(2, np.float64) # noqa: F841 @@ -35,6 +35,8 @@ def kernel(): with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): kernel[1,1]() + print("After kernel launch...") + rtsys.print_memsys(0) cur_stats = rtsys.get_allocation_stats() self.assertEqual(cur_stats.alloc - init_stats.alloc, n) self.assertEqual(cur_stats.free - init_stats.free, n)