diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index 4028828..f85473b 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -341,8 +341,6 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
         # Prepare kernel
         cufunc = self._codelibrary.get_cufunc()
 
-        rtsys.allocate()
-
         if self.debug:
             excname = cufunc.name + "__errcode__"
             excmem, excsz = cufunc.module.get_global_symbol(excname)
@@ -364,8 +362,11 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
 
         stream_handle = stream and stream.handle or zero_stream
 
+        rtsys.allocate(stream_handle)
         rtsys.set_memsys_to_module(cufunc.module, stream_handle)
         rtsys.initialize(stream_handle)
+        rtsys.enable(stream_handle)
+        rtsys.print_memsys(0)
 
         # Invoke kernel
         driver.launch_kernel(cufunc.handle,
diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py
index fb22f05..8b1ea0a 100644
--- a/numba_cuda/numba/cuda/runtime/nrt.py
+++ b/numba_cuda/numba/cuda/runtime/nrt.py
@@ -38,13 +38,13 @@ def _compile_memsys_module(self):
 
         self._memsys_module = module
 
-    def _ensure_allocate(self):
+    def _ensure_allocate(self, stream):
         if self._memsys is not None:
             return
 
-        self.allocate()
+        self.allocate(stream)
 
-    def allocate(self):
+    def allocate(self, stream):
         from numba.cuda import device_array
 
         if self._memsys_module is None:
@@ -53,7 +53,10 @@ def allocate(self):
         if self._memsys is None:
             # Allocate space for NRT_MemSys
             # TODO: determine the size of NRT_MemSys at runtime
-            self._memsys = device_array((40,), dtype="i1")
+            self._memsys = device_array((40,), dtype="i1", stream=stream)
+            # TODO: Memsys module needs a stream that's consistent with the
+            # system's stream.
+            self.set_memsys_to_module(self._memsys_module, stream=stream)
 
     def _single_thread_launch(self, module, stream, name, params=()):
         func = module.get_function(name)
@@ -84,18 +87,17 @@ def initialize(self, stream):
 
     def enable(self, stream):
         self._single_thread_launch(
-            self._memsys_module, stream, "NR_MemSys_enable")
+            self._memsys_module, stream, "NRT_MemSys_enable")
 
     def disable(self, stream):
         self._single_thread_launch(
-            self._memsys_module, stream, "NR_MemSys_disable")
+            self._memsys_module, stream, "NRT_MemSys_disable")
 
     def _copy_memsys_to_host(self, stream=0):
-        self._ensure_allocate()
+        self._ensure_allocate(stream)
         self._ensure_initialize(stream)
 
         # Q: What stream should we execute this on?
-        # read the stats
         dt = np.dtype([
             ('alloc', np.uint64),
             ('free', np.uint64),
@@ -116,20 +118,12 @@ def _copy_memsys_to_host(self, stream=0):
         return stats_for_read[0]
 
     def get_allocation_stats(self):
-        # This is commented out to test the 700 error code from cuda.
-        # if self._memsys is None or (not self._initialized):
-        #     return _nrt_mstats(
-        #         alloc=0,
-        #         free=0,
-        #         mi_alloc=0,
-        #         mi_free=0
-        #     )
         memsys = self._copy_memsys_to_host()
         return _nrt_mstats(
-            alloc=memsys.alloc,
-            free=memsys.free,
-            mi_alloc=memsys.mi_alloc,
-            mi_free=memsys.mi_free
+            alloc=memsys["alloc"],
+            free=memsys["free"],
+            mi_alloc=memsys["mi_alloc"],
+            mi_free=memsys["mi_free"]
         )
 
     def set_memsys_to_module(self, module, stream):
@@ -137,6 +131,7 @@ def set_memsys_to_module(self, module, stream):
             raise RuntimeError(
                 "Please allocate NRT Memsys first before initializing.")
 
+        print(f"Setting {self._memsys.device_ctypes_pointer} to {module}")
         self._single_thread_launch(
             module,
             stream,
@@ -144,5 +139,13 @@ def set_memsys_to_module(self, module, stream):
             [self._memsys.device_ctypes_pointer,]
         )
 
+    def print_memsys(self, stream):
+        cuda.synchronize()
+        self._single_thread_launch(
+            self._memsys_module,
+            stream,
+            "NRT_MemSys_print"
+        )
+
 
 rtsys = _Runtime()
diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
index e60a128..cdb14e6 100644
--- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
+++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
@@ -25,7 +25,7 @@ def test_no_return(self):
         """
         n = 10
 
-        @cuda.jit
+        @cuda.jit(debug=True)
         def kernel():
             for i in range(n):
                 temp = cuda_empty(2, np.float64) # noqa: F841
@@ -35,6 +35,8 @@ def kernel():
 
         with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
             kernel[1,1]()
+        print("After kernel launch...")
+        rtsys.print_memsys(0)
         cur_stats = rtsys.get_allocation_stats()
         self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
         self.assertEqual(cur_stats.free - init_stats.free, n)