#3504 use async calls using a cuda stream

Xpra-org · Aug 21, 2022 · b9c24fa · b9c24fa
1 parent 2a5e368
commit b9c24fa
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 9 deletions.
diff --git a/xpra/client/gl/gl_window_backing_base.py b/xpra/client/gl/gl_window_backing_base.py
@@ -1,6 +1,6 @@
 # This file is part of Xpra.
 # Copyright (C) 2013 Serviware (Arthur Huillet, <[email protected]>)
-# Copyright (C) 2012-2021 Antoine Martin <[email protected]>
+# Copyright (C) 2012-2022 Antoine Martin <[email protected]>
 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any
 # later version. See the file COPYING for details.
 
@@ -1074,6 +1074,9 @@ def paint_nvjpeg(gl_context):
 
     def paint_nvjpeg(self, gl_context, encoding, img_data, x : int, y : int, width : int, height : int, options, callbacks):
         with self.assign_cuda_context(True):
+            from pycuda.driver import Stream  # @UnresolvedImport
+            stream = Stream()
+            options["stream"] = stream
             img = self.nvjpeg_decoder.decompress_with_device("RGB", img_data, options)
             log("paint_nvjpeg(%s) img=%s, downloading buffer to pbo", gl_context, img)
             #'pixels' is a cuda buffer:
@@ -1086,18 +1089,19 @@ def paint_nvjpeg(self, gl_context, encoding, img_data, x : int, y : int, width :
             glBufferData(GL_PIXEL_UNPACK_BUFFER, size, None, GL_STREAM_DRAW)
             glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)
             #pylint: disable=import-outside-toplevel
-            from pycuda.driver import memcpy_dtod   #pylint: disable=no-name-in-module
+            from pycuda.driver import memcpy_dtod_async   #pylint: disable=no-name-in-module
             from pycuda.gl import RegisteredBuffer, graphics_map_flags  # @UnresolvedImport
             cuda_pbo = RegisteredBuffer(int(pbo), graphics_map_flags.WRITE_DISCARD)
             log("RegisteredBuffer%s=%s", (pbo, graphics_map_flags.WRITE_DISCARD), cuda_pbo)
-            mapping = cuda_pbo.map()
+            mapping = cuda_pbo.map(stream)
             ptr, msize = mapping.device_ptr_and_size()
             assert msize>=size, "registered buffer size %i too small for pbo size %i" % (msize, size)
             log("copying %i bytes from %s to mapping=%s at %#x", size, cuda_buffer, mapping, ptr)
-            memcpy_dtod(ptr, cuda_buffer, size)
-            mapping.unmap()
+            memcpy_dtod_async(ptr, cuda_buffer, size, stream)
+            mapping.unmap(stream)
             cuda_pbo.unregister()
             cuda_buffer.free()
+            stream.synchronize()
 
         rgb_format = img.get_pixel_format()
         assert rgb_format in ("RGB", "BGR", "RGBA", "BGRA"), "unexpected rgb format %r" % (rgb_format,)

diff --git a/xpra/codecs/cuda_common/cuda_context.py b/xpra/codecs/cuda_common/cuda_context.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # This file is part of Xpra.
-# Copyright (C) 2013-2019 Antoine Martin <[email protected]>
+# Copyright (C) 2013-2022 Antoine Martin <[email protected]>
 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any
 # later version. See the file COPYING for details.
 
@@ -461,7 +461,7 @@ def make_context(self):
         start = monotonic()
         cf = driver.ctx_flags
         if self.opengl:
-            from pycuda import gl
+            from pycuda import gl  # @UnresolvedImport
             self.context = gl.make_context(self.device)
         else:
             self.context = self.device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)

diff --git a/xpra/codecs/nvjpeg/decoder.pyx b/xpra/codecs/nvjpeg/decoder.pyx
@@ -116,7 +116,10 @@ def decompress_with_device(rgb_format, img_data, options=None):
     else:
         raise ValueError("invalid rgb format %r" % rgb_format)
     cdef nvjpegImage_t nv_image
+    stream = (options or {}).get("stream", None)
     cdef cudaStream_t nv_stream = NULL
+    if stream:
+        nv_stream = <cudaStream_t> (<uintptr_t> stream.handle)
     cdef nvjpegStatus_t r
     cdef uintptr_t dmem = 0
     cdef int rowstride = 0, width = 0, height = 0
@@ -200,7 +203,7 @@ def decompress_with_device(rgb_format, img_data, options=None):
                     memcpy.set_src_device(rgb)
                     memcpy.set_dst_device(rgba)
                     memcpy.height = width*height
-                    memcpy(aligned=False)
+                    memcpy(stream)
                     rgb.free()
                     #fill in the alpha channel:
                     memcpy = Memcpy2D()
@@ -212,7 +215,7 @@ def decompress_with_device(rgb_format, img_data, options=None):
                     memcpy.set_src_device(alpha)
                     memcpy.set_dst_device(rgba)
                     memcpy.height = alpha_size
-                    memcpy(aligned=False)
+                    memcpy(stream)
                     alpha.free()
                     end = monotonic()
                     log("alpha merge took %ims", 1000*(end-start))