From b7edb451302ea2ffea843177322b250fa66d02e7 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Wed, 28 Aug 2024 09:17:01 +0200 Subject: [PATCH] added debug CUDA kernel duration measureent to measure the duration of the newly created kernel for ->R12L kernel It is actually 1.6 ms for 1920x1080 picture on GeForce GTX TITAN, which seems to be more or less OK for now (it could be perhaps optimzed but doesn't seem to be a blocker for now). --- src/cuda_wrapper/kernels.cu | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index ac99e52cd..43088a38f 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -174,6 +174,24 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x, unsigned size_y) src += 2; } +#ifdef DEBUG +#include +#define MEASURE_KERNEL_DURATION_START \ + cudaEvent_t t0, t1; \ + cudaEventCreate(&t0); \ + cudaEventCreate(&t1); \ + cudaEventRecord(t0, stream); +#define MEASURE_KERNEL_DURATION_STOP \ + cudaEventRecord(t1, stream); \ + cudaEventSynchronize(t1); \ + float elapsedTime = NAN; \ + cudaEventElapsedTime(&elapsedTime, t0, t1); \ + printf("elapsed time: %f\n", elapsedTime); +#else +#define MEASURE_KERNEL_DURATION_START +#define MEASURE_KERNEL_DURATION_STOP +#endif + /** * @sa cmpto_j2k_dec_postprocessor_run_callback_cuda */ @@ -191,14 +209,20 @@ int postprocess_rg48_to_r12l( size_t /* temp_buffer_size */, void * output_buffer, size_t /* output_buffer_size */, - void * stream + void * vstream ) { + cudaStream_t stream = (cudaStream_t) vstream; dim3 threads_per_block(256); dim3 blocks((((size_x + 7) / 8) + 255) / 256, size_y); + MEASURE_KERNEL_DURATION_START + kernel_rg48_to_r12l<<>>( (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x, size_y); + + MEASURE_KERNEL_DURATION_STOP + return 0; }