From 3952a3c941e30758195ac1413196babb98f8834d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 26 Jan 2022 15:06:21 +0000 Subject: [PATCH] Remove a cuda stream synchronize. --- include/detail/gpu_ctc.h | 71 +++++++------------------------------ include/detail/gpu_helper.h | 10 ++++++ 2 files changed, 22 insertions(+), 59 deletions(-) diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h index c665e06..77de6dd 100644 --- a/include/detail/gpu_ctc.h +++ b/include/detail/gpu_ctc.h @@ -204,30 +204,14 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, Lmax = std::max(Lmax, L); } -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats, - (end_idx - start_idx) * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + &(repeats_[start_idx]), repeats, (end_idx - start_idx) * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets, - (end_idx - start_idx) * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + &(label_offsets_[start_idx]), label_offsets, (end_idx - start_idx) * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; } @@ -243,16 +227,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(utt_length_, input_lengths, - minibatch_ * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(utt_length_, input_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + utt_length_, input_lengths, minibatch_ * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -261,16 +237,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(label_sizes_, label_lengths, - minibatch_ * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + label_sizes_, label_lengths, minibatch_ * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -279,16 +247,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += Lmax * minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels, - total_label_length * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels, - total_label_length * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + labels_without_blanks_, flat_labels, total_label_length * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -302,7 +262,6 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT); - denoms_ = reinterpret_cast(static_cast(gpu_workspace_) + gpu_bytes_used); @@ -330,25 +289,19 @@ ctcStatus_t GpuCTC::launch_alpha_beta_kernels(const ProbT* const probs, // away const int stride = minibatch_; - if (compute_alpha) + if (compute_alpha) { compute_alpha_kernel<<>> (probs, label_sizes_, utt_length_, repeats_, labels_without_blanks_, label_offsets_, labels_with_blanks_, alphas_, nll_forward_, stride, out_dim_, S_, T_, blank_label_); - + } if (compute_beta) { compute_betas_and_grad_kernel<<>> (probs, label_sizes_, utt_length_, repeats_, labels_with_blanks_, alphas_, nll_forward_, nll_backward_, grads, stride, out_dim_, S_, T_, blank_label_); - -#ifdef __HIPCC__ - hipStreamSynchronize(stream_); -#else - cudaStreamSynchronize(stream_); -#endif } #ifdef __HIPCC__ diff --git a/include/detail/gpu_helper.h b/include/detail/gpu_helper.h index d323f9c..55473e0 100644 --- a/include/detail/gpu_helper.h +++ b/include/detail/gpu_helper.h @@ -14,6 +14,16 @@ static gpuError_t memcpy_d2h_async(void *dst, const void *src, size_t bytes, GPU return status; } +static gpuError_t memcpy_h2d_async(void *dst, const void *src, size_t bytes, GPUstream stream) { + gpuError_t status; +#ifdef __HIPCC__ + status = hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream); +#else + status = cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, stream); +#endif + return status; +} + static gpuError_t synchronize(GPUstream stream) { gpuError_t status; #ifdef __HIPCC__