Remove 2 cuda stream synchronizes.

baidu-research · Jan 26, 2022 · c63af94 · c63af94
1 parent 3a2ca8f
commit c63af94
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 61 deletions.
diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h
@@ -204,30 +204,14 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
             Lmax = std::max(Lmax, L);
         }
 
-#ifdef __HIPCC__
-        cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats,
-                                      (end_idx - start_idx) * sizeof(int),
-                                      hipMemcpyHostToDevice, stream_);
-#else
-        cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats,
-                                      (end_idx - start_idx) * sizeof(int),
-                                      cudaMemcpyHostToDevice, stream_);
-#endif
-
+        cuda_status = warpctc::memcpy_h2d_async(
+            &(repeats_[start_idx]), repeats, (end_idx - start_idx) * sizeof(int), stream_);
         if (cuda_status != gpuSuccess)
             return CTC_STATUS_MEMOPS_FAILED;
 
 
-#ifdef __HIPCC__
-        cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
-                                      (end_idx - start_idx) * sizeof(int),
-                                      hipMemcpyHostToDevice, stream_);
-#else
-        cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets,
-                                      (end_idx - start_idx) * sizeof(int),
-                                      cudaMemcpyHostToDevice, stream_);
-#endif
-
+        cuda_status = warpctc::memcpy_h2d_async(
+            &(label_offsets_[start_idx]), label_offsets, (end_idx - start_idx) * sizeof(int), stream_);
         if (cuda_status != gpuSuccess)
             return CTC_STATUS_MEMOPS_FAILED;
     }
@@ -243,16 +227,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_  * sizeof(int);
 
-#ifdef __HIPCC__
-    cuda_status = hipMemcpyAsync(utt_length_, input_lengths,
-                                  minibatch_ * sizeof(int),
-                                  hipMemcpyHostToDevice, stream_);
-#else
-    cuda_status = cudaMemcpyAsync(utt_length_, input_lengths,
-                                  minibatch_ * sizeof(int),
-                                  cudaMemcpyHostToDevice, stream_);
-#endif
-
+    cuda_status = warpctc::memcpy_h2d_async(
+        utt_length_, input_lengths, minibatch_ * sizeof(int), stream_);
     if (cuda_status != gpuSuccess)
         return CTC_STATUS_MEMOPS_FAILED;
 
@@ -261,16 +237,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                 gpu_bytes_used);
     gpu_bytes_used += minibatch_ * sizeof(int);
 
-#ifdef __HIPCC__
-    cuda_status = hipMemcpyAsync(label_sizes_, label_lengths,
-                                  minibatch_ * sizeof(int),
-                                  hipMemcpyHostToDevice, stream_);
-#else
-    cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths,
-                                  minibatch_ * sizeof(int),
-                                  cudaMemcpyHostToDevice, stream_);
-#endif
-
+    cuda_status = warpctc::memcpy_h2d_async(
+        label_sizes_, label_lengths, minibatch_ * sizeof(int), stream_);
     if (cuda_status != gpuSuccess)
         return CTC_STATUS_MEMOPS_FAILED;
 
@@ -279,16 +247,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                 gpu_bytes_used);
     gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
 
-#ifdef __HIPCC__
-    cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels,
-                                  total_label_length * sizeof(int),
-                                  hipMemcpyHostToDevice, stream_);
-#else
-    cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels,
-                                  total_label_length * sizeof(int),
-                                  cudaMemcpyHostToDevice, stream_);
-#endif
-
+    cuda_status = warpctc::memcpy_h2d_async(
+        labels_without_blanks_, flat_labels, total_label_length * sizeof(int), stream_);
     if (cuda_status != gpuSuccess)
         return CTC_STATUS_MEMOPS_FAILED;
 
@@ -302,7 +262,6 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                   gpu_bytes_used);
     gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
 
-
     denoms_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
@@ -330,25 +289,19 @@ ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
     // away
     const int stride = minibatch_;
 
-    if (compute_alpha)
+    if (compute_alpha) {
         compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
             (probs, label_sizes_, utt_length_,
              repeats_, labels_without_blanks_, label_offsets_,
              labels_with_blanks_, alphas_, nll_forward_,
              stride, out_dim_, S_, T_, blank_label_);
-
+    }
 
     if (compute_beta) {
         compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
             (probs, label_sizes_, utt_length_, repeats_,
              labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
              grads, stride, out_dim_, S_, T_, blank_label_);
-
-#ifdef __HIPCC__
-        hipStreamSynchronize(stream_);
-#else
-        cudaStreamSynchronize(stream_);
-#endif
     }
 
 #ifdef __HIPCC__

diff --git a/include/detail/gpu_helper.h b/include/detail/gpu_helper.h
@@ -14,6 +14,16 @@ static gpuError_t memcpy_d2h_async(void *dst, const void *src, size_t bytes, GPU
     return status;
 }
 
+static gpuError_t memcpy_h2d_async(void *dst, const void *src, size_t bytes, GPUstream stream) {
+    gpuError_t status;
+#ifdef __HIPCC__
+    status = hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream);
+#else
+    status = cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, stream);
+#endif
+    return status;
+}
+
 static gpuError_t synchronize(GPUstream stream) {
     gpuError_t status;
 #ifdef __HIPCC__

diff --git a/src/reduce.cu b/src/reduce.cu
@@ -143,10 +143,8 @@ ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols,
     ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
 
 #ifdef __HIPCC__
-    hipStreamSynchronize(stream);
     gpuError_t err = hipGetLastError();
 #else
-    cudaStreamSynchronize(stream);
     gpuError_t err = cudaGetLastError();
 #endif