diff --git a/CMakeLists.txt b/CMakeLists.txt index cd8ccaa..e9ca779 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) option(WITH_OMP "compile warp-ctc with OpenMP." ON) option(BUILD_TESTS "build warp-ctc unit tests." ON) option(BUILD_SHARED "build warp-ctc shared library." ON) -option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) if(WITH_ROCM) add_definitions(-DWARPCTC_WITH_HIP) @@ -65,32 +65,38 @@ endif() # need to be at least 30 or __shfl_down in reduce wont compile IF (CUDA_VERSION VERSION_LESS "11.0") + # CUDA_VERSION < 11.0 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") ENDIF() -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") IF (CUDA_VERSION VERSION_GREATER "7.6") + # CUDA_VERSION > 7.6 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") ENDIF() IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) + # CUDA_VERSION >= 9.0 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") ENDIF() IF ((CUDA_VERSION VERSION_GREATER "10.0") OR (CUDA_VERSION VERSION_EQUAL "10.0")) + # CUDA_VERSION >= 10.0 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") ENDIF() IF ((CUDA_VERSION VERSION_GREATER "11.0") OR (CUDA_VERSION VERSION_EQUAL "11.0")) + # CUDA_VERSION >= 11.0 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80") ENDIF() IF ((CUDA_VERSION VERSION_GREATER "11.2") OR (CUDA_VERSION VERSION_EQUAL "11.2")) + # CUDA_VERSION >= 11.2 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86") ENDIF() diff --git a/include/ctc.h b/include/ctc.h index 11d9de6..18455e8 100644 --- a/include/ctc.h +++ b/include/ctc.h @@ -52,6 +52,15 @@ typedef enum { CTC_GPU = 1 } ctcComputeLocation; +struct ctcParamConfigs { + /// input params of compute_ctc_loss + ctcComputeLocation flat_labels_loc{CTC_CPU}; + ctcComputeLocation label_lengths_loc{CTC_CPU}; + ctcComputeLocation input_lengths_loc{CTC_CPU}; + /// output param of compute_ctc_loss + ctcComputeLocation costs_loc{CTC_CPU}; +}; + /** Structure used for options to the CTC compution. Applications * should zero out the array using memset and sizeof(struct * ctcOptions) in C or default initialization (e.g. 'ctcOptions @@ -70,6 +79,8 @@ struct ctcOptions { /// the label value/index that the CTC calculation should use as the blank label int blank_label; + /// where to hold the input and output params + ctcParamConfigs params; }; /** Compute the connectionist temporal classification loss between diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h index cafe6ae..77de6dd 100644 --- a/include/detail/gpu_ctc.h +++ b/include/detail/gpu_ctc.h @@ -3,6 +3,7 @@ #include "ctc_helper.h" #include "gpu_ctc_kernels.h" #include "reduce.h" +#include "gpu_helper.h" template class GpuCTC { @@ -11,10 +12,19 @@ class GpuCTC { int minibatch, void *workspace, GPUstream stream, - int blank_label) : + int blank_label, + bool has_cpu_flat_labels, + bool has_cpu_label_lengths, + bool has_cpu_input_lengths, + bool copy_costs_to_cpu) : out_dim_(alphabet_size), minibatch_(minibatch), gpu_workspace_(workspace), stream_(stream), - blank_label_(blank_label) {}; + blank_label_(blank_label) { + param_configs_.has_cpu_flat_labels = has_cpu_flat_labels; + param_configs_.has_cpu_label_lengths = has_cpu_label_lengths; + param_configs_.has_cpu_input_lengths = has_cpu_input_lengths; + param_configs_.copy_costs_to_cpu = copy_costs_to_cpu; + } // Noncopyable GpuCTC(const GpuCTC&) = delete; @@ -36,7 +46,6 @@ class GpuCTC { const int* const input_lengths); private: - template ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs, ProbT *grads, @@ -53,12 +62,14 @@ class GpuCTC { ctcStatus_t setup_gpu_metadata(const int* const flat_labels, const int* const label_lengths, - const int* const input_lengths); + const int* const input_lengths, + ProbT* costs); ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths, const int* const flat_labels, const int* const input_lengths, + ProbT* costs, size_t& best_config); ctcStatus_t @@ -74,6 +85,12 @@ class GpuCTC { bool compute_alpha, bool compute_betas_and_grad); + struct ParamConfigs { + bool has_cpu_flat_labels{true}; + bool has_cpu_label_lengths{true}; + bool has_cpu_input_lengths{true}; + bool copy_costs_to_cpu{true}; + }; int out_dim_; // Number of characters plus blank int minibatch_; @@ -86,34 +103,40 @@ class GpuCTC { GPUstream stream_; int blank_label_; - - void *gpu_workspace_; // Buffer for all temporary GPU memory - int *utt_length_; // T - int *label_sizes_; // L - int *repeats_; // repeats_ - int *label_offsets_; - int *labels_without_blanks_; - int *labels_with_blanks_; - ProbT *alphas_; - ProbT *nll_forward_; - ProbT *nll_backward_; - ProbT *denoms_; // Temporary storage for denoms for softmax - ProbT *probs_; // Temporary storage for probabilities (softmax output) + ParamConfigs param_configs_; + + void *gpu_workspace_{nullptr}; // Buffer for all temporary GPU memory + int *utt_length_{nullptr}; // T + int *label_sizes_{nullptr}; // L + int *repeats_{nullptr}; // repeats_ + int *label_offsets_{nullptr}; + int *labels_without_blanks_{nullptr}; + int *labels_with_blanks_{nullptr}; + ProbT *alphas_{nullptr}; + ProbT *nll_forward_{nullptr}; + ProbT *nll_backward_{nullptr}; + ProbT *denoms_{nullptr}; // Temporary storage for denoms for softmax + ProbT *probs_{nullptr}; // Temporary storage for probabilities (softmax output) }; template ctcStatus_t GpuCTC::setup_gpu_metadata(const int* const flat_labels, const int* const label_lengths, - const int* const input_lengths) + const int* const input_lengths, + ProbT* costs) { size_t gpu_bytes_used = 0; - nll_forward_ = - reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(ProbT); - + if (param_configs_.copy_costs_to_cpu) { + nll_forward_ = + reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(ProbT); + } else { + // input costs should be a device ptr. + nll_forward_ = costs; + } nll_backward_ = reinterpret_cast(static_cast(gpu_workspace_) + @@ -181,30 +204,14 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, Lmax = std::max(Lmax, L); } -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(&(repeats_[start_idx]), repeats, - (end_idx - start_idx) * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), repeats, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + &(repeats_[start_idx]), repeats, (end_idx - start_idx) * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(&(label_offsets_[start_idx]), label_offsets, - (end_idx - start_idx) * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), label_offsets, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + &(label_offsets_[start_idx]), label_offsets, (end_idx - start_idx) * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; } @@ -220,16 +227,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(utt_length_, input_lengths, - minibatch_ * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(utt_length_, input_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + utt_length_, input_lengths, minibatch_ * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -238,16 +237,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(label_sizes_, label_lengths, - minibatch_ * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(label_sizes_, label_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + label_sizes_, label_lengths, minibatch_ * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -256,16 +247,8 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += Lmax * minibatch_ * sizeof(int); -#ifdef __HIPCC__ - cuda_status = hipMemcpyAsync(labels_without_blanks_, flat_labels, - total_label_length * sizeof(int), - hipMemcpyHostToDevice, stream_); -#else - cuda_status = cudaMemcpyAsync(labels_without_blanks_, flat_labels, - total_label_length * sizeof(int), - cudaMemcpyHostToDevice, stream_); -#endif - + cuda_status = warpctc::memcpy_h2d_async( + labels_without_blanks_, flat_labels, total_label_length * sizeof(int), stream_); if (cuda_status != gpuSuccess) return CTC_STATUS_MEMOPS_FAILED; @@ -279,7 +262,6 @@ GpuCTC::setup_gpu_metadata(const int* const flat_labels, gpu_bytes_used); gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT); - denoms_ = reinterpret_cast(static_cast(gpu_workspace_) + gpu_bytes_used); @@ -307,25 +289,19 @@ ctcStatus_t GpuCTC::launch_alpha_beta_kernels(const ProbT* const probs, // away const int stride = minibatch_; - if (compute_alpha) + if (compute_alpha) { compute_alpha_kernel<<>> (probs, label_sizes_, utt_length_, repeats_, labels_without_blanks_, label_offsets_, labels_with_blanks_, alphas_, nll_forward_, stride, out_dim_, S_, T_, blank_label_); - + } if (compute_beta) { compute_betas_and_grad_kernel<<>> (probs, label_sizes_, utt_length_, repeats_, labels_with_blanks_, alphas_, nll_forward_, nll_backward_, grads, stride, out_dim_, S_, T_, blank_label_); - -#ifdef __HIPCC__ - hipStreamSynchronize(stream_); -#else - cudaStreamSynchronize(stream_); -#endif } #ifdef __HIPCC__ @@ -345,10 +321,11 @@ ctcStatus_t GpuCTC::create_metadata_and_choose_config(const int* const flat_labels, const int* const label_lengths, const int* const input_lengths, + ProbT* costs, size_t& best_config) { // Setup the metadata for GPU - ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths); + ctcStatus_t status = setup_gpu_metadata(flat_labels, label_lengths, input_lengths, costs); if (status != CTC_STATUS_SUCCESS) return status; @@ -472,6 +449,7 @@ GpuCTC::compute_cost_and_score(const ProbT* const activations, ctcStatus_t status = create_metadata_and_choose_config(flat_labels, label_lengths, input_lengths, + costs, best_config); if (status != CTC_STATUS_SUCCESS) return status; @@ -483,26 +461,13 @@ GpuCTC::compute_cost_and_score(const ProbT* const activations, launch_gpu_kernels(probs_, grads, best_config, compute_alpha, compute_betas_and_grad); - gpuError_t cuda_status_mem, cuda_status_sync; - -#ifdef __HIPCC__ - cuda_status_mem = hipMemcpyAsync(costs, nll_forward_, - sizeof(ProbT) * minibatch_, - hipMemcpyDeviceToHost, stream_); -#else - cuda_status_mem = cudaMemcpyAsync(costs, nll_forward_, - sizeof(ProbT) * minibatch_, - cudaMemcpyDeviceToHost, stream_); -#endif - -#ifdef __HIPCC__ - cuda_status_sync = hipStreamSynchronize(stream_); -#else - cuda_status_sync = cudaStreamSynchronize(stream_); -#endif - - if (cuda_status_mem != gpuSuccess || cuda_status_sync != gpuSuccess) - return CTC_STATUS_MEMOPS_FAILED; + if (param_configs_.copy_costs_to_cpu) { + // Copy the costs back to CPU. + gpuError_t cuda_status_mem = warpctc::memcpy_d2h_sync( + costs, nll_forward_, sizeof(ProbT) * minibatch_, stream_); + if (cuda_status_mem != gpuSuccess) + return CTC_STATUS_MEMOPS_FAILED; + } return CTC_STATUS_SUCCESS; } diff --git a/include/detail/gpu_helper.h b/include/detail/gpu_helper.h new file mode 100644 index 0000000..55473e0 --- /dev/null +++ b/include/detail/gpu_helper.h @@ -0,0 +1,45 @@ +#pragma once + +#include "type_defs.h" + +namespace warpctc { + +static gpuError_t memcpy_d2h_async(void *dst, const void *src, size_t bytes, GPUstream stream) { + gpuError_t status; +#ifdef __HIPCC__ + status = hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, stream); +#else + status = cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, stream); +#endif + return status; +} + +static gpuError_t memcpy_h2d_async(void *dst, const void *src, size_t bytes, GPUstream stream) { + gpuError_t status; +#ifdef __HIPCC__ + status = hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream); +#else + status = cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, stream); +#endif + return status; +} + +static gpuError_t synchronize(GPUstream stream) { + gpuError_t status; +#ifdef __HIPCC__ + status = hipStreamSynchronize(stream); +#else + status = cudaStreamSynchronize(stream); +#endif + return status; +} + +static gpuError_t memcpy_d2h_sync(void *dst, const void *src, size_t bytes, GPUstream stream) { + gpuError_t status = memcpy_d2h_async(dst, src, bytes, stream); + if (status != gpuSuccess) { + return status; + } + return synchronize(stream); +} + +} // namespace warpctc diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp index f56f6a8..01aefb2 100644 --- a/src/ctc_entrypoint.cpp +++ b/src/ctc_entrypoint.cpp @@ -72,7 +72,10 @@ ctcStatus_t compute_ctc_loss(const float* const activations, } else if (options.loc == CTC_GPU) { #if (defined(__HIPCC__) || defined(__CUDACC__)) GpuCTC ctc(alphabet_size, minibatch, workspace, options.stream, - options.blank_label); + options.blank_label, options.params.flat_labels_loc == CTC_CPU, + options.params.label_lengths_loc == CTC_CPU, + options.params.input_lengths_loc == CTC_CPU, + options.params.costs_loc == CTC_CPU); if (gradients != NULL) return ctc.cost_and_grad(activations, gradients, costs, @@ -112,7 +115,7 @@ ctcStatus_t compute_ctc_loss_double(const double* const activations, if (options.loc == CTC_CPU) { CpuCTC ctc(alphabet_size, minibatch, workspace, options.num_threads, - options.blank_label); + options.blank_label); if (gradients != NULL) return ctc.cost_and_grad(activations, gradients, @@ -125,7 +128,10 @@ ctcStatus_t compute_ctc_loss_double(const double* const activations, } else if (options.loc == CTC_GPU) { #if (defined(__HIPCC__) || defined(__CUDACC__)) GpuCTC ctc(alphabet_size, minibatch, workspace, options.stream, - options.blank_label); + options.blank_label, options.params.flat_labels_loc == CTC_CPU, + options.params.label_lengths_loc == CTC_CPU, + options.params.input_lengths_loc == CTC_CPU, + options.params.costs_loc == CTC_CPU); if (gradients != NULL) return ctc.cost_and_grad(activations, gradients, costs, @@ -167,8 +173,13 @@ ctcStatus_t get_workspace_size(const int* const label_lengths, if (options.loc == CTC_GPU) { // GPU storage - //nll_forward, nll_backward - *size_bytes += 2 * sizeof(float) * minibatch; + if (options.params.costs_loc == CTC_CPU) { + // nll_forward, nll_backward + *size_bytes += 2 * sizeof(float) * minibatch; + } else { + // nll_backward + *size_bytes += 1 * sizeof(float) * minibatch; + } //repeats *size_bytes += sizeof(int) * minibatch; @@ -249,8 +260,13 @@ ctcStatus_t get_workspace_size_double(const int* const label_lengths, if (options.loc == CTC_GPU) { // GPU storage - //nll_forward, nll_backward - *size_bytes += 2 * sizeof(double) * minibatch; + if (options.params.costs_loc == CTC_CPU) { + // nll_forward, nll_backward + *size_bytes += 2 * sizeof(double) * minibatch; + } else { + // nll_backward + *size_bytes += 1 * sizeof(double) * minibatch; + } //repeats *size_bytes += sizeof(int) * minibatch; diff --git a/src/reduce.cu b/src/reduce.cu index e3c8217..9aac755 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -143,10 +143,8 @@ ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); #ifdef __HIPCC__ - hipStreamSynchronize(stream); gpuError_t err = hipGetLastError(); #else - cudaStreamSynchronize(stream); gpuError_t err = cudaGetLastError(); #endif diff --git a/tests/test.h b/tests/test.h index 22f87b0..1bf804f 100644 --- a/tests/test.h +++ b/tests/test.h @@ -5,9 +5,22 @@ #include #include #include +#include #include +#define CHECK_STATUS(func_call, status) \ + { \ + bool s = func_call; \ + std::string funcname = #func_call; \ + if (s) { \ + std::cout << "-- TEST " << funcname << " PASS!\n"; \ + } else { \ + std::cout << "-- TEST " << funcname << " FAILED!\n"; \ + } \ + status &= s; \ + } + inline void throw_on_error(ctcStatus_t status, const char* message) { if (status != CTC_STATUS_SUCCESS) { throw std::runtime_error(message + (", stat = " + diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp index e710fbc..048496b 100644 --- a/tests/test_cpu.cpp +++ b/tests/test_cpu.cpp @@ -374,16 +374,16 @@ int main(void) { std::cout << "Running CPU tests" << std::endl; bool status = true; - status &= small_test(); - status &= options_test(); - status &= inf_test(); - status &= run_tests(); + CHECK_STATUS(small_test(), status); + CHECK_STATUS(options_test(), status); + CHECK_STATUS(inf_test(), status); + CHECK_STATUS(run_tests(), status); if (status) { - std::cout << "Tests pass" << std::endl; + std::cout << "All tests PASS!" << std::endl; return 0; } else { - std::cout << "Some or all tests fail" << std::endl; + std::cout << "Some or all tests FAILED!" << std::endl; return 1; } } diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu index a94d93a..784114c 100644 --- a/tests/test_gpu.cu +++ b/tests/test_gpu.cu @@ -678,16 +678,16 @@ int main(void) { #endif bool status = true; - status &= small_test(); - status &= options_test(); - status &= inf_test(); - status &= run_tests(); + CHECK_STATUS(small_test(), status); + CHECK_STATUS(options_test(), status); + CHECK_STATUS(inf_test(), status); + CHECK_STATUS(run_tests(), status); if (status) { - std::cout << "Tests pass" << std::endl; + std::cout << "All tests PASS!" << std::endl; return 0; } else { - std::cout << "Some or all tests fail" << std::endl; + std::cout << "Some or all tests FAILED!" << std::endl; return 1; } }