Merge remote-tracking branch 'origin/branch-23.12' into bot-submodule…

…-sync-branch-23.12
NVIDIA · Nov 21, 2023 · b01d8fd · b01d8fd
2 parents b5e2980 + ae68c6b
commit b01d8fd
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 3 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -259,6 +259,39 @@ class NormalCaseTest {
 }
 ```
 
+### Debugging
+You can add debug symbols selectively to C++ files in spark-rapids-jni by modifying the appropriate
+`CMakeLists.txt` files. You will need to add a specific flag depending on what kind of code you are
+debugging. For CUDA code, you need to add the `-G` flag to add device debug symbols:
+
+```cmake
+set_source_files_properties(src/row_conversion.cu PROPERTIES COMPILE_OPTIONS "-G")
+```
+
+For C++ code, you will need to add the `-g` flag to add host debug symbols.
+
+```cmake
+set_source_files_properties(row_conversion.cpp PROPERTIES COMPILE_OPTIONS "-G")
+```
+
+For debugging C++ tests, you need to add both device debug symbols to the CUDA kernel files involved
+in testing (in `src/main/cpp/CMakeLists.txt`) **and** host debug symbols to the CPP files used for
+testing (in `src/main/cpp/tests/CMakeLists.txt`).
+
+You can then use `cuda-gdb` to debug the gtest (NOTE: For Docker, run an interactive shell first and
+then run `cuda-gdb`. You do not necessarily need to run `cuda-gdb` in Docker):
+
+```bash
+./build/run-in-docker
+bash-4.2$ cuda-gdb target/cmake-build/gtests/ROW_CONVERSION
+```
+
+You can also use the [NVIDIA Nsight VSCode Code Integration](https://docs.nvidia.com/nsight-visual-studio-code-edition/cuda-debugger/index.html)
+as well to debug within Visual Studio Code.
+
+To debug libcudf code, please see [Debugging cuDF](thirdparty/cudf/CONTRIBUTING.md#debugging-cudf)
+in the cuDF [CONTRIBUTING](thirdparty/cudf/CONTRIBUTING.md) guide.
+
 ### Benchmarks
 Benchmarks exist for c++ benchmarks using NVBench and are in the `src/main/cpp/benchmarks` directory.
 To build these benchmarks requires the `-DBUILD_BENCHMARKS` build option. Once built, the benchmarks

diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
@@ -81,6 +81,7 @@ constexpr auto NUM_WARPS_IN_BLOCK = 32;
 #pragma nv_diag_suppress static_var_with_dynamic_init
 
 using namespace cudf;
+using detail::make_device_uvector_sync;
 using detail::make_device_uvector_async;
 using rmm::device_uvector;
 
@@ -231,7 +232,7 @@ build_string_row_offsets(table_view const& tbl,
                  offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
                  [](auto const& offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(
+    return make_device_uvector_sync(
       offsets_iterators, stream, rmm::mr::get_current_device_resource());
   }();
 
@@ -1556,8 +1557,13 @@ batch_data build_batches(size_type num_rows,
   batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
   device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(
-    rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin());
+
+  // Evaluate the row size values before calling `inclusive_scan` to workaround
+  // memory issue in https://github.com/NVIDIA/spark-rapids-jni/issues/1567.
+  thrust::copy(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+               cumulative_row_sizes.begin());
+  thrust::inclusive_scan(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
+                         cumulative_row_sizes.end(), cumulative_row_sizes.begin());
 
   // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than
   // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a