rapidsai · kingcrimsontianyu · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024
@@ -132,6 +132,146 @@ struct orcdec_state_s {
   } vals;
 };
 
+/**
+ * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group.
+ *
+ * This class is used to address a special case, where the first run spans two adjacent row groups
+ * and its length is greater than the maximum length allowed to be consumed. This limit is imposed
+ * by the decoder when processing the SECONDARY stream. This class shall be instantiated in the
+ * shared memory, and be used to cache the DATA stream with a decoded data type of `int64_t`. As an
+ * optimization, the actual cache is a local variable and does not reside in the shared memory.
+ */
+class run_cache_manager {
+ private:
+  enum class status : uint8_t {
+    DISABLED,             ///< Run cache manager is disabled. No caching will be performed.
+    CAN_WRITE_TO_CACHE,   ///< Run cache manager is ready for write. This is expected to happen in
+                          ///< the first iteration of the top-level while-loop in
+                          ///< gpuDecodeOrcColumnData().
+    CAN_READ_FROM_CACHE,  ///< Run cache manager is ready for read. This is expected to happen in
+                          ///< the second iteration of the while-loop.
+  };
+
+ public:
+  /**
+   * @brief Initialize the run cache manager.
+   *
+   * @param[in] s ORC decoder state.
+   */
+  __device__ void initialize(orcdec_state_s* s)
+  {
+    _status          = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP)
+                         ? status::CAN_WRITE_TO_CACHE
+                         : status::DISABLED;
+    _reusable_length = 0;
+    _run_length      = 0;
+  }
+
+  /**
+   * @brief Set the reusable length object.
+   *
+   * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the
+   * DATA stream.
+   * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed
+   * by the decoder when processing the SECONDARY stream.
+   */
+  __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length)
+  {
+    if (_status == status::CAN_WRITE_TO_CACHE) {
+      _run_length      = run_length;
+      _reusable_length = (_run_length > max_length) ? (_run_length - max_length) : 0;
+    }
+  }
+
+  /**
+   * @brief Adjust the maximum length allowed to be consumed when the length of the first run is
+   * greater than it.
+   *
+   * @param[in] max_length The maximum length allowed to be consumed for the DATA stream.
+   * @return A new maximum length.
+   */
+  [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length)
+  {
+    auto new_max_length{max_length};
+    if (_status == status::CAN_READ_FROM_CACHE) { new_max_length -= _reusable_length; }
+    return new_max_length;
+  }
+
+  /**
+   * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache.
+   *
+   * @param[in] src Intermediate buffer for the DATA stream.
+   * @param[out] cache Local variable serving as the cache for the DATA stream.
+   */
+  __device__ void write_to_cache(int64_t* src, int64_t& cache)
+  {
+    if (_status != status::CAN_WRITE_TO_CACHE) { return; }
+
+    const auto tid = threadIdx.x;
+
+    // All threads in the block always take a uniform code path for the following branches.
+    // _reusable_length ranges between [0, 512].
+    if (_reusable_length > 0) {
+      const auto length_to_skip = _run_length - _reusable_length;
+      if (tid < _reusable_length) {
+        const auto src_idx = tid + length_to_skip;
+        cache              = src[src_idx];
+      }
+      // Block until all writes are done to safely change _status.
+      __syncthreads();
+      if (tid == 0) { _status = status::CAN_READ_FROM_CACHE; }
+    } else {
+      __syncthreads();
+      if (tid == 0) { _status = status::DISABLED; }
+    }
+
+    __syncthreads();
+  }
+
+  /**
+   * @brief Copy the cached data to the intermediate buffer for the DATA stream.
+   *
+   * @param[in,out] dst Intermediate buffer for the DATA stream.
+   * @param[in,out] rle Run length decoder state object.
+   * @param[in] cache Local variable serving as the cache for the DATA stream.
+   */
+  __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle, int64_t cache)
+  {
+    if (_status != status::CAN_READ_FROM_CACHE) { return; }
+
+    const auto tid = threadIdx.x;
+
+    // First, shift the data up
+    const auto dst_idx = tid + _reusable_length;
+    const auto v       = (dst_idx < rle->num_vals + _reusable_length) ? dst[tid] : 0;
+    __syncthreads();
+
+    if (dst_idx < rle->num_vals + _reusable_length) { dst[dst_idx] = v; }
+    __syncthreads();
+
+    // Second, insert the cached data
+    if (tid < _reusable_length) { dst[tid] = cache; }
+    __syncthreads();
+
+    if (tid == 0) {
+      // Disable the run cache manager, since cache write-and-read happens at most once per row
+      // group.
+      _status = status::DISABLED;
+      rle->num_vals += _reusable_length;
+    }
+
+    __syncthreads();
+  }
+
+ private:
+  status _status;  ///< The status of the run cache manager.
+  uint32_t
+    _reusable_length;  ///< The number of data to be cached and reused later. For example, if a run
+                       ///< has a length of 512 but the maximum length allowed to be consumed is
+                       ///< capped at 162, then 350 (512-162) data will be cached.
+  uint32_t _run_length;  ///< The length of the run, 512 in the above example.
+};
+
 /**
  * @brief Initializes byte stream, modifying length and start position to keep the read pointer
  * 8-byte aligned.
@@ -631,6 +771,9 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] maxvals maximum number of values to decode
  * @param[in] t thread id
  * @param[in] has_buffered_values If true, means there are already buffered values
+ * @param[in] run_cache_manager_inst If non-null, the run cache manager will be used to manage
+ * caching of the first run of the DATA stream.
+ * @param[in] cache Local variable serving as the cache.
  *
  * @return number of values decoded
  */
@@ -640,9 +783,14 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
                                          T* vals,
                                          uint32_t maxvals,
                                          int t,
-                                         bool has_buffered_values = false)
+                                         bool has_buffered_values                  = false,
+                                         run_cache_manager* run_cache_manager_inst = nullptr,
+                                         int64_t* cache                            = nullptr)
 {
   if (t == 0) {
+    if (run_cache_manager_inst != nullptr) {
+      maxvals = run_cache_manager_inst->adjust_max_length(maxvals);
+    }
     uint32_t maxpos  = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
     uint32_t lastpos = bs->pos;
     auto numvals     = 0;
@@ -685,6 +833,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
           l += deltapos;
         }
       }
+
+      if (run_cache_manager_inst != nullptr) {
+        run_cache_manager_inst->set_reusable_length(n, maxvals);
+      }
+
       if ((numvals != 0) and (numvals + n > maxvals)) break;
       // case where there are buffered values and can't consume a whole chunk
       // from decoded values, so skip adding any more to buffer, work on buffered values and then
@@ -866,6 +1019,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
     __syncwarp();
   }
   __syncthreads();
+  // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the
+  // data type is int64_t.
+  if constexpr (cuda::std::is_same_v<T, int64_t>) {
+    if (run_cache_manager_inst != nullptr) {
+      // Run cache is read from during the 2nd iteration of the top-level while-loop in
+      // gpuDecodeOrcColumnData().
+      run_cache_manager_inst->read_from_cache(vals, rle, *cache);
+      // Run cache is written to during the 1st iteration of the loop.
+      run_cache_manager_inst->write_to_cache(vals, *cache);
+    }
+  }
   return rle->num_vals;
 }
 
@@ -1401,6 +1565,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   // Struct doesn't have any data in itself, so skip
   bool const is_valid       = s->chunk.type_kind != STRUCT;
   size_t const max_num_rows = s->chunk.column_num_rows;
+  __shared__ run_cache_manager run_cache_manager_inst;
+  int64_t cache{};
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
@@ -1443,6 +1609,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
+
+    run_cache_manager_inst.initialize(s);
   }
   __syncthreads();
 
@@ -1602,7 +1770,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         if (is_rlev1(s->chunk.encoding_kind)) {
           numvals = Integer_RLEv1<int64_t>(bs, &s->u.rlev1, s->vals.i64, numvals, t);
         } else {
-          numvals = Integer_RLEv2<int64_t>(bs, &s->u.rlev2, s->vals.i64, numvals, t);
+          numvals = Integer_RLEv2<int64_t>(
+            bs, &s->u.rlev2, s->vals.i64, numvals, t, false, &run_cache_manager_inst, &cache);
         }
         if (s->chunk.type_kind == DECIMAL) {
           // If we're using an index, we may have to drop values from the initial run

@@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir):
     got = cudf.read_orc(buffer)
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "inputfile",
+    [
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+    ],
+)
+def test_orc_reader_desynced_timestamp(datadir, inputfile):
+    # Test a special case where the DATA stream (second) in a TIMESTAMP column
+    # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row
+    # group. In this case, the "run cache manager" in the decoder kernel is used to
+    # orchestrate the dual-stream processing.
+    # For more information, see https://github.com/rapidsai/cudf/issues/17155.
+
+    path = datadir / inputfile
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_frame_equal(cudf.from_pandas(expect), got)