From 91742afb2e25d31610846a69b77b4f64ae6fd07c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 2 Aug 2023 22:21:10 +0530
Subject: [PATCH 01/47] warp per string parsing of string columns (unicode)

---
 cpp/include/cudf/io/detail/data_casting.cuh | 361 +++++++++++++++++++-
 1 file changed, 353 insertions(+), 8 deletions(-)
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index d764e8533c6..98c06ee09eb 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/column/column.hpp>
@@ -21,6 +23,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
@@ -295,6 +298,287 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+// 1 warp per string.
+template <typename str_tuple_it>
+__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                         size_type total_out_strings,
+                                         bitmask_type* null_mask,
+                                         size_type* null_count_data,
+                                         cudf::io::parse_options_view const options,
+                                         size_type* d_offsets,
+                                         char* d_chars)
+{
+  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  int warp_lane        = global_thread_id % cudf::detail::warp_size;
+  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+  // TODO alignment - aligned access possible?
+
+  // grid-stride loop.
+  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+    if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
+      if (!d_chars) d_offsets[istring] = 0;
+      continue;  // gride-stride return;
+    }
+
+    auto in_begin           = str_tuples[istring].first;
+    auto in_end             = in_begin + str_tuples[istring].second;
+    auto const num_in_chars = str_tuples[istring].second;
+
+    // Check if the value corresponds to the null literal
+    auto const is_null_literal =
+      (!d_chars) &&
+      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+    if (is_null_literal && null_mask != nullptr) {
+      if (warp_lane == 0) {
+        clear_bit(null_mask, istring);
+        atomicAdd(null_count_data, 1);
+        if (!d_chars) d_offsets[istring] = 0;
+      }
+      continue;  // gride-stride return;
+    }
+    // String values are indicated by keeping the quote character
+    bool const is_string_value =
+      num_in_chars >= 2LL &&
+      (options.quotechar == '\0' ||
+       (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+    char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr;
+
+    // Copy literal/numeric value
+    if (not is_string_value) {
+      if (!d_chars) {
+        if (warp_lane == 0) { d_offsets[istring] = in_end - in_begin; }
+      } else {
+        for (size_type char_index = warp_lane; char_index < (in_end - in_begin);
+             char_index += cudf::detail::warp_size) {
+          d_buffer[char_index] = in_begin[char_index];
+        }
+      }
+      continue;  // gride-stride return;
+    }
+
+    // Exclude beginning and ending quote chars from string range
+    if (!options.keepquotes) {
+      ++in_begin;
+      --in_end;
+    }
+    // auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+
+    // \uXXXX         6->2/3/4
+    // \uXXXX\uXXXX  12->2/3/4
+    // \"             2->1
+    // _              1->1
+    //
+    // error conditions. (propagate)
+    // c=='\' & curr_idx == end_idx-1; ERROR
+    // [c-1]=='\' &  get_escape[c]==NEC
+    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
+    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
+
+    // skip conditions. (scan for size)
+    // c=='\' skip.
+    // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
+
+    // write conditions. (write to d_buffer)
+    // [c-1]!='\' &  [c]!='\' write [c]
+    // [c-1]!='\' &  [c]=='\' skip (unnecessary? already covered? in skip conditions)
+    // [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
+    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
+    // [c+1:4]=curr_hex_val
+    //        // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+    //        hex,DECODE [c+7:4]=next_hex_val
+    //        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+    //        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
+    //        // if prev_hex_val in high, curr_hex_val in low, skip.
+    //        // if curr_hex_val in high, next_hex_val in low, write u16.
+    // if curr_hex_val not in high, write u8.
+    // before writing, find size, then intra-warp scan for out_idx
+    // propagate offset from 32nd thread to others in warp to carry forward.
+    auto is_hex = [](auto ch) {
+      return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
+    };
+    bool init_state{false};  // for backslash scan calculation
+    auto last_offset = 0;
+    // 0-31, 32-63, ... i*32-n.
+    for (size_type char_index = warp_lane; char_index < (in_end - in_begin);
+         char_index += cudf::detail::warp_size) {
+      auto c            = in_begin[char_index];
+      auto prev_c       = char_index > 0 ? in_begin[char_index - 1] : 'a';
+      auto escaped_char = get_escape_char(c);
+      bool error        = false;
+      // FIXME: \\ at end is a problem here.
+      // \uXXXXe e-u=5 4<=4
+      //  012345
+      error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
+      error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
+      error |= (prev_c == '\\' && c == 'u' &&
+                // TODO check if following condition is right or off by one error.
+                ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
+                 // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
+                 !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
+                 !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      // propagate error using warp shuffle.
+      error = __any_sync(0xffffffff, error);
+      if (error) {
+        if (warp_lane == 0) {
+          if (null_mask != nullptr) {
+            clear_bit(null_mask, istring);
+            atomicAdd(null_count_data, 1);
+          }
+          last_offset = 0;
+          if (!d_chars) d_offsets[istring] = 0;
+        }
+        break;  // return to grid-stride loop for next string.
+      }
+      // TODO one more error condition of second \uXXXX is not hex.
+      bool skip = false;
+      // TODO FIXME: continue slashes are a problem!
+      // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
+      // corner case \\uXXXX TODO
+      // skip XXXX in \uXXXX
+      skip |=
+        char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u';
+      skip |=
+        char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && in_begin[char_index - 2] == 'u';
+      skip |=
+        char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u';
+      skip |=
+        char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u';
+      int this_num_out = 0;
+      cudf::char_utf8 write_char{'a'};
+
+      // auto prev = 0; // carry for is_escape_slash
+      // if prev == escape_slash, then this is escaped_char, so copy.
+      // then this is not escaped_slash regardless of c.
+      // if prev != escape_slash, then curr == '\' is escaped_slash,.
+      // curr_escape_slash = prev==true then 0, if prev=false & c=='\'
+      // curr = !prev && c=='\'
+      //        0  & *
+      //        1  '\'?
+      //                 c=='\'
+      // inclusivesum of custom operator.
+      // check if any c=='\' in warp, if no, then prev=false for all.
+      // else do the scan.
+
+      // curr = !prev & c=='\\'
+      // !curr = !(!prev & c=='\\')
+      // !curr = prev | c!='\\' is it associative? NO
+
+      // not associative!
+      // curr[0] curr[1] curr[2]. op = !prev & c;
+      // op = !a & b;
+      // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z
+      // op(x, op(y, z))  =op(x, !y&z)  = !x&(!y &z) = (!x)&(!y)&z
+      auto warp_id = threadIdx.x / 32;
+
+      // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping
+      // backslash.
+
+      struct state_table {
+        bool state[2];
+      };
+      // using state_table = bool[2]; Try this. and see if compiler errors
+      __shared__ typename cub::WarpScan<state_table>::TempStorage temp_slash[4];
+      state_table curr{c == '\\', false};  // state transition vector.
+      auto composite_op = [](state_table op1, state_table op2) {
+        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+      };
+      state_table scanned;
+      // inclusive scan? how?
+      cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+      auto is_escaping_backslash = scanned.state[init_state];
+      // init_state                 = __shfl_sync(0xffffffff, is_escaping_backslash, 31);
+      auto last_active_lane = 31 - __clz(__activemask());  // TODO simplify 0xFF case?
+      init_state            = __shfl_sync(0xffffffff, is_escaping_backslash, last_active_lane);
+      // TODO replace/add prev_c with proper scan of escapes
+      skip |= is_escaping_backslash;
+
+      if (!skip) {
+        // is prev_is_not backslash?
+        if (prev_c != '\\') {  // FIXME: enable this after debugging.
+          // if (true) {
+          this_num_out = 1;
+          if (d_chars) write_char = c;
+          // d_buffer[last_offset+ this_num_out_scaned] = c;
+        } else {
+          // already taken care early.
+          // if (escaped_char == NON_ESCAPE_CHAR) {
+          //    this_num_out = 0;
+          //    error = true;
+          // } else
+          if (escaped_char != UNICODE_SEQ) {
+            this_num_out = 1;
+            // if(d_chars)
+            write_char = escaped_char;
+            // d_buffer[last_offset+ this_num_out_scaned] = escaped_char;
+          } else {
+            // \uXXXX- u
+            // Unicode
+            auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
+            auto hex_low_val = 0;
+#if 1
+            // if next is \uXXXX
+            // in_begin + char_index
+            // 01234567890
+            //\uXXXX\uXXXX
+            // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
+                in_begin[char_index + 1 + 5] == 'u') {
+              hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
+            }
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+              // Compute UTF16-encoded code point
+              uint32_t unicode_code_point = 0x10000 +
+                                            ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                            (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+              write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
+              this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+              // this_num_out = 0; skip=true;
+            } else {
+              // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6);
+              if (
+                // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val <
+                // UTF16_HIGH_SURROGATE_END &&
+                hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+                skip         = true;
+                this_num_out = 0;
+                write_char   = 0;
+              } else {
+                // if u8
+                write_char   = strings::detail::codepoint_to_utf8(hex_val);
+                this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+              }
+            }
+#endif
+          }
+        }
+      }  // !skip end.
+      {
+        // TODO think about writing error conditions as normal, so that program flow is easy to read
+        // and can process error here.
+        // WRITE now (compute out_idx offset then write)
+        // intra-warp scan of this_num_out.
+        // TODO union to save shared memory
+        __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[4];
+        size_type offset;
+        cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
+        offset += last_offset;
+        // TODO add last active lane this_num_out for correct last_offset.
+        if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
+        __shared__ cub::WarpReduce<size_type>::TempStorage temp_storage2[4];
+        last_offset += cub::WarpReduce<size_type>(temp_storage2[warp_id]).Sum(this_num_out);
+        last_offset = __shfl_sync(0xffffffff, last_offset, 0);
+        // offset += this_num_out;
+        // auto last_active_lane = __ffs(__brev(__activemask())); // TODO simplify 0xFF case?
+        // last_offset = __shfl_sync(0xffffffff, offset, 31-last_active_lane);  // TODO is mask
+        // right?
+      }
+    }  // char for-loop
+    if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; }
+  }    // grid-stride for-loop
+}
+
 template <typename str_tuple_it>
 struct string_parse {
   str_tuple_it str_tuples;
@@ -365,20 +649,81 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
 
+  auto d_null_count2    = rmm::device_scalar<size_type>(null_count, stream);
+  auto null_count_data2 = d_null_count2.data();
+
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
     // this utility calls the functor to build the offsets and chars columns;
     // the bitmask and null count may be updated by parse failures
-    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-      string_parse<decltype(str_tuples)>{
-        str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
+    nvtxRangePush("make_strings_children");
+    // auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+    //   string_parse<decltype(str_tuples)>{
+    //     str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
+    //   col_size,
+    //   stream,
+    //   mr);
+    nvtxRangePop();
+
+    // {
+    nvtxRangePush("string_parallel");
+    auto offsets2 = cudf::make_numeric_column(
+      data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets = offsets2->mutable_view().data<size_type>();
+    parse_fn_string_parallel<<<min(65536, col_size / 4 + 1), 32 * 4, 0, stream.value()>>>(
+      str_tuples,
+      col_size,
+      static_cast<bitmask_type*>(null_mask.data()),
+      null_count_data2,
+      options,
+      d_offsets,
+      nullptr);
+    // if (0) {
+    // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsets,
+    // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<<i<<","; std::cout<<std::endl;
+    // }
+    auto const bytes =
+      cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+
+    // CHARS column
+    std::unique_ptr<column> chars2 =
+      strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+    auto d_chars2 = chars2->mutable_view().data<char>();
+    cudaMemsetAsync(d_chars2, 'c', bytes, stream.value());
+
+    parse_fn_string_parallel<<<min(65536, col_size / 4 + 1), 32 * 4, 0, stream.value()>>>(
+      str_tuples,
       col_size,
-      stream,
-      mr);
+      static_cast<bitmask_type*>(null_mask.data()),
+      null_count_data2,
+      options,
+      d_offsets,
+      d_chars2);
+
+    // if(bytes!=chars->size()) {
+    // std::cout<<"new bytes="<<bytes<<std::endl;
+    // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsets,
+    // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<<i<<","; std::cout<<std::endl;
+
+    // auto h_chars2 = cudf::detail::make_std_vector_sync(device_span<char const>(d_chars2, bytes),
+    // stream); for(auto i: h_chars2) std::cout<<i; std::cout<<std::endl;
+    // }
+    // if(bytes!=chars->size()) {
+    // std::cout<<"old bytes="<<chars->size()<<std::endl;
+    // auto d_offsetsa = (offsets->mutable_view())
+    // .template data<size_type>();
+    // auto h_offsets = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsetsa,
+    // offsets->size()), stream); for(auto i: h_offsets) std::cout<<i<<","; std::cout<<std::endl;
+    // auto d_chars = chars->mutable_view().template data<char>();
+    // auto h_chars = cudf::detail::make_std_vector_sync(device_span<char const>(d_chars,
+    // chars->size()), stream); for(auto i: h_chars) std::cout<<i; std::cout<<std::endl;
+    // }
+    nvtxRangePop();
+    // }
 
     return make_strings_column(col_size,
-                               std::move(offsets),
-                               std::move(chars),
-                               d_null_count.value(stream),
+                               std::move(offsets2),
+                               std::move(chars2),
+                               d_null_count2.value(stream),
                                std::move(null_mask));
   }
 

From 51e70e2684c8fab01498e25a9c4f395703a1ba89 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 2 Aug 2023 22:22:26 +0530
Subject: [PATCH 02/47] remove dependency of data_casting.cuh in write_json.cu

---
 cpp/src/io/json/write_json.cu | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 9ecf77a798a..48a95cb3ec2 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -27,9 +27,9 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -61,6 +61,21 @@
 #include <vector>
 
 namespace cudf::io::json::detail {
+__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return {'\\', '"'};
+    case '\\': return {'\\', '\\'};
+    case '/': return {'\\', '/'};
+    case '\b': return {'\\', 'b'};
+    case '\f': return {'\\', 'f'};
+    case '\n': return {'\\', 'n'};
+    case '\r': return {'\\', 'r'};
+    case '\t': return {'\\', 't'};
+    // case 'u': return UNICODE_SEQ;
+    default: return {'\0', escaped_char};
+  }
+}
 
 std::unique_ptr<column> make_column_names_column(host_span<column_name_info const> column_names,
                                                  size_type num_columns,
@@ -148,7 +163,7 @@ struct escape_strings_fn {
         }
         continue;
       }
-      auto escaped_chars = cudf::io::json::experimental::detail::get_escaped_char(utf8_char);
+      auto escaped_chars = get_escaped_char(utf8_char);
       if (escaped_chars.first == '\0') {
         write_char(escaped_chars.second, d_buffer, bytes);
       } else {

From 318b4a3e7d2d2f421e62477d8667af714dbeafac Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 3 Aug 2023 00:41:19 +0530
Subject: [PATCH 03/47] cleanup

---
 cpp/include/cudf/io/detail/data_casting.cuh | 119 +++++++++++---------
 1 file changed, 64 insertions(+), 55 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 98c06ee09eb..bb58c79e7a3 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -298,6 +298,39 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+// 1 warp per string.
+// algorithm
+
+// \uXXXX         6->2/3/4
+// \uXXXX\uXXXX  12->2/3/4
+// \"             2->1
+// _              1->1
+//
+// error conditions. (propagate)
+// c=='\' & curr_idx == end_idx-1; ERROR
+// [c-1]=='\' &  get_escape[c]==NEC
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
+
+// skip conditions. (scan for size)
+// c=='\' skip.
+// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
+
+// write conditions. (write to d_buffer)
+// [c-1]!='\' &  [c]!='\' write [c]
+// [c-1]!='\' &  [c]=='\' skip (unnecessary? already covered? in skip conditions)
+// [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
+// [c+1:4]=curr_hex_val
+//        // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c+7:4]=next_hex_val
+//        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
+//        // if prev_hex_val in high, curr_hex_val in low, skip.
+//        // if curr_hex_val in high, next_hex_val in low, write u16.
+// if curr_hex_val not in high, write u8.
+// before writing, find size, then intra-warp scan for out_idx
+// propagate offset from 32nd thread to others in warp to carry forward.
 // 1 warp per string.
 template <typename str_tuple_it>
 __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
@@ -364,36 +397,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     }
     // auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
 
-    // \uXXXX         6->2/3/4
-    // \uXXXX\uXXXX  12->2/3/4
-    // \"             2->1
-    // _              1->1
-    //
-    // error conditions. (propagate)
-    // c=='\' & curr_idx == end_idx-1; ERROR
-    // [c-1]=='\' &  get_escape[c]==NEC
-    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
-    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
-
-    // skip conditions. (scan for size)
-    // c=='\' skip.
-    // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
-
-    // write conditions. (write to d_buffer)
-    // [c-1]!='\' &  [c]!='\' write [c]
-    // [c-1]!='\' &  [c]=='\' skip (unnecessary? already covered? in skip conditions)
-    // [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
-    // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
-    // [c+1:4]=curr_hex_val
-    //        // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
-    //        hex,DECODE [c+7:4]=next_hex_val
-    //        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
-    //        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
-    //        // if prev_hex_val in high, curr_hex_val in low, skip.
-    //        // if curr_hex_val in high, next_hex_val in low, write u16.
-    // if curr_hex_val not in high, write u8.
-    // before writing, find size, then intra-warp scan for out_idx
-    // propagate offset from 32nd thread to others in warp to carry forward.
     auto is_hex = [](auto ch) {
       return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
     };
@@ -622,6 +625,16 @@ struct string_parse {
     }
   }
 };
+
+template <typename T>
+void print_raw(T const* ptr, size_type size, rmm::cuda_stream_view stream)
+{
+  auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<T const>(ptr, size), stream);
+  for (auto i : h_offsets2)
+    std::cout << i << ",";
+  std::cout << std::endl;
+}
+
 /**
  * @brief Parses the data from an iterator of string views, casting it to the given target data type
  *
@@ -655,14 +668,20 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
     // this utility calls the functor to build the offsets and chars columns;
     // the bitmask and null count may be updated by parse failures
+
+#define WARP_PARALLEL
+#ifndef WARP_PARALLEL
     nvtxRangePush("make_strings_children");
-    // auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-    //   string_parse<decltype(str_tuples)>{
-    //     str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
-    //   col_size,
-    //   stream,
-    //   mr);
+    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+      string_parse<decltype(str_tuples)>{
+        str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
+      col_size,
+      stream,
+      mr);
+    auto& offsets2 = offsets;
+    auto& chars2   = chars;
     nvtxRangePop();
+#else
 
     // {
     nvtxRangePush("string_parallel");
@@ -677,10 +696,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       options,
       d_offsets,
       nullptr);
-    // if (0) {
-    // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsets,
-    // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<<i<<","; std::cout<<std::endl;
-    // }
+    // print_raw(d_offsets, offsets2->size(), stream);
     auto const bytes =
       cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
 
@@ -700,25 +716,18 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       d_chars2);
 
     // if(bytes!=chars->size()) {
-    // std::cout<<"new bytes="<<bytes<<std::endl;
-    // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsets,
-    // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<<i<<","; std::cout<<std::endl;
-
-    // auto h_chars2 = cudf::detail::make_std_vector_sync(device_span<char const>(d_chars2, bytes),
-    // stream); for(auto i: h_chars2) std::cout<<i; std::cout<<std::endl;
+    //   std::cout<<"new bytes="<<bytes<<std::endl;
+    //   print_raw(d_offsets, offsets2->size(), stream);
+    //   print_raw(d_chars2, chars2->size(), stream);
     // }
     // if(bytes!=chars->size()) {
-    // std::cout<<"old bytes="<<chars->size()<<std::endl;
-    // auto d_offsetsa = (offsets->mutable_view())
-    // .template data<size_type>();
-    // auto h_offsets = cudf::detail::make_std_vector_sync(device_span<size_type const>(d_offsetsa,
-    // offsets->size()), stream); for(auto i: h_offsets) std::cout<<i<<","; std::cout<<std::endl;
-    // auto d_chars = chars->mutable_view().template data<char>();
-    // auto h_chars = cudf::detail::make_std_vector_sync(device_span<char const>(d_chars,
-    // chars->size()), stream); for(auto i: h_chars) std::cout<<i; std::cout<<std::endl;
+    //   std::cout<<"old bytes="<<chars->size()<<std::endl;
+    //   print_raw(offsets->view().data<size_type>(), offsets->size(), stream);
+    //   print_raw(chars->view().data<size_type>(), chars->size(), stream);
     // }
     nvtxRangePop();
-    // }
+// }
+#endif
 
     return make_strings_column(col_size,
                                std::move(offsets2),

From 363d5ab65d17b3c4fffef5422f969f321dc36eeb Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:36:53 +0530
Subject: [PATCH 04/47] try load balancing with global counter for string index

---
 cpp/include/cudf/io/detail/data_casting.cuh | 38 ++++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index bb58c79e7a3..9977c69a441 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -335,6 +335,7 @@ process_string(in_iterator_t in_begin,
 template <typename str_tuple_it>
 __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                                          size_type total_out_strings,
+                                         size_type* str_counter,
                                          bitmask_type* null_mask,
                                          size_type* null_count_data,
                                          cudf::io::parse_options_view const options,
@@ -342,13 +343,24 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                                          char* d_chars)
 {
   int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
   int warp_lane        = global_thread_id % cudf::detail::warp_size;
-  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+  // int global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  // int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
   // TODO alignment - aligned access possible?
 
   // grid-stride loop.
-  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+  auto warp_inc_count = [&]() {
+    size_type istring=0;
+    if(warp_lane==0) {
+      istring = atomicAdd(str_counter, 1);
+    }
+    __syncwarp();
+    return __shfl_sync(0xffffffff, istring, 0);
+  };
+  // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+  for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) {
+    // if (!d_chars)
+    // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings);
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
       if (!d_chars) d_offsets[istring] = 0;
       continue;  // gride-stride return;
@@ -688,9 +700,23 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     auto offsets2 = cudf::make_numeric_column(
       data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
     auto d_offsets = offsets2->mutable_view().data<size_type>();
-    parse_fn_string_parallel<<<min(65536, col_size / 4 + 1), 32 * 4, 0, stream.value()>>>(
+
+    int max_blocks = 0;
+    constexpr auto warps_per_block = 8;
+    int threads_per_block = cudf::detail::warp_size * warps_per_block;
+    CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, parse_fn_string_parallel<str_tuple_it>, threads_per_block, 0));
+
+    int device = 0;
+    CUDF_CUDA_TRY(cudaGetDevice(&device));
+    int num_sms = 0;
+    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+    auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1));
+    auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
+
+    parse_fn_string_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
       str_tuples,
       col_size,
+      str_counter.data(),
       static_cast<bitmask_type*>(null_mask.data()),
       null_count_data2,
       options,
@@ -699,6 +725,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     // print_raw(d_offsets, offsets2->size(), stream);
     auto const bytes =
       cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+    str_counter.set_value(0, stream);
 
     // CHARS column
     std::unique_ptr<column> chars2 =
@@ -706,9 +733,10 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     auto d_chars2 = chars2->mutable_view().data<char>();
     cudaMemsetAsync(d_chars2, 'c', bytes, stream.value());
 
-    parse_fn_string_parallel<<<min(65536, col_size / 4 + 1), 32 * 4, 0, stream.value()>>>(
+    parse_fn_string_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
       str_tuples,
       col_size,
+      str_counter.data(),
       static_cast<bitmask_type*>(null_mask.data()),
       null_count_data2,
       options,

From 58e0d6c502dccd214ccec35ccc91cee7d6433d9c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:40:04 +0530
Subject: [PATCH 05/47] fix intra-warp divergence issue with cub::WarpScan
 stuck

---
 cpp/include/cudf/io/detail/data_casting.cuh | 31 +++++++++++++--------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 9977c69a441..7e46e311dde 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -415,8 +415,13 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     bool init_state{false};  // for backslash scan calculation
     auto last_offset = 0;
     // 0-31, 32-63, ... i*32-n.
-    for (size_type char_index = warp_lane; char_index < (in_end - in_begin);
+    // condition as __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) != 0
+    // to allow entire warp execute but with mask.
+    auto MASK = 0xffffffff;
+    for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
+    // char_index<(in_end - in_begin);
          char_index += cudf::detail::warp_size) {
+      bool is_within_bounds = char_index < (in_end - in_begin);
       auto c            = in_begin[char_index];
       auto prev_c       = char_index > 0 ? in_begin[char_index - 1] : 'a';
       auto escaped_char = get_escape_char(c);
@@ -424,6 +429,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // FIXME: \\ at end is a problem here.
       // \uXXXXe e-u=5 4<=4
       //  012345
+      if(is_within_bounds) {
       error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
       error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
       error |= (prev_c == '\\' && c == 'u' &&
@@ -432,8 +438,9 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                  // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
                  !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                  !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      }
       // propagate error using warp shuffle.
-      error = __any_sync(0xffffffff, error);
+      error = __any_sync(MASK, error);
       if (error) {
         if (warp_lane == 0) {
           if (null_mask != nullptr) {
@@ -446,11 +453,12 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         break;  // return to grid-stride loop for next string.
       }
       // TODO one more error condition of second \uXXXX is not hex.
-      bool skip = false;
+      bool skip = !is_within_bounds; //false;
       // TODO FIXME: continue slashes are a problem!
       // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
       // corner case \\uXXXX TODO
       // skip XXXX in \uXXXX
+      if (is_within_bounds) {
       skip |=
         char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u';
       skip |=
@@ -459,6 +467,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u';
       skip |=
         char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u';
+      }
       int this_num_out = 0;
       cudf::char_utf8 write_char{'a'};
 
@@ -484,7 +493,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // op = !a & b;
       // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z
       // op(x, op(y, z))  =op(x, !y&z)  = !x&(!y &z) = (!x)&(!y)&z
-      auto warp_id = threadIdx.x / 32;
 
       // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping
       // backslash.
@@ -493,18 +501,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         bool state[2];
       };
       // using state_table = bool[2]; Try this. and see if compiler errors
-      __shared__ typename cub::WarpScan<state_table>::TempStorage temp_slash[4];
-      state_table curr{c == '\\', false};  // state transition vector.
+      __shared__ typename cub::WarpScan<state_table>::TempStorage temp_slash[8];
+      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
       auto composite_op = [](state_table op1, state_table op2) {
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
-      // inclusive scan? how?
+      auto warp_id = threadIdx.x / 32;
+      // inclusive scan. TODO both inclusive and exclusive available in cub.
       cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
       // init_state                 = __shfl_sync(0xffffffff, is_escaping_backslash, 31);
-      auto last_active_lane = 31 - __clz(__activemask());  // TODO simplify 0xFF case?
-      init_state            = __shfl_sync(0xffffffff, is_escaping_backslash, last_active_lane);
+      auto last_active_lane = 31 - __clz(MASK);  // TODO simplify 0xFF case?
+      init_state            = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
       // TODO replace/add prev_c with proper scan of escapes
       skip |= is_escaping_backslash;
 
@@ -531,7 +540,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             // Unicode
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
             auto hex_low_val = 0;
-#if 1
             // if next is \uXXXX
             // in_begin + char_index
             // 01234567890
@@ -565,7 +573,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                 this_num_out = strings::detail::bytes_in_char_utf8(write_char);
               }
             }
-#endif
           }
         }
       }  // !skip end.
@@ -575,7 +582,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         // WRITE now (compute out_idx offset then write)
         // intra-warp scan of this_num_out.
         // TODO union to save shared memory
-        __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[4];
+        __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[8];
         size_type offset;
         cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
         offset += last_offset;

From c0edf8f8d3d38d4ca0711a2aea5d23de5dbefcc6 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:55:16 +0530
Subject: [PATCH 06/47] remove unnecessary WarpReduce, reduce shmem usage

---
 cpp/include/cudf/io/detail/data_casting.cuh | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 7e46e311dde..48bf540f455 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -369,6 +369,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     auto in_begin           = str_tuples[istring].first;
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
+    // if(!(num_in_chars >= 1024)) continue;
 
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
@@ -588,13 +589,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         offset += last_offset;
         // TODO add last active lane this_num_out for correct last_offset.
         if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
-        __shared__ cub::WarpReduce<size_type>::TempStorage temp_storage2[4];
-        last_offset += cub::WarpReduce<size_type>(temp_storage2[warp_id]).Sum(this_num_out);
-        last_offset = __shfl_sync(0xffffffff, last_offset, 0);
-        // offset += this_num_out;
-        // auto last_active_lane = __ffs(__brev(__activemask())); // TODO simplify 0xFF case?
-        // last_offset = __shfl_sync(0xffffffff, offset, 31-last_active_lane);  // TODO is mask
-        // right?
+        offset += this_num_out;
+        last_offset = __shfl_sync(0xffffffff, offset, 31);
       }
     }  // char for-loop
     if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; }

From 086dfa9de1ad658386b103c6e0ced37f64f90802 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 7 Aug 2023 18:08:04 +0530
Subject: [PATCH 07/47] cleanup comments, unused code

---
 cpp/include/cudf/io/detail/data_casting.cuh | 52 +++++++--------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 48bf540f455..3b06ad08724 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -348,7 +348,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
   // int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
   // TODO alignment - aligned access possible?
 
-  // grid-stride loop.
+  // get 1-string index per warp
   auto warp_inc_count = [&]() {
     size_type istring=0;
     if(warp_lane==0) {
@@ -357,6 +357,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     __syncwarp();
     return __shfl_sync(0xffffffff, istring, 0);
   };
+  // grid-stride loop.
   // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
   for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) {
     // if (!d_chars)
@@ -416,13 +417,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     bool init_state{false};  // for backslash scan calculation
     auto last_offset = 0;
     // 0-31, 32-63, ... i*32-n.
-    // condition as __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) != 0
-    // to allow entire warp execute but with mask.
+    // entire warp executes but with mask.
     auto MASK = 0xffffffff;
     for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
-    // char_index<(in_end - in_begin);
          char_index += cudf::detail::warp_size) {
-      bool is_within_bounds = char_index < (in_end - in_begin);
+      bool is_within_bounds = char_index < (in_end - in_begin); //TODO more conditions below to avoid out-of-bound memory access.
       auto c            = in_begin[char_index];
       auto prev_c       = char_index > 0 ? in_begin[char_index - 1] : 'a';
       auto escaped_char = get_escape_char(c);
@@ -472,31 +471,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       int this_num_out = 0;
       cudf::char_utf8 write_char{'a'};
 
-      // auto prev = 0; // carry for is_escape_slash
-      // if prev == escape_slash, then this is escaped_char, so copy.
-      // then this is not escaped_slash regardless of c.
-      // if prev != escape_slash, then curr == '\' is escaped_slash,.
-      // curr_escape_slash = prev==true then 0, if prev=false & c=='\'
-      // curr = !prev && c=='\'
-      //        0  & *
-      //        1  '\'?
-      //                 c=='\'
-      // inclusivesum of custom operator.
-      // check if any c=='\' in warp, if no, then prev=false for all.
-      // else do the scan.
-
+      // To check current is backslash by checking if previous is backslash.
       // curr = !prev & c=='\\'
-      // !curr = !(!prev & c=='\\')
-      // !curr = prev | c!='\\' is it associative? NO
-
-      // not associative!
-      // curr[0] curr[1] curr[2]. op = !prev & c;
-      // op = !a & b;
-      // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z
-      // op(x, op(y, z))  =op(x, !y&z)  = !x&(!y &z) = (!x)&(!y)&z
-
-      // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping
-      // backslash.
+      // So, scan is required from beginning of string.
+      // State table approach (intra-warp FST)
+      // 2 states: Not-Slash(NS), Slash(S).
+      // prev  /   *
+      // NS    S  NS
+      //  S   NS  NS
+      // After inclusive scan, all current S states translate to escaping backslash.
+      // All escaping backslash should be skipped.
 
       struct state_table {
         bool state[2];
@@ -512,8 +496,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // inclusive scan. TODO both inclusive and exclusive available in cub.
       cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
-      // init_state                 = __shfl_sync(0xffffffff, is_escaping_backslash, 31);
-      auto last_active_lane = 31 - __clz(MASK);  // TODO simplify 0xFF case?
+      auto last_active_lane = 31 - __clz(MASK);  // TODO simplify 0xffffffff case?
       init_state            = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
       // TODO replace/add prev_c with proper scan of escapes
       skip |= is_escaping_backslash;
@@ -524,7 +507,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           // if (true) {
           this_num_out = 1;
           if (d_chars) write_char = c;
-          // d_buffer[last_offset+ this_num_out_scaned] = c;
+          //FIXME: can you skip write like this for string_size count at other places?
         } else {
           // already taken care early.
           // if (escaped_char == NON_ESCAPE_CHAR) {
@@ -533,9 +516,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           // } else
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
-            // if(d_chars)
             write_char = escaped_char;
-            // d_buffer[last_offset+ this_num_out_scaned] = escaped_char;
           } else {
             // \uXXXX- u
             // Unicode
@@ -544,7 +525,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             // if next is \uXXXX
             // in_begin + char_index
             // 01234567890
-            //\uXXXX\uXXXX
+            //\uXXXX\uXXXX TODO cleanup these conditions.
             // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
             if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
                 in_begin[char_index + 1 + 5] == 'u') {
@@ -558,7 +539,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                                             (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
               write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
               this_num_out = strings::detail::bytes_in_char_utf8(write_char);
-              // this_num_out = 0; skip=true;
             } else {
               // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6);
               if (

From 0aa2c0e0c674c7633027e5c4022655a6edb37c3f Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 11 Aug 2023 20:38:45 +0530
Subject: [PATCH 08/47] add block per string algorithm

---
 cpp/include/cudf/io/detail/data_casting.cuh | 356 ++++++++++++++++++--
 1 file changed, 320 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 3b06ad08724..855f226a0b0 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -350,16 +350,15 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
 
   // get 1-string index per warp
   auto warp_inc_count = [&]() {
-    size_type istring=0;
-    if(warp_lane==0) {
-      istring = atomicAdd(str_counter, 1);
-    }
+    size_type istring = 0;
+    if (warp_lane == 0) { istring = atomicAdd(str_counter, 1); }
     __syncwarp();
     return __shfl_sync(0xffffffff, istring, 0);
   };
   // grid-stride loop.
   // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
-  for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) {
+  for (size_type istring = warp_inc_count(); istring < total_out_strings;
+       istring           = warp_inc_count()) {
     // if (!d_chars)
     // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings);
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
@@ -370,7 +369,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     auto in_begin           = str_tuples[istring].first;
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
-    // if(!(num_in_chars >= 1024)) continue;
+    if (num_in_chars >= 1024) continue;
 
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
@@ -419,25 +418,28 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
     auto MASK = 0xffffffff;
-    for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
+    for (size_type char_index = warp_lane;
+         (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
          char_index += cudf::detail::warp_size) {
-      bool is_within_bounds = char_index < (in_end - in_begin); //TODO more conditions below to avoid out-of-bound memory access.
-      auto c            = in_begin[char_index];
-      auto prev_c       = char_index > 0 ? in_begin[char_index - 1] : 'a';
+      bool is_within_bounds =
+        char_index <
+        (in_end - in_begin);  // TODO more conditions below to avoid out-of-bound memory access.
+      auto c            = is_within_bounds ? in_begin[char_index] : '\0';
+      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
       bool error        = false;
       // FIXME: \\ at end is a problem here.
       // \uXXXXe e-u=5 4<=4
       //  012345
-      if(is_within_bounds) {
-      error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
-      error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
-      error |= (prev_c == '\\' && c == 'u' &&
-                // TODO check if following condition is right or off by one error.
-                ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
-                 // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
-                 !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
-                 !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      if (is_within_bounds) {
+        error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
+        error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
+        error |= (prev_c == '\\' && c == 'u' &&
+                  // TODO check if following condition is right or off by one error.
+                  ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
+                   // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
+                   !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
+                   !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
       // propagate error using warp shuffle.
       error = __any_sync(MASK, error);
@@ -453,20 +455,20 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         break;  // return to grid-stride loop for next string.
       }
       // TODO one more error condition of second \uXXXX is not hex.
-      bool skip = !is_within_bounds; //false;
+      bool skip = !is_within_bounds;  // false;
       // TODO FIXME: continue slashes are a problem!
       // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
       // corner case \\uXXXX TODO
       // skip XXXX in \uXXXX
       if (is_within_bounds) {
-      skip |=
-        char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u';
-      skip |=
-        char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && in_begin[char_index - 2] == 'u';
-      skip |=
-        char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u';
-      skip |=
-        char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u';
+        skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
+                in_begin[char_index - 1] == 'u';
+        skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' &&
+                in_begin[char_index - 2] == 'u';
+        skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' &&
+                in_begin[char_index - 3] == 'u';
+        skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' &&
+                in_begin[char_index - 4] == 'u';
       }
       int this_num_out = 0;
       cudf::char_utf8 write_char{'a'};
@@ -496,8 +498,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // inclusive scan. TODO both inclusive and exclusive available in cub.
       cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
-      auto last_active_lane = 31 - __clz(MASK);  // TODO simplify 0xffffffff case?
-      init_state            = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
+      auto last_active_lane      = 31 - __clz(MASK);  // TODO simplify 0xffffffff case?
+      init_state                 = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
       // TODO replace/add prev_c with proper scan of escapes
       skip |= is_escaping_backslash;
 
@@ -507,7 +509,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           // if (true) {
           this_num_out = 1;
           if (d_chars) write_char = c;
-          //FIXME: can you skip write like this for string_size count at other places?
+          // FIXME: can you skip write like this for string_size count at other places?
         } else {
           // already taken care early.
           // if (escaped_char == NON_ESCAPE_CHAR) {
@@ -516,7 +518,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           // } else
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
-            write_char = escaped_char;
+            write_char   = escaped_char;
           } else {
             // \uXXXX- u
             // Unicode
@@ -577,6 +579,269 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
   }    // grid-stride for-loop
 }
 
+template <typename str_tuple_it>
+__global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
+                                        size_type total_out_strings,
+                                        size_type* str_counter,
+                                        bitmask_type* null_mask,
+                                        size_type* null_count_data,
+                                        cudf::io::parse_options_view const options,
+                                        size_type* d_offsets,
+                                        char* d_chars)
+{
+  // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int lane = threadIdx.x;
+  // int global_warp_id   = blockIdx.x;
+  // int nwarps           = gridDim.x;
+  // TODO alignment - aligned access possible?
+
+  // get 1-string index per warp
+  auto warp_inc_count = [&]() {
+    __shared__ size_type istring;
+    if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+    __syncthreads();  // memory fence?
+    return istring;
+  };
+  // grid-stride loop.
+  // TODO if large number of small strings, then this loop is not efficient. So, switch to old
+  // method. for (size_type istring = global_warp_id; istring < total_out_strings; istring +=
+  // nwarps) {
+  for (size_type istring = warp_inc_count(); istring < total_out_strings;
+       istring           = warp_inc_count()) {
+    // if (!d_chars)
+    // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings);
+    if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
+      if (!d_chars) d_offsets[istring] = 0;
+      continue;  // gride-stride return;
+    }
+
+    auto in_begin           = str_tuples[istring].first;
+    auto in_end             = in_begin + str_tuples[istring].second;
+    auto const num_in_chars = str_tuples[istring].second;
+    if (num_in_chars < 1024) continue;
+
+    // Check if the value corresponds to the null literal
+    auto const is_null_literal =
+      (!d_chars) &&
+      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+    if (is_null_literal && null_mask != nullptr) {
+      if (lane == 0) {
+        clear_bit(null_mask, istring);
+        atomicAdd(null_count_data, 1);
+        if (!d_chars) d_offsets[istring] = 0;
+      }
+      continue;  // gride-stride return;
+    }
+    // String values are indicated by keeping the quote character
+    bool const is_string_value =
+      num_in_chars >= 2LL &&
+      (options.quotechar == '\0' ||
+       (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+    char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr;
+
+    // Copy literal/numeric value
+    if (not is_string_value) {
+      if (!d_chars) {
+        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
+      } else {
+        for (size_type char_index = lane; char_index < (in_end - in_begin);
+             char_index += blockDim.x) {
+          d_buffer[char_index] = in_begin[char_index];
+        }
+      }
+      continue;  // gride-stride return;
+    }
+
+    // Exclude beginning and ending quote chars from string range
+    if (!options.keepquotes) {
+      ++in_begin;
+      --in_end;
+    }
+    // auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+
+    auto is_hex = [](auto ch) {
+      return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
+    };
+    __shared__ bool init_state;        //{false};  // for backslash scan calculation
+    __shared__ size_type last_offset;  // = 0;
+    if (lane == 0) {
+      init_state  = false;
+      last_offset = 0;
+    }
+    // 0-31, 32-63, ... i*32-n.
+    // entire warp executes but with mask.
+    // auto MASK = 0xffffffff;
+    for (size_type char_index = lane;
+         char_index < (in_end - in_begin + 32 * 8 - 1) / (32 * 8) * (32 * 8);
+         char_index += blockDim.x) {
+      bool is_within_bounds =
+        char_index <
+        (in_end - in_begin);  // TODO more conditions below to avoid out-of-bound memory access.
+      auto c            = is_within_bounds ? in_begin[char_index] : '\0';
+      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : 'a';
+      auto escaped_char = get_escape_char(c);
+      bool error        = false;
+      // FIXME: \\ at end is a problem here.
+      // \uXXXXe e-u=5 4<=4
+      //  012345
+      if (is_within_bounds) {
+        error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
+        error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
+        error |= (prev_c == '\\' && c == 'u' &&
+                  // TODO check if following condition is right or off by one error.
+                  ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
+                   // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
+                   !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
+                   !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      }
+      // propagate error using warp shuffle.
+      // error = __any_sync(MASK, error);
+      using BlockReduce = cub::BlockReduce<bool, 32 * 8>;
+      __shared__ typename BlockReduce::TempStorage temp_storage_error;
+      __shared__ bool error_reduced;
+      error_reduced = BlockReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
+      // only valid in thread0.
+      __syncthreads();
+      if (error_reduced) {
+        if (lane == 0) {
+          if (null_mask != nullptr) {
+            clear_bit(null_mask, istring);
+            atomicAdd(null_count_data, 1);
+          }
+          last_offset = 0;
+          if (!d_chars) d_offsets[istring] = 0;
+        }
+        break;  // return to grid-stride loop for next string.
+      }
+      // TODO one more error condition of second \uXXXX is not hex.
+      bool skip = !is_within_bounds;  // false;
+      // TODO FIXME: continue slashes are a problem!
+      // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
+      // corner case \\uXXXX TODO
+      // skip XXXX in \uXXXX
+      if (is_within_bounds) {
+        skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
+                in_begin[char_index - 1] == 'u';
+        skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' &&
+                in_begin[char_index - 2] == 'u';
+        skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' &&
+                in_begin[char_index - 3] == 'u';
+        skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' &&
+                in_begin[char_index - 4] == 'u';
+      }
+      int this_num_out = 0;
+      cudf::char_utf8 write_char{'a'};
+
+      // To check current is backslash by checking if previous is backslash.
+      // curr = !prev & c=='\\'
+      // So, scan is required from beginning of string.
+      // State table approach (intra-warp FST)
+      // 2 states: Not-Slash(NS), Slash(S).
+      // prev  /   *
+      // NS    S  NS
+      //  S   NS  NS
+      // After inclusive scan, all current S states translate to escaping backslash.
+      // All escaping backslash should be skipped.
+
+      struct state_table {
+        bool state[2];
+      };
+      // using state_table = bool[2]; Try this. and see if compiler errors
+      using BlockScan = cub::BlockScan<state_table, 32 * 8>;
+      __shared__ typename BlockScan::TempStorage temp_slash;
+      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
+      auto composite_op = [](state_table op1, state_table op2) {
+        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+      };
+      state_table scanned;
+      // auto warp_id = threadIdx.x / 32;
+      // inclusive scan. TODO both inclusive and exclusive available in cub.
+      BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
+      auto is_escaping_backslash = scanned.state[init_state];
+      auto last_active_lane      = blockDim.x - 1;  // TODO simplify 0xffffffff case?
+      if (threadIdx.x == last_active_lane) init_state = is_escaping_backslash;
+      // There is another __syncthreads() at the end of for-loop.
+      // TODO replace/add prev_c with proper scan of escapes
+      skip |= is_escaping_backslash;
+
+      if (!skip) {
+        // is prev_is_not backslash?
+        if (prev_c != '\\') {  // FIXME: enable this after debugging.
+          // if (true) {
+          this_num_out = 1;
+          if (d_chars) write_char = c;
+          // FIXME: can you skip write like this for string_size count at other places?
+        } else {
+          // already taken care early.
+          // if (escaped_char == NON_ESCAPE_CHAR) {
+          //    this_num_out = 0;
+          //    error = true;
+          // } else
+          if (escaped_char != UNICODE_SEQ) {
+            this_num_out = 1;
+            write_char   = escaped_char;
+          } else {
+            // \uXXXX- u
+            // Unicode
+            auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
+            auto hex_low_val = 0;
+            // if next is \uXXXX
+            // in_begin + char_index
+            // 01234567890
+            //\uXXXX\uXXXX TODO cleanup these conditions.
+            // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
+                in_begin[char_index + 1 + 5] == 'u') {
+              hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
+            }
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+              // Compute UTF16-encoded code point
+              uint32_t unicode_code_point = 0x10000 +
+                                            ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                            (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+              write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
+              this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+            } else {
+              // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6);
+              if (
+                // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val <
+                // UTF16_HIGH_SURROGATE_END &&
+                hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+                skip         = true;
+                this_num_out = 0;
+                write_char   = 0;
+              } else {
+                // if u8
+                write_char   = strings::detail::codepoint_to_utf8(hex_val);
+                this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+              }
+            }
+          }
+        }
+      }  // !skip end.
+      {
+        // TODO think about writing error conditions as normal, so that program flow is easy to read
+        // and can process error here.
+        // WRITE now (compute out_idx offset then write)
+        // intra-warp scan of this_num_out.
+        // TODO union to save shared memory
+        using BlockScan2 = cub::BlockScan<size_type, 32 * 8>;
+        __shared__ BlockScan2::TempStorage temp_storage;
+        size_type offset;
+        BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset);
+        offset += last_offset;
+        // TODO add last active lane this_num_out for correct last_offset.
+        if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
+        offset += this_num_out;
+        if (threadIdx.x == last_active_lane) last_offset = offset;
+        __syncthreads();
+      }
+    }  // char for-loop
+    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
+  }    // grid-stride for-loop
+}
+
 template <typename str_tuple_it>
 struct string_parse {
   str_tuple_it str_tuples;
@@ -684,16 +949,17 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
     auto d_offsets = offsets2->mutable_view().data<size_type>();
 
-    int max_blocks = 0;
+    int max_blocks                 = 0;
     constexpr auto warps_per_block = 8;
-    int threads_per_block = cudf::detail::warp_size * warps_per_block;
-    CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, parse_fn_string_parallel<str_tuple_it>, threads_per_block, 0));
+    int threads_per_block          = cudf::detail::warp_size * warps_per_block;
+    CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, parse_fn_string_parallel<str_tuple_it>, threads_per_block, 0));
 
     int device = 0;
     CUDF_CUDA_TRY(cudaGetDevice(&device));
     int num_sms = 0;
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
-    auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1));
+    auto num_blocks  = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1));
     auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
 
     parse_fn_string_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
@@ -705,6 +971,15 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       options,
       d_offsets,
       nullptr);
+    parse_fn_block_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      str_tuples,
+      col_size,
+      str_counter.data(),
+      static_cast<bitmask_type*>(null_mask.data()),
+      null_count_data2,
+      options,
+      d_offsets,
+      nullptr);
     // print_raw(d_offsets, offsets2->size(), stream);
     auto const bytes =
       cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
@@ -726,6 +1001,15 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       d_offsets,
       d_chars2);
 
+    parse_fn_block_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      str_tuples,
+      col_size,
+      str_counter.data(),
+      static_cast<bitmask_type*>(null_mask.data()),
+      null_count_data2,
+      options,
+      d_offsets,
+      d_chars2);
     // if(bytes!=chars->size()) {
     //   std::cout<<"new bytes="<<bytes<<std::endl;
     //   print_raw(d_offsets, offsets2->size(), stream);

From 57ea0564815c8a710382811855fc9502db175080 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 12 Aug 2023 02:25:31 +0530
Subject: [PATCH 09/47] cleanup, kernel name

---
 cpp/include/cudf/io/detail/data_casting.cuh | 122 +++++++++++---------
 1 file changed, 66 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 855f226a0b0..4444714b5db 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -24,6 +24,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
@@ -333,25 +334,27 @@ process_string(in_iterator_t in_begin,
 // propagate offset from 32nd thread to others in warp to carry forward.
 // 1 warp per string.
 template <typename str_tuple_it>
-__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
-                                         size_type total_out_strings,
-                                         size_type* str_counter,
-                                         bitmask_type* null_mask,
-                                         size_type* null_count_data,
-                                         cudf::io::parse_options_view const options,
-                                         size_type* d_offsets,
-                                         char* d_chars)
+__global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
+                                       size_type total_out_strings,
+                                       size_type* str_counter,
+                                       bitmask_type* null_mask,
+                                       size_type* null_count_data,
+                                       cudf::io::parse_options_view const options,
+                                       size_type* d_offsets,
+                                       char* d_chars)
 {
-  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int warp_lane        = global_thread_id % cudf::detail::warp_size;
-  // int global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  // int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+  constexpr auto BLOCK_SIZE = cudf::detail::warp_size;
+  int global_thread_id      = blockIdx.x * blockDim.x + threadIdx.x;
+  int lane                  = global_thread_id % BLOCK_SIZE;
+  // int global_warp_id   = global_thread_id / BLOCK_SIZE;
+  // int nwarps           = gridDim.x * blockDim.x / BLOCK_SIZE;
   // TODO alignment - aligned access possible?
 
   // get 1-string index per warp
+  // TODO if #num(33-1024) > SOME_LIMIT, then fixed load.
   auto warp_inc_count = [&]() {
-    size_type istring = 0;
-    if (warp_lane == 0) { istring = atomicAdd(str_counter, 1); }
+    size_type istring;
+    if (lane == 0) { istring = atomicAdd(str_counter, 1); }
     __syncwarp();
     return __shfl_sync(0xffffffff, istring, 0);
   };
@@ -359,8 +362,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
   // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
   for (size_type istring = warp_inc_count(); istring < total_out_strings;
        istring           = warp_inc_count()) {
-    // if (!d_chars)
-    // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings);
+    // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
       if (!d_chars) d_offsets[istring] = 0;
       continue;  // gride-stride return;
@@ -376,7 +378,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       (!d_chars) &&
       serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
     if (is_null_literal && null_mask != nullptr) {
-      if (warp_lane == 0) {
+      if (lane == 0) {
         clear_bit(null_mask, istring);
         atomicAdd(null_count_data, 1);
         if (!d_chars) d_offsets[istring] = 0;
@@ -393,10 +395,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     // Copy literal/numeric value
     if (not is_string_value) {
       if (!d_chars) {
-        if (warp_lane == 0) { d_offsets[istring] = in_end - in_begin; }
+        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
       } else {
-        for (size_type char_index = warp_lane; char_index < (in_end - in_begin);
-             char_index += cudf::detail::warp_size) {
+        for (size_type char_index = lane; char_index < (in_end - in_begin);
+             char_index += BLOCK_SIZE) {
           d_buffer[char_index] = in_begin[char_index];
         }
       }
@@ -413,17 +415,18 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     auto is_hex = [](auto ch) {
       return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
     };
-    bool init_state{false};  // for backslash scan calculation
+
+    // for backslash scan calculation: is_previous_escaping_backslash
+    bool init_state{false};
     auto last_offset = 0;
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
     auto MASK = 0xffffffff;
-    for (size_type char_index = warp_lane;
+    for (size_type char_index = lane;
          (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
-         char_index += cudf::detail::warp_size) {
-      bool is_within_bounds =
-        char_index <
-        (in_end - in_begin);  // TODO more conditions below to avoid out-of-bound memory access.
+         char_index += BLOCK_SIZE) {
+      bool is_within_bounds = char_index < (in_end - in_begin);
+      // TODO more conditions below to avoid out-of-bound memory access.
       auto c            = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
@@ -432,19 +435,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // \uXXXXe e-u=5 4<=4
       //  012345
       if (is_within_bounds) {
+        // TODO instead of '\\', use is_escaping_backslash, and previous index value also here.
         error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
         error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
         error |= (prev_c == '\\' && c == 'u' &&
                   // TODO check if following condition is right or off by one error.
                   ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
-                   // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
       // propagate error using warp shuffle.
       error = __any_sync(MASK, error);
       if (error) {
-        if (warp_lane == 0) {
+        if (lane == 0) {
           if (null_mask != nullptr) {
             clear_bit(null_mask, istring);
             atomicAdd(null_count_data, 1);
@@ -452,7 +455,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           last_offset = 0;
           if (!d_chars) d_offsets[istring] = 0;
         }
-        break;  // return to grid-stride loop for next string.
+        break;  // gride-stride return;
       }
       // TODO one more error condition of second \uXXXX is not hex.
       bool skip = !is_within_bounds;  // false;
@@ -471,7 +474,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                 in_begin[char_index - 4] == 'u';
       }
       int this_num_out = 0;
-      cudf::char_utf8 write_char{'a'};
+      cudf::char_utf8 write_char{};
 
       // To check current is backslash by checking if previous is backslash.
       // curr = !prev & c=='\\'
@@ -487,19 +490,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       struct state_table {
         bool state[2];
       };
-      // using state_table = bool[2]; Try this. and see if compiler errors
+      // TODO Use union to reduce shared memory usage.
       __shared__ typename cub::WarpScan<state_table>::TempStorage temp_slash[8];
       state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
       auto composite_op = [](state_table op1, state_table op2) {
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
-      auto warp_id = threadIdx.x / 32;
+      auto warp_id = threadIdx.x / BLOCK_SIZE;
       // inclusive scan. TODO both inclusive and exclusive available in cub.
       cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
-      auto last_active_lane      = 31 - __clz(MASK);  // TODO simplify 0xffffffff case?
-      init_state                 = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
+      auto last_active_lane = (BLOCK_SIZE - 1) - __clz(MASK);  // TODO simplify 0xffffffff case?
+      init_state            = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
       // TODO replace/add prev_c with proper scan of escapes
       skip |= is_escaping_backslash;
 
@@ -528,7 +531,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             // in_begin + char_index
             // 01234567890
             //\uXXXX\uXXXX TODO cleanup these conditions.
-            // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            // Note: no need for scanned_backslash below because we already know. only '\u' check is
+            // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
             if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
                 in_begin[char_index + 1 + 5] == 'u') {
               hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
@@ -565,6 +569,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         // WRITE now (compute out_idx offset then write)
         // intra-warp scan of this_num_out.
         // TODO union to save shared memory
+        // TODO, use only Reduce instead of scan for size calculation - if(!dchars)
         __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[8];
         size_type offset;
         cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
@@ -572,10 +577,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         // TODO add last active lane this_num_out for correct last_offset.
         if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
         offset += this_num_out;
-        last_offset = __shfl_sync(0xffffffff, offset, 31);
+        last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
       }
     }  // char for-loop
-    if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; }
+    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
   }    // grid-stride for-loop
 }
 
@@ -589,6 +594,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type* d_offsets,
                                         char* d_chars)
 {
+  const long BLOCK_SIZE = blockDim.x;
   // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   int lane = threadIdx.x;
   // int global_warp_id   = blockIdx.x;
@@ -596,6 +602,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
   // TODO alignment - aligned access possible?
 
   // get 1-string index per warp
+  // TODO if #num(>1024) > SOME_LIMIT, then fixed load.
   auto warp_inc_count = [&]() {
     __shared__ size_type istring;
     if (lane == 0) { istring = atomicAdd(str_counter, 1); }
@@ -604,12 +611,11 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
   };
   // grid-stride loop.
   // TODO if large number of small strings, then this loop is not efficient. So, switch to old
-  // method. for (size_type istring = global_warp_id; istring < total_out_strings; istring +=
-  // nwarps) {
+  // method.
+  // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
   for (size_type istring = warp_inc_count(); istring < total_out_strings;
        istring           = warp_inc_count()) {
-    // if (!d_chars)
-    // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings);
+    // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
       if (!d_chars) d_offsets[istring] = 0;
       continue;  // gride-stride return;
@@ -645,7 +651,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
       } else {
         for (size_type char_index = lane; char_index < (in_end - in_begin);
-             char_index += blockDim.x) {
+             char_index += BLOCK_SIZE) {
           d_buffer[char_index] = in_begin[char_index];
         }
       }
@@ -672,25 +678,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
     // entire warp executes but with mask.
     // auto MASK = 0xffffffff;
     for (size_type char_index = lane;
-         char_index < (in_end - in_begin + 32 * 8 - 1) / (32 * 8) * (32 * 8);
-         char_index += blockDim.x) {
-      bool is_within_bounds =
-        char_index <
-        (in_end - in_begin);  // TODO more conditions below to avoid out-of-bound memory access.
+         char_index < cudf::util::round_up_unsafe(in_end - in_begin, BLOCK_SIZE);
+         char_index += BLOCK_SIZE) {
+      bool is_within_bounds = char_index < (in_end - in_begin);
+      // TODO more conditions below to avoid out-of-bound memory access.
       auto c            = is_within_bounds ? in_begin[char_index] : '\0';
-      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : 'a';
+      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
       bool error        = false;
       // FIXME: \\ at end is a problem here.
       // \uXXXXe e-u=5 4<=4
       //  012345
       if (is_within_bounds) {
+        // TODO instead of '\\', use is_escaping_backslash, and previous index value also here.
         error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
         error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
         error |= (prev_c == '\\' && c == 'u' &&
                   // TODO check if following condition is right or off by one error.
                   ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
-                   // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) |
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
@@ -711,7 +716,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
           last_offset = 0;
           if (!d_chars) d_offsets[istring] = 0;
         }
-        break;  // return to grid-stride loop for next string.
+        break;  // gride-stride return;
       }
       // TODO one more error condition of second \uXXXX is not hex.
       bool skip = !is_within_bounds;  // false;
@@ -730,7 +735,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                 in_begin[char_index - 4] == 'u';
       }
       int this_num_out = 0;
-      cudf::char_utf8 write_char{'a'};
+      cudf::char_utf8 write_char{};
 
       // To check current is backslash by checking if previous is backslash.
       // curr = !prev & c=='\\'
@@ -754,7 +759,6 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
-      // auto warp_id = threadIdx.x / 32;
       // inclusive scan. TODO both inclusive and exclusive available in cub.
       BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
@@ -789,7 +793,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
             // in_begin + char_index
             // 01234567890
             //\uXXXX\uXXXX TODO cleanup these conditions.
-            // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            // Note: no need for scanned_backslash below because we already know. only '\u' check is
+            // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
             if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
                 in_begin[char_index + 1 + 5] == 'u') {
               hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
@@ -826,6 +831,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         // WRITE now (compute out_idx offset then write)
         // intra-warp scan of this_num_out.
         // TODO union to save shared memory
+        // TODO, use only Reduce instead of scan for size calculation - if(!dchars)
         using BlockScan2 = cub::BlockScan<size_type, 32 * 8>;
         __shared__ BlockScan2::TempStorage temp_storage;
         size_type offset;
@@ -924,6 +930,10 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
 
   auto d_null_count2    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data2 = d_null_count2.data();
+  // Write 3 sizes <=32, 33-1024, 1024-1M, >1M
+  // if all<32, (33, 1024, 1M)==0, run serial kernel.
+  // if >33, warp-per-string kernel. (we want warp_parallel to be called only if this #rows is less.
+  // or else atomicAdd congestion might happen) if >1024, block-per-string kernel.
 
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
     // this utility calls the functor to build the offsets and chars columns;
@@ -953,7 +963,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     constexpr auto warps_per_block = 8;
     int threads_per_block          = cudf::detail::warp_size * warps_per_block;
     CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_blocks, parse_fn_string_parallel<str_tuple_it>, threads_per_block, 0));
+      &max_blocks, parse_fn_warp_parallel<str_tuple_it>, threads_per_block, 0));
 
     int device = 0;
     CUDF_CUDA_TRY(cudaGetDevice(&device));
@@ -962,7 +972,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     auto num_blocks  = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1));
     auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
 
-    parse_fn_string_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+    parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
       str_tuples,
       col_size,
       str_counter.data(),
@@ -991,7 +1001,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     auto d_chars2 = chars2->mutable_view().data<char>();
     cudaMemsetAsync(d_chars2, 'c', bytes, stream.value());
 
-    parse_fn_string_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+    parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
       str_tuples,
       col_size,
       str_counter.data(),

From 7e4cfd219ef2458e5b57708e69f9dd56be76bb79 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 17 Aug 2023 17:06:38 +0530
Subject: [PATCH 10/47] add BLOCK_SIZE to block kernel

---
 cpp/include/cudf/io/detail/data_casting.cuh | 51 +++++++++++----------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 4444714b5db..8fd23199e93 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -584,7 +584,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
   }    // grid-stride for-loop
 }
 
-template <typename str_tuple_it>
+template <int BLOCK_SIZE, typename str_tuple_it>
 __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type total_out_strings,
                                         size_type* str_counter,
@@ -594,7 +594,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type* d_offsets,
                                         char* d_chars)
 {
-  const long BLOCK_SIZE = blockDim.x;
+  // const long BLOCK_SIZE = blockDim.x;
   // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   int lane = threadIdx.x;
   // int global_warp_id   = blockIdx.x;
@@ -678,7 +678,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
     // entire warp executes but with mask.
     // auto MASK = 0xffffffff;
     for (size_type char_index = lane;
-         char_index < cudf::util::round_up_unsafe(in_end - in_begin, BLOCK_SIZE);
+         char_index <
+         cudf::util::round_up_unsafe(in_end - in_begin, static_cast<std::size_t>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool is_within_bounds = char_index < (in_end - in_begin);
       // TODO more conditions below to avoid out-of-bound memory access.
@@ -959,9 +960,9 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
     auto d_offsets = offsets2->mutable_view().data<size_type>();
 
-    int max_blocks                 = 0;
-    constexpr auto warps_per_block = 8;
-    int threads_per_block          = cudf::detail::warp_size * warps_per_block;
+    int max_blocks                  = 0;
+    constexpr auto warps_per_block  = 8;
+    constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
     CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &max_blocks, parse_fn_warp_parallel<str_tuple_it>, threads_per_block, 0));
 
@@ -981,15 +982,16 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       options,
       d_offsets,
       nullptr);
-    parse_fn_block_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
-      str_tuples,
-      col_size,
-      str_counter.data(),
-      static_cast<bitmask_type*>(null_mask.data()),
-      null_count_data2,
-      options,
-      d_offsets,
-      nullptr);
+    parse_fn_block_parallel<threads_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data2,
+        options,
+        d_offsets,
+        nullptr);
     // print_raw(d_offsets, offsets2->size(), stream);
     auto const bytes =
       cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
@@ -1011,15 +1013,16 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       d_offsets,
       d_chars2);
 
-    parse_fn_block_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
-      str_tuples,
-      col_size,
-      str_counter.data(),
-      static_cast<bitmask_type*>(null_mask.data()),
-      null_count_data2,
-      options,
-      d_offsets,
-      d_chars2);
+    parse_fn_block_parallel<threads_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data2,
+        options,
+        d_offsets,
+        d_chars2);
     // if(bytes!=chars->size()) {
     //   std::cout<<"new bytes="<<bytes<<std::endl;
     //   print_raw(d_offsets, offsets2->size(), stream);

From 6622460f5732c5c08dbd6fec729f2440386dfff3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 18 Aug 2023 00:40:41 +0530
Subject: [PATCH 11/47] clean up, add constants

---
 cpp/include/cudf/io/detail/data_casting.cuh | 354 ++++++++------------
 1 file changed, 134 insertions(+), 220 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 8fd23199e93..a9cb45f5714 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -32,12 +32,18 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
 #include <cub/cub.cuh>
 
 #include <memory>
 
 namespace cudf::io::json::experimental::detail {
 
+constexpr auto SINGLE_THREAD_THRESHOLD = 128;
+constexpr auto WARP_THRESHOLD          = 1024;
+
 // Unicode code point escape sequence
 static constexpr char UNICODE_SEQ = 0x7F;
 
@@ -344,24 +350,19 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
                                        char* d_chars)
 {
   constexpr auto BLOCK_SIZE = cudf::detail::warp_size;
-  int global_thread_id      = blockIdx.x * blockDim.x + threadIdx.x;
-  int lane                  = global_thread_id % BLOCK_SIZE;
-  // int global_warp_id   = global_thread_id / BLOCK_SIZE;
-  // int nwarps           = gridDim.x * blockDim.x / BLOCK_SIZE;
-  // TODO alignment - aligned access possible?
+  int lane                  = threadIdx.x % BLOCK_SIZE;
 
   // get 1-string index per warp
-  // TODO if #num(33-1024) > SOME_LIMIT, then fixed load.
-  auto warp_inc_count = [&]() {
+  auto get_next_string = [&]() {
     size_type istring;
     if (lane == 0) { istring = atomicAdd(str_counter, 1); }
     __syncwarp();
     return __shfl_sync(0xffffffff, istring, 0);
   };
+
   // grid-stride loop.
-  // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
-  for (size_type istring = warp_inc_count(); istring < total_out_strings;
-       istring           = warp_inc_count()) {
+  for (size_type istring = get_next_string(); istring < total_out_strings;
+       istring           = get_next_string()) {
     // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
       if (!d_chars) d_offsets[istring] = 0;
@@ -371,7 +372,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
     auto in_begin           = str_tuples[istring].first;
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
-    if (num_in_chars >= 1024) continue;
+    if (num_in_chars > WARP_THRESHOLD) continue;
 
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
@@ -410,7 +411,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
       ++in_begin;
       --in_end;
     }
-    // auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+    // warp-parallelized process_string(in_begin, in_end, d_buffer, options);
 
     auto is_hex = [](auto ch) {
       return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
@@ -426,14 +427,10 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
          (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
          char_index += BLOCK_SIZE) {
       bool is_within_bounds = char_index < (in_end - in_begin);
-      // TODO more conditions below to avoid out-of-bound memory access.
-      auto c            = is_within_bounds ? in_begin[char_index] : '\0';
+      auto c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
       bool error        = false;
-      // FIXME: \\ at end is a problem here.
-      // \uXXXXe e-u=5 4<=4
-      //  012345
       if (is_within_bounds) {
         // TODO instead of '\\', use is_escaping_backslash, and previous index value also here.
         error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
@@ -455,14 +452,9 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
           last_offset = 0;
           if (!d_chars) d_offsets[istring] = 0;
         }
-        break;  // gride-stride return;
+        break;                        // gride-stride return;
       }
-      // TODO one more error condition of second \uXXXX is not hex.
       bool skip = !is_within_bounds;  // false;
-      // TODO FIXME: continue slashes are a problem!
-      // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
-      // corner case \\uXXXX TODO
-      // skip XXXX in \uXXXX
       if (is_within_bounds) {
         skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
                 in_begin[char_index - 1] == 'u';
@@ -490,49 +482,36 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
       struct state_table {
         bool state[2];
       };
-      // TODO Use union to reduce shared memory usage.
-      __shared__ typename cub::WarpScan<state_table>::TempStorage temp_slash[8];
       state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
       auto composite_op = [](state_table op1, state_table op2) {
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
       auto warp_id = threadIdx.x / BLOCK_SIZE;
-      // inclusive scan. TODO both inclusive and exclusive available in cub.
-      cub::WarpScan<state_table>(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+      // inclusive scan of escaping backslashes
+      using SlashScan = cub::WarpScan<state_table>;
+      __shared__ typename SlashScan::TempStorage temp_slash[8];
+      SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
-      auto last_active_lane = (BLOCK_SIZE - 1) - __clz(MASK);  // TODO simplify 0xffffffff case?
-      init_state            = __shfl_sync(MASK, is_escaping_backslash, last_active_lane);
-      // TODO replace/add prev_c with proper scan of escapes
+      init_state                 = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
       skip |= is_escaping_backslash;
 
       if (!skip) {
-        // is prev_is_not backslash?
-        if (prev_c != '\\') {  // FIXME: enable this after debugging.
-          // if (true) {
+        if (prev_c != '\\') {
           this_num_out = 1;
           if (d_chars) write_char = c;
-          // FIXME: can you skip write like this for string_size count at other places?
         } else {
-          // already taken care early.
-          // if (escaped_char == NON_ESCAPE_CHAR) {
-          //    this_num_out = 0;
-          //    error = true;
-          // } else
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
             write_char   = escaped_char;
           } else {
-            // \uXXXX- u
             // Unicode
+            // \uXXXX
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
             auto hex_low_val = 0;
-            // if next is \uXXXX
-            // in_begin + char_index
-            // 01234567890
-            //\uXXXX\uXXXX TODO cleanup these conditions.
-            // Note: no need for scanned_backslash below because we already know. only '\u' check is
-            // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            // \uXXXX\uXXXX
+            // Note: no need for scanned_backslash below because we already know that
+            // only '\u' check is enough.
             if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
                 in_begin[char_index + 1 + 5] == 'u') {
               hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
@@ -546,11 +525,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
               write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
               this_num_out = strings::detail::bytes_in_char_utf8(write_char);
             } else {
-              // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6);
-              if (
-                // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val <
-                // UTF16_HIGH_SURROGATE_END &&
-                hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+              if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
                 skip         = true;
                 this_num_out = 0;
                 write_char   = 0;
@@ -564,17 +539,10 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
-        // TODO think about writing error conditions as normal, so that program flow is easy to read
-        // and can process error here.
-        // WRITE now (compute out_idx offset then write)
-        // intra-warp scan of this_num_out.
-        // TODO union to save shared memory
-        // TODO, use only Reduce instead of scan for size calculation - if(!dchars)
         __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[8];
         size_type offset;
         cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
         offset += last_offset;
-        // TODO add last active lane this_num_out for correct last_offset.
         if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
         offset += this_num_out;
         last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
@@ -594,27 +562,18 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type* d_offsets,
                                         char* d_chars)
 {
-  // const long BLOCK_SIZE = blockDim.x;
-  // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   int lane = threadIdx.x;
-  // int global_warp_id   = blockIdx.x;
-  // int nwarps           = gridDim.x;
-  // TODO alignment - aligned access possible?
 
   // get 1-string index per warp
-  // TODO if #num(>1024) > SOME_LIMIT, then fixed load.
-  auto warp_inc_count = [&]() {
+  auto get_next_string = [&]() {
     __shared__ size_type istring;
     if (lane == 0) { istring = atomicAdd(str_counter, 1); }
     __syncthreads();  // memory fence?
     return istring;
   };
   // grid-stride loop.
-  // TODO if large number of small strings, then this loop is not efficient. So, switch to old
-  // method.
-  // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
-  for (size_type istring = warp_inc_count(); istring < total_out_strings;
-       istring           = warp_inc_count()) {
+  for (size_type istring = get_next_string(); istring < total_out_strings;
+       istring           = get_next_string()) {
     // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
       if (!d_chars) d_offsets[istring] = 0;
@@ -624,7 +583,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
     auto in_begin           = str_tuples[istring].first;
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
-    if (num_in_chars < 1024) continue;
+    if (num_in_chars <= WARP_THRESHOLD) continue;
 
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
@@ -663,7 +622,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
       ++in_begin;
       --in_end;
     }
-    // auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+    // block-parallelized process_string(in_begin, in_end, d_buffer, options);
 
     auto is_hex = [](auto ch) {
       return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
@@ -674,34 +633,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
       init_state  = false;
       last_offset = 0;
     }
-    // 0-31, 32-63, ... i*32-n.
-    // entire warp executes but with mask.
-    // auto MASK = 0xffffffff;
+    // entire block executes
     for (size_type char_index = lane;
-         char_index <
-         cudf::util::round_up_unsafe(in_end - in_begin, static_cast<std::size_t>(BLOCK_SIZE));
+         char_index < cudf::util::round_up_unsafe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool is_within_bounds = char_index < (in_end - in_begin);
-      // TODO more conditions below to avoid out-of-bound memory access.
-      auto c            = is_within_bounds ? in_begin[char_index] : '\0';
+      auto c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
       bool error        = false;
-      // FIXME: \\ at end is a problem here.
-      // \uXXXXe e-u=5 4<=4
-      //  012345
       if (is_within_bounds) {
-        // TODO instead of '\\', use is_escaping_backslash, and previous index value also here.
         error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
         error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
         error |= (prev_c == '\\' && c == 'u' &&
-                  // TODO check if following condition is right or off by one error.
                   ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
-      // propagate error using warp shuffle.
-      // error = __any_sync(MASK, error);
+      // propagate error across entire block
       using BlockReduce = cub::BlockReduce<bool, 32 * 8>;
       __shared__ typename BlockReduce::TempStorage temp_storage_error;
       __shared__ bool error_reduced;
@@ -717,14 +666,9 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
           last_offset = 0;
           if (!d_chars) d_offsets[istring] = 0;
         }
-        break;  // gride-stride return;
+        break;                        // gride-stride return;
       }
-      // TODO one more error condition of second \uXXXX is not hex.
       bool skip = !is_within_bounds;  // false;
-      // TODO FIXME: continue slashes are a problem!
-      // skip |= (prev_c != '\\') && (c=='\\'); // skip '\'
-      // corner case \\uXXXX TODO
-      // skip XXXX in \uXXXX
       if (is_within_bounds) {
         skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
                 in_begin[char_index - 1] == 'u';
@@ -763,39 +707,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
       // inclusive scan. TODO both inclusive and exclusive available in cub.
       BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
       auto is_escaping_backslash = scanned.state[init_state];
-      auto last_active_lane      = blockDim.x - 1;  // TODO simplify 0xffffffff case?
-      if (threadIdx.x == last_active_lane) init_state = is_escaping_backslash;
+      if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
       // There is another __syncthreads() at the end of for-loop.
-      // TODO replace/add prev_c with proper scan of escapes
       skip |= is_escaping_backslash;
 
       if (!skip) {
-        // is prev_is_not backslash?
-        if (prev_c != '\\') {  // FIXME: enable this after debugging.
-          // if (true) {
+        if (prev_c != '\\') {
           this_num_out = 1;
           if (d_chars) write_char = c;
-          // FIXME: can you skip write like this for string_size count at other places?
         } else {
-          // already taken care early.
-          // if (escaped_char == NON_ESCAPE_CHAR) {
-          //    this_num_out = 0;
-          //    error = true;
-          // } else
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
             write_char   = escaped_char;
           } else {
-            // \uXXXX- u
             // Unicode
+            // \uXXXX
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
             auto hex_low_val = 0;
-            // if next is \uXXXX
-            // in_begin + char_index
-            // 01234567890
-            //\uXXXX\uXXXX TODO cleanup these conditions.
-            // Note: no need for scanned_backslash below because we already know. only '\u' check is
-            // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 &&
+            // \uXXXX\uXXXX
             if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
                 in_begin[char_index + 1 + 5] == 'u') {
               hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
@@ -809,11 +738,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
               write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
               this_num_out = strings::detail::bytes_in_char_utf8(write_char);
             } else {
-              // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6);
-              if (
-                // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val <
-                // UTF16_HIGH_SURROGATE_END &&
-                hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+              if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
                 skip         = true;
                 this_num_out = 0;
                 write_char   = 0;
@@ -827,12 +752,6 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
-        // TODO think about writing error conditions as normal, so that program flow is easy to read
-        // and can process error here.
-        // WRITE now (compute out_idx offset then write)
-        // intra-warp scan of this_num_out.
-        // TODO union to save shared memory
-        // TODO, use only Reduce instead of scan for size calculation - if(!dchars)
         using BlockScan2 = cub::BlockScan<size_type, 32 * 8>;
         __shared__ BlockScan2::TempStorage temp_storage;
         size_type offset;
@@ -841,7 +760,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         // TODO add last active lane this_num_out for correct last_offset.
         if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
         offset += this_num_out;
-        if (threadIdx.x == last_active_lane) last_offset = offset;
+        if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
         __syncthreads();
       }
     }  // char for-loop
@@ -929,119 +848,114 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
 
-  auto d_null_count2    = rmm::device_scalar<size_type>(null_count, stream);
-  auto null_count_data2 = d_null_count2.data();
-  // Write 3 sizes <=32, 33-1024, 1024-1M, >1M
-  // if all<32, (33, 1024, 1M)==0, run serial kernel.
-  // if >33, warp-per-string kernel. (we want warp_parallel to be called only if this #rows is less.
-  // or else atomicAdd congestion might happen) if >1024, block-per-string kernel.
-
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    // this utility calls the functor to build the offsets and chars columns;
-    // the bitmask and null count may be updated by parse failures
-
-#define WARP_PARALLEL
-#ifndef WARP_PARALLEL
-    nvtxRangePush("make_strings_children");
-    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-      string_parse<decltype(str_tuples)>{
-        str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
-      col_size,
-      stream,
-      mr);
-    auto& offsets2 = offsets;
-    auto& chars2   = chars;
-    nvtxRangePop();
-#else
-
-    // {
-    nvtxRangePush("string_parallel");
-    auto offsets2 = cudf::make_numeric_column(
-      data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
-    auto d_offsets = offsets2->mutable_view().data<size_type>();
-
-    int max_blocks                  = 0;
-    constexpr auto warps_per_block  = 8;
-    constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
-    CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_blocks, parse_fn_warp_parallel<str_tuple_it>, threads_per_block, 0));
-
-    int device = 0;
-    CUDF_CUDA_TRY(cudaGetDevice(&device));
-    int num_sms = 0;
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
-    auto num_blocks  = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1));
-    auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
-
-    parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+    auto const max_length = thrust::transform_reduce(
+      rmm::exec_policy(stream),
       str_tuples,
-      col_size,
-      str_counter.data(),
-      static_cast<bitmask_type*>(null_mask.data()),
-      null_count_data2,
-      options,
-      d_offsets,
-      nullptr);
-    parse_fn_block_parallel<threads_per_block>
-      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      str_tuples + col_size,
+      [] __device__(auto t) { return t.second; },
+      size_type{0},
+      thrust::maximum<size_type>{});
+
+    if (max_length < SINGLE_THREAD_THRESHOLD) {
+      // this utility calls the functor to build the offsets and chars columns;
+      // the bitmask and null count may be updated by parse failures
+      nvtxRangePush("make_strings_children");
+      auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+        string_parse<decltype(str_tuples)>{
+          str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
+        col_size,
+        stream,
+        mr);
+      nvtxRangePop();
+
+      return make_strings_column(col_size,
+                                 std::move(offsets),
+                                 std::move(chars),
+                                 d_null_count.value(stream),
+                                 std::move(null_mask));
+    } else {
+      nvtxRangePush("string_parallel");
+      auto offsets2 = cudf::make_numeric_column(
+        data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+      auto d_offsets = offsets2->mutable_view().data<size_type>();
+
+      constexpr auto warps_per_block  = 8;
+      constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
+      auto num_blocks                 = min(65535, col_size / warps_per_block + 1);
+      auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
+
+      parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
         str_tuples,
         col_size,
         str_counter.data(),
         static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data2,
+        null_count_data,
         options,
         d_offsets,
         nullptr);
-    // print_raw(d_offsets, offsets2->size(), stream);
-    auto const bytes =
-      cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
-    str_counter.set_value(0, stream);
-
-    // CHARS column
-    std::unique_ptr<column> chars2 =
-      strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-    auto d_chars2 = chars2->mutable_view().data<char>();
-    cudaMemsetAsync(d_chars2, 'c', bytes, stream.value());
-
-    parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
-      str_tuples,
-      col_size,
-      str_counter.data(),
-      static_cast<bitmask_type*>(null_mask.data()),
-      null_count_data2,
-      options,
-      d_offsets,
-      d_chars2);
-
-    parse_fn_block_parallel<threads_per_block>
-      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      str_counter.set_value(0, stream);
+      // if (max_length > WARP_THRESHOLD)
+      parse_fn_block_parallel<threads_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          nullptr);
+      // print_raw(d_offsets, offsets2->size(), stream);
+      auto const bytes =
+        cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+      str_counter.set_value(0, stream);
+
+      // CHARS column
+      std::unique_ptr<column> chars2 =
+        strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+      auto d_chars2 = chars2->mutable_view().data<char>();
+
+      parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
         str_tuples,
         col_size,
         str_counter.data(),
         static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data2,
+        null_count_data,
         options,
         d_offsets,
         d_chars2);
-    // if(bytes!=chars->size()) {
-    //   std::cout<<"new bytes="<<bytes<<std::endl;
-    //   print_raw(d_offsets, offsets2->size(), stream);
-    //   print_raw(d_chars2, chars2->size(), stream);
-    // }
-    // if(bytes!=chars->size()) {
-    //   std::cout<<"old bytes="<<chars->size()<<std::endl;
-    //   print_raw(offsets->view().data<size_type>(), offsets->size(), stream);
-    //   print_raw(chars->view().data<size_type>(), chars->size(), stream);
-    // }
-    nvtxRangePop();
-// }
-#endif
-
-    return make_strings_column(col_size,
-                               std::move(offsets2),
-                               std::move(chars2),
-                               d_null_count2.value(stream),
-                               std::move(null_mask));
+      str_counter.set_value(0, stream);
+
+      // if (max_length > WARP_THRESHOLD)
+      parse_fn_block_parallel<threads_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          d_chars2);
+      // if(bytes!=chars->size()) {
+      //   std::cout<<"new bytes="<<bytes<<std::endl;
+      //   print_raw(d_offsets, offsets2->size(), stream);
+      //   print_raw(d_chars2, chars2->size(), stream);
+      // }
+      // if(bytes!=chars->size()) {
+      //   std::cout<<"old bytes="<<chars->size()<<std::endl;
+      //   print_raw(offsets->view().data<size_type>(), offsets->size(), stream);
+      //   print_raw(chars->view().data<size_type>(), chars->size(), stream);
+      // }
+      nvtxRangePop();
+
+      return make_strings_column(col_size,
+                                 std::move(offsets2),
+                                 std::move(chars2),
+                                 d_null_count.value(stream),
+                                 std::move(null_mask));
+    }
   }
 
   auto out_col =

From efe78983cbfe2036c6b3523f012ccc87cd9b3e18 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 18 Aug 2023 00:41:25 +0530
Subject: [PATCH 12/47] add long string json test

---
 cpp/tests/io/json_test.cpp | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 97d5846294a..fea307d6047 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1370,6 +1370,39 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view());
 }
 
+TEST_F(JsonReaderTest, JsonLongString)
+{
+  std::string json_string =
+    R"({"a":"a0"}
+    {"a":"a1"}
+    {"a":"a2", "b":"b2"}
+    {"a":"a3", "c":"c3"})";
+  json_string += std::string("\n") + R"({"a":"a4\u20ac)" + std::string(103, 'a') + "\"}";
+  json_string += std::string("\n") + R"({"a":"a5\u20ac)" + std::string(1034, 'a') + "\"}";
+  json_string += std::string("\n") + R"({"a":"a6\u20ac)" + std::string(10340, 'a') + "\"}";
+
+  cudf::test::strings_column_wrapper col1{"a0",
+                                          "a1",
+                                          "a2",
+                                          "a3",
+                                          "a4€" + std::string(103, 'a'),
+                                          "a5€" + std::string(1034, 'a'),
+                                          "a6€" + std::string(10340, 'a')};
+  cudf::test::strings_column_wrapper col2{{"", "", "b2", "", "", "", ""}, {0, 0, 1, 0, 0, 0, 0}};
+  cudf::test::strings_column_wrapper col3{{"", "", "", "c3", "", "", ""}, {0, 0, 0, 1, 0, 0, 0}};
+  cudf::table_view expected({col1, col2, col3});
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_string.c_str(), json_string.size()})
+      .lines(true);
+
+  // Read test data via nested JSON reader
+  auto const table = cudf::io::read_json(json_lines_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
+}
+
 TEST_F(JsonReaderTest, TokenAllocation)
 {
   std::array<std::string const, 3> const json_inputs{

From 589e0a3c9ed8b25119741f04e8076413784a6bb3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 18 Aug 2023 00:43:14 +0530
Subject: [PATCH 13/47] remove debug prints

---
 cpp/include/cudf/io/detail/data_casting.cuh | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index a9cb45f5714..a9968be36d8 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -812,15 +812,6 @@ struct string_parse {
   }
 };
 
-template <typename T>
-void print_raw(T const* ptr, size_type size, rmm::cuda_stream_view stream)
-{
-  auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span<T const>(ptr, size), stream);
-  for (auto i : h_offsets2)
-    std::cout << i << ",";
-  std::cout << std::endl;
-}
-
 /**
  * @brief Parses the data from an iterator of string views, casting it to the given target data type
  *
@@ -906,7 +897,6 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
           options,
           d_offsets,
           nullptr);
-      // print_raw(d_offsets, offsets2->size(), stream);
       auto const bytes =
         cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
       str_counter.set_value(0, stream);
@@ -938,16 +928,6 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
           options,
           d_offsets,
           d_chars2);
-      // if(bytes!=chars->size()) {
-      //   std::cout<<"new bytes="<<bytes<<std::endl;
-      //   print_raw(d_offsets, offsets2->size(), stream);
-      //   print_raw(d_chars2, chars2->size(), stream);
-      // }
-      // if(bytes!=chars->size()) {
-      //   std::cout<<"old bytes="<<chars->size()<<std::endl;
-      //   print_raw(offsets->view().data<size_type>(), offsets->size(), stream);
-      //   print_raw(chars->view().data<size_type>(), chars->size(), stream);
-      // }
       nvtxRangePop();
 
       return make_strings_column(col_size,

From d3dc8cfd53f1f86c381bab2c8a8ec65b545fa36f Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 18 Aug 2023 00:47:16 +0530
Subject: [PATCH 14/47] comment

---
 cpp/include/cudf/io/detail/data_casting.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 808a250ef5b..e672ab0feee 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -552,6 +552,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
   }    // grid-stride for-loop
 }
 
+// Similar to warp-parallel algorithm but with 1 block per string. 
 template <int BLOCK_SIZE, typename str_tuple_it>
 __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type total_out_strings,

From e17589e620159264e2ba33c51624ba33860a516e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 18 Aug 2023 00:59:05 +0530
Subject: [PATCH 15/47] style fix, add constants

---
 cpp/include/cudf/io/detail/data_casting.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index e672ab0feee..0cc917a3498 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -539,7 +539,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
-        __shared__ cub::WarpScan<size_type>::TempStorage temp_storage[8];
+        __shared__ typename cub::WarpScan<size_type>::TempStorage temp_storage[8];
         size_type offset;
         cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
         offset += last_offset;
@@ -552,7 +552,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
   }    // grid-stride for-loop
 }
 
-// Similar to warp-parallel algorithm but with 1 block per string. 
+// Similar to warp-parallel algorithm but 1 block per string.
 template <int BLOCK_SIZE, typename str_tuple_it>
 __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                                         size_type total_out_strings,
@@ -652,7 +652,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
       // propagate error across entire block
-      using BlockReduce = cub::BlockReduce<bool, 32 * 8>;
+      using BlockReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
       __shared__ typename BlockReduce::TempStorage temp_storage_error;
       __shared__ bool error_reduced;
       error_reduced = BlockReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
@@ -698,7 +698,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         bool state[2];
       };
       // using state_table = bool[2]; Try this. and see if compiler errors
-      using BlockScan = cub::BlockScan<state_table, 32 * 8>;
+      using BlockScan = cub::BlockScan<state_table, BLOCK_SIZE>;
       __shared__ typename BlockScan::TempStorage temp_slash;
       state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
       auto composite_op = [](state_table op1, state_table op2) {
@@ -753,8 +753,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
-        using BlockScan2 = cub::BlockScan<size_type, 32 * 8>;
-        __shared__ BlockScan2::TempStorage temp_storage;
+        using BlockScan2 = cub::BlockScan<size_type, BLOCK_SIZE>;
+        __shared__ typename BlockScan2::TempStorage temp_storage;
         size_type offset;
         BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset);
         offset += last_offset;

From 631528aba9032d02ea763cbfc5d495166e81c96e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 25 Aug 2023 19:59:57 +0530
Subject: [PATCH 16/47] unified kernel for warp and block

---
 cpp/include/cudf/io/detail/data_casting.cuh | 187 +++++++++++++++-----
 1 file changed, 141 insertions(+), 46 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 0cc917a3498..0bf04df9072 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -305,6 +305,35 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+// Attempted, but didn't work for warp_parallel kernel
+template <bool b>
+__device__ bool& init_state()
+{
+  static bool data;
+  return data;
+}
+
+template <>
+__device__ bool& init_state<false>()
+{
+  __shared__ bool data;
+  return data;
+};
+
+template <bool b>
+__device__ int& last_offset()
+{
+  static size_type data;
+  return data;
+}
+
+template <>
+__device__ int& last_offset<false>()
+{
+  __shared__ size_type data;
+  return data;
+};
+
 // 1 warp per string.
 // algorithm
 
@@ -339,7 +368,7 @@ process_string(in_iterator_t in_begin,
 // before writing, find size, then intra-warp scan for out_idx
 // propagate offset from 32nd thread to others in warp to carry forward.
 // 1 warp per string.
-template <typename str_tuple_it>
+template <bool is_warp, size_type num_warps, typename str_tuple_it>
 __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
                                        size_type total_out_strings,
                                        size_type* str_counter,
@@ -349,17 +378,24 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
                                        size_type* d_offsets,
                                        char* d_chars)
 {
-  constexpr auto BLOCK_SIZE = cudf::detail::warp_size;
-  int lane                  = threadIdx.x % BLOCK_SIZE;
+  constexpr auto BLOCK_SIZE =
+    is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
+  size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x;
 
   // get 1-string index per warp
   auto get_next_string = [&]() {
-    size_type istring;
-    if (lane == 0) { istring = atomicAdd(str_counter, 1); }
-    __syncwarp();
-    return __shfl_sync(0xffffffff, istring, 0);
+    if constexpr (is_warp) {
+      size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      __syncwarp();
+      return __shfl_sync(0xffffffff, istring, 0);
+    } else {
+      __shared__ size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      __syncthreads();  // memory fence?
+      return istring;
+    }
   };
-
   // grid-stride loop.
   for (size_type istring = get_next_string(); istring < total_out_strings;
        istring           = get_next_string()) {
@@ -372,7 +408,11 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
     auto in_begin           = str_tuples[istring].first;
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
-    if (num_in_chars > WARP_THRESHOLD) continue;
+    if constexpr (is_warp) {
+      if (num_in_chars > WARP_THRESHOLD) continue;
+    } else {
+      if (num_in_chars <= WARP_THRESHOLD) continue;
+    }
 
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
@@ -418,14 +458,26 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
     };
 
     // for backslash scan calculation: is_previous_escaping_backslash
-    bool init_state{false};
-    auto last_offset = 0;
+    // bool init_state{false};
+    // auto last_offset = 0;
+    bool init_state_reg;
+    __shared__ bool init_state_shared;
+    size_type last_offset_reg;
+    __shared__ size_type last_offset_shared;
+    bool& init_state(is_warp ? init_state_reg : init_state_shared);
+    size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared);
+    if (is_warp || lane == 0) {
+      init_state  = false;
+      last_offset = 0;
+    }
+    // if constexpr(!is_warp) { __syncthreads(); }
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
-    auto MASK = 0xffffffff;
+    // auto MASK = 0xffffffff;
     for (size_type char_index = lane;
-         (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0;
+         char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
+      auto MASK             = __ballot_sync(0xffffffff, char_index < (in_end - in_begin));
       bool is_within_bounds = char_index < (in_end - in_begin);
       auto c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
@@ -442,7 +494,17 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
       // propagate error using warp shuffle.
-      error = __any_sync(MASK, error);
+      if constexpr (is_warp) {
+        error = __any_sync(MASK, error);
+      } else {
+        using ErrorReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
+        __shared__ typename ErrorReduce::TempStorage temp_storage_error;
+        __shared__ bool error_reduced;
+        error_reduced = ErrorReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
+        // only valid in thread0.
+        __syncthreads();
+        error = error_reduced;
+      }
       if (error) {
         if (lane == 0) {
           if (null_mask != nullptr) {
@@ -487,14 +549,25 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
-      auto warp_id = threadIdx.x / BLOCK_SIZE;
+      [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE;
       // inclusive scan of escaping backslashes
-      using SlashScan = cub::WarpScan<state_table>;
-      __shared__ typename SlashScan::TempStorage temp_slash[8];
-      SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
-      auto is_escaping_backslash = scanned.state[init_state];
-      init_state                 = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
-      skip |= is_escaping_backslash;
+      // TODO both inclusive and exclusive available in cub.
+      if constexpr (is_warp) {
+        using SlashScan = cub::WarpScan<state_table>;
+        __shared__ typename SlashScan::TempStorage temp_slash[8];
+        SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+        auto is_escaping_backslash = scanned.state[init_state];
+        init_state                 = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        skip |= is_escaping_backslash;
+      } else {
+        using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
+        __shared__ typename SlashScan::TempStorage temp_slash;
+        SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
+        auto is_escaping_backslash = scanned.state[init_state];
+        if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
+        // There is another __syncthreads() at the end of for-loop.
+        skip |= is_escaping_backslash;
+      }
 
       if (!skip) {
         if (prev_c != '\\') {
@@ -539,13 +612,26 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
-        __shared__ typename cub::WarpScan<size_type>::TempStorage temp_storage[8];
         size_type offset;
-        cub::WarpScan<size_type>(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
+        if constexpr (is_warp) {
+          using OffsetScan = cub::WarpScan<size_type>;
+          __shared__ typename OffsetScan::TempStorage temp_storage[8];
+          OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
+        } else {
+          using OffsetScan = cub::BlockScan<size_type, BLOCK_SIZE>;
+          __shared__ typename OffsetScan::TempStorage temp_storage;
+          OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset);
+          __syncthreads();
+        }
         offset += last_offset;
         if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
         offset += this_num_out;
-        last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
+        if constexpr (is_warp) {
+          last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
+        } else {
+          if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
+          __syncthreads();
+        }
       }
     }  // char for-loop
     if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
@@ -813,6 +899,17 @@ struct string_parse {
   }
 };
 
+template <typename SymbolT>
+struct to_string_view_pair {
+  SymbolT const* data;
+  to_string_view_pair(SymbolT const* _data) : data(_data) {}
+  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  {
+    return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
+                                                  static_cast<std::size_t>(thrust::get<1>(ip))};
+  }
+};
+
 /**
  * @brief Parses the data from an iterator of string views, casting it to the given target data type
  *
@@ -877,7 +974,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       auto num_blocks                 = min(65535, col_size / warps_per_block + 1);
       auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
 
-      parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      parse_fn_warp_parallel<true, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
         str_tuples,
         col_size,
         str_counter.data(),
@@ -888,16 +985,15 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
         nullptr);
       str_counter.set_value(0, stream);
       // if (max_length > WARP_THRESHOLD)
-      parse_fn_block_parallel<threads_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          nullptr);
+      parse_fn_warp_parallel<false, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
       auto const bytes =
         cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
       str_counter.set_value(0, stream);
@@ -907,7 +1003,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
         strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
       auto d_chars2 = chars2->mutable_view().data<char>();
 
-      parse_fn_warp_parallel<<<num_blocks, threads_per_block, 0, stream.value()>>>(
+      parse_fn_warp_parallel<true, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
         str_tuples,
         col_size,
         str_counter.data(),
@@ -919,16 +1015,15 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       str_counter.set_value(0, stream);
 
       // if (max_length > WARP_THRESHOLD)
-      parse_fn_block_parallel<threads_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          d_chars2);
+      parse_fn_warp_parallel<false, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars2);
       nvtxRangePop();
 
       return make_strings_column(col_size,

From 6fe5afa46c52afa7b9e8b0af3d6b1c8e6f91528d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 25 Aug 2023 20:23:36 +0530
Subject: [PATCH 17/47] remove duplicate block kernel, cleanup names

---
 cpp/include/cudf/io/detail/data_casting.cuh | 353 +++-----------------
 1 file changed, 55 insertions(+), 298 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 0bf04df9072..5a90a39025a 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -305,42 +305,13 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
-// Attempted, but didn't work for warp_parallel kernel
-template <bool b>
-__device__ bool& init_state()
-{
-  static bool data;
-  return data;
-}
-
-template <>
-__device__ bool& init_state<false>()
-{
-  __shared__ bool data;
-  return data;
-};
-
-template <bool b>
-__device__ int& last_offset()
-{
-  static size_type data;
-  return data;
-}
-
-template <>
-__device__ int& last_offset<false>()
-{
-  __shared__ size_type data;
-  return data;
-};
-
 // 1 warp per string.
 // algorithm
-
+// character      count: input->output
 // \uXXXX         6->2/3/4
 // \uXXXX\uXXXX  12->2/3/4
 // \"             2->1
-// _              1->1
+// *              1->1
 //
 // error conditions. (propagate)
 // c=='\' & curr_idx == end_idx-1; ERROR
@@ -367,16 +338,16 @@ __device__ int& last_offset<false>()
 // if curr_hex_val not in high, write u8.
 // before writing, find size, then intra-warp scan for out_idx
 // propagate offset from 32nd thread to others in warp to carry forward.
-// 1 warp per string.
+// 1 warp per string or 1 block per string
 template <bool is_warp, size_type num_warps, typename str_tuple_it>
-__global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
-                                       size_type total_out_strings,
-                                       size_type* str_counter,
-                                       bitmask_type* null_mask,
-                                       size_type* null_count_data,
-                                       cudf::io::parse_options_view const options,
-                                       size_type* d_offsets,
-                                       char* d_chars)
+__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                         size_type total_out_strings,
+                                         size_type* str_counter,
+                                         bitmask_type* null_mask,
+                                         size_type* null_count_data,
+                                         cudf::io::parse_options_view const options,
+                                         size_type* d_offsets,
+                                         char* d_chars)
 {
   constexpr auto BLOCK_SIZE =
     is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
@@ -458,8 +429,6 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
     };
 
     // for backslash scan calculation: is_previous_escaping_backslash
-    // bool init_state{false};
-    // auto last_offset = 0;
     bool init_state_reg;
     __shared__ bool init_state_shared;
     size_type last_offset_reg;
@@ -477,7 +446,8 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
     for (size_type char_index = lane;
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
-      auto MASK             = __ballot_sync(0xffffffff, char_index < (in_end - in_begin));
+      auto MASK =
+        is_warp ? __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) : 0xffffffff;
       bool is_within_bounds = char_index < (in_end - in_begin);
       auto c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
@@ -554,7 +524,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
       // TODO both inclusive and exclusive available in cub.
       if constexpr (is_warp) {
         using SlashScan = cub::WarpScan<state_table>;
-        __shared__ typename SlashScan::TempStorage temp_slash[8];
+        __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
         SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
         auto is_escaping_backslash = scanned.state[init_state];
         init_state                 = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
@@ -615,7 +585,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
         size_type offset;
         if constexpr (is_warp) {
           using OffsetScan = cub::WarpScan<size_type>;
-          __shared__ typename OffsetScan::TempStorage temp_storage[8];
+          __shared__ typename OffsetScan::TempStorage temp_storage[num_warps];
           OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
         } else {
           using OffsetScan = cub::BlockScan<size_type, BLOCK_SIZE>;
@@ -638,223 +608,6 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples,
   }    // grid-stride for-loop
 }
 
-// Similar to warp-parallel algorithm but 1 block per string.
-template <int BLOCK_SIZE, typename str_tuple_it>
-__global__ void parse_fn_block_parallel(str_tuple_it str_tuples,
-                                        size_type total_out_strings,
-                                        size_type* str_counter,
-                                        bitmask_type* null_mask,
-                                        size_type* null_count_data,
-                                        cudf::io::parse_options_view const options,
-                                        size_type* d_offsets,
-                                        char* d_chars)
-{
-  int lane = threadIdx.x;
-
-  // get 1-string index per warp
-  auto get_next_string = [&]() {
-    __shared__ size_type istring;
-    if (lane == 0) { istring = atomicAdd(str_counter, 1); }
-    __syncthreads();  // memory fence?
-    return istring;
-  };
-  // grid-stride loop.
-  for (size_type istring = get_next_string(); istring < total_out_strings;
-       istring           = get_next_string()) {
-    // skip nulls
-    if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
-      if (!d_chars) d_offsets[istring] = 0;
-      continue;  // gride-stride return;
-    }
-
-    auto in_begin           = str_tuples[istring].first;
-    auto in_end             = in_begin + str_tuples[istring].second;
-    auto const num_in_chars = str_tuples[istring].second;
-    if (num_in_chars <= WARP_THRESHOLD) continue;
-
-    // Check if the value corresponds to the null literal
-    auto const is_null_literal =
-      (!d_chars) &&
-      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-    if (is_null_literal && null_mask != nullptr) {
-      if (lane == 0) {
-        clear_bit(null_mask, istring);
-        atomicAdd(null_count_data, 1);
-        if (!d_chars) d_offsets[istring] = 0;
-      }
-      continue;  // gride-stride return;
-    }
-    // String values are indicated by keeping the quote character
-    bool const is_string_value =
-      num_in_chars >= 2LL &&
-      (options.quotechar == '\0' ||
-       (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
-    char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr;
-
-    // Copy literal/numeric value
-    if (not is_string_value) {
-      if (!d_chars) {
-        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
-      } else {
-        for (size_type char_index = lane; char_index < (in_end - in_begin);
-             char_index += BLOCK_SIZE) {
-          d_buffer[char_index] = in_begin[char_index];
-        }
-      }
-      continue;  // gride-stride return;
-    }
-
-    // Exclude beginning and ending quote chars from string range
-    if (!options.keepquotes) {
-      ++in_begin;
-      --in_end;
-    }
-    // block-parallelized process_string(in_begin, in_end, d_buffer, options);
-
-    auto is_hex = [](auto ch) {
-      return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
-    };
-    __shared__ bool init_state;        //{false};  // for backslash scan calculation
-    __shared__ size_type last_offset;  // = 0;
-    if (lane == 0) {
-      init_state  = false;
-      last_offset = 0;
-    }
-    // entire block executes
-    for (size_type char_index = lane;
-         char_index < cudf::util::round_up_unsafe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
-         char_index += BLOCK_SIZE) {
-      bool is_within_bounds = char_index < (in_end - in_begin);
-      auto c                = is_within_bounds ? in_begin[char_index] : '\0';
-      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
-      auto escaped_char = get_escape_char(c);
-      bool error        = false;
-      if (is_within_bounds) {
-        error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
-        error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
-        error |= (prev_c == '\\' && c == 'u' &&
-                  ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
-                   !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
-                   !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
-      }
-      // propagate error across entire block
-      using BlockReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
-      __shared__ typename BlockReduce::TempStorage temp_storage_error;
-      __shared__ bool error_reduced;
-      error_reduced = BlockReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
-      // only valid in thread0.
-      __syncthreads();
-      if (error_reduced) {
-        if (lane == 0) {
-          if (null_mask != nullptr) {
-            clear_bit(null_mask, istring);
-            atomicAdd(null_count_data, 1);
-          }
-          last_offset = 0;
-          if (!d_chars) d_offsets[istring] = 0;
-        }
-        break;                        // gride-stride return;
-      }
-      bool skip = !is_within_bounds;  // false;
-      if (is_within_bounds) {
-        skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
-                in_begin[char_index - 1] == 'u';
-        skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' &&
-                in_begin[char_index - 2] == 'u';
-        skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' &&
-                in_begin[char_index - 3] == 'u';
-        skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' &&
-                in_begin[char_index - 4] == 'u';
-      }
-      int this_num_out = 0;
-      cudf::char_utf8 write_char{};
-
-      // To check current is backslash by checking if previous is backslash.
-      // curr = !prev & c=='\\'
-      // So, scan is required from beginning of string.
-      // State table approach (intra-warp FST)
-      // 2 states: Not-Slash(NS), Slash(S).
-      // prev  /   *
-      // NS    S  NS
-      //  S   NS  NS
-      // After inclusive scan, all current S states translate to escaping backslash.
-      // All escaping backslash should be skipped.
-
-      struct state_table {
-        bool state[2];
-      };
-      // using state_table = bool[2]; Try this. and see if compiler errors
-      using BlockScan = cub::BlockScan<state_table, BLOCK_SIZE>;
-      __shared__ typename BlockScan::TempStorage temp_slash;
-      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
-      auto composite_op = [](state_table op1, state_table op2) {
-        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
-      };
-      state_table scanned;
-      // inclusive scan. TODO both inclusive and exclusive available in cub.
-      BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
-      auto is_escaping_backslash = scanned.state[init_state];
-      if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
-      // There is another __syncthreads() at the end of for-loop.
-      skip |= is_escaping_backslash;
-
-      if (!skip) {
-        if (prev_c != '\\') {
-          this_num_out = 1;
-          if (d_chars) write_char = c;
-        } else {
-          if (escaped_char != UNICODE_SEQ) {
-            this_num_out = 1;
-            write_char   = escaped_char;
-          } else {
-            // Unicode
-            // \uXXXX
-            auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
-            auto hex_low_val = 0;
-            // \uXXXX\uXXXX
-            if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
-                in_begin[char_index + 1 + 5] == 'u') {
-              hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
-            }
-            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
-                hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
-              // Compute UTF16-encoded code point
-              uint32_t unicode_code_point = 0x10000 +
-                                            ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
-                                            (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
-              write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
-              this_num_out = strings::detail::bytes_in_char_utf8(write_char);
-            } else {
-              if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
-                skip         = true;
-                this_num_out = 0;
-                write_char   = 0;
-              } else {
-                // if u8
-                write_char   = strings::detail::codepoint_to_utf8(hex_val);
-                this_num_out = strings::detail::bytes_in_char_utf8(write_char);
-              }
-            }
-          }
-        }
-      }  // !skip end.
-      {
-        using BlockScan2 = cub::BlockScan<size_type, BLOCK_SIZE>;
-        __shared__ typename BlockScan2::TempStorage temp_storage;
-        size_type offset;
-        BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset);
-        offset += last_offset;
-        // TODO add last active lane this_num_out for correct last_offset.
-        if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
-        offset += this_num_out;
-        if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
-        __syncthreads();
-      }
-    }  // char for-loop
-    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
-  }    // grid-stride for-loop
-}
-
 template <typename str_tuple_it>
 struct string_parse {
   str_tuple_it str_tuples;
@@ -974,26 +727,28 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       auto num_blocks                 = min(65535, col_size / warps_per_block + 1);
       auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
 
-      parse_fn_warp_parallel<true, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
-        str_tuples,
-        col_size,
-        str_counter.data(),
-        static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data,
-        options,
-        d_offsets,
-        nullptr);
+      parse_fn_string_parallel<true, warps_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          nullptr);
       str_counter.set_value(0, stream);
       // if (max_length > WARP_THRESHOLD)
-      parse_fn_warp_parallel<false, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
-        str_tuples,
-        col_size,
-        str_counter.data(),
-        static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data,
-        options,
-        d_offsets,
-        nullptr);
+      parse_fn_string_parallel<false, warps_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          nullptr);
       auto const bytes =
         cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
       str_counter.set_value(0, stream);
@@ -1003,27 +758,29 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
         strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
       auto d_chars2 = chars2->mutable_view().data<char>();
 
-      parse_fn_warp_parallel<true, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
-        str_tuples,
-        col_size,
-        str_counter.data(),
-        static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data,
-        options,
-        d_offsets,
-        d_chars2);
+      parse_fn_string_parallel<true, warps_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          d_chars2);
       str_counter.set_value(0, stream);
 
       // if (max_length > WARP_THRESHOLD)
-      parse_fn_warp_parallel<false, 8><<<num_blocks, threads_per_block, 0, stream.value()>>>(
-        str_tuples,
-        col_size,
-        str_counter.data(),
-        static_cast<bitmask_type*>(null_mask.data()),
-        null_count_data,
-        options,
-        d_offsets,
-        d_chars2);
+      parse_fn_string_parallel<false, warps_per_block>
+        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+          str_tuples,
+          col_size,
+          str_counter.data(),
+          static_cast<bitmask_type*>(null_mask.data()),
+          null_count_data,
+          options,
+          d_offsets,
+          d_chars2);
       nvtxRangePop();
 
       return make_strings_column(col_size,

From d3a35b1a6c790acc81660e049d70ff8f88ef71d4 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sun, 27 Aug 2023 23:48:58 +0530
Subject: [PATCH 18/47] address review comments

---
 cpp/include/cudf/io/detail/data_casting.cuh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 5a90a39025a..61128ccc6e6 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -738,7 +738,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
           d_offsets,
           nullptr);
       str_counter.set_value(0, stream);
-      // if (max_length > WARP_THRESHOLD)
+      // for strings longer than WARP_THRESHOLD, 1 block per string
       parse_fn_string_parallel<false, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
           str_tuples,
@@ -754,9 +754,9 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
       str_counter.set_value(0, stream);
 
       // CHARS column
-      std::unique_ptr<column> chars2 =
+      std::unique_ptr<column> chars =
         strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-      auto d_chars2 = chars2->mutable_view().data<char>();
+      auto d_chars = chars->mutable_view().data<char>();
 
       parse_fn_string_parallel<true, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
@@ -767,10 +767,10 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
           null_count_data,
           options,
           d_offsets,
-          d_chars2);
+          d_chars);
       str_counter.set_value(0, stream);
 
-      // if (max_length > WARP_THRESHOLD)
+      // for strings longer than WARP_THRESHOLD, 1 block per string
       parse_fn_string_parallel<false, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
           str_tuples,
@@ -780,12 +780,12 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
           null_count_data,
           options,
           d_offsets,
-          d_chars2);
+          d_chars);
       nvtxRangePop();
 
       return make_strings_column(col_size,
                                  std::move(offsets2),
-                                 std::move(chars2),
+                                 std::move(chars),
                                  d_null_count.value(stream),
                                  std::move(null_mask));
     }

From 658d8ba2ba0a37b5a8078d8a93826d704a8e5940 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 28 Aug 2023 08:37:53 +0530
Subject: [PATCH 19/47] cleanup infer_data_type signature

---
 cpp/src/io/json/json_column.cu          | 20 +++++++------
 cpp/src/io/json/nested_json_gpu.cu      |  9 +-----
 cpp/src/io/utilities/type_inference.cuh | 37 ++++++++++++-------------
 cpp/tests/io/type_inference_test.cu     | 28 +++++++++----------
 4 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bdad16bd9f1..479f7daf3ac 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -16,7 +16,7 @@
 
 #include "nested_json.hpp"
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+// #include <io/utilities/type_inference.cuh>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -52,6 +52,16 @@
 #include <algorithm>
 #include <cstdint>
 
+namespace cudf::io::detail {
+
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream);
+}
+
 namespace cudf::io::json::detail {
 
 // DEBUG prints
@@ -763,12 +773,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // TODO how about directly storing pair<char*, size_t> in json_column?
       auto offset_length_it =
         thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
 
       // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
       auto string_spans_it = thrust::make_transform_iterator(
@@ -790,7 +794,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // Infer column type, if we don't have an explicit type for it
       else {
         target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, string_ranges_it, col_size, stream);
+          options.json_view(), d_input, offset_length_it, col_size, stream);
       }
 
       auto [result_bitmask, null_count] = make_validity(json_col);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b691eaa8caf..c6d9b69d6cc 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1949,13 +1949,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto offset_length_it =
         thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin());
 
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
       // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
       auto string_spans_it = thrust::make_transform_iterator(
         offset_length_it, [data = d_input.data()] __device__(auto ip) {
@@ -1978,7 +1971,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
         target_type =
           cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(),
                                             d_input,
-                                            string_ranges_it,
+                                            offset_length_it,
                                             col_size,
                                             stream);
       }
diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh
index a9ccc80ca33..0b6c51fc647 100644
--- a/cpp/src/io/utilities/type_inference.cuh
+++ b/cpp/src/io/utilities/type_inference.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
 #include <cub/block/block_reduce.cuh>
@@ -114,14 +115,14 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  *
  * @param[in] options View of inference options
  * @param[in] data JSON string input
- * @param[in] column_strings_begin The beginning of an offset-length tuple sequence
+ * @param[in] offset_length_begin The beginning of an offset-length tuple sequence
  * @param[in] size Size of the string input
  * @param[out] column_info Histogram of column type counters
  */
 template <int BlockSize, typename OptionsView, typename ColumnStringIter>
 __global__ void infer_column_type_kernel(OptionsView options,
                                          device_span<char const> data,
-                                         ColumnStringIter column_strings_begin,
+                                         ColumnStringIter offset_length_begin,
                                          std::size_t size,
                                          cudf::io::column_type_histogram* column_info)
 {
@@ -129,8 +130,8 @@ __global__ void infer_column_type_kernel(OptionsView options,
 
   for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
-    auto const field_offset = thrust::get<0>(*(column_strings_begin + idx));
-    auto const field_len    = thrust::get<1>(*(column_strings_begin + idx));
+    auto const field_offset = thrust::get<0>(*(offset_length_begin + idx));
+    auto const field_len    = thrust::get<1>(*(offset_length_begin + idx));
     auto const field_begin  = data.begin() + field_offset;
 
     if (cudf::detail::serialized_trie_contains(
@@ -234,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
  *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A histogram containing column-specific type counters
@@ -242,7 +243,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
 template <typename OptionsView, typename ColumnStringIter>
 cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
                                                   cudf::device_span<char const> data,
-                                                  ColumnStringIter column_strings_begin,
+                                                  ColumnStringIter offset_length_begin,
                                                   std::size_t const size,
                                                   rmm::cuda_stream_view stream)
 {
@@ -254,7 +255,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
   infer_column_type_kernel<block_size><<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, column_strings_begin, size, d_column_info.data());
+    options, data, offset_length_begin, size, d_column_info.data());
 
   return d_column_info.value(stream);
 }
@@ -266,28 +267,24 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
  * @throw cudf::logic_error if date time is not inferred as string
  * @throw cudf::logic_error if data type inference failed
  *
- * @tparam OptionsView Type of inference options view
- * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to
- * `thrust::tuple<device_span, string_view>`
- *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The inferred data type
  */
-template <typename OptionsView, typename ColumnStringIter>
-cudf::data_type infer_data_type(OptionsView const& options,
-                                device_span<char const> data,
-                                ColumnStringIter column_strings_begin,
-                                std::size_t const size,
-                                rmm::cuda_stream_view stream)
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(size != 0, "No data available for data type inference.\n");
 
-  auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream);
+  auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream);
 
   auto get_type_id = [&](auto const& cinfo) {
     auto int_count_total =
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index b2eb1b94f9c..7eec44af06d 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
-  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 4, 7};
+  auto const string_length   = std::vector<cudf::size_type>{2, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -83,8 +83,8 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
-  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 4};
+  auto const string_length   = std::vector<cudf::size_type>{0, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
-  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 1};
+  auto const string_length   = std::vector<cudf::size_type>{0, 0, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -149,8 +149,8 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
-  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 8, 12};
+  auto const string_length   = std::vector<cudf::size_type>{6, 3, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
-  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 6, 12};
+  auto const string_length   = std::vector<cudf::size_type>{4, 5, 5};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 10};
-  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 10};
+  auto const string_length   = std::vector<cudf::size_type>{8, 9};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<cudf::size_type>{1, 1, 1, 1, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(

From 3e9a88c47d1c5f749483fd4ac214338f612b70b1 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 28 Aug 2023 08:50:43 +0530
Subject: [PATCH 20/47] Rename type_inference.cuh to .cu file

---
 cpp/CMakeLists.txt                            |  1 +
 cpp/src/io/json/json_column.cu                | 12 +------
 cpp/src/io/json/nested_json_gpu.cu            |  2 +-
 cpp/src/io/utilities/string_parsing.hpp       | 36 +++++++++++++++++++
 .../{type_inference.cuh => type_inference.cu} | 10 +-----
 cpp/tests/io/type_inference_test.cu           |  2 +-
 6 files changed, 41 insertions(+), 22 deletions(-)
 create mode 100644 cpp/src/io/utilities/string_parsing.hpp
 rename cpp/src/io/utilities/{type_inference.cuh => type_inference.cu} (98%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 516865e5782..cbe5972049e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -418,6 +418,7 @@ add_library(
   src/io/utilities/file_io_utilities.cpp
   src/io/utilities/parsing_utils.cu
   src/io/utilities/row_selection.cpp
+  src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
   src/jit/cache.cpp
   src/jit/parser.cpp
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 479f7daf3ac..5e32ed5c990 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -16,7 +16,7 @@
 
 #include "nested_json.hpp"
 #include <io/utilities/parsing_utils.cuh>
-// #include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -52,16 +52,6 @@
 #include <algorithm>
 #include <cstdint>
 
-namespace cudf::io::detail {
-
-cudf::data_type infer_data_type(
-  cudf::io::json_inference_options_view const& options,
-  device_span<char const> data,
-  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
-  std::size_t const size,
-  rmm::cuda_stream_view stream);
-}
-
 namespace cudf::io::json::detail {
 
 // DEBUG prints
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index c6d9b69d6cc..baad52cd729 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -19,7 +19,7 @@
 #include <io/fst/logical_stack.cuh>
 #include <io/fst/lookup_tables.cuh>
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
new file mode 100644
index 00000000000..b26df3ac844
--- /dev/null
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <io/utilities/parsing_utils.cuh>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+namespace cudf::io::detail {
+
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream);
+}
diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu
similarity index 98%
rename from cpp/src/io/utilities/type_inference.cuh
rename to cpp/src/io/utilities/type_inference.cu
index 0b6c51fc647..892c34bf269 100644
--- a/cpp/src/io/utilities/type_inference.cuh
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -13,24 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include <io/utilities/column_type_histogram.hpp>
-#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <thrust/distance.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/tuple.h>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index 7eec44af06d..a14e7ecf5b3 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
-#include <io/utilities/type_inference.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>

From 1838341ddf49e19048aef7e5495ee6808531d10e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:03:55 +0530
Subject: [PATCH 21/47] Cleanup parse_data signature

---
 cpp/include/cudf/io/detail/data_casting.cuh |  36 +++---
 cpp/src/io/json/json_column.cu              |  29 +++--
 cpp/src/io/json/nested_json_gpu.cu          |  10 +-
 cpp/src/io/utilities/string_parsing.hpp     |  47 +++++++-
 cpp/src/io/utilities/type_inference.cu      |  14 ---
 cpp/tests/io/json_type_cast_test.cu         | 118 +++++++++++---------
 6 files changed, 141 insertions(+), 113 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 61128ccc6e6..47b6cca0df7 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -30,6 +31,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
@@ -663,33 +665,25 @@ struct to_string_view_pair {
   }
 };
 
-/**
- * @brief Parses the data from an iterator of string views, casting it to the given target data type
- *
- * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair
- * @param col_size The total number of items of this column
- * @param col_type The column's target data type
- * @param null_mask A null mask that renders certain items from the input invalid
- * @param options Settings for controlling the processing behavior
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr The resource to be used for device memory allocation
- * @return The column that contains the parsed data
- */
-template <typename str_tuple_it, typename B>
-std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
-                                   size_type col_size,
-                                   data_type col_type,
-                                   B&& null_mask,
-                                   size_type null_count,
-                                   cudf::io::parse_options_view const& options,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
 
+  // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
+  auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
+
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
     auto const max_length = thrust::transform_reduce(
       rmm::exec_policy(stream),
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 5e32ed5c990..04264a44cea 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -23,7 +23,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -331,23 +330,27 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<thrust::pair<char const*, size_type>> string_views(num_strings, stream);
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
   thrust::transform(rmm::exec_policy(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
-                    string_views.begin(),
-                    [data = input.data()] __device__(auto const& offsets) {
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
                       // Note: first character for non-field columns
-                      return thrust::make_pair(
-                        data + thrust::get<0>(offsets),
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
                         static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
                     });
 
   cudf::io::parse_options_view options_view{};
   options_view.quotechar  = '\0';  // no quotes
   options_view.keepquotes = true;
-  auto d_column_names     = parse_data(string_views.begin(),
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
                                    num_strings,
                                    data_type{type_id::STRING},
                                    rmm::device_buffer{},
@@ -355,7 +358,7 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
                                    options_view,
                                    stream,
                                    rmm::mr::get_current_device_resource());
-  auto to_host            = [stream](auto const& col) {
+  auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
@@ -764,13 +767,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       auto offset_length_it =
         thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
 
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
       data_type target_type{};
 
       if (schema.has_value()) {
@@ -789,7 +785,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 
       auto [result_bitmask, null_count] = make_validity(json_col);
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index baad52cd729..b99b8d2647d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1949,13 +1949,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto offset_length_it =
         thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin());
 
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
       data_type target_type{};
 
       if (schema.has_value()) {
@@ -1979,7 +1972,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto [result_bitmask, null_count] = make_validity(json_col);
 
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index b26df3ac844..12fc0a5b2e7 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -25,12 +25,55 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
-namespace cudf::io::detail {
+namespace cudf::io {
+namespace detail {
 
+/**
+ * @brief Infers data type for a given JSON string input `data`.
+ *
+ * @throw cudf::logic_error if input size is 0
+ * @throw cudf::logic_error if date time is not inferred as string
+ * @throw cudf::logic_error if data type inference failed
+ *
+ * @param options View of inference options
+ * @param data JSON string input
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param size Size of the string input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The inferred data type
+ */
 cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
   thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
   std::size_t const size,
   rmm::cuda_stream_view stream);
-}
+}  // namespace detail
+
+namespace json::detail {
+
+/**
+ * @brief Parses the data from an iterator of string views, casting it to the given target data type
+ *
+ * @param data string input base pointer
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param col_size The total number of items of this column
+ * @param col_type The column's target data type
+ * @param null_mask A null mask that renders certain items from the input invalid
+ * @param options Settings for controlling the processing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr The resource to be used for device memory allocation
+ * @return The column that contains the parsed data
+ */
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+}  // namespace json::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 892c34bf269..79a5c8f1c4c 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -252,20 +252,6 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
   return d_column_info.value(stream);
 }
 
-/**
- * @brief Infers data type for a given JSON string input `data`.
- *
- * @throw cudf::logic_error if input size is 0
- * @throw cudf::logic_error if date time is not inferred as string
- * @throw cudf::logic_error if data type inference failed
- *
- * @param options View of inference options
- * @param data JSON string input
- * @param offset_length_begin The beginning of an offset-length tuple sequence
- * @param size Size of the string input
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return The inferred data type
- */
 cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 5c32131114d..48a866fa5bc 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -37,11 +37,10 @@ using namespace cudf::test::iterators;
 struct JSONTypeCastTest : public cudf::test::BaseFixture {};
 
 namespace {
-struct to_thrust_pair_fn {
-  __device__ thrust::pair<char const*, cudf::size_type> operator()(
-    thrust::pair<cudf::string_view, bool> const& p)
+struct offsets_to_length {
+  __device__ cudf::size_type operator()(thrust::tuple<cudf::size_type, cudf::size_type> const& p)
   {
-    return {p.first.data(), p.first.size_bytes()};
+    return thrust::get<1>(p) - thrust::get<0>(p);
   }
 };
 }  // namespace
@@ -67,26 +66,31 @@ TEST_F(JSONTypeCastTest, String)
   std::vector<char const*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
   cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids);
 
-  auto d_column = cudf::column_device_view::create(input);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
+  auto column        = cudf::strings_column_view(input);
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto str_col = cudf::io::json::detail::parse_data(svs.data(),
-                                                    svs.size(),
-                                                    type,
-                                                    std::move(null_mask),
-                                                    0,
-                                                    default_json_options().view(),
-                                                    stream,
-                                                    mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto str_col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto out_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
@@ -103,26 +107,31 @@ TEST_F(JSONTypeCastTest, Int)
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
+  auto column        = cudf::strings_column_view(data);
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
@@ -146,26 +155,31 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     R"("escape with nothing to escape \")",
     R"("\"\\\/\b\f\n\r\t")",
   });
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
+  auto column        = cudf::strings_column_view(data);
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected = cudf::test::strings_column_wrapper{
     {"🚀", "Ａ🚀ＡＡ", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},

From efe9712cdeef333011c2f3b9c4856db60f55937f Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:22:19 +0530
Subject: [PATCH 22/47] rename data_casting.cuh to .cu

---
 cpp/CMakeLists.txt                                           | 1 +
 cpp/src/io/json/nested_json_gpu.cu                           | 1 -
 .../data_casting.cuh => src/io/utilities/data_casting.cu}    | 2 --
 cpp/tests/io/json_type_cast_test.cu                          | 5 ++++-
 4 files changed, 5 insertions(+), 4 deletions(-)
 rename cpp/{include/cudf/io/detail/data_casting.cuh => src/io/utilities/data_casting.cu} (99%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cbe5972049e..05e0c33ace9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -413,6 +413,7 @@ add_library(
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
+  src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b99b8d2647d..ea185699ce4 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -26,7 +26,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/detail/valid_if.cuh>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/src/io/utilities/data_casting.cu
similarity index 99%
rename from cpp/include/cudf/io/detail/data_casting.cuh
rename to cpp/src/io/utilities/data_casting.cu
index 47b6cca0df7..30173cc2272 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#pragma once
-
 #include <io/utilities/parsing_utils.cuh>
 #include <io/utilities/string_parsing.hpp>
 
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 48a866fa5bc..74085c7ebbb 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -21,15 +21,18 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <io/utilities/string_parsing.hpp>
+
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/exec_policy.hpp>
+
 #include <type_traits>
 
 using namespace cudf::test::iterators;

From 1d869962d61e6cdb2d5f4d4085f30556c90ab0af Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 29 Aug 2023 22:26:56 +0530
Subject: [PATCH 23/47] move get_escaped_char to parsing_utils.cuh

---
 cpp/src/io/json/write_json.cu          | 16 +---------------
 cpp/src/io/utilities/data_casting.cu   | 21 ---------------------
 cpp/src/io/utilities/parsing_utils.cuh | 24 +++++++++++++++++++++++-
 3 files changed, 24 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 5014140991d..303c32ea78d 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -20,6 +20,7 @@
  */
 
 #include <io/csv/durations.hpp>
+#include <io/utilities/parsing_utils.cuh>
 #include <lists/utilities.hpp>
 
 #include <cudf/column/column_device_view.cuh>
@@ -61,21 +62,6 @@
 #include <vector>
 
 namespace cudf::io::json::detail {
-__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return {'\\', '"'};
-    case '\\': return {'\\', '\\'};
-    case '/': return {'\\', '/'};
-    case '\b': return {'\\', 'b'};
-    case '\f': return {'\\', 'f'};
-    case '\n': return {'\\', 'n'};
-    case '\r': return {'\\', 'r'};
-    case '\t': return {'\\', 't'};
-    // case 'u': return UNICODE_SEQ;
-    default: return {'\0', escaped_char};
-  }
-}
 
 std::unique_ptr<column> make_column_names_column(host_span<column_name_info const> column_names,
                                                  size_type num_columns,
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 30173cc2272..7b076800f2c 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -103,27 +103,6 @@ __device__ __forceinline__ char get_escape_char(char escaped_char)
   }
 }
 
-/**
- * @brief Returns the escaped characters for a given character.
- *
- * @param escaped_char The character to escape.
- * @return The escaped characters for a given character.
- */
-__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return {'\\', '"'};
-    case '\\': return {'\\', '\\'};
-    case '/': return {'\\', '/'};
-    case '\b': return {'\\', 'b'};
-    case '\f': return {'\\', 'f'};
-    case '\n': return {'\\', 'n'};
-    case '\r': return {'\\', 'r'};
-    case '\t': return {'\\', 't'};
-    // case 'u': return UNICODE_SEQ;
-    default: return {'\0', escaped_char};
-  }
-}
 /**
  * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
  * \uXXXX.
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 5c3af588411..43d62fcd513 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,6 +116,28 @@ struct parse_options {
   }
 };
 
+/**
+ * @brief Returns the escaped characters for a given character.
+ *
+ * @param escaped_char The character to escape.
+ * @return The escaped characters for a given character.
+ */
+__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return {'\\', '"'};
+    case '\\': return {'\\', '\\'};
+    case '/': return {'\\', '/'};
+    case '\b': return {'\\', 'b'};
+    case '\f': return {'\\', 'f'};
+    case '\n': return {'\\', 'n'};
+    case '\r': return {'\\', 'r'};
+    case '\t': return {'\\', 't'};
+    // case 'u': return UNICODE_SEQ;
+    default: return {'\0', escaped_char};
+  }
+}
+
 /**
  * @brief Returns the numeric value of an ASCII/UTF-8 character.
  * Handles hexadecimal digits, both uppercase and lowercase

From e11452f43032f785967a2fc26762212740ce5609 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 29 Aug 2023 22:46:47 +0530
Subject: [PATCH 24/47] last backslash errored bug fix add unit tests with
 unicode to use both kernels

---
 cpp/src/io/utilities/data_casting.cu | 85 ++++++++++++++--------------
 cpp/tests/io/json_test.cpp           | 72 ++++++++++++++++-------
 2 files changed, 93 insertions(+), 64 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 7b076800f2c..08e0fdb0096 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -431,10 +431,49 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       auto c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto escaped_char = get_escape_char(c);
-      bool error        = false;
+
+      bool is_escaping_backslash{false};
+      // To check current is backslash by checking if previous is backslash.
+      // curr = !prev & c=='\\'
+      // So, scan is required from beginning of string.
+      // State table approach (intra-warp FST)
+      // 2 states: Not-Slash(NS), Slash(S).
+      // prev  /   *
+      // NS    S  NS
+      //  S   NS  NS
+      // After inclusive scan, all current S states translate to escaping backslash.
+      // All escaping backslash should be skipped.
+
+      struct state_table {
+        bool state[2];
+      };
+      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
+      auto composite_op = [](state_table op1, state_table op2) {
+        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+      };
+      state_table scanned;
+      [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE;
+      // inclusive scan of escaping backslashes
+      // TODO both inclusive and exclusive available in cub.
+      if constexpr (is_warp) {
+        using SlashScan = cub::WarpScan<state_table>;
+        __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
+        SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.state[init_state];
+        init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+      } else {
+        using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
+        __shared__ typename SlashScan::TempStorage temp_slash;
+        SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.state[init_state];
+        if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
+        // There is another __syncthreads() at the end of for-loop.
+      }
+
+      bool error = false;
       if (is_within_bounds) {
-        // TODO instead of '\\', use is_escaping_backslash, and previous index value also here.
-        error |= (c == '\\' && char_index == (in_end - in_begin) - 1);
+        // instead of '\\', using is_escaping_backslash, and previous index value also here.
+        error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
         error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
         error |= (prev_c == '\\' && c == 'u' &&
                   // TODO check if following condition is right or off by one error.
@@ -466,6 +505,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         break;                        // gride-stride return;
       }
       bool skip = !is_within_bounds;  // false;
+      skip |= is_escaping_backslash;
       if (is_within_bounds) {
         skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
                 in_begin[char_index - 1] == 'u';
@@ -479,45 +519,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       int this_num_out = 0;
       cudf::char_utf8 write_char{};
 
-      // To check current is backslash by checking if previous is backslash.
-      // curr = !prev & c=='\\'
-      // So, scan is required from beginning of string.
-      // State table approach (intra-warp FST)
-      // 2 states: Not-Slash(NS), Slash(S).
-      // prev  /   *
-      // NS    S  NS
-      //  S   NS  NS
-      // After inclusive scan, all current S states translate to escaping backslash.
-      // All escaping backslash should be skipped.
-
-      struct state_table {
-        bool state[2];
-      };
-      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
-      auto composite_op = [](state_table op1, state_table op2) {
-        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
-      };
-      state_table scanned;
-      [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE;
-      // inclusive scan of escaping backslashes
-      // TODO both inclusive and exclusive available in cub.
-      if constexpr (is_warp) {
-        using SlashScan = cub::WarpScan<state_table>;
-        __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
-        SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
-        auto is_escaping_backslash = scanned.state[init_state];
-        init_state                 = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
-        skip |= is_escaping_backslash;
-      } else {
-        using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
-        __shared__ typename SlashScan::TempStorage temp_slash;
-        SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
-        auto is_escaping_backslash = scanned.state[init_state];
-        if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
-        // There is another __syncthreads() at the end of for-loop.
-        skip |= is_escaping_backslash;
-      }
-
       if (!skip) {
         if (prev_c != '\\') {
           this_num_out = 1;
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0b9ff854618..4bced6397d1 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -1372,35 +1373,62 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
 
 TEST_F(JsonReaderTest, JsonLongString)
 {
-  std::string json_string =
-    R"({"a":"a0"}
-    {"a":"a1"}
-    {"a":"a2", "b":"b2"}
-    {"a":"a3", "c":"c3"})";
-  json_string += std::string("\n") + R"({"a":"a4\u20ac)" + std::string(103, 'a') + "\"}";
-  json_string += std::string("\n") + R"({"a":"a5\u20ac)" + std::string(1034, 'a') + "\"}";
-  json_string += std::string("\n") + R"({"a":"a6\u20ac)" + std::string(10340, 'a') + "\"}";
-
-  cudf::test::strings_column_wrapper col1{"a0",
-                                          "a1",
-                                          "a2",
-                                          "a3",
-                                          "a4€" + std::string(103, 'a'),
-                                          "a5€" + std::string(1034, 'a'),
-                                          "a6€" + std::string(10340, 'a')};
-  cudf::test::strings_column_wrapper col2{{"", "", "b2", "", "", "", ""}, {0, 0, 1, 0, 0, 0, 0}};
-  cudf::test::strings_column_wrapper col3{{"", "", "", "c3", "", "", ""}, {0, 0, 0, 1, 0, 0, 0}};
-  cudf::table_view expected({col1, col2, col3});
+  // Unicode
+  // 0000-FFFF     Basic Multilingual Plane
+  // 10000-10FFFF  Supplementary Plane
+  cudf::test::strings_column_wrapper col1{
+    {"\"\\/\b\f\n\r\t",
+     "\"",
+     "\\",
+     "/",
+     "\b",
+     "\f\n",
+     "\r\t",
+     "$€",
+     "ராபிட்ஸ்",
+     "C𝞵𝓓𝒻",
+     "",                                                // null
+     "",                                                // null
+     "கார்த்தி",
+     "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+     "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+     "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+     "𰾑𱔈𲍉"},                                         // 30000-3FFFF
+    cudf::test::iterators::nulls_at({10, 11})};
+
+  cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640, 1280};
+  auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
+  auto col2   = d_col2->view();
+  cudf::table_view tbl_view{{col1, col2, repeat_times}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(true)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+
+  cudf::table_view expected({col1, col2, repeat_times});
+  std::map<std::string, data_type> types;
+  types["col1"]  = data_type{type_id::STRING};
+  types["col2"]  = data_type{type_id::STRING};
+  types["int16"] = data_type{type_id::INT16};
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
-      cudf::io::source_info{json_string.c_str(), json_string.size()})
-      .lines(true);
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
+      .lines(true)
+      .dtypes(types);
 
   // Read test data via nested JSON reader
   auto const table = cudf::io::read_json(json_lines_options);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view());
 }
 
 TEST_F(JsonReaderTest, TokenAllocation)

From 9426dda58b52d6832c918a80508b8f0ea589df18 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 30 Aug 2023 20:02:30 +0530
Subject: [PATCH 25/47] address review comments, update docs

---
 cpp/src/io/utilities/data_casting.cu | 56 ++++++++++++++++++----------
 cpp/tests/io/json_test.cpp           |  2 +-
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 08e0fdb0096..7f619eae3c1 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -186,8 +186,6 @@ process_string(in_iterator_t in_begin,
     }
     return {bytes, data_casting_result::PARSING_SUCCESS};
   }
-  // Whether in the original JSON this was a string value enclosed in quotes
-  // ({"a":"foo"} vs. {"a":1.23})
   char const backslash_char = '\\';
 
   // Escape-flag, set after encountering a backslash character
@@ -284,27 +282,27 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
-// 1 warp per string.
-// algorithm
+// Algorithm: warp/block parallel version of string_parse and process_string()
+// Decoding character classes (u8, u16, \*, *):
 // character      count: input->output
 // \uXXXX         6->2/3/4
 // \uXXXX\uXXXX  12->2/3/4
 // \"             2->1
 // *              1->1
 //
-// error conditions. (propagate)
-// c=='\' & curr_idx == end_idx-1; ERROR
+// ERROR conditions. (all collaborating threads quit)
+// c=='\' & curr_idx == end_idx-1;
 // [c-1]=='\' &  get_escape[c]==NEC
 // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
 // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
-
-// skip conditions. (scan for size)
-// c=='\' skip.
+//
+// skip conditions. (current thread skips this char, no output)
+// c=='\' skip. (Escaping char only)
 // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
-
+//
 // write conditions. (write to d_buffer)
 // [c-1]!='\' &  [c]!='\' write [c]
-// [c-1]!='\' &  [c]=='\' skip (unnecessary? already covered? in skip conditions)
+// [c-1]!='\' &  [c]=='\' skip (already covered in skip conditions)
 // [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
 // [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
 // [c+1:4]=curr_hex_val
@@ -313,11 +311,28 @@ process_string(in_iterator_t in_begin,
 //        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
 //        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
 //        // if prev_hex_val in high, curr_hex_val in low, skip.
-//        // if curr_hex_val in high, next_hex_val in low, write u16.
-// if curr_hex_val not in high, write u8.
-// before writing, find size, then intra-warp scan for out_idx
-// propagate offset from 32nd thread to others in warp to carry forward.
-// 1 warp per string or 1 block per string
+//        // if curr_hex_val in high, next_hex_val in low, write [u16]
+// if curr_hex_val not in high, write [u8]
+// before writing, find num of output characters per threads,
+// then do intra-warp/intra-block scan for out_idx
+// propagate offset from next iteration to carry forward.
+// Uses 1 warp per string or 1 block per string
+
+/**
+ * @brief Warp/Block parallel version of string_parse functor
+ *
+ * @tparam is_warp True if 1 warp per string, False if 1 block per string
+ * @tparam num_warps Number of warps per block
+ * @tparam str_tuple_it Iterator type for tuple with string pointer and its length
+ * @param str_tuples iterator of tuple with string pointer and its length
+ * @param total_out_strings Number of string rows to be processed
+ * @param str_counter Counter to keep track of processed number of strings
+ * @param null_mask Null mask
+ * @param null_count_data pointer to store null count
+ * @param options Settings for controlling string processing behavior
+ * @param d_offsets Offsets to identify where to store the results for each string
+ * @param d_chars Character array to store the characters of strings
+ */
 template <bool is_warp, size_type num_warps, typename str_tuple_it>
 __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                                          size_type total_out_strings,
@@ -401,7 +416,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       ++in_begin;
       --in_end;
     }
-    // warp-parallelized process_string(in_begin, in_end, d_buffer, options);
+    // warp-parallelized or block-parallelized process_string()
 
     auto is_hex = [](auto ch) {
       return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
@@ -436,7 +451,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // To check current is backslash by checking if previous is backslash.
       // curr = !prev & c=='\\'
       // So, scan is required from beginning of string.
-      // State table approach (intra-warp FST)
+      // State table approach (intra-warp FST) (intra-block FST)
       // 2 states: Not-Slash(NS), Slash(S).
       // prev  /   *
       // NS    S  NS
@@ -528,6 +543,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             this_num_out = 1;
             write_char   = escaped_char;
           } else {
+            // TODO if prev is not escaping backslash, copy \uXXXX.
             // Unicode
             // \uXXXX
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
@@ -696,8 +712,8 @@ std::unique_ptr<column> parse_data(
 
       constexpr auto warps_per_block  = 8;
       constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
-      auto num_blocks                 = min(65535, col_size / warps_per_block + 1);
-      auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
+      auto num_blocks  = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block));
+      auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
 
       parse_fn_string_parallel<true, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 4bced6397d1..8ad4fec0c3c 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1397,7 +1397,7 @@ TEST_F(JsonReaderTest, JsonLongString)
     cudf::test::iterators::nulls_at({10, 11})};
 
   cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
-    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640, 1280};
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279};
   auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
   auto col2   = d_col2->view();
   cudf::table_view tbl_view{{col1, col2, repeat_times}};

From b481d7cfa0bcddaf4b227bd6cf99a13e99c64de3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 31 Aug 2023 23:43:50 +0530
Subject: [PATCH 26/47] add complex test cases

---
 cpp/tests/io/json_test.cpp | 50 +++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 8ad4fec0c3c..050f75dcb83 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1377,30 +1377,40 @@ TEST_F(JsonReaderTest, JsonLongString)
   // 0000-FFFF     Basic Multilingual Plane
   // 10000-10FFFF  Supplementary Plane
   cudf::test::strings_column_wrapper col1{
-    {"\"\\/\b\f\n\r\t",
-     "\"",
-     "\\",
-     "/",
-     "\b",
-     "\f\n",
-     "\r\t",
-     "$€",
-     "ராபிட்ஸ்",
-     "C𝞵𝓓𝒻",
-     "",                                                // null
-     "",                                                // null
-     "கார்த்தி",
-     "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
-     "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
-     "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
-     "𰾑𱔈𲍉"},                                         // 30000-3FFFF
+    {
+      "\"\\/\b\f\n\r\t",
+      "\"",
+      "\\",
+      "/",
+      "\b",
+      "\f\n",
+      "\r\t",
+      "$€",
+      "ராபிட்ஸ்",
+      "C𝞵𝓓𝒻",
+      "",                                                // null
+      "",                                                // null
+      "கார்த்தி",
+      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+      "𰾑𱔈𲍉",                                          // 30000-3FFFF
+      R"("$€ \u0024\u20ac \\u0024\\u20ac  \\\u0024\\\u20ac \\\\u0024\\\\u20ac)",
+      R"(        \\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"(                      \\abcd)",
+      R"(                 \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+      R"(                \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+    },
     cudf::test::iterators::nulls_at({10, 11})};
 
   cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
-    1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279};
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3};
   auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
   auto col2   = d_col2->view();
-  cudf::table_view tbl_view{{col1, col2, repeat_times}};
+  cudf::table_view const tbl_view{{col1, col2, repeat_times}};
   cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
 
   std::vector<char> out_buffer;
@@ -1413,7 +1423,7 @@ TEST_F(JsonReaderTest, JsonLongString)
 
   cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
 
-  cudf::table_view expected({col1, col2, repeat_times});
+  cudf::table_view const expected = tbl_view;
   std::map<std::string, data_type> types;
   types["col1"]  = data_type{type_id::STRING};
   types["col2"]  = data_type{type_id::STRING};

From a5acda6042b00e2f56e05fcb428675b91c1c1253 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 31 Aug 2023 23:44:17 +0530
Subject: [PATCH 27/47] add is escaping backslash lookback

---
 cpp/src/io/utilities/data_casting.cu | 84 ++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 7f619eae3c1..9711e1947a6 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -38,6 +38,7 @@
 #include <cub/cub.cuh>
 
 #include <memory>
+#include <type_traits>
 
 namespace cudf::io::json::detail {
 
@@ -282,6 +283,51 @@ process_string(in_iterator_t in_begin,
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+template <unsigned num_warps>
+struct bitfield_warp {
+  // 5+32 for each warp.
+  bool is_slash[num_warps][5 + 32];
+  __device__ void reset(unsigned warp_id)
+  {
+    is_slash[warp_id][threadIdx.x % 32]     = 0;
+    is_slash[warp_id][threadIdx.x % 32 + 5] = 0;
+  }
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x % 32 < 5)
+      is_slash[warp_id][threadIdx.x % 32] = is_slash[warp_id][32 + threadIdx.x % 32];
+  }
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[warp_id][5 + threadIdx.x % 32] = is_escaping_backslash;
+  }
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[warp_id][5 + bit_index];
+  }
+};
+
+template <unsigned num_warps>
+struct bitfield_block {
+  // 5 + num_warps*32 for entire block
+  bool is_slash[5 + num_warps * 32];
+
+  __device__ void reset(unsigned warp_id)
+  {
+    is_slash[threadIdx.x]     = 0;
+    is_slash[threadIdx.x + 5] = 0;
+  }
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x < 5) is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x];
+  }
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[5 + threadIdx.x] = is_escaping_backslash;
+  }
+  __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[5 + bit_index]; }
+};
+
 // Algorithm: warp/block parallel version of string_parse and process_string()
 // Decoding character classes (u8, u16, \*, *):
 // character      count: input->output
@@ -423,6 +469,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     };
 
     // for backslash scan calculation: is_previous_escaping_backslash
+    [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size;
     bool init_state_reg;
     __shared__ bool init_state_shared;
     size_type last_offset_reg;
@@ -433,10 +480,13 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       init_state  = false;
       last_offset = 0;
     }
-    // if constexpr(!is_warp) { __syncthreads(); }
+    using bitfield =
+      std::conditional_t<is_warp, bitfield_warp<num_warps>, bitfield_block<num_warps>>;
+    __shared__ bitfield is_slash;
+    is_slash.reset(warp_id);
+    __syncthreads();
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
-    // auto MASK = 0xffffffff;
     for (size_type char_index = lane;
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
@@ -448,6 +498,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       auto escaped_char = get_escape_char(c);
 
       bool is_escaping_backslash{false};
+      [[maybe_unused]] bool is_prev_escaping_backslash{false};
       // To check current is backslash by checking if previous is backslash.
       // curr = !prev & c=='\\'
       // So, scan is required from beginning of string.
@@ -467,7 +518,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
       };
       state_table scanned;
-      [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE;
       // inclusive scan of escaping backslashes
       // TODO both inclusive and exclusive available in cub.
       if constexpr (is_warp) {
@@ -476,12 +526,24 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
         is_escaping_backslash = scanned.state[init_state];
         init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        __syncwarp();
+        is_slash.shift(warp_id);
+        __syncwarp();
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        __syncwarp();
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
       } else {
         using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
         __shared__ typename SlashScan::TempStorage temp_slash;
         SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
         is_escaping_backslash = scanned.state[init_state];
         if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
+        __syncthreads();
+        is_slash.shift(warp_id);
+        __syncthreads();
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        __syncthreads();
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
         // There is another __syncthreads() at the end of for-loop.
       }
 
@@ -489,8 +551,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (is_within_bounds) {
         // instead of '\\', using is_escaping_backslash, and previous index value also here.
         error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
-        error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR);
-        error |= (prev_c == '\\' && c == 'u' &&
+        error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR);
+        error |= (is_prev_escaping_backslash && c == 'u' &&
                   // TODO check if following condition is right or off by one error.
                   ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
@@ -522,20 +584,21 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       bool skip = !is_within_bounds;  // false;
       skip |= is_escaping_backslash;
       if (is_within_bounds) {
-        skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' &&
+        // \uXXXX check for "\u" for each X
+        skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) &&
                 in_begin[char_index - 1] == 'u';
-        skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' &&
+        skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) &&
                 in_begin[char_index - 2] == 'u';
-        skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' &&
+        skip |= char_index - 4 >= 0 && is_slash.get_bit(warp_id, lane - 4) &&
                 in_begin[char_index - 3] == 'u';
-        skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' &&
+        skip |= char_index - 5 >= 0 && is_slash.get_bit(warp_id, lane - 5) &&
                 in_begin[char_index - 4] == 'u';
       }
       int this_num_out = 0;
       cudf::char_utf8 write_char{};
 
       if (!skip) {
-        if (prev_c != '\\') {
+        if (!is_prev_escaping_backslash) {
           this_num_out = 1;
           if (d_chars) write_char = c;
         } else {
@@ -543,7 +606,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             this_num_out = 1;
             write_char   = escaped_char;
           } else {
-            // TODO if prev is not escaping backslash, copy \uXXXX.
             // Unicode
             // \uXXXX
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);

From 4a3941fd6a01e5d064042c17f9e25e274d70b22d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 1 Sep 2023 02:53:44 +0530
Subject: [PATCH 28/47] call thread kernel for small size, adjust sizes

---
 cpp/src/io/utilities/data_casting.cu | 108 +++++++++++++++------------
 1 file changed, 61 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 9711e1947a6..c3a9c8bbb91 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -42,8 +42,8 @@
 
 namespace cudf::io::json::detail {
 
-constexpr auto SINGLE_THREAD_THRESHOLD = 128;
-constexpr auto WARP_THRESHOLD          = 1024;
+constexpr auto SINGLE_THREAD_THRESHOLD = 512;
+constexpr auto WARP_THRESHOLD          = 1024 * 4;
 
 // Unicode code point escape sequence
 static constexpr char UNICODE_SEQ = 0x7F;
@@ -420,6 +420,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
     if constexpr (is_warp) {
+      if (!(num_in_chars > SINGLE_THREAD_THRESHOLD)) continue;
       if (num_in_chars > WARP_THRESHOLD) continue;
     } else {
       if (num_in_chars <= WARP_THRESHOLD) continue;
@@ -685,6 +686,8 @@ struct string_parse {
     auto const in_end       = in_begin + str_tuples[idx].second;
     auto const num_in_chars = str_tuples[idx].second;
 
+    if (num_in_chars > SINGLE_THREAD_THRESHOLD) return;
+
     // Check if the value corresponds to the null literal
     auto const is_null_literal =
       (!d_chars) &&
@@ -749,34 +752,29 @@ std::unique_ptr<column> parse_data(
       size_type{0},
       thrust::maximum<size_type>{});
 
-    if (max_length < SINGLE_THREAD_THRESHOLD) {
-      // this utility calls the functor to build the offsets and chars columns;
-      // the bitmask and null count may be updated by parse failures
-      nvtxRangePush("make_strings_children");
-      auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-        string_parse<decltype(str_tuples)>{
-          str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
-        col_size,
-        stream,
-        mr);
-      nvtxRangePop();
-
-      return make_strings_column(col_size,
-                                 std::move(offsets),
-                                 std::move(chars),
-                                 d_null_count.value(stream),
-                                 std::move(null_mask));
-    } else {
-      nvtxRangePush("string_parallel");
-      auto offsets2 = cudf::make_numeric_column(
-        data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
-      auto d_offsets = offsets2->mutable_view().data<size_type>();
-
-      constexpr auto warps_per_block  = 8;
-      constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
-      auto num_blocks  = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block));
-      auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
-
+    nvtxRangePush("string_parallel");
+    auto offsets = cudf::make_numeric_column(
+      data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets = offsets->mutable_view().data<size_type>();
+
+    auto single_thread_fn =
+      string_parse<decltype(str_tuples)>{str_tuples,
+                                         static_cast<bitmask_type*>(null_mask.data()),
+                                         null_count_data,
+                                         options,
+                                         d_offsets};
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       col_size,
+                       single_thread_fn);
+
+    constexpr auto warps_per_block  = 8;
+    constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
+    auto num_blocks  = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block));
+    auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
+
+    // TODO run these independent kernels in parallel streams.
+    if (max_length > SINGLE_THREAD_THRESHOLD) {
       parse_fn_string_parallel<true, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
           str_tuples,
@@ -787,8 +785,11 @@ std::unique_ptr<column> parse_data(
           options,
           d_offsets,
           nullptr);
-      str_counter.set_value(0, stream);
+    }
+
+    if (max_length > WARP_THRESHOLD) {
       // for strings longer than WARP_THRESHOLD, 1 block per string
+      str_counter.set_value(0, stream);
       parse_fn_string_parallel<false, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
           str_tuples,
@@ -799,15 +800,26 @@ std::unique_ptr<column> parse_data(
           options,
           d_offsets,
           nullptr);
-      auto const bytes =
-        cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+    }
+    auto const bytes =
+      cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+    CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+                 "Size of output exceeds the column size limit",
+                 std::overflow_error);
+
+    // CHARS column
+    std::unique_ptr<column> chars =
+      strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+    auto d_chars = chars->mutable_view().data<char>();
+
+    single_thread_fn.d_chars = d_chars;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       col_size,
+                       single_thread_fn);
+
+    if (max_length > SINGLE_THREAD_THRESHOLD) {
       str_counter.set_value(0, stream);
-
-      // CHARS column
-      std::unique_ptr<column> chars =
-        strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-      auto d_chars = chars->mutable_view().data<char>();
-
       parse_fn_string_parallel<true, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
           str_tuples,
@@ -818,8 +830,10 @@ std::unique_ptr<column> parse_data(
           options,
           d_offsets,
           d_chars);
-      str_counter.set_value(0, stream);
+    }
 
+    if (max_length > WARP_THRESHOLD) {
+      str_counter.set_value(0, stream);
       // for strings longer than WARP_THRESHOLD, 1 block per string
       parse_fn_string_parallel<false, warps_per_block>
         <<<num_blocks, threads_per_block, 0, stream.value()>>>(
@@ -831,14 +845,14 @@ std::unique_ptr<column> parse_data(
           options,
           d_offsets,
           d_chars);
-      nvtxRangePop();
-
-      return make_strings_column(col_size,
-                                 std::move(offsets2),
-                                 std::move(chars),
-                                 d_null_count.value(stream),
-                                 std::move(null_mask));
     }
+    nvtxRangePop();
+
+    return make_strings_column(col_size,
+                               std::move(offsets),
+                               std::move(chars),
+                               d_null_count.value(stream),
+                               std::move(null_mask));
   }
 
   auto out_col =

From d227dad02b49d0359b05a117dc3e2391eddb3f2d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 1 Sep 2023 22:27:06 +0530
Subject: [PATCH 29/47] address review comments

---
 cpp/src/io/utilities/data_casting.cu | 75 ++++++++++++++++++----------
 1 file changed, 48 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index c3a9c8bbb91..42f4491cec3 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -32,6 +32,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
 
@@ -180,17 +181,14 @@ process_string(in_iterator_t in_begin,
 
   // Copy literal/numeric value
   if (not is_string_value) {
-    while (in_begin != in_end) {
-      if (d_buffer) *d_buffer++ = *in_begin;
-      ++in_begin;
-      ++bytes;
-    }
+    bytes += (in_end - in_begin);
+    if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer);
     return {bytes, data_casting_result::PARSING_SUCCESS};
   }
-  char const backslash_char = '\\';
+  char constexpr backslash_char = '\\';
 
   // Escape-flag, set after encountering a backslash character
-  bool escape = false;
+  bool is_prev_char_escape = false;
 
   // Exclude beginning and ending quote chars from string range
   if (!options.keepquotes) {
@@ -201,9 +199,9 @@ process_string(in_iterator_t in_begin,
   // Iterate over the input
   while (in_begin != in_end) {
     // Copy single character to output
-    if (!escape) {
-      escape = (*in_begin == backslash_char);
-      if (!escape) {
+    if (!is_prev_char_escape) {
+      is_prev_char_escape = (*in_begin == backslash_char);
+      if (!is_prev_char_escape) {
         if (d_buffer) *d_buffer++ = *in_begin;
         ++bytes;
       }
@@ -213,7 +211,7 @@ process_string(in_iterator_t in_begin,
 
     // Previous char indicated beginning of escape sequence
     // Reset escape flag for next loop iteration
-    escape = false;
+    is_prev_char_escape = false;
 
     // Check the character that is supposed to be escaped
     auto escaped_char = get_escape_char(*in_begin);
@@ -279,53 +277,77 @@ process_string(in_iterator_t in_begin,
   }
 
   // The last character of the input is a backslash -> "fail"/null for this item
-  if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+  if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
   return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+/**
+ * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * warp.
+ *
+ * @tparam num_warps number of warps in the block
+ */
 template <unsigned num_warps>
 struct bitfield_warp {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back upto 5 chars are needed.
   // 5+32 for each warp.
-  bool is_slash[num_warps][5 + 32];
+  bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size];
   __device__ void reset(unsigned warp_id)
   {
-    is_slash[warp_id][threadIdx.x % 32]     = 0;
-    is_slash[warp_id][threadIdx.x % 32 + 5] = 0;
+    if (threadIdx.x < UNICODE_LOOK_BACK) {
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0;
+    }
+    is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0;
   }
   __device__ void shift(unsigned warp_id)
   {
-    if (threadIdx.x % 32 < 5)
-      is_slash[warp_id][threadIdx.x % 32] = is_slash[warp_id][32 + threadIdx.x % 32];
+    if (threadIdx.x % 32 < UNICODE_LOOK_BACK)
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] =
+        is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size];
   }
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
-    is_slash[warp_id][5 + threadIdx.x % 32] = is_escaping_backslash;
+    is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] =
+      is_escaping_backslash;
   }
   __device__ bool get_bit(unsigned warp_id, int bit_index)
   {
-    return is_slash[warp_id][5 + bit_index];
+    return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index];
   }
 };
 
+/**
+ * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * block.
+ *
+ * @tparam num_warps number of warps in the block
+ */
 template <unsigned num_warps>
 struct bitfield_block {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back upto 5 chars are needed.
   // 5 + num_warps*32 for entire block
-  bool is_slash[5 + num_warps * 32];
+  bool is_slash[UNICODE_LOOK_BACK + num_warps * 32];
 
   __device__ void reset(unsigned warp_id)
   {
-    is_slash[threadIdx.x]     = 0;
-    is_slash[threadIdx.x + 5] = 0;
+    if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; }
+    is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0;
   }
   __device__ void shift(unsigned warp_id)
   {
-    if (threadIdx.x < 5) is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x];
+    if (threadIdx.x < UNICODE_LOOK_BACK)
+      is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x];
   }
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
-    is_slash[5 + threadIdx.x] = is_escaping_backslash;
+    is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash;
+  }
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[UNICODE_LOOK_BACK + bit_index];
   }
-  __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[5 + bit_index]; }
 };
 
 // Algorithm: warp/block parallel version of string_parse and process_string()
@@ -420,8 +442,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     auto in_end             = in_begin + str_tuples[istring].second;
     auto const num_in_chars = str_tuples[istring].second;
     if constexpr (is_warp) {
-      if (!(num_in_chars > SINGLE_THREAD_THRESHOLD)) continue;
-      if (num_in_chars > WARP_THRESHOLD) continue;
+      if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue;
     } else {
       if (num_in_chars <= WARP_THRESHOLD) continue;
     }

From bcade0f379d4c8d653dba9762f53775590910e84 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 4 Sep 2023 23:03:34 +0530
Subject: [PATCH 30/47] address review comments, fix 2 data hazards

---
 cpp/src/io/utilities/data_casting.cu | 73 +++++++++++++++-------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 42f4491cec3..35c0f34a393 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -302,14 +302,16 @@ struct bitfield_warp {
   }
   __device__ void shift(unsigned warp_id)
   {
-    if (threadIdx.x % 32 < UNICODE_LOOK_BACK)
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK)
       is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] =
         is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size];
+    __syncwarp();
   }
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
     is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] =
       is_escaping_backslash;
+    __syncwarp();
   }
   __device__ bool get_bit(unsigned warp_id, int bit_index)
   {
@@ -328,7 +330,7 @@ struct bitfield_block {
   static constexpr auto UNICODE_LOOK_BACK{5};
   // 5 because for skipping unicode hex chars, look back upto 5 chars are needed.
   // 5 + num_warps*32 for entire block
-  bool is_slash[UNICODE_LOOK_BACK + num_warps * 32];
+  bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size];
 
   __device__ void reset(unsigned warp_id)
   {
@@ -338,11 +340,13 @@ struct bitfield_block {
   __device__ void shift(unsigned warp_id)
   {
     if (threadIdx.x < UNICODE_LOOK_BACK)
-      is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x];
+      is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x];
+    __syncthreads();
   }
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
     is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash;
+    __syncthreads();
   }
   __device__ bool get_bit(unsigned warp_id, int bit_index)
   {
@@ -415,17 +419,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
   size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x;
 
-  // get 1-string index per warp
+  // get 1-string index per warp/block
   auto get_next_string = [&]() {
     if constexpr (is_warp) {
       size_type istring;
       if (lane == 0) { istring = atomicAdd(str_counter, 1); }
-      __syncwarp();
       return __shfl_sync(0xffffffff, istring, 0);
     } else {
+      // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's
+      // istring value
+      __syncthreads();
       __shared__ size_type istring;
       if (lane == 0) { istring = atomicAdd(str_counter, 1); }
-      __syncthreads();  // memory fence?
+      __syncthreads();
       return istring;
     }
   };
@@ -434,7 +440,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
        istring           = get_next_string()) {
     // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
-      if (!d_chars) d_offsets[istring] = 0;
+      if (!d_chars && lane == 0) d_offsets[istring] = 0;
       continue;  // gride-stride return;
     }
 
@@ -448,16 +454,17 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     }
 
     // Check if the value corresponds to the null literal
-    auto const is_null_literal =
-      (!d_chars) &&
-      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-    if (is_null_literal && null_mask != nullptr) {
-      if (lane == 0) {
-        clear_bit(null_mask, istring);
-        atomicAdd(null_count_data, 1);
-        if (!d_chars) d_offsets[istring] = 0;
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        if (lane == 0) {
+          clear_bit(null_mask, istring);
+          atomicAdd(null_count_data, 1);
+          if (!d_chars) d_offsets[istring] = 0;
+        }
+        continue;  // gride-stride return;
       }
-      continue;  // gride-stride return;
     }
     // String values are indicated by keeping the quote character
     bool const is_string_value =
@@ -512,12 +519,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     for (size_type char_index = lane;
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
-      auto MASK =
-        is_warp ? __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) : 0xffffffff;
-      bool is_within_bounds = char_index < (in_end - in_begin);
-      auto c                = is_within_bounds ? in_begin[char_index] : '\0';
-      auto prev_c       = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
-      auto escaped_char = get_escape_char(c);
+      bool const is_within_bounds = char_index < (in_end - in_begin);
+      auto const MASK   = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff;
+      auto const c      = is_within_bounds ? in_begin[char_index] : '\0';
+      auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
+      auto const escaped_char = get_escape_char(c);
 
       bool is_escaping_backslash{false};
       [[maybe_unused]] bool is_prev_escaping_backslash{false};
@@ -550,21 +556,18 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
         __syncwarp();
         is_slash.shift(warp_id);
-        __syncwarp();
         is_slash.set_bits(warp_id, is_escaping_backslash);
-        __syncwarp();
         is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
       } else {
         using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
         __shared__ typename SlashScan::TempStorage temp_slash;
         SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
         is_escaping_backslash = scanned.state[init_state];
+        __syncthreads();
         if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
         __syncthreads();
         is_slash.shift(warp_id);
-        __syncthreads();
         is_slash.set_bits(warp_id, is_escaping_backslash);
-        __syncthreads();
         is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
         // There is another __syncthreads() at the end of for-loop.
       }
@@ -679,6 +682,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         if constexpr (is_warp) {
           last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
         } else {
+          __syncthreads();
           if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
           __syncthreads();
         }
@@ -710,14 +714,15 @@ struct string_parse {
     if (num_in_chars > SINGLE_THREAD_THRESHOLD) return;
 
     // Check if the value corresponds to the null literal
-    auto const is_null_literal =
-      (!d_chars) &&
-      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-    if (is_null_literal && null_mask != nullptr) {
-      clear_bit(null_mask, idx);
-      atomicAdd(null_count_data, 1);
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        clear_bit(null_mask, idx);
+        atomicAdd(null_count_data, 1);
+        if (!d_chars) d_offsets[idx] = 0;
+        return;
+      }
     }
 
     char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;

From 4d1e048a61976b59bc7117cd83c7cd553f2da6b7 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 4 Sep 2023 23:34:06 +0530
Subject: [PATCH 31/47] update comments

---
 cpp/src/io/utilities/data_casting.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 35c0f34a393..d3d7de39eb9 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -547,7 +547,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       };
       state_table scanned;
       // inclusive scan of escaping backslashes
-      // TODO both inclusive and exclusive available in cub.
       if constexpr (is_warp) {
         using SlashScan = cub::WarpScan<state_table>;
         __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
@@ -578,7 +577,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
         error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR);
         error |= (is_prev_escaping_backslash && c == 'u' &&
-                  // TODO check if following condition is right or off by one error.
                   ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
@@ -591,7 +589,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         __shared__ typename ErrorReduce::TempStorage temp_storage_error;
         __shared__ bool error_reduced;
         error_reduced = ErrorReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
-        // only valid in thread0.
+        // only valid in thread0, so shared memory is used for broadcast.
         __syncthreads();
         error = error_reduced;
       }

From 896141de2fb8b270a547e5e4b0884cd528e9e6e5 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 5 Sep 2023 22:11:01 +0530
Subject: [PATCH 32/47] using bitfields for state_table, no local mem

---
 cpp/src/io/utilities/data_casting.cu | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index d3d7de39eb9..bccd64703be 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -282,7 +282,7 @@ process_string(in_iterator_t in_begin,
 }
 
 /**
- * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
  * warp.
  *
  * @tparam num_warps number of warps in the block
@@ -290,12 +290,12 @@ process_string(in_iterator_t in_begin,
 template <unsigned num_warps>
 struct bitfield_warp {
   static constexpr auto UNICODE_LOOK_BACK{5};
-  // 5 because for skipping unicode hex chars, look back upto 5 chars are needed.
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
   // 5+32 for each warp.
   bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size];
   __device__ void reset(unsigned warp_id)
   {
-    if (threadIdx.x < UNICODE_LOOK_BACK) {
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) {
       is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0;
     }
     is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0;
@@ -320,7 +320,7 @@ struct bitfield_warp {
 };
 
 /**
- * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
  * block.
  *
  * @tparam num_warps number of warps in the block
@@ -328,7 +328,7 @@ struct bitfield_warp {
 template <unsigned num_warps>
 struct bitfield_block {
   static constexpr auto UNICODE_LOOK_BACK{5};
-  // 5 because for skipping unicode hex chars, look back upto 5 chars are needed.
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
   // 5 + num_warps*32 for entire block
   bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size];
 
@@ -539,11 +539,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       // All escaping backslash should be skipped.
 
       struct state_table {
-        bool state[2];
+        // using bit fields instead of state[2]
+        bool state0 : 1;
+        bool state1 : 1;
+        bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; }
       };
       state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
       auto composite_op = [](state_table op1, state_table op2) {
-        return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+        // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+        return state_table{op1.state0 ? op2.state1 : op2.state0,
+                           op1.state1 ? op2.state1 : op2.state0};
       };
       state_table scanned;
       // inclusive scan of escaping backslashes
@@ -551,7 +556,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         using SlashScan = cub::WarpScan<state_table>;
         __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
         SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
-        is_escaping_backslash = scanned.state[init_state];
+        is_escaping_backslash = scanned.get(init_state);
         init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
         __syncwarp();
         is_slash.shift(warp_id);
@@ -561,7 +566,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
         __shared__ typename SlashScan::TempStorage temp_slash;
         SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
-        is_escaping_backslash = scanned.state[init_state];
+        is_escaping_backslash = scanned.get(init_state);
         __syncthreads();
         if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
         __syncthreads();

From 7095aab8dd20ec0020eaeff40f7f89d98cb55f1b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 15:02:51 +0530
Subject: [PATCH 33/47] review comments syncthreads()

---
 cpp/src/io/utilities/data_casting.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index bccd64703be..a4926daecf6 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -599,14 +599,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         error = error_reduced;
       }
       if (error) {
-        if (lane == 0) {
+        if (!d_chars && lane == 0) {
           if (null_mask != nullptr) {
             clear_bit(null_mask, istring);
             atomicAdd(null_count_data, 1);
           }
           last_offset = 0;
-          if (!d_chars) d_offsets[istring] = 0;
+          d_offsets[istring] = 0;
         }
+        if constexpr (!is_warp)
+          __syncthreads();
         break;                        // gride-stride return;
       }
       bool skip = !is_within_bounds;  // false;

From 0d723e00457c87e6a773255b8b6fd073b69e1a4a Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:19:04 +0530
Subject: [PATCH 34/47] address review comments syncthreads()

---
 cpp/src/io/utilities/data_casting.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index a4926daecf6..02742969cd2 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -604,11 +604,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             clear_bit(null_mask, istring);
             atomicAdd(null_count_data, 1);
           }
-          last_offset = 0;
+          last_offset        = 0;
           d_offsets[istring] = 0;
         }
-        if constexpr (!is_warp)
-          __syncthreads();
+        if constexpr (!is_warp) { __syncthreads(); }
         break;                        // gride-stride return;
       }
       bool skip = !is_within_bounds;  // false;

From 3dce8d9d1eceba33a154a34a935e9117f347eb18 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:20:23 +0530
Subject: [PATCH 35/47] fix consts, zero size column, roundup

---
 cpp/src/io/utilities/data_casting.cu | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 02742969cd2..4cef2c8e50d 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -642,8 +642,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             // \uXXXX\uXXXX
             // Note: no need for scanned_backslash below because we already know that
             // only '\u' check is enough.
-            if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' &&
-                in_begin[char_index + 1 + 5] == 'u') {
+            if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
+                  in_end &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') {
               hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
             }
             if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
@@ -767,6 +769,7 @@ std::unique_ptr<column> parse_data(
 {
   CUDF_FUNC_RANGE();
 
+  if (col_size == 0) { return make_empty_column(col_type); }
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
 
@@ -800,8 +803,8 @@ std::unique_ptr<column> parse_data(
 
     constexpr auto warps_per_block  = 8;
     constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
-    auto num_blocks  = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block));
-    auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream);
+    auto num_blocks                 = cudf::util::div_rounding_up_safe(col_size, warps_per_block);
+    auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
 
     // TODO run these independent kernels in parallel streams.
     if (max_length > SINGLE_THREAD_THRESHOLD) {

From 8de7d76a8319f37a1cee0f346cecea63913db921 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:21:03 +0530
Subject: [PATCH 36/47] optimzie single character write case, also fixes direct
 unicode bug

---
 cpp/src/io/utilities/data_casting.cu | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 4cef2c8e50d..ee2ba4b2d25 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -629,11 +629,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (!skip) {
         if (!is_prev_escaping_backslash) {
           this_num_out = 1;
-          if (d_chars) write_char = c;
+          // writes char directly
         } else {
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
-            write_char   = escaped_char;
+            // writes char directly
           } else {
             // Unicode
             // \uXXXX
@@ -683,7 +683,14 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           __syncthreads();
         }
         offset += last_offset;
-        if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); }
+        if (d_chars && !skip) {
+          auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ;
+          if (is_not_unicode) {
+            *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char;
+          } else {
+            strings::detail::from_char_utf8(write_char, d_buffer + offset);
+          }
+        }
         offset += this_num_out;
         if constexpr (is_warp) {
           last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);

From cb0e0ba8f83dd4064e162f9ba017fc3765a9d695 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:24:18 +0530
Subject: [PATCH 37/47] add unit test JsonReaderTest.ErrorStrings

---
 cpp/tests/io/json_test.cpp | 46 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 050f75dcb83..0d4454d933c 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1441,6 +1441,52 @@ TEST_F(JsonReaderTest, JsonLongString)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view());
 }
 
+TEST_F(JsonReaderTest, ErrorStrings)
+{
+  // cases of invalid escape characters, invalid unicode encodings.
+  auto const buffer = std::string{R"(
+    {"col0": "\"\a"}
+    {"col0": "\u"}
+    {"col0": "\u0"}
+    {"col0": "\u0b"}
+    {"col0": "\u00b"}
+    {"col0": "\u00bz"}
+    {"col0": "\t34567890123456\t9012345678901\ug0bc"}
+    {"col0": "\t34567890123456\t90123456789012\u0hbc"}
+    {"col0": "\t34567890123456\t90123456789012\u00ic"}
+    {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"}
+)"};
+  // Last one is not an error case, but shows that unicode in json is copied string column output.
+
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .dtypes({data_type{cudf::type_id::STRING}})
+      .lines(true)
+      .legacy(false);
+
+  auto const result      = cudf::io::read_json(in_opts);
+  auto const result_view = result.tbl->view().column(0);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
+  EXPECT_EQ(result_view.null_count(), 9);
+  cudf::test::strings_column_wrapper expected{
+    {"",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4"
+     "\xe0\xae\xbf"},
+    // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d
+    //             0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf
+    cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
+}
+
 TEST_F(JsonReaderTest, TokenAllocation)
 {
   std::array<std::string const, 3> const json_inputs{

From 4b46027cdf27f4d762fc69ba2891ac5aba63a897 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 19:55:48 +0530
Subject: [PATCH 38/47] add comment

---
 cpp/tests/io/json_test.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0d4454d933c..16124f1b5f1 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1444,6 +1444,7 @@ TEST_F(JsonReaderTest, JsonLongString)
 TEST_F(JsonReaderTest, ErrorStrings)
 {
   // cases of invalid escape characters, invalid unicode encodings.
+  // Error strings will decode to nulls
   auto const buffer = std::string{R"(
     {"col0": "\"\a"}
     {"col0": "\u"}

From 51390fd5f6e71fa7c8d39b3f2d117fffa0a38213 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 11 Sep 2023 23:22:52 +0530
Subject: [PATCH 39/47] update comments

---
 cpp/src/io/utilities/data_casting.cu | 30 ++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index ee2ba4b2d25..ab2b37ff6a7 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -576,9 +576,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         // There is another __syncthreads() at the end of for-loop.
       }
 
+      // String with parsing errors are made as null
       bool error = false;
       if (is_within_bounds) {
-        // instead of '\\', using is_escaping_backslash, and previous index value also here.
+        // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT
+        // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex
         error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
         error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR);
         error |= (is_prev_escaping_backslash && c == 'u' &&
@@ -586,7 +588,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                    !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
                    !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
       }
-      // propagate error using warp shuffle.
+      // Make sure all threads have no errors before continuing
       if constexpr (is_warp) {
         error = __any_sync(MASK, error);
       } else {
@@ -598,6 +600,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         __syncthreads();
         error = error_reduced;
       }
+      // If any thread has an error, skip the rest of the string and make this string as null
       if (error) {
         if (!d_chars && lane == 0) {
           if (null_mask != nullptr) {
@@ -608,12 +611,15 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           d_offsets[istring] = 0;
         }
         if constexpr (!is_warp) { __syncthreads(); }
-        break;                        // gride-stride return;
+        break;  // gride-stride return;
       }
+
+      // Skipping non-copied escaped characters
       bool skip = !is_within_bounds;  // false;
+      // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX
       skip |= is_escaping_backslash;
       if (is_within_bounds) {
-        // \uXXXX check for "\u" for each X
+        // skip X for each X in \uXXXX
         skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) &&
                 in_begin[char_index - 1] == 'u';
         skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) &&
@@ -627,19 +633,21 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       cudf::char_utf8 write_char{};
 
       if (!skip) {
+        // 1. Unescaped character
         if (!is_prev_escaping_backslash) {
           this_num_out = 1;
-          // writes char directly
+          // writes char directly for non-unicode
         } else {
+          // 2. Escaped character
           if (escaped_char != UNICODE_SEQ) {
             this_num_out = 1;
-            // writes char directly
+            // writes char directly for non-unicode
           } else {
-            // Unicode
-            // \uXXXX
+            // 3. Unicode
+            // UTF8 \uXXXX
             auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
             auto hex_low_val = 0;
-            // \uXXXX\uXXXX
+            // UTF16 \uXXXX\uXXXX
             // Note: no need for scanned_backslash below because we already know that
             // only '\u' check is enough.
             if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
@@ -662,7 +670,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
                 this_num_out = 0;
                 write_char   = 0;
               } else {
-                // if u8
+                // if UTF8
                 write_char   = strings::detail::codepoint_to_utf8(hex_val);
                 this_num_out = strings::detail::bytes_in_char_utf8(write_char);
               }
@@ -671,6 +679,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         }
       }  // !skip end.
       {
+        // compute offset to write output for each thread
         size_type offset;
         if constexpr (is_warp) {
           using OffsetScan = cub::WarpScan<size_type>;
@@ -683,6 +692,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
           __syncthreads();
         }
         offset += last_offset;
+        // Write output
         if (d_chars && !skip) {
           auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ;
           if (is_not_unicode) {

From d7bb5ac78737779af48a45fce7fe5338e2fd76c3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 12 Sep 2023 00:50:56 +0530
Subject: [PATCH 40/47] adjust kernel string limits

---
 cpp/src/io/utilities/data_casting.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index ab2b37ff6a7..6ffa1b444b6 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -43,8 +43,8 @@
 
 namespace cudf::io::json::detail {
 
-constexpr auto SINGLE_THREAD_THRESHOLD = 512;
-constexpr auto WARP_THRESHOLD          = 1024 * 4;
+constexpr auto SINGLE_THREAD_THRESHOLD = 128;
+constexpr auto WARP_THRESHOLD          = 128 * 128;  // 16K
 
 // Unicode code point escape sequence
 static constexpr char UNICODE_SEQ = 0x7F;

From d0c86126eb79cf5be155c6a36f7c93eaf8df3836 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 13 Sep 2023 20:30:38 +0530
Subject: [PATCH 41/47] reorg json type test code

---
 cpp/tests/io/json_type_cast_test.cu | 51 ++++++++++++-----------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 74085c7ebbb..a82d97b13dc 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -46,6 +46,21 @@ struct offsets_to_length {
     return thrust::get<1>(p) - thrust::get<0>(p);
   }
 };
+
+/// Returns length of each string in the column
+auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream)
+{
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
+  return svs_length;
+}
 }  // namespace
 
 auto default_json_options()
@@ -69,16 +84,8 @@ TEST_F(JSONTypeCastTest, String)
   std::vector<char const*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
   cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids);
 
-  auto column        = cudf::strings_column_view(input);
-  auto offsets_begin = column.offsets_begin();
-  auto offsets_pair =
-    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
-  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    offsets_pair,
-                    offsets_pair + column.size(),
-                    svs_length.begin(),
-                    offsets_to_length{});
+  auto column                                     = cudf::strings_column_view(input);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
@@ -110,16 +117,8 @@ TEST_F(JSONTypeCastTest, Int)
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
-  auto column        = cudf::strings_column_view(data);
-  auto offsets_begin = column.offsets_begin();
-  auto offsets_pair =
-    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
-  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    offsets_pair,
-                    offsets_pair + column.size(),
-                    svs_length.begin(),
-                    offsets_to_length{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
@@ -158,16 +157,8 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     R"("escape with nothing to escape \")",
     R"("\"\\\/\b\f\n\r\t")",
   });
-  auto column        = cudf::strings_column_view(data);
-  auto offsets_begin = column.offsets_begin();
-  auto offsets_pair =
-    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
-  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    offsets_pair,
-                    offsets_pair + column.size(),
-                    svs_length.begin(),
-                    offsets_to_length{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =

From 56d7fb614d22f610d9107f4151057595908ea0d6 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 14 Sep 2023 02:24:45 +0530
Subject: [PATCH 42/47] add error cases for parse_data

---
 cpp/tests/io/json_type_cast_test.cu | 69 +++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index a82d97b13dc..9eb5e8f5230 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -33,6 +33,8 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <algorithm>
+#include <iterator>
 #include <type_traits>
 
 using namespace cudf::test::iterators;
@@ -181,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected);
 }
 
+TEST_F(JSONTypeCastTest, ErrorNulls)
+{
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto const type   = cudf::data_type{cudf::type_id::STRING};
+
+  // error in decoding
+  std::vector<char const*> input_values{R"("\"\a")",
+                                        R"("\u")",
+                                        R"("\u0")",
+                                        R"("\u0b")",
+                                        R"("\u00b")",
+                                        R"("\u00bz")",
+                                        R"("\t34567890123456\t9012345678901\ug0bc")",
+                                        R"("\t34567890123456\t90123456789012\u0hbc")",
+                                        R"("\t34567890123456\t90123456789012\u00ic")",
+                                        R"("\t34567890123456\t9012345678901\")",
+                                        R"("\t34567890123456\t90123456789012\")",
+                                        R"(null)"};
+  // Note: without quotes are copied without decoding
+  cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end());
+
+  auto column        = cudf::strings_column_view(input);
+  auto space_length  = 128;
+  auto prepend_space = [&space_length](auto const& s) {
+    if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1);
+    return std::string(s);
+  };
+  std::vector<std::string> small_input;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space);
+  cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end());
+
+  std::vector<std::string> large_input;
+  space_length = 128 * 128;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space);
+  cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end());
+
+  std::vector<char const*> expected_values{"", "", "", "", "", "", "", "", "", "", "", ""};
+  cudf::test::strings_column_wrapper expected(
+    expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls());
+
+  // single threads, warp, block.
+  for (auto const& column :
+       {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) {
+    rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
+
+    auto null_mask_it = no_nulls();
+    auto null_mask =
+      std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+    auto str_col = cudf::io::json::detail::parse_data(
+      column.chars().data<char>(),
+      thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+      column.size(),
+      type,
+      std::move(null_mask),
+      0,
+      default_json_options().view(),
+      stream,
+      mr);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From d088e8ea8824a356ce9c2e7c39b53671d4c44d2d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 14 Sep 2023 02:25:12 +0530
Subject: [PATCH 43/47] address review comments (vuule)

---
 cpp/src/io/utilities/data_casting.cu | 2 +-
 cpp/tests/io/json_test.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 6ffa1b444b6..b259a0f4d4d 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -909,7 +909,7 @@ std::unique_ptr<column> parse_data(
     make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
   auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);
 
-  // use existing code (`ConvertFunctor`) to convert values
+  // use `ConvertFunctor` to convert non-string values
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 16124f1b5f1..d792520e5e3 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1438,7 +1438,7 @@ TEST_F(JsonReaderTest, JsonLongString)
 
   // Read test data via nested JSON reader
   auto const table = cudf::io::read_json(json_lines_options);
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
 }
 
 TEST_F(JsonReaderTest, ErrorStrings)

From 79b4f389eea9ba002c07d82180da328cd1564ccd Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 14 Sep 2023 02:28:22 +0530
Subject: [PATCH 44/47] fix review comments, remove nvtx ranges

---
 cpp/src/io/utilities/data_casting.cu | 2 --
 cpp/tests/io/json_test.cpp           | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index b259a0f4d4d..d09ea408c23 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -802,7 +802,6 @@ std::unique_ptr<column> parse_data(
       size_type{0},
       thrust::maximum<size_type>{});
 
-    nvtxRangePush("string_parallel");
     auto offsets = cudf::make_numeric_column(
       data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
     auto d_offsets = offsets->mutable_view().data<size_type>();
@@ -896,7 +895,6 @@ std::unique_ptr<column> parse_data(
           d_offsets,
           d_chars);
     }
-    nvtxRangePop();
 
     return make_strings_column(col_size,
                                std::move(offsets),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index d792520e5e3..16124f1b5f1 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1438,7 +1438,7 @@ TEST_F(JsonReaderTest, JsonLongString)
 
   // Read test data via nested JSON reader
   auto const table = cudf::io::read_json(json_lines_options);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view());
 }
 
 TEST_F(JsonReaderTest, ErrorStrings)

From 403a3741b49e3e8c8814052127c501a4dc17f6ab Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 14 Sep 2023 02:59:21 +0530
Subject: [PATCH 45/47] fix unit test cases nullability

---
 cpp/tests/io/json_test.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 16124f1b5f1..80282871a3c 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1407,7 +1407,8 @@ TEST_F(JsonReaderTest, JsonLongString)
     cudf::test::iterators::nulls_at({10, 11})};
 
   cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
-    1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3};
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3},
+    cudf::test::iterators::no_nulls()};
   auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
   auto col2   = d_col2->view();
   cudf::table_view const tbl_view{{col1, col2, repeat_times}};
@@ -1438,7 +1439,7 @@ TEST_F(JsonReaderTest, JsonLongString)
 
   // Read test data via nested JSON reader
   auto const table = cudf::io::read_json(json_lines_options);
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
 }
 
 TEST_F(JsonReaderTest, ErrorStrings)

From 72d23fbba4b625113b8825b8513b98bd75b62845 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 19 Sep 2023 21:07:46 +0530
Subject: [PATCH 46/47] address review comments, split code for string type

---
 cpp/src/io/utilities/data_casting.cu | 279 +++++++++++++++------------
 1 file changed, 155 insertions(+), 124 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index d09ea408c23..f2868f80a23 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -249,7 +249,8 @@ process_string(in_iterator_t in_begin,
     // If this may be a UTF-16 encoded surrogate pair:
     // we expect another \uXXXX sequence
     int32_t hex_low_val = 0;
-    if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
+    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+        thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
         *in_begin == backslash_char && *thrust::next(in_begin) == 'u') {
       // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low
       // surrogate
@@ -267,10 +268,8 @@ process_string(in_iterator_t in_begin,
                                     (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
       auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
       bytes += write_utf8_char(utf8_chars, d_buffer);
-    }
-
-    // Just a single \uXXXX sequence
-    else {
+    } else {
+      // Just a single \uXXXX sequence
       auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
       bytes += write_utf8_char(utf8_chars, d_buffer);
     }
@@ -293,6 +292,8 @@ struct bitfield_warp {
   // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
   // 5+32 for each warp.
   bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size];
+
+  /// Sets all bits to 0
   __device__ void reset(unsigned warp_id)
   {
     if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) {
@@ -300,6 +301,8 @@ struct bitfield_warp {
     }
     is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0;
   }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
   __device__ void shift(unsigned warp_id)
   {
     if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK)
@@ -307,12 +310,16 @@ struct bitfield_warp {
         is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size];
     __syncwarp();
   }
+
+  /// Each thread in a warp sets its own bit.
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
     is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] =
       is_escaping_backslash;
     __syncwarp();
   }
+
+  /// Each thread in a warp gets the requested bit.
   __device__ bool get_bit(unsigned warp_id, int bit_index)
   {
     return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index];
@@ -332,22 +339,29 @@ struct bitfield_block {
   // 5 + num_warps*32 for entire block
   bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size];
 
+  /// Sets all bits to 0
   __device__ void reset(unsigned warp_id)
   {
     if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; }
     is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0;
   }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
   __device__ void shift(unsigned warp_id)
   {
     if (threadIdx.x < UNICODE_LOOK_BACK)
       is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x];
     __syncthreads();
   }
+
+  /// Each thread in a block sets its own bit.
   __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
   {
     is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash;
     __syncthreads();
   }
+
+  /// Each thread in a block gets the requested bit.
   __device__ bool get_bit(unsigned warp_id, int bit_index)
   {
     return is_slash[UNICODE_LOOK_BACK + bit_index];
@@ -478,8 +492,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (!d_chars) {
         if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
       } else {
-        for (size_type char_index = lane; char_index < (in_end - in_begin);
-             char_index += BLOCK_SIZE) {
+        for (size_t char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) {
           d_buffer[char_index] = in_begin[char_index];
         }
       }
@@ -516,7 +529,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     __syncthreads();
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
-    for (size_type char_index = lane;
+    for (size_t char_index = lane;
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool const is_within_bounds = char_index < (in_end - in_begin);
@@ -620,14 +633,14 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       skip |= is_escaping_backslash;
       if (is_within_bounds) {
         // skip X for each X in \uXXXX
-        skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) &&
-                in_begin[char_index - 1] == 'u';
-        skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) &&
-                in_begin[char_index - 2] == 'u';
-        skip |= char_index - 4 >= 0 && is_slash.get_bit(warp_id, lane - 4) &&
-                in_begin[char_index - 3] == 'u';
-        skip |= char_index - 5 >= 0 && is_slash.get_bit(warp_id, lane - 5) &&
-                in_begin[char_index - 4] == 'u';
+        skip |=
+          char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u';
+        skip |=
+          char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u';
+        skip |=
+          char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u';
+        skip |=
+          char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u';
       }
       int this_num_out = 0;
       cudf::char_utf8 write_char{};
@@ -650,7 +663,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
             // UTF16 \uXXXX\uXXXX
             // Note: no need for scanned_backslash below because we already know that
             // only '\u' check is enough.
-            if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
                   in_end &&
                 in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' &&
                 in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') {
@@ -773,6 +787,123 @@ struct to_string_view_pair {
   }
 };
 
+template <typename string_view_pair_it>
+static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
+                                            size_type col_size,
+                                            rmm::device_buffer&& null_mask,
+                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::io::parse_options_view const& options,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  //  CUDF_FUNC_RANGE();
+
+  auto const max_length = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    str_tuples,
+    str_tuples + col_size,
+    [] __device__(auto t) { return t.second; },
+    size_type{0},
+    thrust::maximum<size_type>{});
+
+  auto offsets = cudf::make_numeric_column(
+    data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets       = offsets->mutable_view().data<size_type>();
+  auto null_count_data = d_null_count.data();
+
+  auto single_thread_fn = string_parse<decltype(str_tuples)>{
+    str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  constexpr auto warps_per_block  = 8;
+  constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
+  auto num_blocks                 = cudf::util::div_rounding_up_safe(col_size, warps_per_block);
+  auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
+
+  // TODO run these independent kernels in parallel streams.
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  // CHARS column
+  std::unique_ptr<column> chars =
+    strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  auto d_chars = chars->mutable_view().data<char>();
+
+  single_thread_fn.d_chars = d_chars;
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  return make_strings_column(col_size,
+                             std::move(offsets),
+                             std::move(chars),
+                             d_null_count.value(stream),
+                             std::move(null_mask));
+}
+
 std::unique_ptr<column> parse_data(
   const char* data,
   thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
@@ -794,113 +925,13 @@ std::unique_ptr<column> parse_data(
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
 
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    auto const max_length = thrust::transform_reduce(
-      rmm::exec_policy(stream),
-      str_tuples,
-      str_tuples + col_size,
-      [] __device__(auto t) { return t.second; },
-      size_type{0},
-      thrust::maximum<size_type>{});
-
-    auto offsets = cudf::make_numeric_column(
-      data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
-    auto d_offsets = offsets->mutable_view().data<size_type>();
-
-    auto single_thread_fn =
-      string_parse<decltype(str_tuples)>{str_tuples,
-                                         static_cast<bitmask_type*>(null_mask.data()),
-                                         null_count_data,
-                                         options,
-                                         d_offsets};
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       col_size,
-                       single_thread_fn);
-
-    constexpr auto warps_per_block  = 8;
-    constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
-    auto num_blocks                 = cudf::util::div_rounding_up_safe(col_size, warps_per_block);
-    auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
-
-    // TODO run these independent kernels in parallel streams.
-    if (max_length > SINGLE_THREAD_THRESHOLD) {
-      parse_fn_string_parallel<true, warps_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          nullptr);
-    }
-
-    if (max_length > WARP_THRESHOLD) {
-      // for strings longer than WARP_THRESHOLD, 1 block per string
-      str_counter.set_value(0, stream);
-      parse_fn_string_parallel<false, warps_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          nullptr);
-    }
-    auto const bytes =
-      cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
-    CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-                 "Size of output exceeds the column size limit",
-                 std::overflow_error);
-
-    // CHARS column
-    std::unique_ptr<column> chars =
-      strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-    auto d_chars = chars->mutable_view().data<char>();
-
-    single_thread_fn.d_chars = d_chars;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       col_size,
-                       single_thread_fn);
-
-    if (max_length > SINGLE_THREAD_THRESHOLD) {
-      str_counter.set_value(0, stream);
-      parse_fn_string_parallel<true, warps_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          d_chars);
-    }
-
-    if (max_length > WARP_THRESHOLD) {
-      str_counter.set_value(0, stream);
-      // for strings longer than WARP_THRESHOLD, 1 block per string
-      parse_fn_string_parallel<false, warps_per_block>
-        <<<num_blocks, threads_per_block, 0, stream.value()>>>(
-          str_tuples,
-          col_size,
-          str_counter.data(),
-          static_cast<bitmask_type*>(null_mask.data()),
-          null_count_data,
-          options,
-          d_offsets,
-          d_chars);
-    }
-
-    return make_strings_column(col_size,
-                               std::move(offsets),
-                               std::move(chars),
-                               d_null_count.value(stream),
-                               std::move(null_mask));
+    return parse_string(str_tuples,
+                        col_size,
+                        std::forward<rmm::device_buffer>(null_mask),
+                        d_null_count,
+                        options,
+                        stream,
+                        mr);
   }
 
   auto out_col =

From d0a5e23956a94d4e9a2e1f2a01ab7be0226c8c4f Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 19 Sep 2023 21:51:59 +0530
Subject: [PATCH 47/47] add comments, style fix

---
 cpp/src/io/utilities/data_casting.cu | 10 +++++++---
 cpp/tests/io/json_test.cpp           |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index f2868f80a23..1772e5e43fa 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -492,7 +492,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       if (!d_chars) {
         if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
       } else {
-        for (size_t char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) {
+        for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
+             char_index += BLOCK_SIZE) {
           d_buffer[char_index] = in_begin[char_index];
         }
       }
@@ -529,7 +530,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
     __syncthreads();
     // 0-31, 32-63, ... i*32-n.
     // entire warp executes but with mask.
-    for (size_t char_index = lane;
+    for (thread_index_type char_index = lane;
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool const is_within_bounds = char_index < (in_end - in_begin);
@@ -679,7 +680,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
               write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
               this_num_out = strings::detail::bytes_in_char_utf8(write_char);
             } else {
+              // if hex_val is high surrogate, ideally it should be parsing failure.
+              // but skipping it as other parsers do this too.
               if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+                // Ideally this should be skipped if previous char is high surrogate.
                 skip         = true;
                 this_num_out = 0;
                 write_char   = 0;
@@ -726,7 +730,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       }
     }  // char for-loop
     if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
-  }    // grid-stride for-loop
+  }  // grid-stride for-loop
 }
 
 template <typename str_tuple_it>
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 80282871a3c..7c911ac2e04 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1388,8 +1388,8 @@ TEST_F(JsonReaderTest, JsonLongString)
       "$€",
       "ராபிட்ஸ்",
       "C𝞵𝓓𝒻",
-      "",                                                // null
-      "",                                                // null
+      "",  // null
+      "",  // null
       "கார்த்தி",
       "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
       "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF