From 91742afb2e25d31610846a69b77b4f64ae6fd07c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 2 Aug 2023 22:21:10 +0530 Subject: [PATCH 01/47] warp per string parsing of string columns (unicode) --- cpp/include/cudf/io/detail/data_casting.cuh | 361 +++++++++++++++++++- 1 file changed, 353 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index d764e8533c6..98c06ee09eb 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include @@ -21,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -295,6 +298,287 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } +// 1 warp per string. +template +__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) +{ + int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int global_warp_id = global_thread_id / cudf::detail::warp_size; + int warp_lane = global_thread_id % cudf::detail::warp_size; + int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + // TODO alignment - aligned access possible? + + // grid-stride loop. + for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { + if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { + if (!d_chars) d_offsets[istring] = 0; + continue; // gride-stride return; + } + + auto in_begin = str_tuples[istring].first; + auto in_end = in_begin + str_tuples[istring].second; + auto const num_in_chars = str_tuples[istring].second; + + // Check if the value corresponds to the null literal + auto const is_null_literal = + (!d_chars) && + serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (warp_lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; + } + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; + + // Copy literal/numeric value + if (not is_string_value) { + if (!d_chars) { + if (warp_lane == 0) { d_offsets[istring] = in_end - in_begin; } + } else { + for (size_type char_index = warp_lane; char_index < (in_end - in_begin); + char_index += cudf::detail::warp_size) { + d_buffer[char_index] = in_begin[char_index]; + } + } + continue; // gride-stride return; + } + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + // auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + + // \uXXXX 6->2/3/4 + // \uXXXX\uXXXX 12->2/3/4 + // \" 2->1 + // _ 1->1 + // + // error conditions. (propagate) + // c=='\' & curr_idx == end_idx-1; ERROR + // [c-1]=='\' & get_escape[c]==NEC + // [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT + // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex + + // skip conditions. (scan for size) + // c=='\' skip. + // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. + + // write conditions. (write to d_buffer) + // [c-1]!='\' & [c]!='\' write [c] + // [c-1]!='\' & [c]=='\' skip (unnecessary? already covered? in skip conditions) + // [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] + // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE + // [c+1:4]=curr_hex_val + // // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && + // hex,DECODE [c+7:4]=next_hex_val + // // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && + // hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val + // // if prev_hex_val in high, curr_hex_val in low, skip. + // // if curr_hex_val in high, next_hex_val in low, write u16. + // if curr_hex_val not in high, write u8. + // before writing, find size, then intra-warp scan for out_idx + // propagate offset from 32nd thread to others in warp to carry forward. + auto is_hex = [](auto ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + }; + bool init_state{false}; // for backslash scan calculation + auto last_offset = 0; + // 0-31, 32-63, ... i*32-n. + for (size_type char_index = warp_lane; char_index < (in_end - in_begin); + char_index += cudf::detail::warp_size) { + auto c = in_begin[char_index]; + auto prev_c = char_index > 0 ? in_begin[char_index - 1] : 'a'; + auto escaped_char = get_escape_char(c); + bool error = false; + // FIXME: \\ at end is a problem here. + // \uXXXXe e-u=5 4<=4 + // 012345 + error |= (c == '\\' && char_index == (in_end - in_begin) - 1); + error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); + error |= (prev_c == '\\' && c == 'u' && + // TODO check if following condition is right or off by one error. + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + // propagate error using warp shuffle. + error = __any_sync(0xffffffff, error); + if (error) { + if (warp_lane == 0) { + if (null_mask != nullptr) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + } + last_offset = 0; + if (!d_chars) d_offsets[istring] = 0; + } + break; // return to grid-stride loop for next string. + } + // TODO one more error condition of second \uXXXX is not hex. + bool skip = false; + // TODO FIXME: continue slashes are a problem! + // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' + // corner case \\uXXXX TODO + // skip XXXX in \uXXXX + skip |= + char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; + skip |= + char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && in_begin[char_index - 2] == 'u'; + skip |= + char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u'; + skip |= + char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u'; + int this_num_out = 0; + cudf::char_utf8 write_char{'a'}; + + // auto prev = 0; // carry for is_escape_slash + // if prev == escape_slash, then this is escaped_char, so copy. + // then this is not escaped_slash regardless of c. + // if prev != escape_slash, then curr == '\' is escaped_slash,. + // curr_escape_slash = prev==true then 0, if prev=false & c=='\' + // curr = !prev && c=='\' + // 0 & * + // 1 '\'? + // c=='\' + // inclusivesum of custom operator. + // check if any c=='\' in warp, if no, then prev=false for all. + // else do the scan. + + // curr = !prev & c=='\\' + // !curr = !(!prev & c=='\\') + // !curr = prev | c!='\\' is it associative? NO + + // not associative! + // curr[0] curr[1] curr[2]. op = !prev & c; + // op = !a & b; + // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z + // op(x, op(y, z)) =op(x, !y&z) = !x&(!y &z) = (!x)&(!y)&z + auto warp_id = threadIdx.x / 32; + + // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping + // backslash. + + struct state_table { + bool state[2]; + }; + // using state_table = bool[2]; Try this. and see if compiler errors + __shared__ typename cub::WarpScan::TempStorage temp_slash[4]; + state_table curr{c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + }; + state_table scanned; + // inclusive scan? how? + cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + auto is_escaping_backslash = scanned.state[init_state]; + // init_state = __shfl_sync(0xffffffff, is_escaping_backslash, 31); + auto last_active_lane = 31 - __clz(__activemask()); // TODO simplify 0xFF case? + init_state = __shfl_sync(0xffffffff, is_escaping_backslash, last_active_lane); + // TODO replace/add prev_c with proper scan of escapes + skip |= is_escaping_backslash; + + if (!skip) { + // is prev_is_not backslash? + if (prev_c != '\\') { // FIXME: enable this after debugging. + // if (true) { + this_num_out = 1; + if (d_chars) write_char = c; + // d_buffer[last_offset+ this_num_out_scaned] = c; + } else { + // already taken care early. + // if (escaped_char == NON_ESCAPE_CHAR) { + // this_num_out = 0; + // error = true; + // } else + if (escaped_char != UNICODE_SEQ) { + this_num_out = 1; + // if(d_chars) + write_char = escaped_char; + // d_buffer[last_offset+ this_num_out_scaned] = escaped_char; + } else { + // \uXXXX- u + // Unicode + auto hex_val = parse_unicode_hex(in_begin + char_index + 1); + auto hex_low_val = 0; +#if 1 + // if next is \uXXXX + // in_begin + char_index + // 01234567890 + //\uXXXX\uXXXX + // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && + in_begin[char_index + 1 + 5] == 'u') { + hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); + } + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + write_char = strings::detail::codepoint_to_utf8(unicode_code_point); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + // this_num_out = 0; skip=true; + } else { + // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6); + if ( + // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val < + // UTF16_HIGH_SURROGATE_END && + hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + skip = true; + this_num_out = 0; + write_char = 0; + } else { + // if u8 + write_char = strings::detail::codepoint_to_utf8(hex_val); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } + } +#endif + } + } + } // !skip end. + { + // TODO think about writing error conditions as normal, so that program flow is easy to read + // and can process error here. + // WRITE now (compute out_idx offset then write) + // intra-warp scan of this_num_out. + // TODO union to save shared memory + __shared__ cub::WarpScan::TempStorage temp_storage[4]; + size_type offset; + cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + offset += last_offset; + // TODO add last active lane this_num_out for correct last_offset. + if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } + __shared__ cub::WarpReduce::TempStorage temp_storage2[4]; + last_offset += cub::WarpReduce(temp_storage2[warp_id]).Sum(this_num_out); + last_offset = __shfl_sync(0xffffffff, last_offset, 0); + // offset += this_num_out; + // auto last_active_lane = __ffs(__brev(__activemask())); // TODO simplify 0xFF case? + // last_offset = __shfl_sync(0xffffffff, offset, 31-last_active_lane); // TODO is mask + // right? + } + } // char for-loop + if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; } + } // grid-stride for-loop +} + template struct string_parse { str_tuple_it str_tuples; @@ -365,20 +649,81 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto d_null_count = rmm::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); + auto d_null_count2 = rmm::device_scalar(null_count, stream); + auto null_count_data2 = d_null_count2.data(); + if (col_type == cudf::data_type{cudf::type_id::STRING}) { // this utility calls the functor to build the offsets and chars columns; // the bitmask and null count may be updated by parse failures - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, + nvtxRangePush("make_strings_children"); + // auto [offsets, chars] = cudf::strings::detail::make_strings_children( + // string_parse{ + // str_tuples, static_cast(null_mask.data()), null_count_data, options}, + // col_size, + // stream, + // mr); + nvtxRangePop(); + + // { + nvtxRangePush("string_parallel"); + auto offsets2 = cudf::make_numeric_column( + data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets2->mutable_view().data(); + parse_fn_string_parallel<<>>( + str_tuples, + col_size, + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + nullptr); + // if (0) { + // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span(d_offsets, + // offsets2->size()), stream); for(auto i: h_offsets2) std::cout< chars2 = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars2 = chars2->mutable_view().data(); + cudaMemsetAsync(d_chars2, 'c', bytes, stream.value()); + + parse_fn_string_parallel<<>>( + str_tuples, col_size, - stream, - mr); + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + d_chars2); + + // if(bytes!=chars->size()) { + // std::cout<<"new bytes="<(d_offsets, + // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<(d_chars2, bytes), + // stream); for(auto i: h_chars2) std::cout<size()) { + // std::cout<<"old bytes="<size()<mutable_view()) + // .template data(); + // auto h_offsets = cudf::detail::make_std_vector_sync(device_span(d_offsetsa, + // offsets->size()), stream); for(auto i: h_offsets) std::cout<mutable_view().template data(); + // auto h_chars = cudf::detail::make_std_vector_sync(device_span(d_chars, + // chars->size()), stream); for(auto i: h_chars) std::cout< Date: Wed, 2 Aug 2023 22:22:26 +0530 Subject: [PATCH 02/47] remove dependency of data_casting.cuh in write_json.cu --- cpp/src/io/json/write_json.cu | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 9ecf77a798a..48a95cb3ec2 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -27,9 +27,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -61,6 +61,21 @@ #include namespace cudf::io::json::detail { +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} std::unique_ptr make_column_names_column(host_span column_names, size_type num_columns, @@ -148,7 +163,7 @@ struct escape_strings_fn { } continue; } - auto escaped_chars = cudf::io::json::experimental::detail::get_escaped_char(utf8_char); + auto escaped_chars = get_escaped_char(utf8_char); if (escaped_chars.first == '\0') { write_char(escaped_chars.second, d_buffer, bytes); } else { From 318b4a3e7d2d2f421e62477d8667af714dbeafac Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 3 Aug 2023 00:41:19 +0530 Subject: [PATCH 03/47] cleanup --- cpp/include/cudf/io/detail/data_casting.cuh | 119 +++++++++++--------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 98c06ee09eb..bb58c79e7a3 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -298,6 +298,39 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } +// 1 warp per string. +// algorithm + +// \uXXXX 6->2/3/4 +// \uXXXX\uXXXX 12->2/3/4 +// \" 2->1 +// _ 1->1 +// +// error conditions. (propagate) +// c=='\' & curr_idx == end_idx-1; ERROR +// [c-1]=='\' & get_escape[c]==NEC +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex + +// skip conditions. (scan for size) +// c=='\' skip. +// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. + +// write conditions. (write to d_buffer) +// [c-1]!='\' & [c]!='\' write [c] +// [c-1]!='\' & [c]=='\' skip (unnecessary? already covered? in skip conditions) +// [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE +// [c+1:4]=curr_hex_val +// // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c+7:4]=next_hex_val +// // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val +// // if prev_hex_val in high, curr_hex_val in low, skip. +// // if curr_hex_val in high, next_hex_val in low, write u16. +// if curr_hex_val not in high, write u8. +// before writing, find size, then intra-warp scan for out_idx +// propagate offset from 32nd thread to others in warp to carry forward. // 1 warp per string. template __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, @@ -364,36 +397,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, } // auto str_process_info = process_string(in_begin, in_end, d_buffer, options); - // \uXXXX 6->2/3/4 - // \uXXXX\uXXXX 12->2/3/4 - // \" 2->1 - // _ 1->1 - // - // error conditions. (propagate) - // c=='\' & curr_idx == end_idx-1; ERROR - // [c-1]=='\' & get_escape[c]==NEC - // [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT - // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex - - // skip conditions. (scan for size) - // c=='\' skip. - // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. - - // write conditions. (write to d_buffer) - // [c-1]!='\' & [c]!='\' write [c] - // [c-1]!='\' & [c]=='\' skip (unnecessary? already covered? in skip conditions) - // [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] - // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE - // [c+1:4]=curr_hex_val - // // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && - // hex,DECODE [c+7:4]=next_hex_val - // // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && - // hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val - // // if prev_hex_val in high, curr_hex_val in low, skip. - // // if curr_hex_val in high, next_hex_val in low, write u16. - // if curr_hex_val not in high, write u8. - // before writing, find size, then intra-warp scan for out_idx - // propagate offset from 32nd thread to others in warp to carry forward. auto is_hex = [](auto ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); }; @@ -622,6 +625,16 @@ struct string_parse { } } }; + +template +void print_raw(T const* ptr, size_type size, rmm::cuda_stream_view stream) +{ + auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span(ptr, size), stream); + for (auto i : h_offsets2) + std::cout << i << ","; + std::cout << std::endl; +} + /** * @brief Parses the data from an iterator of string views, casting it to the given target data type * @@ -655,14 +668,20 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (col_type == cudf::data_type{cudf::type_id::STRING}) { // this utility calls the functor to build the offsets and chars columns; // the bitmask and null count may be updated by parse failures + +#define WARP_PARALLEL +#ifndef WARP_PARALLEL nvtxRangePush("make_strings_children"); - // auto [offsets, chars] = cudf::strings::detail::make_strings_children( - // string_parse{ - // str_tuples, static_cast(null_mask.data()), null_count_data, options}, - // col_size, - // stream, - // mr); + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options}, + col_size, + stream, + mr); + auto& offsets2 = offsets; + auto& chars2 = chars; nvtxRangePop(); +#else // { nvtxRangePush("string_parallel"); @@ -677,10 +696,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options, d_offsets, nullptr); - // if (0) { - // auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span(d_offsets, - // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<size(), stream); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); @@ -700,25 +716,18 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, d_chars2); // if(bytes!=chars->size()) { - // std::cout<<"new bytes="<(d_offsets, - // offsets2->size()), stream); for(auto i: h_offsets2) std::cout<(d_chars2, bytes), - // stream); for(auto i: h_chars2) std::cout<mutable_view().template data(); - // auto h_chars = cudf::detail::make_std_vector_sync(device_span(d_chars, - // chars->size()), stream); for(auto i: h_chars) std::cout< parse_data(str_tuple_it str_tuples, auto offsets2 = cudf::make_numeric_column( data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets2->mutable_view().data(); - parse_fn_string_parallel<<>>( + + int max_blocks = 0; + constexpr auto warps_per_block = 8; + int threads_per_block = cudf::detail::warp_size * warps_per_block; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, parse_fn_string_parallel, threads_per_block, 0)); + + int device = 0; + CUDF_CUDA_TRY(cudaGetDevice(&device)); + int num_sms = 0; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); + auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1)); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + parse_fn_string_parallel<<>>( str_tuples, col_size, + str_counter.data(), static_cast(null_mask.data()), null_count_data2, options, @@ -699,6 +725,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // print_raw(d_offsets, offsets2->size(), stream); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + str_counter.set_value(0, stream); // CHARS column std::unique_ptr chars2 = @@ -706,9 +733,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto d_chars2 = chars2->mutable_view().data(); cudaMemsetAsync(d_chars2, 'c', bytes, stream.value()); - parse_fn_string_parallel<<>>( + parse_fn_string_parallel<<>>( str_tuples, col_size, + str_counter.data(), static_cast(null_mask.data()), null_count_data2, options, From 58e0d6c502dccd214ccec35ccc91cee7d6433d9c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 7 Aug 2023 17:40:04 +0530 Subject: [PATCH 05/47] fix intra-warp divergence issue with cub::WarpScan stuck --- cpp/include/cudf/io/detail/data_casting.cuh | 31 +++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 9977c69a441..7e46e311dde 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -415,8 +415,13 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, bool init_state{false}; // for backslash scan calculation auto last_offset = 0; // 0-31, 32-63, ... i*32-n. - for (size_type char_index = warp_lane; char_index < (in_end - in_begin); + // condition as __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) != 0 + // to allow entire warp execute but with mask. + auto MASK = 0xffffffff; + for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; + // char_index<(in_end - in_begin); char_index += cudf::detail::warp_size) { + bool is_within_bounds = char_index < (in_end - in_begin); auto c = in_begin[char_index]; auto prev_c = char_index > 0 ? in_begin[char_index - 1] : 'a'; auto escaped_char = get_escape_char(c); @@ -424,6 +429,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // FIXME: \\ at end is a problem here. // \uXXXXe e-u=5 4<=4 // 012345 + if(is_within_bounds) { error |= (c == '\\' && char_index == (in_end - in_begin) - 1); error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); error |= (prev_c == '\\' && c == 'u' && @@ -432,8 +438,9 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + } // propagate error using warp shuffle. - error = __any_sync(0xffffffff, error); + error = __any_sync(MASK, error); if (error) { if (warp_lane == 0) { if (null_mask != nullptr) { @@ -446,11 +453,12 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, break; // return to grid-stride loop for next string. } // TODO one more error condition of second \uXXXX is not hex. - bool skip = false; + bool skip = !is_within_bounds; //false; // TODO FIXME: continue slashes are a problem! // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' // corner case \\uXXXX TODO // skip XXXX in \uXXXX + if (is_within_bounds) { skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; skip |= @@ -459,6 +467,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u'; skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u'; + } int this_num_out = 0; cudf::char_utf8 write_char{'a'}; @@ -484,7 +493,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // op = !a & b; // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z // op(x, op(y, z)) =op(x, !y&z) = !x&(!y &z) = (!x)&(!y)&z - auto warp_id = threadIdx.x / 32; // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping // backslash. @@ -493,18 +501,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, bool state[2]; }; // using state_table = bool[2]; Try this. and see if compiler errors - __shared__ typename cub::WarpScan::TempStorage temp_slash[4]; - state_table curr{c == '\\', false}; // state transition vector. + __shared__ typename cub::WarpScan::TempStorage temp_slash[8]; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. auto composite_op = [](state_table op1, state_table op2) { return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; - // inclusive scan? how? + auto warp_id = threadIdx.x / 32; + // inclusive scan. TODO both inclusive and exclusive available in cub. cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; // init_state = __shfl_sync(0xffffffff, is_escaping_backslash, 31); - auto last_active_lane = 31 - __clz(__activemask()); // TODO simplify 0xFF case? - init_state = __shfl_sync(0xffffffff, is_escaping_backslash, last_active_lane); + auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xFF case? + init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); // TODO replace/add prev_c with proper scan of escapes skip |= is_escaping_backslash; @@ -531,7 +540,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // Unicode auto hex_val = parse_unicode_hex(in_begin + char_index + 1); auto hex_low_val = 0; -#if 1 // if next is \uXXXX // in_begin + char_index // 01234567890 @@ -565,7 +573,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, this_num_out = strings::detail::bytes_in_char_utf8(write_char); } } -#endif } } } // !skip end. @@ -575,7 +582,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // WRITE now (compute out_idx offset then write) // intra-warp scan of this_num_out. // TODO union to save shared memory - __shared__ cub::WarpScan::TempStorage temp_storage[4]; + __shared__ cub::WarpScan::TempStorage temp_storage[8]; size_type offset; cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); offset += last_offset; From c0edf8f8d3d38d4ca0711a2aea5d23de5dbefcc6 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 7 Aug 2023 17:55:16 +0530 Subject: [PATCH 06/47] remove unnecessary WarpReduce, reduce shmem usage --- cpp/include/cudf/io/detail/data_casting.cuh | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 7e46e311dde..48bf540f455 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -369,6 +369,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto in_begin = str_tuples[istring].first; auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; + // if(!(num_in_chars >= 1024)) continue; // Check if the value corresponds to the null literal auto const is_null_literal = @@ -588,13 +589,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, offset += last_offset; // TODO add last active lane this_num_out for correct last_offset. if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } - __shared__ cub::WarpReduce::TempStorage temp_storage2[4]; - last_offset += cub::WarpReduce(temp_storage2[warp_id]).Sum(this_num_out); - last_offset = __shfl_sync(0xffffffff, last_offset, 0); - // offset += this_num_out; - // auto last_active_lane = __ffs(__brev(__activemask())); // TODO simplify 0xFF case? - // last_offset = __shfl_sync(0xffffffff, offset, 31-last_active_lane); // TODO is mask - // right? + offset += this_num_out; + last_offset = __shfl_sync(0xffffffff, offset, 31); } } // char for-loop if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; } From 086dfa9de1ad658386b103c6e0ced37f64f90802 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 7 Aug 2023 18:08:04 +0530 Subject: [PATCH 07/47] cleanup comments, unused code --- cpp/include/cudf/io/detail/data_casting.cuh | 52 +++++++-------------- 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 48bf540f455..3b06ad08724 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -348,7 +348,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; // TODO alignment - aligned access possible? - // grid-stride loop. + // get 1-string index per warp auto warp_inc_count = [&]() { size_type istring=0; if(warp_lane==0) { @@ -357,6 +357,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncwarp(); return __shfl_sync(0xffffffff, istring, 0); }; + // grid-stride loop. // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) { // if (!d_chars) @@ -416,13 +417,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, bool init_state{false}; // for backslash scan calculation auto last_offset = 0; // 0-31, 32-63, ... i*32-n. - // condition as __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) != 0 - // to allow entire warp execute but with mask. + // entire warp executes but with mask. auto MASK = 0xffffffff; for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; - // char_index<(in_end - in_begin); char_index += cudf::detail::warp_size) { - bool is_within_bounds = char_index < (in_end - in_begin); + bool is_within_bounds = char_index < (in_end - in_begin); //TODO more conditions below to avoid out-of-bound memory access. auto c = in_begin[char_index]; auto prev_c = char_index > 0 ? in_begin[char_index - 1] : 'a'; auto escaped_char = get_escape_char(c); @@ -472,31 +471,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, int this_num_out = 0; cudf::char_utf8 write_char{'a'}; - // auto prev = 0; // carry for is_escape_slash - // if prev == escape_slash, then this is escaped_char, so copy. - // then this is not escaped_slash regardless of c. - // if prev != escape_slash, then curr == '\' is escaped_slash,. - // curr_escape_slash = prev==true then 0, if prev=false & c=='\' - // curr = !prev && c=='\' - // 0 & * - // 1 '\'? - // c=='\' - // inclusivesum of custom operator. - // check if any c=='\' in warp, if no, then prev=false for all. - // else do the scan. - + // To check current is backslash by checking if previous is backslash. // curr = !prev & c=='\\' - // !curr = !(!prev & c=='\\') - // !curr = prev | c!='\\' is it associative? NO - - // not associative! - // curr[0] curr[1] curr[2]. op = !prev & c; - // op = !a & b; - // op( op(x, y), z) = op(!x&y, z) = (!(!x&y))&z = (x | !y)&z = xz | (!y)z - // op(x, op(y, z)) =op(x, !y&z) = !x&(!y &z) = (!x)&(!y)&z - - // problem is when there is continuous \\\\\\\\\\\\\ we don't know which one is escaping - // backslash. + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. struct state_table { bool state[2]; @@ -512,8 +496,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // inclusive scan. TODO both inclusive and exclusive available in cub. cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; - // init_state = __shfl_sync(0xffffffff, is_escaping_backslash, 31); - auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xFF case? + auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xffffffff case? init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); // TODO replace/add prev_c with proper scan of escapes skip |= is_escaping_backslash; @@ -524,7 +507,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // if (true) { this_num_out = 1; if (d_chars) write_char = c; - // d_buffer[last_offset+ this_num_out_scaned] = c; + //FIXME: can you skip write like this for string_size count at other places? } else { // already taken care early. // if (escaped_char == NON_ESCAPE_CHAR) { @@ -533,9 +516,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // } else if (escaped_char != UNICODE_SEQ) { this_num_out = 1; - // if(d_chars) write_char = escaped_char; - // d_buffer[last_offset+ this_num_out_scaned] = escaped_char; } else { // \uXXXX- u // Unicode @@ -544,7 +525,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // if next is \uXXXX // in_begin + char_index // 01234567890 - //\uXXXX\uXXXX + //\uXXXX\uXXXX TODO cleanup these conditions. // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && in_begin[char_index + 1 + 5] == 'u') { @@ -558,7 +539,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); write_char = strings::detail::codepoint_to_utf8(unicode_code_point); this_num_out = strings::detail::bytes_in_char_utf8(write_char); - // this_num_out = 0; skip=true; } else { // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6); if ( From 0aa2c0e0c674c7633027e5c4022655a6edb37c3f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 11 Aug 2023 20:38:45 +0530 Subject: [PATCH 08/47] add block per string algorithm --- cpp/include/cudf/io/detail/data_casting.cuh | 356 ++++++++++++++++++-- 1 file changed, 320 insertions(+), 36 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 3b06ad08724..855f226a0b0 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -350,16 +350,15 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // get 1-string index per warp auto warp_inc_count = [&]() { - size_type istring=0; - if(warp_lane==0) { - istring = atomicAdd(str_counter, 1); - } + size_type istring = 0; + if (warp_lane == 0) { istring = atomicAdd(str_counter, 1); } __syncwarp(); return __shfl_sync(0xffffffff, istring, 0); }; // grid-stride loop. // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { - for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) { + for (size_type istring = warp_inc_count(); istring < total_out_strings; + istring = warp_inc_count()) { // if (!d_chars) // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings); if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { @@ -370,7 +369,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto in_begin = str_tuples[istring].first; auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; - // if(!(num_in_chars >= 1024)) continue; + if (num_in_chars >= 1024) continue; // Check if the value corresponds to the null literal auto const is_null_literal = @@ -419,25 +418,28 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. auto MASK = 0xffffffff; - for (size_type char_index = warp_lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; + for (size_type char_index = warp_lane; + (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; char_index += cudf::detail::warp_size) { - bool is_within_bounds = char_index < (in_end - in_begin); //TODO more conditions below to avoid out-of-bound memory access. - auto c = in_begin[char_index]; - auto prev_c = char_index > 0 ? in_begin[char_index - 1] : 'a'; + bool is_within_bounds = + char_index < + (in_end - in_begin); // TODO more conditions below to avoid out-of-bound memory access. + auto c = is_within_bounds ? in_begin[char_index] : '\0'; + auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); bool error = false; // FIXME: \\ at end is a problem here. // \uXXXXe e-u=5 4<=4 // 012345 - if(is_within_bounds) { - error |= (c == '\\' && char_index == (in_end - in_begin) - 1); - error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); - error |= (prev_c == '\\' && c == 'u' && - // TODO check if following condition is right or off by one error. - ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | - // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | - !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | - !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + if (is_within_bounds) { + error |= (c == '\\' && char_index == (in_end - in_begin) - 1); + error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); + error |= (prev_c == '\\' && c == 'u' && + // TODO check if following condition is right or off by one error. + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } // propagate error using warp shuffle. error = __any_sync(MASK, error); @@ -453,20 +455,20 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, break; // return to grid-stride loop for next string. } // TODO one more error condition of second \uXXXX is not hex. - bool skip = !is_within_bounds; //false; + bool skip = !is_within_bounds; // false; // TODO FIXME: continue slashes are a problem! // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' // corner case \\uXXXX TODO // skip XXXX in \uXXXX if (is_within_bounds) { - skip |= - char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; - skip |= - char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && in_begin[char_index - 2] == 'u'; - skip |= - char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && in_begin[char_index - 3] == 'u'; - skip |= - char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && in_begin[char_index - 4] == 'u'; + skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && + in_begin[char_index - 1] == 'u'; + skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && + in_begin[char_index - 2] == 'u'; + skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && + in_begin[char_index - 3] == 'u'; + skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && + in_begin[char_index - 4] == 'u'; } int this_num_out = 0; cudf::char_utf8 write_char{'a'}; @@ -496,8 +498,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // inclusive scan. TODO both inclusive and exclusive available in cub. cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; - auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xffffffff case? - init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); + auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xffffffff case? + init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); // TODO replace/add prev_c with proper scan of escapes skip |= is_escaping_backslash; @@ -507,7 +509,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // if (true) { this_num_out = 1; if (d_chars) write_char = c; - //FIXME: can you skip write like this for string_size count at other places? + // FIXME: can you skip write like this for string_size count at other places? } else { // already taken care early. // if (escaped_char == NON_ESCAPE_CHAR) { @@ -516,7 +518,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // } else if (escaped_char != UNICODE_SEQ) { this_num_out = 1; - write_char = escaped_char; + write_char = escaped_char; } else { // \uXXXX- u // Unicode @@ -577,6 +579,269 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, } // grid-stride for-loop } +template +__global__ void parse_fn_block_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) +{ + // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int lane = threadIdx.x; + // int global_warp_id = blockIdx.x; + // int nwarps = gridDim.x; + // TODO alignment - aligned access possible? + + // get 1-string index per warp + auto warp_inc_count = [&]() { + __shared__ size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncthreads(); // memory fence? + return istring; + }; + // grid-stride loop. + // TODO if large number of small strings, then this loop is not efficient. So, switch to old + // method. for (size_type istring = global_warp_id; istring < total_out_strings; istring += + // nwarps) { + for (size_type istring = warp_inc_count(); istring < total_out_strings; + istring = warp_inc_count()) { + // if (!d_chars) + // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings); + if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { + if (!d_chars) d_offsets[istring] = 0; + continue; // gride-stride return; + } + + auto in_begin = str_tuples[istring].first; + auto in_end = in_begin + str_tuples[istring].second; + auto const num_in_chars = str_tuples[istring].second; + if (num_in_chars < 1024) continue; + + // Check if the value corresponds to the null literal + auto const is_null_literal = + (!d_chars) && + serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; + } + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; + + // Copy literal/numeric value + if (not is_string_value) { + if (!d_chars) { + if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + } else { + for (size_type char_index = lane; char_index < (in_end - in_begin); + char_index += blockDim.x) { + d_buffer[char_index] = in_begin[char_index]; + } + } + continue; // gride-stride return; + } + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + // auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + + auto is_hex = [](auto ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + }; + __shared__ bool init_state; //{false}; // for backslash scan calculation + __shared__ size_type last_offset; // = 0; + if (lane == 0) { + init_state = false; + last_offset = 0; + } + // 0-31, 32-63, ... i*32-n. + // entire warp executes but with mask. + // auto MASK = 0xffffffff; + for (size_type char_index = lane; + char_index < (in_end - in_begin + 32 * 8 - 1) / (32 * 8) * (32 * 8); + char_index += blockDim.x) { + bool is_within_bounds = + char_index < + (in_end - in_begin); // TODO more conditions below to avoid out-of-bound memory access. + auto c = is_within_bounds ? in_begin[char_index] : '\0'; + auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : 'a'; + auto escaped_char = get_escape_char(c); + bool error = false; + // FIXME: \\ at end is a problem here. + // \uXXXXe e-u=5 4<=4 + // 012345 + if (is_within_bounds) { + error |= (c == '\\' && char_index == (in_end - in_begin) - 1); + error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); + error |= (prev_c == '\\' && c == 'u' && + // TODO check if following condition is right or off by one error. + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + } + // propagate error using warp shuffle. + // error = __any_sync(MASK, error); + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage_error; + __shared__ bool error_reduced; + error_reduced = BlockReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. + // only valid in thread0. + __syncthreads(); + if (error_reduced) { + if (lane == 0) { + if (null_mask != nullptr) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + } + last_offset = 0; + if (!d_chars) d_offsets[istring] = 0; + } + break; // return to grid-stride loop for next string. + } + // TODO one more error condition of second \uXXXX is not hex. + bool skip = !is_within_bounds; // false; + // TODO FIXME: continue slashes are a problem! + // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' + // corner case \\uXXXX TODO + // skip XXXX in \uXXXX + if (is_within_bounds) { + skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && + in_begin[char_index - 1] == 'u'; + skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && + in_begin[char_index - 2] == 'u'; + skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && + in_begin[char_index - 3] == 'u'; + skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && + in_begin[char_index - 4] == 'u'; + } + int this_num_out = 0; + cudf::char_utf8 write_char{'a'}; + + // To check current is backslash by checking if previous is backslash. + // curr = !prev & c=='\\' + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. + + struct state_table { + bool state[2]; + }; + // using state_table = bool[2]; Try this. and see if compiler errors + using BlockScan = cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_slash; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + }; + state_table scanned; + // auto warp_id = threadIdx.x / 32; + // inclusive scan. TODO both inclusive and exclusive available in cub. + BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + auto is_escaping_backslash = scanned.state[init_state]; + auto last_active_lane = blockDim.x - 1; // TODO simplify 0xffffffff case? + if (threadIdx.x == last_active_lane) init_state = is_escaping_backslash; + // There is another __syncthreads() at the end of for-loop. + // TODO replace/add prev_c with proper scan of escapes + skip |= is_escaping_backslash; + + if (!skip) { + // is prev_is_not backslash? + if (prev_c != '\\') { // FIXME: enable this after debugging. + // if (true) { + this_num_out = 1; + if (d_chars) write_char = c; + // FIXME: can you skip write like this for string_size count at other places? + } else { + // already taken care early. + // if (escaped_char == NON_ESCAPE_CHAR) { + // this_num_out = 0; + // error = true; + // } else + if (escaped_char != UNICODE_SEQ) { + this_num_out = 1; + write_char = escaped_char; + } else { + // \uXXXX- u + // Unicode + auto hex_val = parse_unicode_hex(in_begin + char_index + 1); + auto hex_low_val = 0; + // if next is \uXXXX + // in_begin + char_index + // 01234567890 + //\uXXXX\uXXXX TODO cleanup these conditions. + // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && + in_begin[char_index + 1 + 5] == 'u') { + hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); + } + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + write_char = strings::detail::codepoint_to_utf8(unicode_code_point); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } else { + // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6); + if ( + // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val < + // UTF16_HIGH_SURROGATE_END && + hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + skip = true; + this_num_out = 0; + write_char = 0; + } else { + // if u8 + write_char = strings::detail::codepoint_to_utf8(hex_val); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } + } + } + } + } // !skip end. + { + // TODO think about writing error conditions as normal, so that program flow is easy to read + // and can process error here. + // WRITE now (compute out_idx offset then write) + // intra-warp scan of this_num_out. + // TODO union to save shared memory + using BlockScan2 = cub::BlockScan; + __shared__ BlockScan2::TempStorage temp_storage; + size_type offset; + BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset); + offset += last_offset; + // TODO add last active lane this_num_out for correct last_offset. + if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } + offset += this_num_out; + if (threadIdx.x == last_active_lane) last_offset = offset; + __syncthreads(); + } + } // char for-loop + if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + } // grid-stride for-loop +} + template struct string_parse { str_tuple_it str_tuples; @@ -684,16 +949,17 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets2->mutable_view().data(); - int max_blocks = 0; + int max_blocks = 0; constexpr auto warps_per_block = 8; - int threads_per_block = cudf::detail::warp_size * warps_per_block; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, parse_fn_string_parallel, threads_per_block, 0)); + int threads_per_block = cudf::detail::warp_size * warps_per_block; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, parse_fn_string_parallel, threads_per_block, 0)); int device = 0; CUDF_CUDA_TRY(cudaGetDevice(&device)); int num_sms = 0; CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); - auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1)); + auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1)); auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); parse_fn_string_parallel<<>>( @@ -705,6 +971,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options, d_offsets, nullptr); + parse_fn_block_parallel<<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + nullptr); // print_raw(d_offsets, offsets2->size(), stream); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); @@ -726,6 +1001,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, d_offsets, d_chars2); + parse_fn_block_parallel<<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + d_chars2); // if(bytes!=chars->size()) { // std::cout<<"new bytes="<size(), stream); From 57ea0564815c8a710382811855fc9502db175080 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 12 Aug 2023 02:25:31 +0530 Subject: [PATCH 09/47] cleanup, kernel name --- cpp/include/cudf/io/detail/data_casting.cuh | 122 +++++++++++--------- 1 file changed, 66 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 855f226a0b0..4444714b5db 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -333,25 +334,27 @@ process_string(in_iterator_t in_begin, // propagate offset from 32nd thread to others in warp to carry forward. // 1 warp per string. template -__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, - size_type total_out_strings, - size_type* str_counter, - bitmask_type* null_mask, - size_type* null_count_data, - cudf::io::parse_options_view const options, - size_type* d_offsets, - char* d_chars) +__global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) { - int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_lane = global_thread_id % cudf::detail::warp_size; - // int global_warp_id = global_thread_id / cudf::detail::warp_size; - // int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + constexpr auto BLOCK_SIZE = cudf::detail::warp_size; + int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int lane = global_thread_id % BLOCK_SIZE; + // int global_warp_id = global_thread_id / BLOCK_SIZE; + // int nwarps = gridDim.x * blockDim.x / BLOCK_SIZE; // TODO alignment - aligned access possible? // get 1-string index per warp + // TODO if #num(33-1024) > SOME_LIMIT, then fixed load. auto warp_inc_count = [&]() { - size_type istring = 0; - if (warp_lane == 0) { istring = atomicAdd(str_counter, 1); } + size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } __syncwarp(); return __shfl_sync(0xffffffff, istring, 0); }; @@ -359,8 +362,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) { - // if (!d_chars) - // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings); + // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { if (!d_chars) d_offsets[istring] = 0; continue; // gride-stride return; @@ -376,7 +378,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, (!d_chars) && serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); if (is_null_literal && null_mask != nullptr) { - if (warp_lane == 0) { + if (lane == 0) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); if (!d_chars) d_offsets[istring] = 0; @@ -393,10 +395,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // Copy literal/numeric value if (not is_string_value) { if (!d_chars) { - if (warp_lane == 0) { d_offsets[istring] = in_end - in_begin; } + if (lane == 0) { d_offsets[istring] = in_end - in_begin; } } else { - for (size_type char_index = warp_lane; char_index < (in_end - in_begin); - char_index += cudf::detail::warp_size) { + for (size_type char_index = lane; char_index < (in_end - in_begin); + char_index += BLOCK_SIZE) { d_buffer[char_index] = in_begin[char_index]; } } @@ -413,17 +415,18 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto is_hex = [](auto ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); }; - bool init_state{false}; // for backslash scan calculation + + // for backslash scan calculation: is_previous_escaping_backslash + bool init_state{false}; auto last_offset = 0; // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. auto MASK = 0xffffffff; - for (size_type char_index = warp_lane; + for (size_type char_index = lane; (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; - char_index += cudf::detail::warp_size) { - bool is_within_bounds = - char_index < - (in_end - in_begin); // TODO more conditions below to avoid out-of-bound memory access. + char_index += BLOCK_SIZE) { + bool is_within_bounds = char_index < (in_end - in_begin); + // TODO more conditions below to avoid out-of-bound memory access. auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); @@ -432,19 +435,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // \uXXXXe e-u=5 4<=4 // 012345 if (is_within_bounds) { + // TODO instead of '\\', use is_escaping_backslash, and previous index value also here. error |= (c == '\\' && char_index == (in_end - in_begin) - 1); error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); error |= (prev_c == '\\' && c == 'u' && // TODO check if following condition is right or off by one error. ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | - // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } // propagate error using warp shuffle. error = __any_sync(MASK, error); if (error) { - if (warp_lane == 0) { + if (lane == 0) { if (null_mask != nullptr) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); @@ -452,7 +455,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, last_offset = 0; if (!d_chars) d_offsets[istring] = 0; } - break; // return to grid-stride loop for next string. + break; // gride-stride return; } // TODO one more error condition of second \uXXXX is not hex. bool skip = !is_within_bounds; // false; @@ -471,7 +474,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, in_begin[char_index - 4] == 'u'; } int this_num_out = 0; - cudf::char_utf8 write_char{'a'}; + cudf::char_utf8 write_char{}; // To check current is backslash by checking if previous is backslash. // curr = !prev & c=='\\' @@ -487,19 +490,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, struct state_table { bool state[2]; }; - // using state_table = bool[2]; Try this. and see if compiler errors + // TODO Use union to reduce shared memory usage. __shared__ typename cub::WarpScan::TempStorage temp_slash[8]; state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. auto composite_op = [](state_table op1, state_table op2) { return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; - auto warp_id = threadIdx.x / 32; + auto warp_id = threadIdx.x / BLOCK_SIZE; // inclusive scan. TODO both inclusive and exclusive available in cub. cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; - auto last_active_lane = 31 - __clz(MASK); // TODO simplify 0xffffffff case? - init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); + auto last_active_lane = (BLOCK_SIZE - 1) - __clz(MASK); // TODO simplify 0xffffffff case? + init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); // TODO replace/add prev_c with proper scan of escapes skip |= is_escaping_backslash; @@ -528,7 +531,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // in_begin + char_index // 01234567890 //\uXXXX\uXXXX TODO cleanup these conditions. - // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + // Note: no need for scanned_backslash below because we already know. only '\u' check is + // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && in_begin[char_index + 1 + 5] == 'u') { hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); @@ -565,6 +569,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // WRITE now (compute out_idx offset then write) // intra-warp scan of this_num_out. // TODO union to save shared memory + // TODO, use only Reduce instead of scan for size calculation - if(!dchars) __shared__ cub::WarpScan::TempStorage temp_storage[8]; size_type offset; cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); @@ -572,10 +577,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // TODO add last active lane this_num_out for correct last_offset. if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } offset += this_num_out; - last_offset = __shfl_sync(0xffffffff, offset, 31); + last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); } } // char for-loop - if (!d_chars && warp_lane == 0) { d_offsets[istring] = last_offset; } + if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } } // grid-stride for-loop } @@ -589,6 +594,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type* d_offsets, char* d_chars) { + const long BLOCK_SIZE = blockDim.x; // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int lane = threadIdx.x; // int global_warp_id = blockIdx.x; @@ -596,6 +602,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // TODO alignment - aligned access possible? // get 1-string index per warp + // TODO if #num(>1024) > SOME_LIMIT, then fixed load. auto warp_inc_count = [&]() { __shared__ size_type istring; if (lane == 0) { istring = atomicAdd(str_counter, 1); } @@ -604,12 +611,11 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, }; // grid-stride loop. // TODO if large number of small strings, then this loop is not efficient. So, switch to old - // method. for (size_type istring = global_warp_id; istring < total_out_strings; istring += - // nwarps) { + // method. + // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { for (size_type istring = warp_inc_count(); istring < total_out_strings; istring = warp_inc_count()) { - // if (!d_chars) - // printf("%d:%d<%d\n", global_thread_id, istring, total_out_strings); + // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { if (!d_chars) d_offsets[istring] = 0; continue; // gride-stride return; @@ -645,7 +651,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, if (lane == 0) { d_offsets[istring] = in_end - in_begin; } } else { for (size_type char_index = lane; char_index < (in_end - in_begin); - char_index += blockDim.x) { + char_index += BLOCK_SIZE) { d_buffer[char_index] = in_begin[char_index]; } } @@ -672,25 +678,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // entire warp executes but with mask. // auto MASK = 0xffffffff; for (size_type char_index = lane; - char_index < (in_end - in_begin + 32 * 8 - 1) / (32 * 8) * (32 * 8); - char_index += blockDim.x) { - bool is_within_bounds = - char_index < - (in_end - in_begin); // TODO more conditions below to avoid out-of-bound memory access. + char_index < cudf::util::round_up_unsafe(in_end - in_begin, BLOCK_SIZE); + char_index += BLOCK_SIZE) { + bool is_within_bounds = char_index < (in_end - in_begin); + // TODO more conditions below to avoid out-of-bound memory access. auto c = is_within_bounds ? in_begin[char_index] : '\0'; - auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : 'a'; + auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); bool error = false; // FIXME: \\ at end is a problem here. // \uXXXXe e-u=5 4<=4 // 012345 if (is_within_bounds) { + // TODO instead of '\\', use is_escaping_backslash, and previous index value also here. error |= (c == '\\' && char_index == (in_end - in_begin) - 1); error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); error |= (prev_c == '\\' && c == 'u' && // TODO check if following condition is right or off by one error. ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | - // ((in_end - (in_begin + char_index) <= UNICODE_HEX_DIGIT_COUNT) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } @@ -711,7 +716,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, last_offset = 0; if (!d_chars) d_offsets[istring] = 0; } - break; // return to grid-stride loop for next string. + break; // gride-stride return; } // TODO one more error condition of second \uXXXX is not hex. bool skip = !is_within_bounds; // false; @@ -730,7 +735,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, in_begin[char_index - 4] == 'u'; } int this_num_out = 0; - cudf::char_utf8 write_char{'a'}; + cudf::char_utf8 write_char{}; // To check current is backslash by checking if previous is backslash. // curr = !prev & c=='\\' @@ -754,7 +759,6 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; - // auto warp_id = threadIdx.x / 32; // inclusive scan. TODO both inclusive and exclusive available in cub. BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; @@ -789,7 +793,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // in_begin + char_index // 01234567890 //\uXXXX\uXXXX TODO cleanup these conditions. - // if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + // Note: no need for scanned_backslash below because we already know. only '\u' check is + // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && in_begin[char_index + 1 + 5] == 'u') { hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); @@ -826,6 +831,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // WRITE now (compute out_idx offset then write) // intra-warp scan of this_num_out. // TODO union to save shared memory + // TODO, use only Reduce instead of scan for size calculation - if(!dchars) using BlockScan2 = cub::BlockScan; __shared__ BlockScan2::TempStorage temp_storage; size_type offset; @@ -924,6 +930,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto d_null_count2 = rmm::device_scalar(null_count, stream); auto null_count_data2 = d_null_count2.data(); + // Write 3 sizes <=32, 33-1024, 1024-1M, >1M + // if all<32, (33, 1024, 1M)==0, run serial kernel. + // if >33, warp-per-string kernel. (we want warp_parallel to be called only if this #rows is less. + // or else atomicAdd congestion might happen) if >1024, block-per-string kernel. if (col_type == cudf::data_type{cudf::type_id::STRING}) { // this utility calls the functor to build the offsets and chars columns; @@ -953,7 +963,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, constexpr auto warps_per_block = 8; int threads_per_block = cudf::detail::warp_size * warps_per_block; CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks, parse_fn_string_parallel, threads_per_block, 0)); + &max_blocks, parse_fn_warp_parallel, threads_per_block, 0)); int device = 0; CUDF_CUDA_TRY(cudaGetDevice(&device)); @@ -962,7 +972,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1)); auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - parse_fn_string_parallel<<>>( + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), @@ -991,7 +1001,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto d_chars2 = chars2->mutable_view().data(); cudaMemsetAsync(d_chars2, 'c', bytes, stream.value()); - parse_fn_string_parallel<<>>( + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), From 7e4cfd219ef2458e5b57708e69f9dd56be76bb79 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 17 Aug 2023 17:06:38 +0530 Subject: [PATCH 10/47] add BLOCK_SIZE to block kernel --- cpp/include/cudf/io/detail/data_casting.cuh | 51 +++++++++++---------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 4444714b5db..8fd23199e93 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -584,7 +584,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } // grid-stride for-loop } -template +template __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type total_out_strings, size_type* str_counter, @@ -594,7 +594,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type* d_offsets, char* d_chars) { - const long BLOCK_SIZE = blockDim.x; + // const long BLOCK_SIZE = blockDim.x; // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int lane = threadIdx.x; // int global_warp_id = blockIdx.x; @@ -678,7 +678,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // entire warp executes but with mask. // auto MASK = 0xffffffff; for (size_type char_index = lane; - char_index < cudf::util::round_up_unsafe(in_end - in_begin, BLOCK_SIZE); + char_index < + cudf::util::round_up_unsafe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { bool is_within_bounds = char_index < (in_end - in_begin); // TODO more conditions below to avoid out-of-bound memory access. @@ -959,9 +960,9 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets2->mutable_view().data(); - int max_blocks = 0; - constexpr auto warps_per_block = 8; - int threads_per_block = cudf::detail::warp_size * warps_per_block; + int max_blocks = 0; + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks, parse_fn_warp_parallel, threads_per_block, 0)); @@ -981,15 +982,16 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options, d_offsets, nullptr); - parse_fn_block_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data2, - options, - d_offsets, - nullptr); + parse_fn_block_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + nullptr); // print_raw(d_offsets, offsets2->size(), stream); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); @@ -1011,15 +1013,16 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, d_offsets, d_chars2); - parse_fn_block_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data2, - options, - d_offsets, - d_chars2); + parse_fn_block_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data2, + options, + d_offsets, + d_chars2); // if(bytes!=chars->size()) { // std::cout<<"new bytes="<size(), stream); From 6622460f5732c5c08dbd6fec729f2440386dfff3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 00:40:41 +0530 Subject: [PATCH 11/47] clean up, add constants --- cpp/include/cudf/io/detail/data_casting.cuh | 354 ++++++++------------ 1 file changed, 134 insertions(+), 220 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 8fd23199e93..a9cb45f5714 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -32,12 +32,18 @@ #include #include +#include +#include + #include #include namespace cudf::io::json::experimental::detail { +constexpr auto SINGLE_THREAD_THRESHOLD = 128; +constexpr auto WARP_THRESHOLD = 1024; + // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; @@ -344,24 +350,19 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, char* d_chars) { constexpr auto BLOCK_SIZE = cudf::detail::warp_size; - int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int lane = global_thread_id % BLOCK_SIZE; - // int global_warp_id = global_thread_id / BLOCK_SIZE; - // int nwarps = gridDim.x * blockDim.x / BLOCK_SIZE; - // TODO alignment - aligned access possible? + int lane = threadIdx.x % BLOCK_SIZE; // get 1-string index per warp - // TODO if #num(33-1024) > SOME_LIMIT, then fixed load. - auto warp_inc_count = [&]() { + auto get_next_string = [&]() { size_type istring; if (lane == 0) { istring = atomicAdd(str_counter, 1); } __syncwarp(); return __shfl_sync(0xffffffff, istring, 0); }; + // grid-stride loop. - // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { - for (size_type istring = warp_inc_count(); istring < total_out_strings; - istring = warp_inc_count()) { + for (size_type istring = get_next_string(); istring < total_out_strings; + istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { if (!d_chars) d_offsets[istring] = 0; @@ -371,7 +372,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, auto in_begin = str_tuples[istring].first; auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; - if (num_in_chars >= 1024) continue; + if (num_in_chars > WARP_THRESHOLD) continue; // Check if the value corresponds to the null literal auto const is_null_literal = @@ -410,7 +411,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, ++in_begin; --in_end; } - // auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + // warp-parallelized process_string(in_begin, in_end, d_buffer, options); auto is_hex = [](auto ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); @@ -426,14 +427,10 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; char_index += BLOCK_SIZE) { bool is_within_bounds = char_index < (in_end - in_begin); - // TODO more conditions below to avoid out-of-bound memory access. - auto c = is_within_bounds ? in_begin[char_index] : '\0'; + auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); bool error = false; - // FIXME: \\ at end is a problem here. - // \uXXXXe e-u=5 4<=4 - // 012345 if (is_within_bounds) { // TODO instead of '\\', use is_escaping_backslash, and previous index value also here. error |= (c == '\\' && char_index == (in_end - in_begin) - 1); @@ -455,14 +452,9 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, last_offset = 0; if (!d_chars) d_offsets[istring] = 0; } - break; // gride-stride return; + break; // gride-stride return; } - // TODO one more error condition of second \uXXXX is not hex. bool skip = !is_within_bounds; // false; - // TODO FIXME: continue slashes are a problem! - // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' - // corner case \\uXXXX TODO - // skip XXXX in \uXXXX if (is_within_bounds) { skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; @@ -490,49 +482,36 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, struct state_table { bool state[2]; }; - // TODO Use union to reduce shared memory usage. - __shared__ typename cub::WarpScan::TempStorage temp_slash[8]; state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. auto composite_op = [](state_table op1, state_table op2) { return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; auto warp_id = threadIdx.x / BLOCK_SIZE; - // inclusive scan. TODO both inclusive and exclusive available in cub. - cub::WarpScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + // inclusive scan of escaping backslashes + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[8]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; - auto last_active_lane = (BLOCK_SIZE - 1) - __clz(MASK); // TODO simplify 0xffffffff case? - init_state = __shfl_sync(MASK, is_escaping_backslash, last_active_lane); - // TODO replace/add prev_c with proper scan of escapes + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); skip |= is_escaping_backslash; if (!skip) { - // is prev_is_not backslash? - if (prev_c != '\\') { // FIXME: enable this after debugging. - // if (true) { + if (prev_c != '\\') { this_num_out = 1; if (d_chars) write_char = c; - // FIXME: can you skip write like this for string_size count at other places? } else { - // already taken care early. - // if (escaped_char == NON_ESCAPE_CHAR) { - // this_num_out = 0; - // error = true; - // } else if (escaped_char != UNICODE_SEQ) { this_num_out = 1; write_char = escaped_char; } else { - // \uXXXX- u // Unicode + // \uXXXX auto hex_val = parse_unicode_hex(in_begin + char_index + 1); auto hex_low_val = 0; - // if next is \uXXXX - // in_begin + char_index - // 01234567890 - //\uXXXX\uXXXX TODO cleanup these conditions. - // Note: no need for scanned_backslash below because we already know. only '\u' check is - // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + // \uXXXX\uXXXX + // Note: no need for scanned_backslash below because we already know that + // only '\u' check is enough. if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && in_begin[char_index + 1 + 5] == 'u') { hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); @@ -546,11 +525,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, write_char = strings::detail::codepoint_to_utf8(unicode_code_point); this_num_out = strings::detail::bytes_in_char_utf8(write_char); } else { - // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6); - if ( - // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val < - // UTF16_HIGH_SURROGATE_END && - hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { skip = true; this_num_out = 0; write_char = 0; @@ -564,17 +539,10 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } } // !skip end. { - // TODO think about writing error conditions as normal, so that program flow is easy to read - // and can process error here. - // WRITE now (compute out_idx offset then write) - // intra-warp scan of this_num_out. - // TODO union to save shared memory - // TODO, use only Reduce instead of scan for size calculation - if(!dchars) __shared__ cub::WarpScan::TempStorage temp_storage[8]; size_type offset; cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); offset += last_offset; - // TODO add last active lane this_num_out for correct last_offset. if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } offset += this_num_out; last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); @@ -594,27 +562,18 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type* d_offsets, char* d_chars) { - // const long BLOCK_SIZE = blockDim.x; - // int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int lane = threadIdx.x; - // int global_warp_id = blockIdx.x; - // int nwarps = gridDim.x; - // TODO alignment - aligned access possible? // get 1-string index per warp - // TODO if #num(>1024) > SOME_LIMIT, then fixed load. - auto warp_inc_count = [&]() { + auto get_next_string = [&]() { __shared__ size_type istring; if (lane == 0) { istring = atomicAdd(str_counter, 1); } __syncthreads(); // memory fence? return istring; }; // grid-stride loop. - // TODO if large number of small strings, then this loop is not efficient. So, switch to old - // method. - // for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { - for (size_type istring = warp_inc_count(); istring < total_out_strings; - istring = warp_inc_count()) { + for (size_type istring = get_next_string(); istring < total_out_strings; + istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { if (!d_chars) d_offsets[istring] = 0; @@ -624,7 +583,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, auto in_begin = str_tuples[istring].first; auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; - if (num_in_chars < 1024) continue; + if (num_in_chars <= WARP_THRESHOLD) continue; // Check if the value corresponds to the null literal auto const is_null_literal = @@ -663,7 +622,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, ++in_begin; --in_end; } - // auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + // block-parallelized process_string(in_begin, in_end, d_buffer, options); auto is_hex = [](auto ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); @@ -674,34 +633,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, init_state = false; last_offset = 0; } - // 0-31, 32-63, ... i*32-n. - // entire warp executes but with mask. - // auto MASK = 0xffffffff; + // entire block executes for (size_type char_index = lane; - char_index < - cudf::util::round_up_unsafe(in_end - in_begin, static_cast(BLOCK_SIZE)); + char_index < cudf::util::round_up_unsafe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { bool is_within_bounds = char_index < (in_end - in_begin); - // TODO more conditions below to avoid out-of-bound memory access. - auto c = is_within_bounds ? in_begin[char_index] : '\0'; + auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); bool error = false; - // FIXME: \\ at end is a problem here. - // \uXXXXe e-u=5 4<=4 - // 012345 if (is_within_bounds) { - // TODO instead of '\\', use is_escaping_backslash, and previous index value also here. error |= (c == '\\' && char_index == (in_end - in_begin) - 1); error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); error |= (prev_c == '\\' && c == 'u' && - // TODO check if following condition is right or off by one error. ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } - // propagate error using warp shuffle. - // error = __any_sync(MASK, error); + // propagate error across entire block using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage_error; __shared__ bool error_reduced; @@ -717,14 +666,9 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, last_offset = 0; if (!d_chars) d_offsets[istring] = 0; } - break; // gride-stride return; + break; // gride-stride return; } - // TODO one more error condition of second \uXXXX is not hex. bool skip = !is_within_bounds; // false; - // TODO FIXME: continue slashes are a problem! - // skip |= (prev_c != '\\') && (c=='\\'); // skip '\' - // corner case \\uXXXX TODO - // skip XXXX in \uXXXX if (is_within_bounds) { skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; @@ -763,39 +707,24 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // inclusive scan. TODO both inclusive and exclusive available in cub. BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; - auto last_active_lane = blockDim.x - 1; // TODO simplify 0xffffffff case? - if (threadIdx.x == last_active_lane) init_state = is_escaping_backslash; + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; // There is another __syncthreads() at the end of for-loop. - // TODO replace/add prev_c with proper scan of escapes skip |= is_escaping_backslash; if (!skip) { - // is prev_is_not backslash? - if (prev_c != '\\') { // FIXME: enable this after debugging. - // if (true) { + if (prev_c != '\\') { this_num_out = 1; if (d_chars) write_char = c; - // FIXME: can you skip write like this for string_size count at other places? } else { - // already taken care early. - // if (escaped_char == NON_ESCAPE_CHAR) { - // this_num_out = 0; - // error = true; - // } else if (escaped_char != UNICODE_SEQ) { this_num_out = 1; write_char = escaped_char; } else { - // \uXXXX- u // Unicode + // \uXXXX auto hex_val = parse_unicode_hex(in_begin + char_index + 1); auto hex_low_val = 0; - // if next is \uXXXX - // in_begin + char_index - // 01234567890 - //\uXXXX\uXXXX TODO cleanup these conditions. - // Note: no need for scanned_backslash below because we already know. only '\u' check is - // enough. if ((in_end - (in_begin + char_index + 1 + 4)) > 6 && + // \uXXXX\uXXXX if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && in_begin[char_index + 1 + 5] == 'u') { hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); @@ -809,11 +738,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, write_char = strings::detail::codepoint_to_utf8(unicode_code_point); this_num_out = strings::detail::bytes_in_char_utf8(write_char); } else { - // auto hex_high_val = parse_unicode_hex(in_begin + char_index + 1 - 6); - if ( - // hex_high_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_high_val < - // UTF16_HIGH_SURROGATE_END && - hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { skip = true; this_num_out = 0; write_char = 0; @@ -827,12 +752,6 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, } } // !skip end. { - // TODO think about writing error conditions as normal, so that program flow is easy to read - // and can process error here. - // WRITE now (compute out_idx offset then write) - // intra-warp scan of this_num_out. - // TODO union to save shared memory - // TODO, use only Reduce instead of scan for size calculation - if(!dchars) using BlockScan2 = cub::BlockScan; __shared__ BlockScan2::TempStorage temp_storage; size_type offset; @@ -841,7 +760,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, // TODO add last active lane this_num_out for correct last_offset. if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } offset += this_num_out; - if (threadIdx.x == last_active_lane) last_offset = offset; + if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; __syncthreads(); } } // char for-loop @@ -929,119 +848,114 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto d_null_count = rmm::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); - auto d_null_count2 = rmm::device_scalar(null_count, stream); - auto null_count_data2 = d_null_count2.data(); - // Write 3 sizes <=32, 33-1024, 1024-1M, >1M - // if all<32, (33, 1024, 1M)==0, run serial kernel. - // if >33, warp-per-string kernel. (we want warp_parallel to be called only if this #rows is less. - // or else atomicAdd congestion might happen) if >1024, block-per-string kernel. - if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null count may be updated by parse failures - -#define WARP_PARALLEL -#ifndef WARP_PARALLEL - nvtxRangePush("make_strings_children"); - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, - col_size, - stream, - mr); - auto& offsets2 = offsets; - auto& chars2 = chars; - nvtxRangePop(); -#else - - // { - nvtxRangePush("string_parallel"); - auto offsets2 = cudf::make_numeric_column( - data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets2->mutable_view().data(); - - int max_blocks = 0; - constexpr auto warps_per_block = 8; - constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks, parse_fn_warp_parallel, threads_per_block, 0)); - - int device = 0; - CUDF_CUDA_TRY(cudaGetDevice(&device)); - int num_sms = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); - auto num_blocks = min(num_sms * max_blocks, min(65535, col_size / warps_per_block + 1)); - auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - - parse_fn_warp_parallel<<>>( + auto const max_length = thrust::transform_reduce( + rmm::exec_policy(stream), str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data2, - options, - d_offsets, - nullptr); - parse_fn_block_parallel - <<>>( + str_tuples + col_size, + [] __device__(auto t) { return t.second; }, + size_type{0}, + thrust::maximum{}); + + if (max_length < SINGLE_THREAD_THRESHOLD) { + // this utility calls the functor to build the offsets and chars columns; + // the bitmask and null count may be updated by parse failures + nvtxRangePush("make_strings_children"); + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options}, + col_size, + stream, + mr); + nvtxRangePop(); + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); + } else { + nvtxRangePush("string_parallel"); + auto offsets2 = cudf::make_numeric_column( + data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets2->mutable_view().data(); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = min(65535, col_size / warps_per_block + 1); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), static_cast(null_mask.data()), - null_count_data2, + null_count_data, options, d_offsets, nullptr); - // print_raw(d_offsets, offsets2->size(), stream); - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); - str_counter.set_value(0, stream); - - // CHARS column - std::unique_ptr chars2 = - strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars2 = chars2->mutable_view().data(); - cudaMemsetAsync(d_chars2, 'c', bytes, stream.value()); - - parse_fn_warp_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data2, - options, - d_offsets, - d_chars2); - - parse_fn_block_parallel - <<>>( + str_counter.set_value(0, stream); + // if (max_length > WARP_THRESHOLD) + parse_fn_block_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + // print_raw(d_offsets, offsets2->size(), stream); + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + str_counter.set_value(0, stream); + + // CHARS column + std::unique_ptr chars2 = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars2 = chars2->mutable_view().data(); + + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), static_cast(null_mask.data()), - null_count_data2, + null_count_data, options, d_offsets, d_chars2); - // if(bytes!=chars->size()) { - // std::cout<<"new bytes="<size(), stream); - // print_raw(d_chars2, chars2->size(), stream); - // } - // if(bytes!=chars->size()) { - // std::cout<<"old bytes="<size()<view().data(), offsets->size(), stream); - // print_raw(chars->view().data(), chars->size(), stream); - // } - nvtxRangePop(); -// } -#endif - - return make_strings_column(col_size, - std::move(offsets2), - std::move(chars2), - d_null_count2.value(stream), - std::move(null_mask)); + str_counter.set_value(0, stream); + + // if (max_length > WARP_THRESHOLD) + parse_fn_block_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars2); + // if(bytes!=chars->size()) { + // std::cout<<"new bytes="<size(), stream); + // print_raw(d_chars2, chars2->size(), stream); + // } + // if(bytes!=chars->size()) { + // std::cout<<"old bytes="<size()<view().data(), offsets->size(), stream); + // print_raw(chars->view().data(), chars->size(), stream); + // } + nvtxRangePop(); + + return make_strings_column(col_size, + std::move(offsets2), + std::move(chars2), + d_null_count.value(stream), + std::move(null_mask)); + } } auto out_col = From efe78983cbfe2036c6b3523f012ccc87cd9b3e18 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 00:41:25 +0530 Subject: [PATCH 12/47] add long string json test --- cpp/tests/io/json_test.cpp | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 97d5846294a..fea307d6047 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1370,6 +1370,39 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); } +TEST_F(JsonReaderTest, JsonLongString) +{ + std::string json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"})"; + json_string += std::string("\n") + R"({"a":"a4\u20ac)" + std::string(103, 'a') + "\"}"; + json_string += std::string("\n") + R"({"a":"a5\u20ac)" + std::string(1034, 'a') + "\"}"; + json_string += std::string("\n") + R"({"a":"a6\u20ac)" + std::string(10340, 'a') + "\"}"; + + cudf::test::strings_column_wrapper col1{"a0", + "a1", + "a2", + "a3", + "a4€" + std::string(103, 'a'), + "a5€" + std::string(1034, 'a'), + "a6€" + std::string(10340, 'a')}; + cudf::test::strings_column_wrapper col2{{"", "", "b2", "", "", "", ""}, {0, 0, 1, 0, 0, 0, 0}}; + cudf::test::strings_column_wrapper col3{{"", "", "", "c3", "", "", ""}, {0, 0, 0, 1, 0, 0, 0}}; + cudf::table_view expected({col1, col2, col3}); + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via nested JSON reader + auto const table = cudf::io::read_json(json_lines_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); +} + TEST_F(JsonReaderTest, TokenAllocation) { std::array const json_inputs{ From 589e0a3c9ed8b25119741f04e8076413784a6bb3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 00:43:14 +0530 Subject: [PATCH 13/47] remove debug prints --- cpp/include/cudf/io/detail/data_casting.cuh | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index a9cb45f5714..a9968be36d8 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -812,15 +812,6 @@ struct string_parse { } }; -template -void print_raw(T const* ptr, size_type size, rmm::cuda_stream_view stream) -{ - auto h_offsets2 = cudf::detail::make_std_vector_sync(device_span(ptr, size), stream); - for (auto i : h_offsets2) - std::cout << i << ","; - std::cout << std::endl; -} - /** * @brief Parses the data from an iterator of string views, casting it to the given target data type * @@ -906,7 +897,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options, d_offsets, nullptr); - // print_raw(d_offsets, offsets2->size(), stream); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); str_counter.set_value(0, stream); @@ -938,16 +928,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options, d_offsets, d_chars2); - // if(bytes!=chars->size()) { - // std::cout<<"new bytes="<size(), stream); - // print_raw(d_chars2, chars2->size(), stream); - // } - // if(bytes!=chars->size()) { - // std::cout<<"old bytes="<size()<view().data(), offsets->size(), stream); - // print_raw(chars->view().data(), chars->size(), stream); - // } nvtxRangePop(); return make_strings_column(col_size, From d3dc8cfd53f1f86c381bab2c8a8ec65b545fa36f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 00:47:16 +0530 Subject: [PATCH 14/47] comment --- cpp/include/cudf/io/detail/data_casting.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 808a250ef5b..e672ab0feee 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -552,6 +552,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } // grid-stride for-loop } +// Similar to warp-parallel algorithm but with 1 block per string. template __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type total_out_strings, From e17589e620159264e2ba33c51624ba33860a516e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 18 Aug 2023 00:59:05 +0530 Subject: [PATCH 15/47] style fix, add constants --- cpp/include/cudf/io/detail/data_casting.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index e672ab0feee..0cc917a3498 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -539,7 +539,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } } // !skip end. { - __shared__ cub::WarpScan::TempStorage temp_storage[8]; + __shared__ typename cub::WarpScan::TempStorage temp_storage[8]; size_type offset; cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); offset += last_offset; @@ -552,7 +552,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } // grid-stride for-loop } -// Similar to warp-parallel algorithm but with 1 block per string. +// Similar to warp-parallel algorithm but 1 block per string. template __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, size_type total_out_strings, @@ -652,7 +652,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } // propagate error across entire block - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage_error; __shared__ bool error_reduced; error_reduced = BlockReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. @@ -698,7 +698,7 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, bool state[2]; }; // using state_table = bool[2]; Try this. and see if compiler errors - using BlockScan = cub::BlockScan; + using BlockScan = cub::BlockScan; __shared__ typename BlockScan::TempStorage temp_slash; state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. auto composite_op = [](state_table op1, state_table op2) { @@ -753,8 +753,8 @@ __global__ void parse_fn_block_parallel(str_tuple_it str_tuples, } } // !skip end. { - using BlockScan2 = cub::BlockScan; - __shared__ BlockScan2::TempStorage temp_storage; + using BlockScan2 = cub::BlockScan; + __shared__ typename BlockScan2::TempStorage temp_storage; size_type offset; BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset); offset += last_offset; From 631528aba9032d02ea763cbfc5d495166e81c96e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 25 Aug 2023 19:59:57 +0530 Subject: [PATCH 16/47] unified kernel for warp and block --- cpp/include/cudf/io/detail/data_casting.cuh | 187 +++++++++++++++----- 1 file changed, 141 insertions(+), 46 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 0cc917a3498..0bf04df9072 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -305,6 +305,35 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } +// Attempted, but didn't work for warp_parallel kernel +template +__device__ bool& init_state() +{ + static bool data; + return data; +} + +template <> +__device__ bool& init_state() +{ + __shared__ bool data; + return data; +}; + +template +__device__ int& last_offset() +{ + static size_type data; + return data; +} + +template <> +__device__ int& last_offset() +{ + __shared__ size_type data; + return data; +}; + // 1 warp per string. // algorithm @@ -339,7 +368,7 @@ process_string(in_iterator_t in_begin, // before writing, find size, then intra-warp scan for out_idx // propagate offset from 32nd thread to others in warp to carry forward. // 1 warp per string. -template +template __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, size_type total_out_strings, size_type* str_counter, @@ -349,17 +378,24 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, size_type* d_offsets, char* d_chars) { - constexpr auto BLOCK_SIZE = cudf::detail::warp_size; - int lane = threadIdx.x % BLOCK_SIZE; + constexpr auto BLOCK_SIZE = + is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; + size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x; // get 1-string index per warp auto get_next_string = [&]() { - size_type istring; - if (lane == 0) { istring = atomicAdd(str_counter, 1); } - __syncwarp(); - return __shfl_sync(0xffffffff, istring, 0); + if constexpr (is_warp) { + size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncwarp(); + return __shfl_sync(0xffffffff, istring, 0); + } else { + __shared__ size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncthreads(); // memory fence? + return istring; + } }; - // grid-stride loop. for (size_type istring = get_next_string(); istring < total_out_strings; istring = get_next_string()) { @@ -372,7 +408,11 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, auto in_begin = str_tuples[istring].first; auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; - if (num_in_chars > WARP_THRESHOLD) continue; + if constexpr (is_warp) { + if (num_in_chars > WARP_THRESHOLD) continue; + } else { + if (num_in_chars <= WARP_THRESHOLD) continue; + } // Check if the value corresponds to the null literal auto const is_null_literal = @@ -418,14 +458,26 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, }; // for backslash scan calculation: is_previous_escaping_backslash - bool init_state{false}; - auto last_offset = 0; + // bool init_state{false}; + // auto last_offset = 0; + bool init_state_reg; + __shared__ bool init_state_shared; + size_type last_offset_reg; + __shared__ size_type last_offset_shared; + bool& init_state(is_warp ? init_state_reg : init_state_shared); + size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared); + if (is_warp || lane == 0) { + init_state = false; + last_offset = 0; + } + // if constexpr(!is_warp) { __syncthreads(); } // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. - auto MASK = 0xffffffff; + // auto MASK = 0xffffffff; for (size_type char_index = lane; - (MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin))) != 0; + char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { + auto MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin)); bool is_within_bounds = char_index < (in_end - in_begin); auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; @@ -442,7 +494,17 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } // propagate error using warp shuffle. - error = __any_sync(MASK, error); + if constexpr (is_warp) { + error = __any_sync(MASK, error); + } else { + using ErrorReduce = cub::BlockReduce; + __shared__ typename ErrorReduce::TempStorage temp_storage_error; + __shared__ bool error_reduced; + error_reduced = ErrorReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. + // only valid in thread0. + __syncthreads(); + error = error_reduced; + } if (error) { if (lane == 0) { if (null_mask != nullptr) { @@ -487,14 +549,25 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; - auto warp_id = threadIdx.x / BLOCK_SIZE; + [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE; // inclusive scan of escaping backslashes - using SlashScan = cub::WarpScan; - __shared__ typename SlashScan::TempStorage temp_slash[8]; - SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); - auto is_escaping_backslash = scanned.state[init_state]; - init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); - skip |= is_escaping_backslash; + // TODO both inclusive and exclusive available in cub. + if constexpr (is_warp) { + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[8]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + auto is_escaping_backslash = scanned.state[init_state]; + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + skip |= is_escaping_backslash; + } else { + using SlashScan = cub::BlockScan; + __shared__ typename SlashScan::TempStorage temp_slash; + SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + auto is_escaping_backslash = scanned.state[init_state]; + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + // There is another __syncthreads() at the end of for-loop. + skip |= is_escaping_backslash; + } if (!skip) { if (prev_c != '\\') { @@ -539,13 +612,26 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } } // !skip end. { - __shared__ typename cub::WarpScan::TempStorage temp_storage[8]; size_type offset; - cub::WarpScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + if constexpr (is_warp) { + using OffsetScan = cub::WarpScan; + __shared__ typename OffsetScan::TempStorage temp_storage[8]; + OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + } else { + using OffsetScan = cub::BlockScan; + __shared__ typename OffsetScan::TempStorage temp_storage; + OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset); + __syncthreads(); + } offset += last_offset; if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } offset += this_num_out; - last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); + if constexpr (is_warp) { + last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); + } else { + if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; + __syncthreads(); + } } } // char for-loop if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } @@ -813,6 +899,17 @@ struct string_parse { } }; +template +struct to_string_view_pair { + SymbolT const* data; + to_string_view_pair(SymbolT const* _data) : data(_data) {} + __device__ auto operator()(thrust::tuple ip) + { + return thrust::pair{data + thrust::get<0>(ip), + static_cast(thrust::get<1>(ip))}; + } +}; + /** * @brief Parses the data from an iterator of string views, casting it to the given target data type * @@ -877,7 +974,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto num_blocks = min(65535, col_size / warps_per_block + 1); auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - parse_fn_warp_parallel<<>>( + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), @@ -888,16 +985,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, nullptr); str_counter.set_value(0, stream); // if (max_length > WARP_THRESHOLD) - parse_fn_block_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - nullptr); + parse_fn_warp_parallel<<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); str_counter.set_value(0, stream); @@ -907,7 +1003,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); auto d_chars2 = chars2->mutable_view().data(); - parse_fn_warp_parallel<<>>( + parse_fn_warp_parallel<<>>( str_tuples, col_size, str_counter.data(), @@ -919,16 +1015,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, str_counter.set_value(0, stream); // if (max_length > WARP_THRESHOLD) - parse_fn_block_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - d_chars2); + parse_fn_warp_parallel<<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars2); nvtxRangePop(); return make_strings_column(col_size, From 6fe5afa46c52afa7b9e8b0af3d6b1c8e6f91528d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 25 Aug 2023 20:23:36 +0530 Subject: [PATCH 17/47] remove duplicate block kernel, cleanup names --- cpp/include/cudf/io/detail/data_casting.cuh | 353 +++----------------- 1 file changed, 55 insertions(+), 298 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 0bf04df9072..5a90a39025a 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -305,42 +305,13 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } -// Attempted, but didn't work for warp_parallel kernel -template -__device__ bool& init_state() -{ - static bool data; - return data; -} - -template <> -__device__ bool& init_state() -{ - __shared__ bool data; - return data; -}; - -template -__device__ int& last_offset() -{ - static size_type data; - return data; -} - -template <> -__device__ int& last_offset() -{ - __shared__ size_type data; - return data; -}; - // 1 warp per string. // algorithm - +// character count: input->output // \uXXXX 6->2/3/4 // \uXXXX\uXXXX 12->2/3/4 // \" 2->1 -// _ 1->1 +// * 1->1 // // error conditions. (propagate) // c=='\' & curr_idx == end_idx-1; ERROR @@ -367,16 +338,16 @@ __device__ int& last_offset() // if curr_hex_val not in high, write u8. // before writing, find size, then intra-warp scan for out_idx // propagate offset from 32nd thread to others in warp to carry forward. -// 1 warp per string. +// 1 warp per string or 1 block per string template -__global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, - size_type total_out_strings, - size_type* str_counter, - bitmask_type* null_mask, - size_type* null_count_data, - cudf::io::parse_options_view const options, - size_type* d_offsets, - char* d_chars) +__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) { constexpr auto BLOCK_SIZE = is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; @@ -458,8 +429,6 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, }; // for backslash scan calculation: is_previous_escaping_backslash - // bool init_state{false}; - // auto last_offset = 0; bool init_state_reg; __shared__ bool init_state_shared; size_type last_offset_reg; @@ -477,7 +446,8 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, for (size_type char_index = lane; char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { - auto MASK = __ballot_sync(0xffffffff, char_index < (in_end - in_begin)); + auto MASK = + is_warp ? __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) : 0xffffffff; bool is_within_bounds = char_index < (in_end - in_begin); auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; @@ -554,7 +524,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, // TODO both inclusive and exclusive available in cub. if constexpr (is_warp) { using SlashScan = cub::WarpScan; - __shared__ typename SlashScan::TempStorage temp_slash[8]; + __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); auto is_escaping_backslash = scanned.state[init_state]; init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); @@ -615,7 +585,7 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, size_type offset; if constexpr (is_warp) { using OffsetScan = cub::WarpScan; - __shared__ typename OffsetScan::TempStorage temp_storage[8]; + __shared__ typename OffsetScan::TempStorage temp_storage[num_warps]; OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); } else { using OffsetScan = cub::BlockScan; @@ -638,223 +608,6 @@ __global__ void parse_fn_warp_parallel(str_tuple_it str_tuples, } // grid-stride for-loop } -// Similar to warp-parallel algorithm but 1 block per string. -template -__global__ void parse_fn_block_parallel(str_tuple_it str_tuples, - size_type total_out_strings, - size_type* str_counter, - bitmask_type* null_mask, - size_type* null_count_data, - cudf::io::parse_options_view const options, - size_type* d_offsets, - char* d_chars) -{ - int lane = threadIdx.x; - - // get 1-string index per warp - auto get_next_string = [&]() { - __shared__ size_type istring; - if (lane == 0) { istring = atomicAdd(str_counter, 1); } - __syncthreads(); // memory fence? - return istring; - }; - // grid-stride loop. - for (size_type istring = get_next_string(); istring < total_out_strings; - istring = get_next_string()) { - // skip nulls - if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { - if (!d_chars) d_offsets[istring] = 0; - continue; // gride-stride return; - } - - auto in_begin = str_tuples[istring].first; - auto in_end = in_begin + str_tuples[istring].second; - auto const num_in_chars = str_tuples[istring].second; - if (num_in_chars <= WARP_THRESHOLD) continue; - - // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - if (lane == 0) { - clear_bit(null_mask, istring); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[istring] = 0; - } - continue; // gride-stride return; - } - // String values are indicated by keeping the quote character - bool const is_string_value = - num_in_chars >= 2LL && - (options.quotechar == '\0' || - (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); - char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; - - // Copy literal/numeric value - if (not is_string_value) { - if (!d_chars) { - if (lane == 0) { d_offsets[istring] = in_end - in_begin; } - } else { - for (size_type char_index = lane; char_index < (in_end - in_begin); - char_index += BLOCK_SIZE) { - d_buffer[char_index] = in_begin[char_index]; - } - } - continue; // gride-stride return; - } - - // Exclude beginning and ending quote chars from string range - if (!options.keepquotes) { - ++in_begin; - --in_end; - } - // block-parallelized process_string(in_begin, in_end, d_buffer, options); - - auto is_hex = [](auto ch) { - return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); - }; - __shared__ bool init_state; //{false}; // for backslash scan calculation - __shared__ size_type last_offset; // = 0; - if (lane == 0) { - init_state = false; - last_offset = 0; - } - // entire block executes - for (size_type char_index = lane; - char_index < cudf::util::round_up_unsafe(in_end - in_begin, static_cast(BLOCK_SIZE)); - char_index += BLOCK_SIZE) { - bool is_within_bounds = char_index < (in_end - in_begin); - auto c = is_within_bounds ? in_begin[char_index] : '\0'; - auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; - auto escaped_char = get_escape_char(c); - bool error = false; - if (is_within_bounds) { - error |= (c == '\\' && char_index == (in_end - in_begin) - 1); - error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); - error |= (prev_c == '\\' && c == 'u' && - ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | - !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | - !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); - } - // propagate error across entire block - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage_error; - __shared__ bool error_reduced; - error_reduced = BlockReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. - // only valid in thread0. - __syncthreads(); - if (error_reduced) { - if (lane == 0) { - if (null_mask != nullptr) { - clear_bit(null_mask, istring); - atomicAdd(null_count_data, 1); - } - last_offset = 0; - if (!d_chars) d_offsets[istring] = 0; - } - break; // gride-stride return; - } - bool skip = !is_within_bounds; // false; - if (is_within_bounds) { - skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && - in_begin[char_index - 1] == 'u'; - skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && - in_begin[char_index - 2] == 'u'; - skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && - in_begin[char_index - 3] == 'u'; - skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && - in_begin[char_index - 4] == 'u'; - } - int this_num_out = 0; - cudf::char_utf8 write_char{}; - - // To check current is backslash by checking if previous is backslash. - // curr = !prev & c=='\\' - // So, scan is required from beginning of string. - // State table approach (intra-warp FST) - // 2 states: Not-Slash(NS), Slash(S). - // prev / * - // NS S NS - // S NS NS - // After inclusive scan, all current S states translate to escaping backslash. - // All escaping backslash should be skipped. - - struct state_table { - bool state[2]; - }; - // using state_table = bool[2]; Try this. and see if compiler errors - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_slash; - state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. - auto composite_op = [](state_table op1, state_table op2) { - return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; - }; - state_table scanned; - // inclusive scan. TODO both inclusive and exclusive available in cub. - BlockScan(temp_slash).InclusiveScan(curr, scanned, composite_op); - auto is_escaping_backslash = scanned.state[init_state]; - if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; - // There is another __syncthreads() at the end of for-loop. - skip |= is_escaping_backslash; - - if (!skip) { - if (prev_c != '\\') { - this_num_out = 1; - if (d_chars) write_char = c; - } else { - if (escaped_char != UNICODE_SEQ) { - this_num_out = 1; - write_char = escaped_char; - } else { - // Unicode - // \uXXXX - auto hex_val = parse_unicode_hex(in_begin + char_index + 1); - auto hex_low_val = 0; - // \uXXXX\uXXXX - if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && - in_begin[char_index + 1 + 5] == 'u') { - hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); - } - if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && - hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { - // Compute UTF16-encoded code point - uint32_t unicode_code_point = 0x10000 + - ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + - (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); - write_char = strings::detail::codepoint_to_utf8(unicode_code_point); - this_num_out = strings::detail::bytes_in_char_utf8(write_char); - } else { - if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { - skip = true; - this_num_out = 0; - write_char = 0; - } else { - // if u8 - write_char = strings::detail::codepoint_to_utf8(hex_val); - this_num_out = strings::detail::bytes_in_char_utf8(write_char); - } - } - } - } - } // !skip end. - { - using BlockScan2 = cub::BlockScan; - __shared__ typename BlockScan2::TempStorage temp_storage; - size_type offset; - BlockScan2(temp_storage).ExclusiveSum(this_num_out, offset); - offset += last_offset; - // TODO add last active lane this_num_out for correct last_offset. - if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } - offset += this_num_out; - if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; - __syncthreads(); - } - } // char for-loop - if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } - } // grid-stride for-loop -} - template struct string_parse { str_tuple_it str_tuples; @@ -974,26 +727,28 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto num_blocks = min(65535, col_size / warps_per_block + 1); auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - parse_fn_warp_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - nullptr); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); str_counter.set_value(0, stream); // if (max_length > WARP_THRESHOLD) - parse_fn_warp_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - nullptr); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); str_counter.set_value(0, stream); @@ -1003,27 +758,29 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); auto d_chars2 = chars2->mutable_view().data(); - parse_fn_warp_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - d_chars2); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars2); str_counter.set_value(0, stream); // if (max_length > WARP_THRESHOLD) - parse_fn_warp_parallel<<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - d_chars2); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars2); nvtxRangePop(); return make_strings_column(col_size, From d3a35b1a6c790acc81660e049d70ff8f88ef71d4 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sun, 27 Aug 2023 23:48:58 +0530 Subject: [PATCH 18/47] address review comments --- cpp/include/cudf/io/detail/data_casting.cuh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 5a90a39025a..61128ccc6e6 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -738,7 +738,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, d_offsets, nullptr); str_counter.set_value(0, stream); - // if (max_length > WARP_THRESHOLD) + // for strings longer than WARP_THRESHOLD, 1 block per string parse_fn_string_parallel <<>>( str_tuples, @@ -754,9 +754,9 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, str_counter.set_value(0, stream); // CHARS column - std::unique_ptr chars2 = + std::unique_ptr chars = strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars2 = chars2->mutable_view().data(); + auto d_chars = chars->mutable_view().data(); parse_fn_string_parallel <<>>( @@ -767,10 +767,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, null_count_data, options, d_offsets, - d_chars2); + d_chars); str_counter.set_value(0, stream); - // if (max_length > WARP_THRESHOLD) + // for strings longer than WARP_THRESHOLD, 1 block per string parse_fn_string_parallel <<>>( str_tuples, @@ -780,12 +780,12 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, null_count_data, options, d_offsets, - d_chars2); + d_chars); nvtxRangePop(); return make_strings_column(col_size, std::move(offsets2), - std::move(chars2), + std::move(chars), d_null_count.value(stream), std::move(null_mask)); } From 658d8ba2ba0a37b5a8078d8a93826d704a8e5940 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 28 Aug 2023 08:37:53 +0530 Subject: [PATCH 19/47] cleanup infer_data_type signature --- cpp/src/io/json/json_column.cu | 20 +++++++------ cpp/src/io/json/nested_json_gpu.cu | 9 +----- cpp/src/io/utilities/type_inference.cuh | 37 ++++++++++++------------- cpp/tests/io/type_inference_test.cu | 28 +++++++++---------- 4 files changed, 44 insertions(+), 50 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index bdad16bd9f1..479f7daf3ac 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,7 +16,7 @@ #include "nested_json.hpp" #include -#include +// #include #include #include @@ -52,6 +52,16 @@ #include #include +namespace cudf::io::detail { + +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream); +} + namespace cudf::io::json::detail { // DEBUG prints @@ -763,12 +773,6 @@ std::pair, std::vector> device_json_co // TODO how about directly storing pair in json_column? auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion auto string_spans_it = thrust::make_transform_iterator( @@ -790,7 +794,7 @@ std::pair, std::vector> device_json_co // Infer column type, if we don't have an explicit type for it else { target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, string_ranges_it, col_size, stream); + options.json_view(), d_input, offset_length_it, col_size, stream); } auto [result_bitmask, null_count] = make_validity(json_col); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b691eaa8caf..c6d9b69d6cc 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1949,13 +1949,6 @@ std::pair, std::vector> json_column_to auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion auto string_spans_it = thrust::make_transform_iterator( offset_length_it, [data = d_input.data()] __device__(auto ip) { @@ -1978,7 +1971,7 @@ std::pair, std::vector> json_column_to target_type = cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(), d_input, - string_ranges_it, + offset_length_it, col_size, stream); } diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index a9ccc80ca33..0b6c51fc647 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -114,14 +115,14 @@ __device__ __inline__ bool is_like_float(std::size_t len, * * @param[in] options View of inference options * @param[in] data JSON string input - * @param[in] column_strings_begin The beginning of an offset-length tuple sequence + * @param[in] offset_length_begin The beginning of an offset-length tuple sequence * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ template __global__ void infer_column_type_kernel(OptionsView options, device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { @@ -129,8 +130,8 @@ __global__ void infer_column_type_kernel(OptionsView options, for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { - auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); - auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_offset = thrust::get<0>(*(offset_length_begin + idx)); + auto const field_len = thrust::get<1>(*(offset_length_begin + idx)); auto const field_begin = data.begin() + field_offset; if (cudf::detail::serialized_trie_contains( @@ -234,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options, * * @param options View of inference options * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence + * @param offset_length_begin The beginning of an offset-length tuple sequence * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return A histogram containing column-specific type counters @@ -242,7 +243,7 @@ __global__ void infer_column_type_kernel(OptionsView options, template cudf::io::column_type_histogram infer_column_type(OptionsView const& options, cudf::device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t const size, rmm::cuda_stream_view stream) { @@ -254,7 +255,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); infer_column_type_kernel<<>>( - options, data, column_strings_begin, size, d_column_info.data()); + options, data, offset_length_begin, size, d_column_info.data()); return d_column_info.value(stream); } @@ -266,28 +267,24 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, * @throw cudf::logic_error if date time is not inferred as string * @throw cudf::logic_error if data type inference failed * - * @tparam OptionsView Type of inference options view - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` - * * @param options View of inference options * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence + * @param offset_length_begin The beginning of an offset-length tuple sequence * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return The inferred data type */ -template -cudf::data_type infer_data_type(OptionsView const& options, - device_span data, - ColumnStringIter column_strings_begin, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); - auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); + auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index b2eb1b94f9c..7eec44af06d 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 4, 7}; - auto const string_length = std::vector{2, 2, 1}; + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -83,8 +83,8 @@ TEST_F(TypeInference, Null) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 4}; - auto const string_length = std::vector{0, 2, 1}; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 1}; - auto const string_length = std::vector{0, 0, 4}; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -149,8 +149,8 @@ TEST_F(TypeInference, String) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 8, 12}; - auto const string_length = std::vector{6, 3, 4}; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 6, 12}; - auto const string_length = std::vector{4, 5, 5}; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 10}; - auto const string_length = std::vector{8, 9}; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 3, 5, 7, 9}; - auto const string_length = std::vector{1, 1, 1, 1, 1}; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( From 3e9a88c47d1c5f749483fd4ac214338f612b70b1 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 28 Aug 2023 08:50:43 +0530 Subject: [PATCH 20/47] Rename type_inference.cuh to .cu file --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/json_column.cu | 12 +------ cpp/src/io/json/nested_json_gpu.cu | 2 +- cpp/src/io/utilities/string_parsing.hpp | 36 +++++++++++++++++++ .../{type_inference.cuh => type_inference.cu} | 10 +----- cpp/tests/io/type_inference_test.cu | 2 +- 6 files changed, 41 insertions(+), 22 deletions(-) create mode 100644 cpp/src/io/utilities/string_parsing.hpp rename cpp/src/io/utilities/{type_inference.cuh => type_inference.cu} (98%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 516865e5782..cbe5972049e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -418,6 +418,7 @@ add_library( src/io/utilities/file_io_utilities.cpp src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp + src/io/utilities/type_inference.cu src/io/utilities/trie.cu src/jit/cache.cpp src/jit/parser.cpp diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 479f7daf3ac..5e32ed5c990 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,7 +16,7 @@ #include "nested_json.hpp" #include -// #include +#include #include #include @@ -52,16 +52,6 @@ #include #include -namespace cudf::io::detail { - -cudf::data_type infer_data_type( - cudf::io::json_inference_options_view const& options, - device_span data, - thrust::zip_iterator> offset_length_begin, - std::size_t const size, - rmm::cuda_stream_view stream); -} - namespace cudf::io::json::detail { // DEBUG prints diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c6d9b69d6cc..baad52cd729 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp new file mode 100644 index 00000000000..b26df3ac844 --- /dev/null +++ b/cpp/src/io/utilities/string_parsing.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cudf::io::detail { + +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream); +} diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu similarity index 98% rename from cpp/src/io/utilities/type_inference.cuh rename to cpp/src/io/utilities/type_inference.cu index 0b6c51fc647..892c34bf269 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cu @@ -13,24 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include -#include +#include #include #include -#include #include -#include -#include #include -#include -#include -#include - #include #include diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 7eec44af06d..a14e7ecf5b3 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include From 1838341ddf49e19048aef7e5495ee6808531d10e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 28 Aug 2023 15:03:55 +0530 Subject: [PATCH 21/47] Cleanup parse_data signature --- cpp/include/cudf/io/detail/data_casting.cuh | 36 +++--- cpp/src/io/json/json_column.cu | 29 +++-- cpp/src/io/json/nested_json_gpu.cu | 10 +- cpp/src/io/utilities/string_parsing.hpp | 47 +++++++- cpp/src/io/utilities/type_inference.cu | 14 --- cpp/tests/io/json_type_cast_test.cu | 118 +++++++++++--------- 6 files changed, 141 insertions(+), 113 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 61128ccc6e6..47b6cca0df7 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -30,6 +31,7 @@ #include #include +#include #include #include @@ -663,33 +665,25 @@ struct to_string_view_pair { } }; -/** - * @brief Parses the data from an iterator of string views, casting it to the given target data type - * - * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair - * @param col_size The total number of items of this column - * @param col_type The column's target data type - * @param null_mask A null mask that renders certain items from the input invalid - * @param options Settings for controlling the processing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr The resource to be used for device memory allocation - * @return The column that contains the parsed data - */ -template -std::unique_ptr parse_data(str_tuple_it str_tuples, - size_type col_size, - data_type col_type, - B&& null_mask, - size_type null_count, - cudf::io::parse_options_view const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto d_null_count = rmm::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data}); + if (col_type == cudf::data_type{cudf::type_id::STRING}) { auto const max_length = thrust::transform_reduce( rmm::exec_policy(stream), diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 5e32ed5c990..04264a44cea 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -331,23 +330,27 @@ std::vector copy_strings_to_host(device_span input, { CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); - rmm::device_uvector> string_views(num_strings, stream); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); thrust::transform(rmm::exec_policy(stream), d_offset_pairs, d_offset_pairs + num_strings, - string_views.begin(), - [data = input.data()] __device__(auto const& offsets) { + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { // Note: first character for non-field columns - return thrust::make_pair( - data + thrust::get<0>(offsets), + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); }); cudf::io::parse_options_view options_view{}; options_view.quotechar = '\0'; // no quotes options_view.keepquotes = true; - auto d_column_names = parse_data(string_views.begin(), + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, num_strings, data_type{type_id::STRING}, rmm::device_buffer{}, @@ -355,7 +358,7 @@ std::vector copy_strings_to_host(device_span input, options_view, stream, rmm::mr::get_current_device_resource()); - auto to_host = [stream](auto const& col) { + auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( @@ -764,13 +767,6 @@ std::pair, std::vector> device_json_co auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - data_type target_type{}; if (schema.has_value()) { @@ -789,7 +785,8 @@ std::pair, std::vector> device_json_co auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index baad52cd729..b99b8d2647d 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1949,13 +1949,6 @@ std::pair, std::vector> json_column_to auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - data_type target_type{}; if (schema.has_value()) { @@ -1979,7 +1972,8 @@ std::pair, std::vector> json_column_to auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp index b26df3ac844..12fc0a5b2e7 100644 --- a/cpp/src/io/utilities/string_parsing.hpp +++ b/cpp/src/io/utilities/string_parsing.hpp @@ -25,12 +25,55 @@ #include #include -namespace cudf::io::detail { +namespace cudf::io { +namespace detail { +/** + * @brief Infers data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string + * @throw cudf::logic_error if data type inference failed + * + * @param options View of inference options + * @param data JSON string input + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The inferred data type + */ cudf::data_type infer_data_type( cudf::io::json_inference_options_view const& options, device_span data, thrust::zip_iterator> offset_length_begin, std::size_t const size, rmm::cuda_stream_view stream); -} +} // namespace detail + +namespace json::detail { + +/** + * @brief Parses the data from an iterator of string views, casting it to the given target data type + * + * @param data string input base pointer + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param col_size The total number of items of this column + * @param col_type The column's target data type + * @param null_mask A null mask that renders certain items from the input invalid + * @param options Settings for controlling the processing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr The resource to be used for device memory allocation + * @return The column that contains the parsed data + */ +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace json::detail +} // namespace cudf::io diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index 892c34bf269..79a5c8f1c4c 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -252,20 +252,6 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, return d_column_info.value(stream); } -/** - * @brief Infers data type for a given JSON string input `data`. - * - * @throw cudf::logic_error if input size is 0 - * @throw cudf::logic_error if date time is not inferred as string - * @throw cudf::logic_error if data type inference failed - * - * @param options View of inference options - * @param data JSON string input - * @param offset_length_begin The beginning of an offset-length tuple sequence - * @param size Size of the string input - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The inferred data type - */ cudf::data_type infer_data_type( cudf::io::json_inference_options_view const& options, device_span data, diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 5c32131114d..48a866fa5bc 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -37,11 +37,10 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct to_thrust_pair_fn { - __device__ thrust::pair operator()( - thrust::pair const& p) +struct offsets_to_length { + __device__ cudf::size_type operator()(thrust::tuple const& p) { - return {p.first.data(), p.first.size_bytes()}; + return thrust::get<1>(p) - thrust::get<0>(p); } }; } // namespace @@ -67,26 +66,31 @@ TEST_F(JSONTypeCastTest, String) std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); - auto d_column = cudf::column_device_view::create(input); - rmm::device_uvector> svs(d_column->size(), stream); + auto column = cudf::strings_column_view(input); + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto str_col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -103,26 +107,31 @@ TEST_F(JSONTypeCastTest, Int) auto const type = cudf::data_type{cudf::type_id::INT64}; cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); + auto column = cudf::strings_column_view(data); + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -146,26 +155,31 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); + auto column = cudf::strings_column_view(data); + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, From efe9712cdeef333011c2f3b9c4856db60f55937f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 28 Aug 2023 15:22:19 +0530 Subject: [PATCH 22/47] rename data_casting.cuh to .cu --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/nested_json_gpu.cu | 1 - .../data_casting.cuh => src/io/utilities/data_casting.cu} | 2 -- cpp/tests/io/json_type_cast_test.cu | 5 ++++- 4 files changed, 5 insertions(+), 4 deletions(-) rename cpp/{include/cudf/io/detail/data_casting.cuh => src/io/utilities/data_casting.cu} (99%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cbe5972049e..05e0c33ace9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -413,6 +413,7 @@ add_library( src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp + src/io/utilities/data_casting.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b99b8d2647d..ea185699ce4 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/src/io/utilities/data_casting.cu similarity index 99% rename from cpp/include/cudf/io/detail/data_casting.cuh rename to cpp/src/io/utilities/data_casting.cu index 47b6cca0df7..30173cc2272 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/src/io/utilities/data_casting.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#pragma once - #include #include diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 48a866fa5bc..74085c7ebbb 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -21,15 +21,18 @@ #include #include +#include + #include #include #include -#include #include #include #include #include +#include + #include using namespace cudf::test::iterators; From 1d869962d61e6cdb2d5f4d4085f30556c90ab0af Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 29 Aug 2023 22:26:56 +0530 Subject: [PATCH 23/47] move get_escaped_char to parsing_utils.cuh --- cpp/src/io/json/write_json.cu | 16 +--------------- cpp/src/io/utilities/data_casting.cu | 21 --------------------- cpp/src/io/utilities/parsing_utils.cuh | 24 +++++++++++++++++++++++- 3 files changed, 24 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 5014140991d..303c32ea78d 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -20,6 +20,7 @@ */ #include +#include #include #include @@ -61,21 +62,6 @@ #include namespace cudf::io::json::detail { -__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return {'\\', '"'}; - case '\\': return {'\\', '\\'}; - case '/': return {'\\', '/'}; - case '\b': return {'\\', 'b'}; - case '\f': return {'\\', 'f'}; - case '\n': return {'\\', 'n'}; - case '\r': return {'\\', 'r'}; - case '\t': return {'\\', 't'}; - // case 'u': return UNICODE_SEQ; - default: return {'\0', escaped_char}; - } -} std::unique_ptr make_column_names_column(host_span column_names, size_type num_columns, diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 30173cc2272..7b076800f2c 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -103,27 +103,6 @@ __device__ __forceinline__ char get_escape_char(char escaped_char) } } -/** - * @brief Returns the escaped characters for a given character. - * - * @param escaped_char The character to escape. - * @return The escaped characters for a given character. - */ -__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return {'\\', '"'}; - case '\\': return {'\\', '\\'}; - case '/': return {'\\', '/'}; - case '\b': return {'\\', 'b'}; - case '\f': return {'\\', 'f'}; - case '\n': return {'\\', 'n'}; - case '\r': return {'\\', 'r'}; - case '\t': return {'\\', 't'}; - // case 'u': return UNICODE_SEQ; - default: return {'\0', escaped_char}; - } -} /** * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence * \uXXXX. diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 5c3af588411..43d62fcd513 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,28 @@ struct parse_options { } }; +/** + * @brief Returns the escaped characters for a given character. + * + * @param escaped_char The character to escape. + * @return The escaped characters for a given character. + */ +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} + /** * @brief Returns the numeric value of an ASCII/UTF-8 character. * Handles hexadecimal digits, both uppercase and lowercase From e11452f43032f785967a2fc26762212740ce5609 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 29 Aug 2023 22:46:47 +0530 Subject: [PATCH 24/47] last backslash errored bug fix add unit tests with unicode to use both kernels --- cpp/src/io/utilities/data_casting.cu | 85 ++++++++++++++-------------- cpp/tests/io/json_test.cpp | 72 ++++++++++++++++------- 2 files changed, 93 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 7b076800f2c..08e0fdb0096 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -431,10 +431,49 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto c = is_within_bounds ? in_begin[char_index] : '\0'; auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; auto escaped_char = get_escape_char(c); - bool error = false; + + bool is_escaping_backslash{false}; + // To check current is backslash by checking if previous is backslash. + // curr = !prev & c=='\\' + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. + + struct state_table { + bool state[2]; + }; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + }; + state_table scanned; + [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE; + // inclusive scan of escaping backslashes + // TODO both inclusive and exclusive available in cub. + if constexpr (is_warp) { + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.state[init_state]; + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + } else { + using SlashScan = cub::BlockScan; + __shared__ typename SlashScan::TempStorage temp_slash; + SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.state[init_state]; + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + // There is another __syncthreads() at the end of for-loop. + } + + bool error = false; if (is_within_bounds) { - // TODO instead of '\\', use is_escaping_backslash, and previous index value also here. - error |= (c == '\\' && char_index == (in_end - in_begin) - 1); + // instead of '\\', using is_escaping_backslash, and previous index value also here. + error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); error |= (prev_c == '\\' && c == 'u' && // TODO check if following condition is right or off by one error. @@ -466,6 +505,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, break; // gride-stride return; } bool skip = !is_within_bounds; // false; + skip |= is_escaping_backslash; if (is_within_bounds) { skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && in_begin[char_index - 1] == 'u'; @@ -479,45 +519,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, int this_num_out = 0; cudf::char_utf8 write_char{}; - // To check current is backslash by checking if previous is backslash. - // curr = !prev & c=='\\' - // So, scan is required from beginning of string. - // State table approach (intra-warp FST) - // 2 states: Not-Slash(NS), Slash(S). - // prev / * - // NS S NS - // S NS NS - // After inclusive scan, all current S states translate to escaping backslash. - // All escaping backslash should be skipped. - - struct state_table { - bool state[2]; - }; - state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. - auto composite_op = [](state_table op1, state_table op2) { - return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; - }; - state_table scanned; - [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE; - // inclusive scan of escaping backslashes - // TODO both inclusive and exclusive available in cub. - if constexpr (is_warp) { - using SlashScan = cub::WarpScan; - __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; - SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); - auto is_escaping_backslash = scanned.state[init_state]; - init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); - skip |= is_escaping_backslash; - } else { - using SlashScan = cub::BlockScan; - __shared__ typename SlashScan::TempStorage temp_slash; - SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); - auto is_escaping_backslash = scanned.state[init_state]; - if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; - // There is another __syncthreads() at the end of for-loop. - skip |= is_escaping_backslash; - } - if (!skip) { if (prev_c != '\\') { this_num_out = 1; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0b9ff854618..4bced6397d1 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1372,35 +1373,62 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) TEST_F(JsonReaderTest, JsonLongString) { - std::string json_string = - R"({"a":"a0"} - {"a":"a1"} - {"a":"a2", "b":"b2"} - {"a":"a3", "c":"c3"})"; - json_string += std::string("\n") + R"({"a":"a4\u20ac)" + std::string(103, 'a') + "\"}"; - json_string += std::string("\n") + R"({"a":"a5\u20ac)" + std::string(1034, 'a') + "\"}"; - json_string += std::string("\n") + R"({"a":"a6\u20ac)" + std::string(10340, 'a') + "\"}"; - - cudf::test::strings_column_wrapper col1{"a0", - "a1", - "a2", - "a3", - "a4€" + std::string(103, 'a'), - "a5€" + std::string(1034, 'a'), - "a6€" + std::string(10340, 'a')}; - cudf::test::strings_column_wrapper col2{{"", "", "b2", "", "", "", ""}, {0, 0, 1, 0, 0, 0, 0}}; - cudf::test::strings_column_wrapper col3{{"", "", "", "c3", "", "", ""}, {0, 0, 0, 1, 0, 0, 0}}; - cudf::table_view expected({col1, col2, col3}); + // Unicode + // 0000-FFFF Basic Multilingual Plane + // 10000-10FFFF Supplementary Plane + cudf::test::strings_column_wrapper col1{ + {"\"\\/\b\f\n\r\t", + "\"", + "\\", + "/", + "\b", + "\f\n", + "\r\t", + "$€", + "ராபிட்ஸ்", + "C𝞵𝓓𝒻", + "", // null + "", // null + "கார்த்தி", + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉"}, // 30000-3FFFF + cudf::test::iterators::nulls_at({10, 11})}; + + cudf::test::fixed_width_column_wrapper repeat_times{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640, 1280}; + auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); + auto col2 = d_col2->view(); + cudf::table_view tbl_view{{col1, col2, repeat_times}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + + cudf::table_view expected({col1, col2, repeat_times}); + std::map types; + types["col1"] = data_type{type_id::STRING}; + types["col2"] = data_type{type_id::STRING}; + types["int16"] = data_type{type_id::INT16}; // Initialize parsing options (reading json lines) cudf::io::json_reader_options json_lines_options = cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.c_str(), json_string.size()}) - .lines(true); + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .dtypes(types); // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view()); } TEST_F(JsonReaderTest, TokenAllocation) From 9426dda58b52d6832c918a80508b8f0ea589df18 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 30 Aug 2023 20:02:30 +0530 Subject: [PATCH 25/47] address review comments, update docs --- cpp/src/io/utilities/data_casting.cu | 56 ++++++++++++++++++---------- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 08e0fdb0096..7f619eae3c1 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -186,8 +186,6 @@ process_string(in_iterator_t in_begin, } return {bytes, data_casting_result::PARSING_SUCCESS}; } - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) char const backslash_char = '\\'; // Escape-flag, set after encountering a backslash character @@ -284,27 +282,27 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } -// 1 warp per string. -// algorithm +// Algorithm: warp/block parallel version of string_parse and process_string() +// Decoding character classes (u8, u16, \*, *): // character count: input->output // \uXXXX 6->2/3/4 // \uXXXX\uXXXX 12->2/3/4 // \" 2->1 // * 1->1 // -// error conditions. (propagate) -// c=='\' & curr_idx == end_idx-1; ERROR +// ERROR conditions. (all collaborating threads quit) +// c=='\' & curr_idx == end_idx-1; // [c-1]=='\' & get_escape[c]==NEC // [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex - -// skip conditions. (scan for size) -// c=='\' skip. +// +// skip conditions. (current thread skips this char, no output) +// c=='\' skip. (Escaping char only) // [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. - +// // write conditions. (write to d_buffer) // [c-1]!='\' & [c]!='\' write [c] -// [c-1]!='\' & [c]=='\' skip (unnecessary? already covered? in skip conditions) +// [c-1]!='\' & [c]=='\' skip (already covered in skip conditions) // [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] // [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE // [c+1:4]=curr_hex_val @@ -313,11 +311,28 @@ process_string(in_iterator_t in_begin, // // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && // hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val // // if prev_hex_val in high, curr_hex_val in low, skip. -// // if curr_hex_val in high, next_hex_val in low, write u16. -// if curr_hex_val not in high, write u8. -// before writing, find size, then intra-warp scan for out_idx -// propagate offset from 32nd thread to others in warp to carry forward. -// 1 warp per string or 1 block per string +// // if curr_hex_val in high, next_hex_val in low, write [u16] +// if curr_hex_val not in high, write [u8] +// before writing, find num of output characters per threads, +// then do intra-warp/intra-block scan for out_idx +// propagate offset from next iteration to carry forward. +// Uses 1 warp per string or 1 block per string + +/** + * @brief Warp/Block parallel version of string_parse functor + * + * @tparam is_warp True if 1 warp per string, False if 1 block per string + * @tparam num_warps Number of warps per block + * @tparam str_tuple_it Iterator type for tuple with string pointer and its length + * @param str_tuples iterator of tuple with string pointer and its length + * @param total_out_strings Number of string rows to be processed + * @param str_counter Counter to keep track of processed number of strings + * @param null_mask Null mask + * @param null_count_data pointer to store null count + * @param options Settings for controlling string processing behavior + * @param d_offsets Offsets to identify where to store the results for each string + * @param d_chars Character array to store the characters of strings + */ template __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, size_type total_out_strings, @@ -401,7 +416,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, ++in_begin; --in_end; } - // warp-parallelized process_string(in_begin, in_end, d_buffer, options); + // warp-parallelized or block-parallelized process_string() auto is_hex = [](auto ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); @@ -436,7 +451,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // To check current is backslash by checking if previous is backslash. // curr = !prev & c=='\\' // So, scan is required from beginning of string. - // State table approach (intra-warp FST) + // State table approach (intra-warp FST) (intra-block FST) // 2 states: Not-Slash(NS), Slash(S). // prev / * // NS S NS @@ -528,6 +543,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, this_num_out = 1; write_char = escaped_char; } else { + // TODO if prev is not escaping backslash, copy \uXXXX. // Unicode // \uXXXX auto hex_val = parse_unicode_hex(in_begin + char_index + 1); @@ -696,8 +712,8 @@ std::unique_ptr parse_data( constexpr auto warps_per_block = 8; constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; - auto num_blocks = min(65535, col_size / warps_per_block + 1); - auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + auto num_blocks = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block)); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); parse_fn_string_parallel <<>>( diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 4bced6397d1..8ad4fec0c3c 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1397,7 +1397,7 @@ TEST_F(JsonReaderTest, JsonLongString) cudf::test::iterators::nulls_at({10, 11})}; cudf::test::fixed_width_column_wrapper repeat_times{ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 80, 160, 320, 640, 1280}; + 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279}; auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); auto col2 = d_col2->view(); cudf::table_view tbl_view{{col1, col2, repeat_times}}; From b481d7cfa0bcddaf4b227bd6cf99a13e99c64de3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 31 Aug 2023 23:43:50 +0530 Subject: [PATCH 26/47] add complex test cases --- cpp/tests/io/json_test.cpp | 50 +++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 8ad4fec0c3c..050f75dcb83 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1377,30 +1377,40 @@ TEST_F(JsonReaderTest, JsonLongString) // 0000-FFFF Basic Multilingual Plane // 10000-10FFFF Supplementary Plane cudf::test::strings_column_wrapper col1{ - {"\"\\/\b\f\n\r\t", - "\"", - "\\", - "/", - "\b", - "\f\n", - "\r\t", - "$€", - "ராபிட்ஸ்", - "C𝞵𝓓𝒻", - "", // null - "", // null - "கார்த்தி", - "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF - "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF - "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF - "𰾑𱔈𲍉"}, // 30000-3FFFF + { + "\"\\/\b\f\n\r\t", + "\"", + "\\", + "/", + "\b", + "\f\n", + "\r\t", + "$€", + "ராபிட்ஸ்", + "C𝞵𝓓𝒻", + "", // null + "", // null + "கார்த்தி", + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF + R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", + R"( \\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\abcd)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + }, cudf::test::iterators::nulls_at({10, 11})}; cudf::test::fixed_width_column_wrapper repeat_times{ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279}; + 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}; auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); auto col2 = d_col2->view(); - cudf::table_view tbl_view{{col1, col2, repeat_times}}; + cudf::table_view const tbl_view{{col1, col2, repeat_times}}; cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; std::vector out_buffer; @@ -1413,7 +1423,7 @@ TEST_F(JsonReaderTest, JsonLongString) cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); - cudf::table_view expected({col1, col2, repeat_times}); + cudf::table_view const expected = tbl_view; std::map types; types["col1"] = data_type{type_id::STRING}; types["col2"] = data_type{type_id::STRING}; From a5acda6042b00e2f56e05fcb428675b91c1c1253 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 31 Aug 2023 23:44:17 +0530 Subject: [PATCH 27/47] add is escaping backslash lookback --- cpp/src/io/utilities/data_casting.cu | 84 ++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 7f619eae3c1..9711e1947a6 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -38,6 +38,7 @@ #include #include +#include namespace cudf::io::json::detail { @@ -282,6 +283,51 @@ process_string(in_iterator_t in_begin, return {bytes, data_casting_result::PARSING_SUCCESS}; } +template +struct bitfield_warp { + // 5+32 for each warp. + bool is_slash[num_warps][5 + 32]; + __device__ void reset(unsigned warp_id) + { + is_slash[warp_id][threadIdx.x % 32] = 0; + is_slash[warp_id][threadIdx.x % 32 + 5] = 0; + } + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x % 32 < 5) + is_slash[warp_id][threadIdx.x % 32] = is_slash[warp_id][32 + threadIdx.x % 32]; + } + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[warp_id][5 + threadIdx.x % 32] = is_escaping_backslash; + } + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[warp_id][5 + bit_index]; + } +}; + +template +struct bitfield_block { + // 5 + num_warps*32 for entire block + bool is_slash[5 + num_warps * 32]; + + __device__ void reset(unsigned warp_id) + { + is_slash[threadIdx.x] = 0; + is_slash[threadIdx.x + 5] = 0; + } + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x < 5) is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x]; + } + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[5 + threadIdx.x] = is_escaping_backslash; + } + __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[5 + bit_index]; } +}; + // Algorithm: warp/block parallel version of string_parse and process_string() // Decoding character classes (u8, u16, \*, *): // character count: input->output @@ -423,6 +469,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, }; // for backslash scan calculation: is_previous_escaping_backslash + [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size; bool init_state_reg; __shared__ bool init_state_shared; size_type last_offset_reg; @@ -433,10 +480,13 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, init_state = false; last_offset = 0; } - // if constexpr(!is_warp) { __syncthreads(); } + using bitfield = + std::conditional_t, bitfield_block>; + __shared__ bitfield is_slash; + is_slash.reset(warp_id); + __syncthreads(); // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. - // auto MASK = 0xffffffff; for (size_type char_index = lane; char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { @@ -448,6 +498,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto escaped_char = get_escape_char(c); bool is_escaping_backslash{false}; + [[maybe_unused]] bool is_prev_escaping_backslash{false}; // To check current is backslash by checking if previous is backslash. // curr = !prev & c=='\\' // So, scan is required from beginning of string. @@ -467,7 +518,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; }; state_table scanned; - [[maybe_unused]] auto warp_id = threadIdx.x / BLOCK_SIZE; // inclusive scan of escaping backslashes // TODO both inclusive and exclusive available in cub. if constexpr (is_warp) { @@ -476,12 +526,24 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); is_escaping_backslash = scanned.state[init_state]; init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + __syncwarp(); + is_slash.shift(warp_id); + __syncwarp(); + is_slash.set_bits(warp_id, is_escaping_backslash); + __syncwarp(); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); } else { using SlashScan = cub::BlockScan; __shared__ typename SlashScan::TempStorage temp_slash; SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); is_escaping_backslash = scanned.state[init_state]; if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + __syncthreads(); + is_slash.shift(warp_id); + __syncthreads(); + is_slash.set_bits(warp_id, is_escaping_backslash); + __syncthreads(); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); // There is another __syncthreads() at the end of for-loop. } @@ -489,8 +551,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, if (is_within_bounds) { // instead of '\\', using is_escaping_backslash, and previous index value also here. error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); - error |= (prev_c == '\\' && escaped_char == NON_ESCAPE_CHAR); - error |= (prev_c == '\\' && c == 'u' && + error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); + error |= (is_prev_escaping_backslash && c == 'u' && // TODO check if following condition is right or off by one error. ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | @@ -522,20 +584,21 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, bool skip = !is_within_bounds; // false; skip |= is_escaping_backslash; if (is_within_bounds) { - skip |= char_index - 2 >= 0 && in_begin[char_index - 2] == '\\' && + // \uXXXX check for "\u" for each X + skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; - skip |= char_index - 3 >= 0 && in_begin[char_index - 3] == '\\' && + skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u'; - skip |= char_index - 4 >= 0 && in_begin[char_index - 4] == '\\' && + skip |= char_index - 4 >= 0 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u'; - skip |= char_index - 5 >= 0 && in_begin[char_index - 5] == '\\' && + skip |= char_index - 5 >= 0 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u'; } int this_num_out = 0; cudf::char_utf8 write_char{}; if (!skip) { - if (prev_c != '\\') { + if (!is_prev_escaping_backslash) { this_num_out = 1; if (d_chars) write_char = c; } else { @@ -543,7 +606,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, this_num_out = 1; write_char = escaped_char; } else { - // TODO if prev is not escaping backslash, copy \uXXXX. // Unicode // \uXXXX auto hex_val = parse_unicode_hex(in_begin + char_index + 1); From 4a3941fd6a01e5d064042c17f9e25e274d70b22d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 1 Sep 2023 02:53:44 +0530 Subject: [PATCH 28/47] call thread kernel for small size, adjust sizes --- cpp/src/io/utilities/data_casting.cu | 108 +++++++++++++++------------ 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 9711e1947a6..c3a9c8bbb91 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -42,8 +42,8 @@ namespace cudf::io::json::detail { -constexpr auto SINGLE_THREAD_THRESHOLD = 128; -constexpr auto WARP_THRESHOLD = 1024; +constexpr auto SINGLE_THREAD_THRESHOLD = 512; +constexpr auto WARP_THRESHOLD = 1024 * 4; // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; @@ -420,6 +420,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; if constexpr (is_warp) { + if (!(num_in_chars > SINGLE_THREAD_THRESHOLD)) continue; if (num_in_chars > WARP_THRESHOLD) continue; } else { if (num_in_chars <= WARP_THRESHOLD) continue; @@ -685,6 +686,8 @@ struct string_parse { auto const in_end = in_begin + str_tuples[idx].second; auto const num_in_chars = str_tuples[idx].second; + if (num_in_chars > SINGLE_THREAD_THRESHOLD) return; + // Check if the value corresponds to the null literal auto const is_null_literal = (!d_chars) && @@ -749,34 +752,29 @@ std::unique_ptr parse_data( size_type{0}, thrust::maximum{}); - if (max_length < SINGLE_THREAD_THRESHOLD) { - // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null count may be updated by parse failures - nvtxRangePush("make_strings_children"); - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, - col_size, - stream, - mr); - nvtxRangePop(); - - return make_strings_column(col_size, - std::move(offsets), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); - } else { - nvtxRangePush("string_parallel"); - auto offsets2 = cudf::make_numeric_column( - data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets2->mutable_view().data(); - - constexpr auto warps_per_block = 8; - constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; - auto num_blocks = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block)); - auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - + nvtxRangePush("string_parallel"); + auto offsets = cudf::make_numeric_column( + data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + auto single_thread_fn = + string_parse{str_tuples, + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets}; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block)); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + // TODO run these independent kernels in parallel streams. + if (max_length > SINGLE_THREAD_THRESHOLD) { parse_fn_string_parallel <<>>( str_tuples, @@ -787,8 +785,11 @@ std::unique_ptr parse_data( options, d_offsets, nullptr); - str_counter.set_value(0, stream); + } + + if (max_length > WARP_THRESHOLD) { // for strings longer than WARP_THRESHOLD, 1 block per string + str_counter.set_value(0, stream); parse_fn_string_parallel <<>>( str_tuples, @@ -799,15 +800,26 @@ std::unique_ptr parse_data( options, d_offsets, nullptr); - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + } + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", + std::overflow_error); + + // CHARS column + std::unique_ptr chars = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars = chars->mutable_view().data(); + + single_thread_fn.d_chars = d_chars; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + if (max_length > SINGLE_THREAD_THRESHOLD) { str_counter.set_value(0, stream); - - // CHARS column - std::unique_ptr chars = - strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars = chars->mutable_view().data(); - parse_fn_string_parallel <<>>( str_tuples, @@ -818,8 +830,10 @@ std::unique_ptr parse_data( options, d_offsets, d_chars); - str_counter.set_value(0, stream); + } + if (max_length > WARP_THRESHOLD) { + str_counter.set_value(0, stream); // for strings longer than WARP_THRESHOLD, 1 block per string parse_fn_string_parallel <<>>( @@ -831,14 +845,14 @@ std::unique_ptr parse_data( options, d_offsets, d_chars); - nvtxRangePop(); - - return make_strings_column(col_size, - std::move(offsets2), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); } + nvtxRangePop(); + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); } auto out_col = From d227dad02b49d0359b05a117dc3e2391eddb3f2d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 1 Sep 2023 22:27:06 +0530 Subject: [PATCH 29/47] address review comments --- cpp/src/io/utilities/data_casting.cu | 75 ++++++++++++++++++---------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index c3a9c8bbb91..42f4491cec3 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -180,17 +181,14 @@ process_string(in_iterator_t in_begin, // Copy literal/numeric value if (not is_string_value) { - while (in_begin != in_end) { - if (d_buffer) *d_buffer++ = *in_begin; - ++in_begin; - ++bytes; - } + bytes += (in_end - in_begin); + if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer); return {bytes, data_casting_result::PARSING_SUCCESS}; } - char const backslash_char = '\\'; + char constexpr backslash_char = '\\'; // Escape-flag, set after encountering a backslash character - bool escape = false; + bool is_prev_char_escape = false; // Exclude beginning and ending quote chars from string range if (!options.keepquotes) { @@ -201,9 +199,9 @@ process_string(in_iterator_t in_begin, // Iterate over the input while (in_begin != in_end) { // Copy single character to output - if (!escape) { - escape = (*in_begin == backslash_char); - if (!escape) { + if (!is_prev_char_escape) { + is_prev_char_escape = (*in_begin == backslash_char); + if (!is_prev_char_escape) { if (d_buffer) *d_buffer++ = *in_begin; ++bytes; } @@ -213,7 +211,7 @@ process_string(in_iterator_t in_begin, // Previous char indicated beginning of escape sequence // Reset escape flag for next loop iteration - escape = false; + is_prev_char_escape = false; // Check the character that is supposed to be escaped auto escaped_char = get_escape_char(*in_begin); @@ -279,53 +277,77 @@ process_string(in_iterator_t in_begin, } // The last character of the input is a backslash -> "fail"/null for this item - if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } + if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } return {bytes, data_casting_result::PARSING_SUCCESS}; } +/** + * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * warp. + * + * @tparam num_warps number of warps in the block + */ template struct bitfield_warp { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back upto 5 chars are needed. // 5+32 for each warp. - bool is_slash[num_warps][5 + 32]; + bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; __device__ void reset(unsigned warp_id) { - is_slash[warp_id][threadIdx.x % 32] = 0; - is_slash[warp_id][threadIdx.x % 32 + 5] = 0; + if (threadIdx.x < UNICODE_LOOK_BACK) { + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0; + } + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; } __device__ void shift(unsigned warp_id) { - if (threadIdx.x % 32 < 5) - is_slash[warp_id][threadIdx.x % 32] = is_slash[warp_id][32 + threadIdx.x % 32]; + if (threadIdx.x % 32 < UNICODE_LOOK_BACK) + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = + is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; } __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { - is_slash[warp_id][5 + threadIdx.x % 32] = is_escaping_backslash; + is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = + is_escaping_backslash; } __device__ bool get_bit(unsigned warp_id, int bit_index) { - return is_slash[warp_id][5 + bit_index]; + return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index]; } }; +/** + * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * block. + * + * @tparam num_warps number of warps in the block + */ template struct bitfield_block { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back upto 5 chars are needed. // 5 + num_warps*32 for entire block - bool is_slash[5 + num_warps * 32]; + bool is_slash[UNICODE_LOOK_BACK + num_warps * 32]; __device__ void reset(unsigned warp_id) { - is_slash[threadIdx.x] = 0; - is_slash[threadIdx.x + 5] = 0; + if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; } + is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0; } __device__ void shift(unsigned warp_id) { - if (threadIdx.x < 5) is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x]; + if (threadIdx.x < UNICODE_LOOK_BACK) + is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x]; } __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { - is_slash[5 + threadIdx.x] = is_escaping_backslash; + is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; + } + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[UNICODE_LOOK_BACK + bit_index]; } - __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[5 + bit_index]; } }; // Algorithm: warp/block parallel version of string_parse and process_string() @@ -420,8 +442,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, auto in_end = in_begin + str_tuples[istring].second; auto const num_in_chars = str_tuples[istring].second; if constexpr (is_warp) { - if (!(num_in_chars > SINGLE_THREAD_THRESHOLD)) continue; - if (num_in_chars > WARP_THRESHOLD) continue; + if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue; } else { if (num_in_chars <= WARP_THRESHOLD) continue; } From bcade0f379d4c8d653dba9762f53775590910e84 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 4 Sep 2023 23:03:34 +0530 Subject: [PATCH 30/47] address review comments, fix 2 data hazards --- cpp/src/io/utilities/data_casting.cu | 73 +++++++++++++++------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 42f4491cec3..35c0f34a393 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -302,14 +302,16 @@ struct bitfield_warp { } __device__ void shift(unsigned warp_id) { - if (threadIdx.x % 32 < UNICODE_LOOK_BACK) + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; + __syncwarp(); } __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = is_escaping_backslash; + __syncwarp(); } __device__ bool get_bit(unsigned warp_id, int bit_index) { @@ -328,7 +330,7 @@ struct bitfield_block { static constexpr auto UNICODE_LOOK_BACK{5}; // 5 because for skipping unicode hex chars, look back upto 5 chars are needed. // 5 + num_warps*32 for entire block - bool is_slash[UNICODE_LOOK_BACK + num_warps * 32]; + bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; __device__ void reset(unsigned warp_id) { @@ -338,11 +340,13 @@ struct bitfield_block { __device__ void shift(unsigned warp_id) { if (threadIdx.x < UNICODE_LOOK_BACK) - is_slash[threadIdx.x] = is_slash[num_warps * 32 + threadIdx.x]; + is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x]; + __syncthreads(); } __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; + __syncthreads(); } __device__ bool get_bit(unsigned warp_id, int bit_index) { @@ -415,17 +419,19 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x; - // get 1-string index per warp + // get 1-string index per warp/block auto get_next_string = [&]() { if constexpr (is_warp) { size_type istring; if (lane == 0) { istring = atomicAdd(str_counter, 1); } - __syncwarp(); return __shfl_sync(0xffffffff, istring, 0); } else { + // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's + // istring value + __syncthreads(); __shared__ size_type istring; if (lane == 0) { istring = atomicAdd(str_counter, 1); } - __syncthreads(); // memory fence? + __syncthreads(); return istring; } }; @@ -434,7 +440,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { - if (!d_chars) d_offsets[istring] = 0; + if (!d_chars && lane == 0) d_offsets[istring] = 0; continue; // gride-stride return; } @@ -448,16 +454,17 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, } // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - if (lane == 0) { - clear_bit(null_mask, istring); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[istring] = 0; + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; } - continue; // gride-stride return; } // String values are indicated by keeping the quote character bool const is_string_value = @@ -512,12 +519,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, for (size_type char_index = lane; char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { - auto MASK = - is_warp ? __ballot_sync(0xffffffff, char_index < (in_end - in_begin)) : 0xffffffff; - bool is_within_bounds = char_index < (in_end - in_begin); - auto c = is_within_bounds ? in_begin[char_index] : '\0'; - auto prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; - auto escaped_char = get_escape_char(c); + bool const is_within_bounds = char_index < (in_end - in_begin); + auto const MASK = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff; + auto const c = is_within_bounds ? in_begin[char_index] : '\0'; + auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; + auto const escaped_char = get_escape_char(c); bool is_escaping_backslash{false}; [[maybe_unused]] bool is_prev_escaping_backslash{false}; @@ -550,21 +556,18 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); __syncwarp(); is_slash.shift(warp_id); - __syncwarp(); is_slash.set_bits(warp_id, is_escaping_backslash); - __syncwarp(); is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); } else { using SlashScan = cub::BlockScan; __shared__ typename SlashScan::TempStorage temp_slash; SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); is_escaping_backslash = scanned.state[init_state]; + __syncthreads(); if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; __syncthreads(); is_slash.shift(warp_id); - __syncthreads(); is_slash.set_bits(warp_id, is_escaping_backslash); - __syncthreads(); is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); // There is another __syncthreads() at the end of for-loop. } @@ -679,6 +682,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, if constexpr (is_warp) { last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); } else { + __syncthreads(); if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; __syncthreads(); } @@ -710,14 +714,15 @@ struct string_parse { if (num_in_chars > SINGLE_THREAD_THRESHOLD) return; // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; - return; + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[idx] = 0; + return; + } } char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; From 4d1e048a61976b59bc7117cd83c7cd553f2da6b7 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 4 Sep 2023 23:34:06 +0530 Subject: [PATCH 31/47] update comments --- cpp/src/io/utilities/data_casting.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 35c0f34a393..d3d7de39eb9 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -547,7 +547,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, }; state_table scanned; // inclusive scan of escaping backslashes - // TODO both inclusive and exclusive available in cub. if constexpr (is_warp) { using SlashScan = cub::WarpScan; __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; @@ -578,7 +577,6 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); error |= (is_prev_escaping_backslash && c == 'u' && - // TODO check if following condition is right or off by one error. ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); @@ -591,7 +589,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __shared__ typename ErrorReduce::TempStorage temp_storage_error; __shared__ bool error_reduced; error_reduced = ErrorReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. - // only valid in thread0. + // only valid in thread0, so shared memory is used for broadcast. __syncthreads(); error = error_reduced; } From 896141de2fb8b270a547e5e4b0884cd528e9e6e5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 5 Sep 2023 22:11:01 +0530 Subject: [PATCH 32/47] using bitfields for state_table, no local mem --- cpp/src/io/utilities/data_casting.cu | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index d3d7de39eb9..bccd64703be 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -282,7 +282,7 @@ process_string(in_iterator_t in_begin, } /** - * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a * warp. * * @tparam num_warps number of warps in the block @@ -290,12 +290,12 @@ process_string(in_iterator_t in_begin, template struct bitfield_warp { static constexpr auto UNICODE_LOOK_BACK{5}; - // 5 because for skipping unicode hex chars, look back upto 5 chars are needed. + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. // 5+32 for each warp. bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; __device__ void reset(unsigned warp_id) { - if (threadIdx.x < UNICODE_LOOK_BACK) { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) { is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0; } is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; @@ -320,7 +320,7 @@ struct bitfield_warp { }; /** - * @brief Datastructure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a * block. * * @tparam num_warps number of warps in the block @@ -328,7 +328,7 @@ struct bitfield_warp { template struct bitfield_block { static constexpr auto UNICODE_LOOK_BACK{5}; - // 5 because for skipping unicode hex chars, look back upto 5 chars are needed. + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. // 5 + num_warps*32 for entire block bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; @@ -539,11 +539,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // All escaping backslash should be skipped. struct state_table { - bool state[2]; + // using bit fields instead of state[2] + bool state0 : 1; + bool state1 : 1; + bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; } }; state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. auto composite_op = [](state_table op1, state_table op2) { - return state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + return state_table{op1.state0 ? op2.state1 : op2.state0, + op1.state1 ? op2.state1 : op2.state0}; }; state_table scanned; // inclusive scan of escaping backslashes @@ -551,7 +556,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, using SlashScan = cub::WarpScan; __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); - is_escaping_backslash = scanned.state[init_state]; + is_escaping_backslash = scanned.get(init_state); init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); __syncwarp(); is_slash.shift(warp_id); @@ -561,7 +566,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, using SlashScan = cub::BlockScan; __shared__ typename SlashScan::TempStorage temp_slash; SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); - is_escaping_backslash = scanned.state[init_state]; + is_escaping_backslash = scanned.get(init_state); __syncthreads(); if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; __syncthreads(); From 7095aab8dd20ec0020eaeff40f7f89d98cb55f1b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 15:02:51 +0530 Subject: [PATCH 33/47] review comments syncthreads() --- cpp/src/io/utilities/data_casting.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index bccd64703be..a4926daecf6 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -599,14 +599,16 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, error = error_reduced; } if (error) { - if (lane == 0) { + if (!d_chars && lane == 0) { if (null_mask != nullptr) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); } last_offset = 0; - if (!d_chars) d_offsets[istring] = 0; + d_offsets[istring] = 0; } + if constexpr (!is_warp) + __syncthreads(); break; // gride-stride return; } bool skip = !is_within_bounds; // false; From 0d723e00457c87e6a773255b8b6fd073b69e1a4a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 16:19:04 +0530 Subject: [PATCH 34/47] address review comments syncthreads() --- cpp/src/io/utilities/data_casting.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index a4926daecf6..02742969cd2 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -604,11 +604,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); } - last_offset = 0; + last_offset = 0; d_offsets[istring] = 0; } - if constexpr (!is_warp) - __syncthreads(); + if constexpr (!is_warp) { __syncthreads(); } break; // gride-stride return; } bool skip = !is_within_bounds; // false; From 3dce8d9d1eceba33a154a34a935e9117f347eb18 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 16:20:23 +0530 Subject: [PATCH 35/47] fix consts, zero size column, roundup --- cpp/src/io/utilities/data_casting.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 02742969cd2..4cef2c8e50d 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -642,8 +642,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // \uXXXX\uXXXX // Note: no need for scanned_backslash below because we already know that // only '\u' check is enough. - if ((in_begin + char_index + 4 + 6) < in_end && in_begin[char_index + 1 + 4] == '\\' && - in_begin[char_index + 1 + 5] == 'u') { + if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < + in_end && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') { hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); } if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && @@ -767,6 +769,7 @@ std::unique_ptr parse_data( { CUDF_FUNC_RANGE(); + if (col_size == 0) { return make_empty_column(col_type); } auto d_null_count = rmm::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); @@ -800,8 +803,8 @@ std::unique_ptr parse_data( constexpr auto warps_per_block = 8; constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; - auto num_blocks = min(65535, cudf::util::div_rounding_up_unsafe(col_size, warps_per_block)); - auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); // TODO run these independent kernels in parallel streams. if (max_length > SINGLE_THREAD_THRESHOLD) { From 8de7d76a8319f37a1cee0f346cecea63913db921 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 16:21:03 +0530 Subject: [PATCH 36/47] optimzie single character write case, also fixes direct unicode bug --- cpp/src/io/utilities/data_casting.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 4cef2c8e50d..ee2ba4b2d25 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -629,11 +629,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, if (!skip) { if (!is_prev_escaping_backslash) { this_num_out = 1; - if (d_chars) write_char = c; + // writes char directly } else { if (escaped_char != UNICODE_SEQ) { this_num_out = 1; - write_char = escaped_char; + // writes char directly } else { // Unicode // \uXXXX @@ -683,7 +683,14 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncthreads(); } offset += last_offset; - if (d_chars && !skip) { strings::detail::from_char_utf8(write_char, d_buffer + offset); } + if (d_chars && !skip) { + auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ; + if (is_not_unicode) { + *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char; + } else { + strings::detail::from_char_utf8(write_char, d_buffer + offset); + } + } offset += this_num_out; if constexpr (is_warp) { last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); From cb0e0ba8f83dd4064e162f9ba017fc3765a9d695 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 16:24:18 +0530 Subject: [PATCH 37/47] add unit test JsonReaderTest.ErrorStrings --- cpp/tests/io/json_test.cpp | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 050f75dcb83..0d4454d933c 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1441,6 +1441,52 @@ TEST_F(JsonReaderTest, JsonLongString) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view()); } +TEST_F(JsonReaderTest, ErrorStrings) +{ + // cases of invalid escape characters, invalid unicode encodings. + auto const buffer = std::string{R"( + {"col0": "\"\a"} + {"col0": "\u"} + {"col0": "\u0"} + {"col0": "\u0b"} + {"col0": "\u00b"} + {"col0": "\u00bz"} + {"col0": "\t34567890123456\t9012345678901\ug0bc"} + {"col0": "\t34567890123456\t90123456789012\u0hbc"} + {"col0": "\t34567890123456\t90123456789012\u00ic"} + {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"} +)"}; + // Last one is not an error case, but shows that unicode in json is copied string column output. + + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .dtypes({data_type{cudf::type_id::STRING}}) + .lines(true) + .legacy(false); + + auto const result = cudf::io::read_json(in_opts); + auto const result_view = result.tbl->view().column(0); + + EXPECT_EQ(result.metadata.schema_info[0].name, "col0"); + EXPECT_EQ(result_view.null_count(), 9); + cudf::test::strings_column_wrapper expected{ + {"", + "", + "", + "", + "", + "", + "", + "", + "", + "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4" + "\xe0\xae\xbf"}, + // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d + // 0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf + cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); +} + TEST_F(JsonReaderTest, TokenAllocation) { std::array const json_inputs{ From 4b46027cdf27f4d762fc69ba2891ac5aba63a897 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 19:55:48 +0530 Subject: [PATCH 38/47] add comment --- cpp/tests/io/json_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0d4454d933c..16124f1b5f1 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1444,6 +1444,7 @@ TEST_F(JsonReaderTest, JsonLongString) TEST_F(JsonReaderTest, ErrorStrings) { // cases of invalid escape characters, invalid unicode encodings. + // Error strings will decode to nulls auto const buffer = std::string{R"( {"col0": "\"\a"} {"col0": "\u"} From 51390fd5f6e71fa7c8d39b3f2d117fffa0a38213 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 11 Sep 2023 23:22:52 +0530 Subject: [PATCH 39/47] update comments --- cpp/src/io/utilities/data_casting.cu | 30 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index ee2ba4b2d25..ab2b37ff6a7 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -576,9 +576,11 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // There is another __syncthreads() at the end of for-loop. } + // String with parsing errors are made as null bool error = false; if (is_within_bounds) { - // instead of '\\', using is_escaping_backslash, and previous index value also here. + // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT + // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); error |= (is_prev_escaping_backslash && c == 'u' && @@ -586,7 +588,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); } - // propagate error using warp shuffle. + // Make sure all threads have no errors before continuing if constexpr (is_warp) { error = __any_sync(MASK, error); } else { @@ -598,6 +600,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncthreads(); error = error_reduced; } + // If any thread has an error, skip the rest of the string and make this string as null if (error) { if (!d_chars && lane == 0) { if (null_mask != nullptr) { @@ -608,12 +611,15 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, d_offsets[istring] = 0; } if constexpr (!is_warp) { __syncthreads(); } - break; // gride-stride return; + break; // gride-stride return; } + + // Skipping non-copied escaped characters bool skip = !is_within_bounds; // false; + // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX skip |= is_escaping_backslash; if (is_within_bounds) { - // \uXXXX check for "\u" for each X + // skip X for each X in \uXXXX skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) && @@ -627,19 +633,21 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, cudf::char_utf8 write_char{}; if (!skip) { + // 1. Unescaped character if (!is_prev_escaping_backslash) { this_num_out = 1; - // writes char directly + // writes char directly for non-unicode } else { + // 2. Escaped character if (escaped_char != UNICODE_SEQ) { this_num_out = 1; - // writes char directly + // writes char directly for non-unicode } else { - // Unicode - // \uXXXX + // 3. Unicode + // UTF8 \uXXXX auto hex_val = parse_unicode_hex(in_begin + char_index + 1); auto hex_low_val = 0; - // \uXXXX\uXXXX + // UTF16 \uXXXX\uXXXX // Note: no need for scanned_backslash below because we already know that // only '\u' check is enough. if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < @@ -662,7 +670,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, this_num_out = 0; write_char = 0; } else { - // if u8 + // if UTF8 write_char = strings::detail::codepoint_to_utf8(hex_val); this_num_out = strings::detail::bytes_in_char_utf8(write_char); } @@ -671,6 +679,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, } } // !skip end. { + // compute offset to write output for each thread size_type offset; if constexpr (is_warp) { using OffsetScan = cub::WarpScan; @@ -683,6 +692,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncthreads(); } offset += last_offset; + // Write output if (d_chars && !skip) { auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ; if (is_not_unicode) { From d7bb5ac78737779af48a45fce7fe5338e2fd76c3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 12 Sep 2023 00:50:56 +0530 Subject: [PATCH 40/47] adjust kernel string limits --- cpp/src/io/utilities/data_casting.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index ab2b37ff6a7..6ffa1b444b6 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -43,8 +43,8 @@ namespace cudf::io::json::detail { -constexpr auto SINGLE_THREAD_THRESHOLD = 512; -constexpr auto WARP_THRESHOLD = 1024 * 4; +constexpr auto SINGLE_THREAD_THRESHOLD = 128; +constexpr auto WARP_THRESHOLD = 128 * 128; // 16K // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; From d0c86126eb79cf5be155c6a36f7c93eaf8df3836 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 13 Sep 2023 20:30:38 +0530 Subject: [PATCH 41/47] reorg json type test code --- cpp/tests/io/json_type_cast_test.cu | 51 ++++++++++++----------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 74085c7ebbb..a82d97b13dc 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -46,6 +46,21 @@ struct offsets_to_length { return thrust::get<1>(p) - thrust::get<0>(p); } }; + +/// Returns length of each string in the column +auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) +{ + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); + return svs_length; +} } // namespace auto default_json_options() @@ -69,16 +84,8 @@ TEST_F(JSONTypeCastTest, String) std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); - auto column = cudf::strings_column_view(input); - auto offsets_begin = column.offsets_begin(); - auto offsets_pair = - thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); - rmm::device_uvector svs_length(column.size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - offsets_pair, - offsets_pair + column.size(), - svs_length.begin(), - offsets_to_length{}); + auto column = cudf::strings_column_view(input); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = @@ -110,16 +117,8 @@ TEST_F(JSONTypeCastTest, Int) auto const type = cudf::data_type{cudf::type_id::INT64}; cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); - auto column = cudf::strings_column_view(data); - auto offsets_begin = column.offsets_begin(); - auto offsets_pair = - thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); - rmm::device_uvector svs_length(column.size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - offsets_pair, - offsets_pair + column.size(), - svs_length.begin(), - offsets_to_length{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = @@ -158,16 +157,8 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); - auto column = cudf::strings_column_view(data); - auto offsets_begin = column.offsets_begin(); - auto offsets_pair = - thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); - rmm::device_uvector svs_length(column.size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - offsets_pair, - offsets_pair + column.size(), - svs_length.begin(), - offsets_to_length{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = From 56d7fb614d22f610d9107f4151057595908ea0d6 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 14 Sep 2023 02:24:45 +0530 Subject: [PATCH 42/47] add error cases for parse_data --- cpp/tests/io/json_type_cast_test.cu | 69 +++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index a82d97b13dc..9eb5e8f5230 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -33,6 +33,8 @@ #include +#include +#include #include using namespace cudf::test::iterators; @@ -181,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } +TEST_F(JSONTypeCastTest, ErrorNulls) +{ + auto const stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::STRING}; + + // error in decoding + std::vector input_values{R"("\"\a")", + R"("\u")", + R"("\u0")", + R"("\u0b")", + R"("\u00b")", + R"("\u00bz")", + R"("\t34567890123456\t9012345678901\ug0bc")", + R"("\t34567890123456\t90123456789012\u0hbc")", + R"("\t34567890123456\t90123456789012\u00ic")", + R"("\t34567890123456\t9012345678901\")", + R"("\t34567890123456\t90123456789012\")", + R"(null)"}; + // Note: without quotes are copied without decoding + cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end()); + + auto column = cudf::strings_column_view(input); + auto space_length = 128; + auto prepend_space = [&space_length](auto const& s) { + if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1); + return std::string(s); + }; + std::vector small_input; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space); + cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end()); + + std::vector large_input; + space_length = 128 * 128; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space); + cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end()); + + std::vector expected_values{"", "", "", "", "", "", "", "", "", "", "", ""}; + cudf::test::strings_column_wrapper expected( + expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls()); + + // single threads, warp, block. + for (auto const& column : + {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) { + rmm::device_uvector svs_length = string_offset_to_length(column, stream); + + auto null_mask_it = no_nulls(); + auto null_mask = + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected); + } +} + CUDF_TEST_PROGRAM_MAIN() From d088e8ea8824a356ce9c2e7c39b53671d4c44d2d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 14 Sep 2023 02:25:12 +0530 Subject: [PATCH 43/47] address review comments (vuule) --- cpp/src/io/utilities/data_casting.cu | 2 +- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 6ffa1b444b6..b259a0f4d4d 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -909,7 +909,7 @@ std::unique_ptr parse_data( make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); - // use existing code (`ConvertFunctor`) to convert values + // use `ConvertFunctor` to convert non-string values thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 16124f1b5f1..d792520e5e3 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1438,7 +1438,7 @@ TEST_F(JsonReaderTest, JsonLongString) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); } TEST_F(JsonReaderTest, ErrorStrings) From 79b4f389eea9ba002c07d82180da328cd1564ccd Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 14 Sep 2023 02:28:22 +0530 Subject: [PATCH 44/47] fix review comments, remove nvtx ranges --- cpp/src/io/utilities/data_casting.cu | 2 -- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index b259a0f4d4d..d09ea408c23 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -802,7 +802,6 @@ std::unique_ptr parse_data( size_type{0}, thrust::maximum{}); - nvtxRangePush("string_parallel"); auto offsets = cudf::make_numeric_column( data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); @@ -896,7 +895,6 @@ std::unique_ptr parse_data( d_offsets, d_chars); } - nvtxRangePop(); return make_strings_column(col_size, std::move(offsets), diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index d792520e5e3..16124f1b5f1 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1438,7 +1438,7 @@ TEST_F(JsonReaderTest, JsonLongString) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view()); } TEST_F(JsonReaderTest, ErrorStrings) From 403a3741b49e3e8c8814052127c501a4dc17f6ab Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 14 Sep 2023 02:59:21 +0530 Subject: [PATCH 45/47] fix unit test cases nullability --- cpp/tests/io/json_test.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 16124f1b5f1..80282871a3c 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1407,7 +1407,8 @@ TEST_F(JsonReaderTest, JsonLongString) cudf::test::iterators::nulls_at({10, 11})}; cudf::test::fixed_width_column_wrapper repeat_times{ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}; + {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}, + cudf::test::iterators::no_nulls()}; auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); auto col2 = d_col2->view(); cudf::table_view const tbl_view{{col1, col2, repeat_times}}; @@ -1438,7 +1439,7 @@ TEST_F(JsonReaderTest, JsonLongString) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, table.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); } TEST_F(JsonReaderTest, ErrorStrings) From 72d23fbba4b625113b8825b8513b98bd75b62845 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 19 Sep 2023 21:07:46 +0530 Subject: [PATCH 46/47] address review comments, split code for string type --- cpp/src/io/utilities/data_casting.cu | 279 +++++++++++++++------------ 1 file changed, 155 insertions(+), 124 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index d09ea408c23..f2868f80a23 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -249,7 +249,8 @@ process_string(in_iterator_t in_begin, // If this may be a UTF-16 encoded surrogate pair: // we expect another \uXXXX sequence int32_t hex_low_val = 0; - if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low // surrogate @@ -267,10 +268,8 @@ process_string(in_iterator_t in_begin, (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); bytes += write_utf8_char(utf8_chars, d_buffer); - } - - // Just a single \uXXXX sequence - else { + } else { + // Just a single \uXXXX sequence auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); bytes += write_utf8_char(utf8_chars, d_buffer); } @@ -293,6 +292,8 @@ struct bitfield_warp { // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. // 5+32 for each warp. bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; + + /// Sets all bits to 0 __device__ void reset(unsigned warp_id) { if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) { @@ -300,6 +301,8 @@ struct bitfield_warp { } is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits __device__ void shift(unsigned warp_id) { if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) @@ -307,12 +310,16 @@ struct bitfield_warp { is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; __syncwarp(); } + + /// Each thread in a warp sets its own bit. __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = is_escaping_backslash; __syncwarp(); } + + /// Each thread in a warp gets the requested bit. __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index]; @@ -332,22 +339,29 @@ struct bitfield_block { // 5 + num_warps*32 for entire block bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; + /// Sets all bits to 0 __device__ void reset(unsigned warp_id) { if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; } is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0; } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits __device__ void shift(unsigned warp_id) { if (threadIdx.x < UNICODE_LOOK_BACK) is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x]; __syncthreads(); } + + /// Each thread in a block sets its own bit. __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) { is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; __syncthreads(); } + + /// Each thread in a block gets the requested bit. __device__ bool get_bit(unsigned warp_id, int bit_index) { return is_slash[UNICODE_LOOK_BACK + bit_index]; @@ -478,8 +492,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, if (!d_chars) { if (lane == 0) { d_offsets[istring] = in_end - in_begin; } } else { - for (size_type char_index = lane; char_index < (in_end - in_begin); - char_index += BLOCK_SIZE) { + for (size_t char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) { d_buffer[char_index] = in_begin[char_index]; } } @@ -516,7 +529,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncthreads(); // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. - for (size_type char_index = lane; + for (size_t char_index = lane; char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { bool const is_within_bounds = char_index < (in_end - in_begin); @@ -620,14 +633,14 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, skip |= is_escaping_backslash; if (is_within_bounds) { // skip X for each X in \uXXXX - skip |= char_index - 2 >= 0 && is_slash.get_bit(warp_id, lane - 2) && - in_begin[char_index - 1] == 'u'; - skip |= char_index - 3 >= 0 && is_slash.get_bit(warp_id, lane - 3) && - in_begin[char_index - 2] == 'u'; - skip |= char_index - 4 >= 0 && is_slash.get_bit(warp_id, lane - 4) && - in_begin[char_index - 3] == 'u'; - skip |= char_index - 5 >= 0 && is_slash.get_bit(warp_id, lane - 5) && - in_begin[char_index - 4] == 'u'; + skip |= + char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; + skip |= + char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u'; + skip |= + char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u'; + skip |= + char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u'; } int this_num_out = 0; cudf::char_utf8 write_char{}; @@ -650,7 +663,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, // UTF16 \uXXXX\uXXXX // Note: no need for scanned_backslash below because we already know that // only '\u' check is enough. - if ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < in_end && in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' && in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') { @@ -773,6 +787,123 @@ struct to_string_view_pair { } }; +template +static std::unique_ptr parse_string(string_view_pair_it str_tuples, + size_type col_size, + rmm::device_buffer&& null_mask, + rmm::device_scalar& d_null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // CUDF_FUNC_RANGE(); + + auto const max_length = thrust::transform_reduce( + rmm::exec_policy(stream), + str_tuples, + str_tuples + col_size, + [] __device__(auto t) { return t.second; }, + size_type{0}, + thrust::maximum{}); + + auto offsets = cudf::make_numeric_column( + data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + auto null_count_data = d_null_count.data(); + + auto single_thread_fn = string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + // TODO run these independent kernels in parallel streams. + if (max_length > SINGLE_THREAD_THRESHOLD) { + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + + if (max_length > WARP_THRESHOLD) { + // for strings longer than WARP_THRESHOLD, 1 block per string + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", + std::overflow_error); + + // CHARS column + std::unique_ptr chars = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars = chars->mutable_view().data(); + + single_thread_fn.d_chars = d_chars; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + if (max_length > SINGLE_THREAD_THRESHOLD) { + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + if (max_length > WARP_THRESHOLD) { + str_counter.set_value(0, stream); + // for strings longer than WARP_THRESHOLD, 1 block per string + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); +} + std::unique_ptr parse_data( const char* data, thrust::zip_iterator> offset_length_begin, @@ -794,113 +925,13 @@ std::unique_ptr parse_data( auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data}); if (col_type == cudf::data_type{cudf::type_id::STRING}) { - auto const max_length = thrust::transform_reduce( - rmm::exec_policy(stream), - str_tuples, - str_tuples + col_size, - [] __device__(auto t) { return t.second; }, - size_type{0}, - thrust::maximum{}); - - auto offsets = cudf::make_numeric_column( - data_type{cudf::type_id::INT32}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().data(); - - auto single_thread_fn = - string_parse{str_tuples, - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - single_thread_fn); - - constexpr auto warps_per_block = 8; - constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; - auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); - auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); - - // TODO run these independent kernels in parallel streams. - if (max_length > SINGLE_THREAD_THRESHOLD) { - parse_fn_string_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - nullptr); - } - - if (max_length > WARP_THRESHOLD) { - // for strings longer than WARP_THRESHOLD, 1 block per string - str_counter.set_value(0, stream); - parse_fn_string_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - nullptr); - } - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); - - // CHARS column - std::unique_ptr chars = - strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars = chars->mutable_view().data(); - - single_thread_fn.d_chars = d_chars; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - single_thread_fn); - - if (max_length > SINGLE_THREAD_THRESHOLD) { - str_counter.set_value(0, stream); - parse_fn_string_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - d_chars); - } - - if (max_length > WARP_THRESHOLD) { - str_counter.set_value(0, stream); - // for strings longer than WARP_THRESHOLD, 1 block per string - parse_fn_string_parallel - <<>>( - str_tuples, - col_size, - str_counter.data(), - static_cast(null_mask.data()), - null_count_data, - options, - d_offsets, - d_chars); - } - - return make_strings_column(col_size, - std::move(offsets), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); + return parse_string(str_tuples, + col_size, + std::forward(null_mask), + d_null_count, + options, + stream, + mr); } auto out_col = From d0a5e23956a94d4e9a2e1f2a01ab7be0226c8c4f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 19 Sep 2023 21:51:59 +0530 Subject: [PATCH 47/47] add comments, style fix --- cpp/src/io/utilities/data_casting.cu | 10 +++++++--- cpp/tests/io/json_test.cpp | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index f2868f80a23..1772e5e43fa 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -492,7 +492,8 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, if (!d_chars) { if (lane == 0) { d_offsets[istring] = in_end - in_begin; } } else { - for (size_t char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) { + for (thread_index_type char_index = lane; char_index < (in_end - in_begin); + char_index += BLOCK_SIZE) { d_buffer[char_index] = in_begin[char_index]; } } @@ -529,7 +530,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, __syncthreads(); // 0-31, 32-63, ... i*32-n. // entire warp executes but with mask. - for (size_t char_index = lane; + for (thread_index_type char_index = lane; char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); char_index += BLOCK_SIZE) { bool const is_within_bounds = char_index < (in_end - in_begin); @@ -679,7 +680,10 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, write_char = strings::detail::codepoint_to_utf8(unicode_code_point); this_num_out = strings::detail::bytes_in_char_utf8(write_char); } else { + // if hex_val is high surrogate, ideally it should be parsing failure. + // but skipping it as other parsers do this too. if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + // Ideally this should be skipped if previous char is high surrogate. skip = true; this_num_out = 0; write_char = 0; @@ -726,7 +730,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples, } } // char for-loop if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } - } // grid-stride for-loop + } // grid-stride for-loop } template diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 80282871a3c..7c911ac2e04 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1388,8 +1388,8 @@ TEST_F(JsonReaderTest, JsonLongString) "$€", "ராபிட்ஸ்", "C𝞵𝓓𝒻", - "", // null - "", // null + "", // null + "", // null "கார்த்தி", "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF