diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 900e9eed98e..a84f7bd5224 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -413,11 +413,13 @@ add_library( src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp + src/io/utilities/data_casting.cu src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp + src/io/utilities/type_inference.cu src/io/utilities/trie.cu src/jit/cache.cpp src/jit/parser.cpp diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh deleted file mode 100644 index b7ee5e05e96..00000000000 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include - -namespace cudf::io::json::detail { - -// Unicode code point escape sequence -static constexpr char UNICODE_SEQ = 0x7F; - -// Invalid escape sequence -static constexpr char NON_ESCAPE_CHAR = 0x7E; - -// Unicode code point escape sequence prefix comprises '\' and 'u' characters -static constexpr size_type UNICODE_ESC_PREFIX = 2; - -// Unicode code point escape sequence comprises four hex characters -static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; - -// A unicode code point escape sequence is \uXXXX -static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; - -static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; -static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; -static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; - -/** - * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or - * whether type casting failed. - */ -enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; - -/** - * @brief Providing additional information about the type casting result. - */ -struct data_casting_result_info { - // Number of bytes written to output - size_type bytes; - // Whether parsing succeeded, item was parsed to null, or failed - data_casting_result result; -}; - -/** - * @brief Returns the character to output for a given escaped character that's following a - * backslash. - * - * @param escaped_char The character following the backslash. - * @return The character to output for a given character that's following a backslash - */ -__device__ __forceinline__ char get_escape_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return '"'; - case '\\': return '\\'; - case '/': return '/'; - case 'b': return '\b'; - case 'f': return '\f'; - case 'n': return '\n'; - case 'r': return '\r'; - case 't': return '\t'; - case 'u': return UNICODE_SEQ; - default: return NON_ESCAPE_CHAR; - } -} - -/** - * @brief Returns the escaped characters for a given character. - * - * @param escaped_char The character to escape. - * @return The escaped characters for a given character. - */ -__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) -{ - switch (escaped_char) { - case '"': return {'\\', '"'}; - case '\\': return {'\\', '\\'}; - case '/': return {'\\', '/'}; - case '\b': return {'\\', 'b'}; - case '\f': return {'\\', 'f'}; - case '\n': return {'\\', 'n'}; - case '\r': return {'\\', 'r'}; - case '\t': return {'\\', 't'}; - // case 'u': return UNICODE_SEQ; - default: return {'\0', escaped_char}; - } -} -/** - * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence - * \uXXXX. - * - * @param str Pointer to the first (most-significant) hex digit - * @return The parsed hex value if successful, -1 otherwise. - */ -__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) -{ - // Prepare result - int32_t result = 0, base = 1; - constexpr int32_t hex_radix = 16; - - // Iterate over hex digits right-to-left - size_type index = UNICODE_HEX_DIGIT_COUNT; - while (index-- > 0) { - char const ch = str[index]; - if (ch >= '0' && ch <= '9') { - result += static_cast((ch - '0') + 0) * base; - base *= hex_radix; - } else if (ch >= 'A' && ch <= 'F') { - result += static_cast((ch - 'A') + 10) * base; - base *= hex_radix; - } else if (ch >= 'a' && ch <= 'f') { - result += static_cast((ch - 'a') + 10) * base; - base *= hex_radix; - } else { - return -1; - } - } - return result; -} - -/** - * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to - * \p out_it - */ -constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) -{ - auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) - : strings::detail::from_char_utf8(character, out_it); - if (out_it) out_it += bytes; - return bytes; -} - -/** - * @brief Processes a string, replaces escape sequences and optionally strips off the quote - * characters. - * - * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to - * char - * @param in_begin Iterator to the first item to process - * @param in_end Iterator to one past the last item to process - * @param d_buffer Output character buffer to the first item to write - * @param options Settings for controlling string processing behavior - * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is - * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether - * parsing succeeded, item was parsed to null, or failed. - */ -template -__device__ __forceinline__ data_casting_result_info -process_string(in_iterator_t in_begin, - in_iterator_t in_end, - char* d_buffer, - cudf::io::parse_options_view const& options) -{ - int32_t bytes = 0; - auto const num_in_chars = thrust::distance(in_begin, in_end); - // String values are indicated by keeping the quote character - bool const is_string_value = - num_in_chars >= 2LL && - (options.quotechar == '\0' || - (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); - - // Copy literal/numeric value - if (not is_string_value) { - while (in_begin != in_end) { - if (d_buffer) *d_buffer++ = *in_begin; - ++in_begin; - ++bytes; - } - return {bytes, data_casting_result::PARSING_SUCCESS}; - } - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const backslash_char = '\\'; - - // Escape-flag, set after encountering a backslash character - bool escape = false; - - // Exclude beginning and ending quote chars from string range - if (!options.keepquotes) { - ++in_begin; - --in_end; - } - - // Iterate over the input - while (in_begin != in_end) { - // Copy single character to output - if (!escape) { - escape = (*in_begin == backslash_char); - if (!escape) { - if (d_buffer) *d_buffer++ = *in_begin; - ++bytes; - } - ++in_begin; - continue; - } - - // Previous char indicated beginning of escape sequence - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(*in_begin); - - // We escaped an invalid escape character -> "fail"/null for this item - if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Regular, single-character escape - if (escaped_char != UNICODE_SEQ) { - if (d_buffer) *d_buffer++ = escaped_char; - ++bytes; - ++in_begin; - continue; - } - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - // Skip over the 'u' char from \uXXXX to the first hex digit - ++in_begin; - - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { - return {bytes, data_casting_result::PARSING_FAILURE}; - } - - auto hex_val = parse_unicode_hex(in_begin); - - // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item - if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } - - // Skip over the four hex digits - thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - int32_t hex_low_val = 0; - if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && - *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { - // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low - // surrogate - hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); - } - - // This is indeed a UTF16 surrogate pair - if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && - hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { - // Skip over the second \uXXXX sequence - thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); - - // Compute UTF16-encoded code point - uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + - (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - bytes += write_utf8_char(utf8_chars, d_buffer); - } - } - - // The last character of the input is a backslash -> "fail"/null for this item - if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } - return {bytes, data_casting_result::PARSING_SUCCESS}; -} - -template -struct string_parse { - str_tuple_it str_tuples; - bitmask_type* null_mask; - size_type* null_count_data; - cudf::io::parse_options_view const options; - size_type* d_offsets{}; - char* d_chars{}; - - __device__ void operator()(size_type idx) - { - if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const in_begin = str_tuples[idx].first; - auto const in_end = in_begin + str_tuples[idx].second; - auto const num_in_chars = str_tuples[idx].second; - - // Check if the value corresponds to the null literal - auto const is_null_literal = - (!d_chars) && - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal && null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; - return; - } - - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - auto str_process_info = process_string(in_begin, in_end, d_buffer, options); - if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - if (null_mask != nullptr) { - clear_bit(null_mask, idx); - atomicAdd(null_count_data, 1); - } - if (!d_chars) d_offsets[idx] = 0; - } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; - } - } -}; -/** - * @brief Parses the data from an iterator of string views, casting it to the given target data type - * - * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair - * @param col_size The total number of items of this column - * @param col_type The column's target data type - * @param null_mask A null mask that renders certain items from the input invalid - * @param options Settings for controlling the processing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr The resource to be used for device memory allocation - * @return The column that contains the parsed data - */ -template -std::unique_ptr parse_data(str_tuple_it str_tuples, - size_type col_size, - data_type col_type, - B&& null_mask, - size_type null_count, - cudf::io::parse_options_view const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - auto d_null_count = rmm::device_scalar(null_count, stream); - auto null_count_data = d_null_count.data(); - - if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null count may be updated by parse failures - auto [offsets, chars] = cudf::strings::detail::make_strings_children( - string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options}, - col_size, - stream, - mr); - - return make_strings_column(col_size, - std::move(offsets), - std::move(chars), - d_null_count.value(stream), - std::move(null_mask)); - } - - auto out_col = - make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); - - // use existing code (`ConvertFunctor`) to convert values - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( - size_type row) { - if (col.is_null(row)) { return; } - auto const in = str_tuples[row]; - - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); - - if (is_null_literal) { - col.set_null(row); - atomicAdd(null_count_data, 1); - return; - } - - // If this is a string value, remove quotes - auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); - - auto const is_parsed = cudf::type_dispatcher(col_type, - ConvertFunctor{}, - in_begin, - in_end, - col.data(), - row, - col_type, - options, - false); - if (not is_parsed) { - col.set_null(row); - atomicAdd(null_count_data, 1); - } - }); - - out_col->set_null_count(d_null_count.value(stream)); - - return out_col; -} - -} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index cabf904f020..5d7fb9d6b43 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,14 +16,13 @@ #include "nested_json.hpp" #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -331,23 +330,27 @@ std::vector copy_strings_to_host(device_span input, { CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); - rmm::device_uvector> string_views(num_strings, stream); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); thrust::transform(rmm::exec_policy(stream), d_offset_pairs, d_offset_pairs + num_strings, - string_views.begin(), - [data = input.data()] __device__(auto const& offsets) { + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { // Note: first character for non-field columns - return thrust::make_pair( - data + thrust::get<0>(offsets), + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); }); cudf::io::parse_options_view options_view{}; options_view.quotechar = '\0'; // no quotes options_view.keepquotes = true; - auto d_column_names = parse_data(string_views.begin(), + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, num_strings, data_type{type_id::STRING}, rmm::device_buffer{}, @@ -355,7 +358,7 @@ std::vector copy_strings_to_host(device_span input, options_view, stream, rmm::mr::get_current_device_resource()); - auto to_host = [stream](auto const& col) { + auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( @@ -763,19 +766,6 @@ std::pair, std::vector> device_json_co // TODO how about directly storing pair in json_column? auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); data_type target_type{}; @@ -790,12 +780,13 @@ std::pair, std::vector> device_json_co // Infer column type, if we don't have an explicit type for it else { target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, string_ranges_it, col_size, stream); + options.json_view(), d_input, offset_length_it, col_size, stream); } auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0b49f97597d..06ac11485cb 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -19,14 +19,13 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include @@ -1949,20 +1948,6 @@ std::pair, std::vector> json_column_to auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference - auto string_ranges_it = - thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { - return thrust::pair{ - thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - - // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion - auto string_spans_it = thrust::make_transform_iterator( - offset_length_it, [data = d_input.data()] __device__(auto ip) { - return thrust::pair{ - data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; - }); - data_type target_type{}; if (schema.has_value()) { @@ -1978,7 +1963,7 @@ std::pair, std::vector> json_column_to target_type = cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(), d_input, - string_ranges_it, + offset_length_it, col_size, stream); } @@ -1986,7 +1971,8 @@ std::pair, std::vector> json_column_to auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = parse_data(string_spans_it, + auto col = parse_data(d_input.data(), + offset_length_it, col_size, target_type, std::move(result_bitmask), diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1e44522ed33..2d363c51fce 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -20,6 +20,7 @@ */ #include +#include #include #include @@ -27,9 +28,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu new file mode 100644 index 00000000000..1772e5e43fa --- /dev/null +++ b/cpp/src/io/utilities/data_casting.cu @@ -0,0 +1,987 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cudf::io::json::detail { + +constexpr auto SINGLE_THREAD_THRESHOLD = 128; +constexpr auto WARP_THRESHOLD = 128 * 128; // 16K + +// Unicode code point escape sequence +static constexpr char UNICODE_SEQ = 0x7F; + +// Invalid escape sequence +static constexpr char NON_ESCAPE_CHAR = 0x7E; + +// Unicode code point escape sequence prefix comprises '\' and 'u' characters +static constexpr size_type UNICODE_ESC_PREFIX = 2; + +// Unicode code point escape sequence comprises four hex characters +static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; + +// A unicode code point escape sequence is \uXXXX +static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; + +static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; +static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; + +/** + * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or + * whether type casting failed. + */ +enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; + +/** + * @brief Providing additional information about the type casting result. + */ +struct data_casting_result_info { + // Number of bytes written to output + size_type bytes; + // Whether parsing succeeded, item was parsed to null, or failed + data_casting_result result; +}; + +/** + * @brief Returns the character to output for a given escaped character that's following a + * backslash. + * + * @param escaped_char The character following the backslash. + * @return The character to output for a given character that's following a backslash + */ +__device__ __forceinline__ char get_escape_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return '"'; + case '\\': return '\\'; + case '/': return '/'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'u': return UNICODE_SEQ; + default: return NON_ESCAPE_CHAR; + } +} + +/** + * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence + * \uXXXX. + * + * @param str Pointer to the first (most-significant) hex digit + * @return The parsed hex value if successful, -1 otherwise. + */ +__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) +{ + // Prepare result + int32_t result = 0, base = 1; + constexpr int32_t hex_radix = 16; + + // Iterate over hex digits right-to-left + size_type index = UNICODE_HEX_DIGIT_COUNT; + while (index-- > 0) { + char const ch = str[index]; + if (ch >= '0' && ch <= '9') { + result += static_cast((ch - '0') + 0) * base; + base *= hex_radix; + } else if (ch >= 'A' && ch <= 'F') { + result += static_cast((ch - 'A') + 10) * base; + base *= hex_radix; + } else if (ch >= 'a' && ch <= 'f') { + result += static_cast((ch - 'a') + 10) * base; + base *= hex_radix; + } else { + return -1; + } + } + return result; +} + +/** + * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to + * \p out_it + */ +constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) +{ + auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) + : strings::detail::from_char_utf8(character, out_it); + if (out_it) out_it += bytes; + return bytes; +} + +/** + * @brief Processes a string, replaces escape sequences and optionally strips off the quote + * characters. + * + * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to + * char + * @param in_begin Iterator to the first item to process + * @param in_end Iterator to one past the last item to process + * @param d_buffer Output character buffer to the first item to write + * @param options Settings for controlling string processing behavior + * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is + * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether + * parsing succeeded, item was parsed to null, or failed. + */ +template +__device__ __forceinline__ data_casting_result_info +process_string(in_iterator_t in_begin, + in_iterator_t in_end, + char* d_buffer, + cudf::io::parse_options_view const& options) +{ + int32_t bytes = 0; + auto const num_in_chars = thrust::distance(in_begin, in_end); + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + + // Copy literal/numeric value + if (not is_string_value) { + bytes += (in_end - in_begin); + if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer); + return {bytes, data_casting_result::PARSING_SUCCESS}; + } + char constexpr backslash_char = '\\'; + + // Escape-flag, set after encountering a backslash character + bool is_prev_char_escape = false; + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + + // Iterate over the input + while (in_begin != in_end) { + // Copy single character to output + if (!is_prev_char_escape) { + is_prev_char_escape = (*in_begin == backslash_char); + if (!is_prev_char_escape) { + if (d_buffer) *d_buffer++ = *in_begin; + ++bytes; + } + ++in_begin; + continue; + } + + // Previous char indicated beginning of escape sequence + // Reset escape flag for next loop iteration + is_prev_char_escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(*in_begin); + + // We escaped an invalid escape character -> "fail"/null for this item + if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Regular, single-character escape + if (escaped_char != UNICODE_SEQ) { + if (d_buffer) *d_buffer++ = escaped_char; + ++bytes; + ++in_begin; + continue; + } + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + // Skip over the 'u' char from \uXXXX to the first hex digit + ++in_begin; + + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { + return {bytes, data_casting_result::PARSING_FAILURE}; + } + + auto hex_val = parse_unicode_hex(in_begin); + + // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item + if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } + + // Skip over the four hex digits + thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int32_t hex_low_val = 0; + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && + *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { + // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low + // surrogate + hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); + } + + // This is indeed a UTF16 surrogate pair + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Skip over the second \uXXXX sequence + thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); + + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + bytes += write_utf8_char(utf8_chars, d_buffer); + } else { + // Just a single \uXXXX sequence + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + bytes += write_utf8_char(utf8_chars, d_buffer); + } + } + + // The last character of the input is a backslash -> "fail"/null for this item + if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } + return {bytes, data_casting_result::PARSING_SUCCESS}; +} + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * warp. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_warp { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5+32 for each warp. + bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) { + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0; + } + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) + is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = + is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size]; + __syncwarp(); + } + + /// Each thread in a warp sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] = + is_escaping_backslash; + __syncwarp(); + } + + /// Each thread in a warp gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index]; + } +}; + +/** + * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a + * block. + * + * @tparam num_warps number of warps in the block + */ +template +struct bitfield_block { + static constexpr auto UNICODE_LOOK_BACK{5}; + // 5 because for skipping unicode hex chars, look back up to 5 chars are needed. + // 5 + num_warps*32 for entire block + bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size]; + + /// Sets all bits to 0 + __device__ void reset(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; } + is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0; + } + + /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits + __device__ void shift(unsigned warp_id) + { + if (threadIdx.x < UNICODE_LOOK_BACK) + is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x]; + __syncthreads(); + } + + /// Each thread in a block sets its own bit. + __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash) + { + is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash; + __syncthreads(); + } + + /// Each thread in a block gets the requested bit. + __device__ bool get_bit(unsigned warp_id, int bit_index) + { + return is_slash[UNICODE_LOOK_BACK + bit_index]; + } +}; + +// Algorithm: warp/block parallel version of string_parse and process_string() +// Decoding character classes (u8, u16, \*, *): +// character count: input->output +// \uXXXX 6->2/3/4 +// \uXXXX\uXXXX 12->2/3/4 +// \" 2->1 +// * 1->1 +// +// ERROR conditions. (all collaborating threads quit) +// c=='\' & curr_idx == end_idx-1; +// [c-1]=='\' & get_escape[c]==NEC +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex +// +// skip conditions. (current thread skips this char, no output) +// c=='\' skip. (Escaping char only) +// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip. +// +// write conditions. (write to d_buffer) +// [c-1]!='\' & [c]!='\' write [c] +// [c-1]!='\' & [c]=='\' skip (already covered in skip conditions) +// [c-1]=='\' & [c]!=NEC && [c]!=UNICODE_SEQ, write [c] +// [c-1]=='\' & [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE +// [c+1:4]=curr_hex_val +// // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c+7:4]=next_hex_val +// // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && +// hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val +// // if prev_hex_val in high, curr_hex_val in low, skip. +// // if curr_hex_val in high, next_hex_val in low, write [u16] +// if curr_hex_val not in high, write [u8] +// before writing, find num of output characters per threads, +// then do intra-warp/intra-block scan for out_idx +// propagate offset from next iteration to carry forward. +// Uses 1 warp per string or 1 block per string + +/** + * @brief Warp/Block parallel version of string_parse functor + * + * @tparam is_warp True if 1 warp per string, False if 1 block per string + * @tparam num_warps Number of warps per block + * @tparam str_tuple_it Iterator type for tuple with string pointer and its length + * @param str_tuples iterator of tuple with string pointer and its length + * @param total_out_strings Number of string rows to be processed + * @param str_counter Counter to keep track of processed number of strings + * @param null_mask Null mask + * @param null_count_data pointer to store null count + * @param options Settings for controlling string processing behavior + * @param d_offsets Offsets to identify where to store the results for each string + * @param d_chars Character array to store the characters of strings + */ +template +__global__ void parse_fn_string_parallel(str_tuple_it str_tuples, + size_type total_out_strings, + size_type* str_counter, + bitmask_type* null_mask, + size_type* null_count_data, + cudf::io::parse_options_view const options, + size_type* d_offsets, + char* d_chars) +{ + constexpr auto BLOCK_SIZE = + is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps; + size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x; + + // get 1-string index per warp/block + auto get_next_string = [&]() { + if constexpr (is_warp) { + size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + return __shfl_sync(0xffffffff, istring, 0); + } else { + // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's + // istring value + __syncthreads(); + __shared__ size_type istring; + if (lane == 0) { istring = atomicAdd(str_counter, 1); } + __syncthreads(); + return istring; + } + }; + // grid-stride loop. + for (size_type istring = get_next_string(); istring < total_out_strings; + istring = get_next_string()) { + // skip nulls + if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { + if (!d_chars && lane == 0) d_offsets[istring] = 0; + continue; // gride-stride return; + } + + auto in_begin = str_tuples[istring].first; + auto in_end = in_begin + str_tuples[istring].second; + auto const num_in_chars = str_tuples[istring].second; + if constexpr (is_warp) { + if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue; + } else { + if (num_in_chars <= WARP_THRESHOLD) continue; + } + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + if (lane == 0) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[istring] = 0; + } + continue; // gride-stride return; + } + } + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); + char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr; + + // Copy literal/numeric value + if (not is_string_value) { + if (!d_chars) { + if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + } else { + for (thread_index_type char_index = lane; char_index < (in_end - in_begin); + char_index += BLOCK_SIZE) { + d_buffer[char_index] = in_begin[char_index]; + } + } + continue; // gride-stride return; + } + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + // warp-parallelized or block-parallelized process_string() + + auto is_hex = [](auto ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + }; + + // for backslash scan calculation: is_previous_escaping_backslash + [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size; + bool init_state_reg; + __shared__ bool init_state_shared; + size_type last_offset_reg; + __shared__ size_type last_offset_shared; + bool& init_state(is_warp ? init_state_reg : init_state_shared); + size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared); + if (is_warp || lane == 0) { + init_state = false; + last_offset = 0; + } + using bitfield = + std::conditional_t, bitfield_block>; + __shared__ bitfield is_slash; + is_slash.reset(warp_id); + __syncthreads(); + // 0-31, 32-63, ... i*32-n. + // entire warp executes but with mask. + for (thread_index_type char_index = lane; + char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast(BLOCK_SIZE)); + char_index += BLOCK_SIZE) { + bool const is_within_bounds = char_index < (in_end - in_begin); + auto const MASK = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff; + auto const c = is_within_bounds ? in_begin[char_index] : '\0'; + auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0'; + auto const escaped_char = get_escape_char(c); + + bool is_escaping_backslash{false}; + [[maybe_unused]] bool is_prev_escaping_backslash{false}; + // To check current is backslash by checking if previous is backslash. + // curr = !prev & c=='\\' + // So, scan is required from beginning of string. + // State table approach (intra-warp FST) (intra-block FST) + // 2 states: Not-Slash(NS), Slash(S). + // prev / * + // NS S NS + // S NS NS + // After inclusive scan, all current S states translate to escaping backslash. + // All escaping backslash should be skipped. + + struct state_table { + // using bit fields instead of state[2] + bool state0 : 1; + bool state1 : 1; + bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; } + }; + state_table curr{is_within_bounds && c == '\\', false}; // state transition vector. + auto composite_op = [](state_table op1, state_table op2) { + // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]}; + return state_table{op1.state0 ? op2.state1 : op2.state0, + op1.state1 ? op2.state1 : op2.state0}; + }; + state_table scanned; + // inclusive scan of escaping backslashes + if constexpr (is_warp) { + using SlashScan = cub::WarpScan; + __shared__ typename SlashScan::TempStorage temp_slash[num_warps]; + SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + init_state = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1); + __syncwarp(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + } else { + using SlashScan = cub::BlockScan; + __shared__ typename SlashScan::TempStorage temp_slash; + SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op); + is_escaping_backslash = scanned.get(init_state); + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash; + __syncthreads(); + is_slash.shift(warp_id); + is_slash.set_bits(warp_id, is_escaping_backslash); + is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1); + // There is another __syncthreads() at the end of for-loop. + } + + // String with parsing errors are made as null + bool error = false; + if (is_within_bounds) { + // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT + // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex + error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1); + error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR); + error |= (is_prev_escaping_backslash && c == 'u' && + ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) | + !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) | + !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4]))); + } + // Make sure all threads have no errors before continuing + if constexpr (is_warp) { + error = __any_sync(MASK, error); + } else { + using ErrorReduce = cub::BlockReduce; + __shared__ typename ErrorReduce::TempStorage temp_storage_error; + __shared__ bool error_reduced; + error_reduced = ErrorReduce(temp_storage_error).Sum(error); // TODO use cub::LogicalOR. + // only valid in thread0, so shared memory is used for broadcast. + __syncthreads(); + error = error_reduced; + } + // If any thread has an error, skip the rest of the string and make this string as null + if (error) { + if (!d_chars && lane == 0) { + if (null_mask != nullptr) { + clear_bit(null_mask, istring); + atomicAdd(null_count_data, 1); + } + last_offset = 0; + d_offsets[istring] = 0; + } + if constexpr (!is_warp) { __syncthreads(); } + break; // gride-stride return; + } + + // Skipping non-copied escaped characters + bool skip = !is_within_bounds; // false; + // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX + skip |= is_escaping_backslash; + if (is_within_bounds) { + // skip X for each X in \uXXXX + skip |= + char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u'; + skip |= + char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u'; + skip |= + char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u'; + skip |= + char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u'; + } + int this_num_out = 0; + cudf::char_utf8 write_char{}; + + if (!skip) { + // 1. Unescaped character + if (!is_prev_escaping_backslash) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 2. Escaped character + if (escaped_char != UNICODE_SEQ) { + this_num_out = 1; + // writes char directly for non-unicode + } else { + // 3. Unicode + // UTF8 \uXXXX + auto hex_val = parse_unicode_hex(in_begin + char_index + 1); + auto hex_low_val = 0; + // UTF16 \uXXXX\uXXXX + // Note: no need for scanned_backslash below because we already know that + // only '\u' check is enough. + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) < + in_end && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' && + in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') { + hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6); + } + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + write_char = strings::detail::codepoint_to_utf8(unicode_code_point); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } else { + // if hex_val is high surrogate, ideally it should be parsing failure. + // but skipping it as other parsers do this too. + if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) { + // Ideally this should be skipped if previous char is high surrogate. + skip = true; + this_num_out = 0; + write_char = 0; + } else { + // if UTF8 + write_char = strings::detail::codepoint_to_utf8(hex_val); + this_num_out = strings::detail::bytes_in_char_utf8(write_char); + } + } + } + } + } // !skip end. + { + // compute offset to write output for each thread + size_type offset; + if constexpr (is_warp) { + using OffsetScan = cub::WarpScan; + __shared__ typename OffsetScan::TempStorage temp_storage[num_warps]; + OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset); + } else { + using OffsetScan = cub::BlockScan; + __shared__ typename OffsetScan::TempStorage temp_storage; + OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset); + __syncthreads(); + } + offset += last_offset; + // Write output + if (d_chars && !skip) { + auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ; + if (is_not_unicode) { + *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char; + } else { + strings::detail::from_char_utf8(write_char, d_buffer + offset); + } + } + offset += this_num_out; + if constexpr (is_warp) { + last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1); + } else { + __syncthreads(); + if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset; + __syncthreads(); + } + } + } // char for-loop + if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + } // grid-stride for-loop +} + +template +struct string_parse { + str_tuple_it str_tuples; + bitmask_type* null_mask; + size_type* null_count_data; + cudf::io::parse_options_view const options; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) + { + if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const in_begin = str_tuples[idx].first; + auto const in_end = in_begin + str_tuples[idx].second; + auto const num_in_chars = str_tuples[idx].second; + + if (num_in_chars > SINGLE_THREAD_THRESHOLD) return; + + // Check if the value corresponds to the null literal + if (!d_chars) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal && null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + if (!d_chars) d_offsets[idx] = 0; + return; + } + } + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { + if (null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + } + if (!d_chars) d_offsets[idx] = 0; + } else { + if (!d_chars) d_offsets[idx] = str_process_info.bytes; + } + } +}; + +template +struct to_string_view_pair { + SymbolT const* data; + to_string_view_pair(SymbolT const* _data) : data(_data) {} + __device__ auto operator()(thrust::tuple ip) + { + return thrust::pair{data + thrust::get<0>(ip), + static_cast(thrust::get<1>(ip))}; + } +}; + +template +static std::unique_ptr parse_string(string_view_pair_it str_tuples, + size_type col_size, + rmm::device_buffer&& null_mask, + rmm::device_scalar& d_null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // CUDF_FUNC_RANGE(); + + auto const max_length = thrust::transform_reduce( + rmm::exec_policy(stream), + str_tuples, + str_tuples + col_size, + [] __device__(auto t) { return t.second; }, + size_type{0}, + thrust::maximum{}); + + auto offsets = cudf::make_numeric_column( + data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + auto null_count_data = d_null_count.data(); + + auto single_thread_fn = string_parse{ + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + constexpr auto warps_per_block = 8; + constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block; + auto num_blocks = cudf::util::div_rounding_up_safe(col_size, warps_per_block); + auto str_counter = cudf::numeric_scalar(size_type{0}, true, stream); + + // TODO run these independent kernels in parallel streams. + if (max_length > SINGLE_THREAD_THRESHOLD) { + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + + if (max_length > WARP_THRESHOLD) { + // for strings longer than WARP_THRESHOLD, 1 block per string + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + nullptr); + } + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", + std::overflow_error); + + // CHARS column + std::unique_ptr chars = + strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); + auto d_chars = chars->mutable_view().data(); + + single_thread_fn.d_chars = d_chars; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + single_thread_fn); + + if (max_length > SINGLE_THREAD_THRESHOLD) { + str_counter.set_value(0, stream); + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + if (max_length > WARP_THRESHOLD) { + str_counter.set_value(0, stream); + // for strings longer than WARP_THRESHOLD, 1 block per string + parse_fn_string_parallel + <<>>( + str_tuples, + col_size, + str_counter.data(), + static_cast(null_mask.data()), + null_count_data, + options, + d_offsets, + d_chars); + } + + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); +} + +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + if (col_size == 0) { return make_empty_column(col_type); } + auto d_null_count = rmm::device_scalar(null_count, stream); + auto null_count_data = d_null_count.data(); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data}); + + if (col_type == cudf::data_type{cudf::type_id::STRING}) { + return parse_string(str_tuples, + col_size, + std::forward(null_mask), + d_null_count, + options, + stream, + mr); + } + + auto out_col = + make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); + auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); + + // use `ConvertFunctor` to convert non-string values + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( + size_type row) { + if (col.is_null(row)) { return; } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + + if (is_null_literal) { + col.set_null(row); + atomicAdd(null_count_data, 1); + return; + } + + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); + + auto const is_parsed = cudf::type_dispatcher(col_type, + ConvertFunctor{}, + in_begin, + in_end, + col.data(), + row, + col_type, + options, + false); + if (not is_parsed) { + col.set_null(row); + atomicAdd(null_count_data, 1); + } + }); + + out_col->set_null_count(d_null_count.value(stream)); + + return out_col; +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 5c3af588411..43d62fcd513 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,28 @@ struct parse_options { } }; +/** + * @brief Returns the escaped characters for a given character. + * + * @param escaped_char The character to escape. + * @return The escaped characters for a given character. + */ +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} + /** * @brief Returns the numeric value of an ASCII/UTF-8 character. * Handles hexadecimal digits, both uppercase and lowercase diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp new file mode 100644 index 00000000000..12fc0a5b2e7 --- /dev/null +++ b/cpp/src/io/utilities/string_parsing.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cudf::io { +namespace detail { + +/** + * @brief Infers data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string + * @throw cudf::logic_error if data type inference failed + * + * @param options View of inference options + * @param data JSON string input + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The inferred data type + */ +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream); +} // namespace detail + +namespace json::detail { + +/** + * @brief Parses the data from an iterator of string views, casting it to the given target data type + * + * @param data string input base pointer + * @param offset_length_begin The beginning of an offset-length tuple sequence + * @param col_size The total number of items of this column + * @param col_type The column's target data type + * @param null_mask A null mask that renders certain items from the input invalid + * @param options Settings for controlling the processing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr The resource to be used for device memory allocation + * @return The column that contains the parsed data + */ +std::unique_ptr parse_data( + const char* data, + thrust::zip_iterator> offset_length_begin, + size_type col_size, + data_type col_type, + rmm::device_buffer&& null_mask, + size_type null_count, + cudf::io::parse_options_view const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace json::detail +} // namespace cudf::io diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu similarity index 84% rename from cpp/src/io/utilities/type_inference.cuh rename to cpp/src/io/utilities/type_inference.cu index a9ccc80ca33..79a5c8f1c4c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,23 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include -#include +#include #include #include -#include #include -#include -#include #include -#include -#include - #include #include @@ -114,14 +107,14 @@ __device__ __inline__ bool is_like_float(std::size_t len, * * @param[in] options View of inference options * @param[in] data JSON string input - * @param[in] column_strings_begin The beginning of an offset-length tuple sequence + * @param[in] offset_length_begin The beginning of an offset-length tuple sequence * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ template __global__ void infer_column_type_kernel(OptionsView options, device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { @@ -129,8 +122,8 @@ __global__ void infer_column_type_kernel(OptionsView options, for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { - auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); - auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_offset = thrust::get<0>(*(offset_length_begin + idx)); + auto const field_len = thrust::get<1>(*(offset_length_begin + idx)); auto const field_begin = data.begin() + field_offset; if (cudf::detail::serialized_trie_contains( @@ -234,7 +227,7 @@ __global__ void infer_column_type_kernel(OptionsView options, * * @param options View of inference options * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence + * @param offset_length_begin The beginning of an offset-length tuple sequence * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return A histogram containing column-specific type counters @@ -242,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options, template cudf::io::column_type_histogram infer_column_type(OptionsView const& options, cudf::device_span data, - ColumnStringIter column_strings_begin, + ColumnStringIter offset_length_begin, std::size_t const size, rmm::cuda_stream_view stream) { @@ -254,40 +247,22 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); infer_column_type_kernel<<>>( - options, data, column_strings_begin, size, d_column_info.data()); + options, data, offset_length_begin, size, d_column_info.data()); return d_column_info.value(stream); } -/** - * @brief Infers data type for a given JSON string input `data`. - * - * @throw cudf::logic_error if input size is 0 - * @throw cudf::logic_error if date time is not inferred as string - * @throw cudf::logic_error if data type inference failed - * - * @tparam OptionsView Type of inference options view - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` - * - * @param options View of inference options - * @param data JSON string input - * @param column_strings_begin The beginning of an offset-length tuple sequence - * @param size Size of the string input - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The inferred data type - */ -template -cudf::data_type infer_data_type(OptionsView const& options, - device_span data, - ColumnStringIter column_strings_begin, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::data_type infer_data_type( + cudf::io::json_inference_options_view const& options, + device_span data, + thrust::zip_iterator> offset_length_begin, + std::size_t const size, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); - auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); + auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 220f1a3391f..7c911ac2e04 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1370,6 +1371,124 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); } +TEST_F(JsonReaderTest, JsonLongString) +{ + // Unicode + // 0000-FFFF Basic Multilingual Plane + // 10000-10FFFF Supplementary Plane + cudf::test::strings_column_wrapper col1{ + { + "\"\\/\b\f\n\r\t", + "\"", + "\\", + "/", + "\b", + "\f\n", + "\r\t", + "$€", + "ராபிட்ஸ்", + "C𝞵𝓓𝒻", + "", // null + "", // null + "கார்த்தி", + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF + R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", + R"( \\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\)", + R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)", + R"( \\abcd)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + R"( \\\\\\\\\\\\\\\\ \\\\\\\\\\\\\\\\)", + }, + cudf::test::iterators::nulls_at({10, 11})}; + + cudf::test::fixed_width_column_wrapper repeat_times{ + {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3}, + cudf::test::iterators::no_nulls()}; + auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times); + auto col2 = d_col2->view(); + cudf::table_view const tbl_view{{col1, col2, repeat_times}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + + cudf::table_view const expected = tbl_view; + std::map types; + types["col1"] = data_type{type_id::STRING}; + types["col2"] = data_type{type_id::STRING}; + types["int16"] = data_type{type_id::INT16}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .dtypes(types); + + // Read test data via nested JSON reader + auto const table = cudf::io::read_json(json_lines_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view()); +} + +TEST_F(JsonReaderTest, ErrorStrings) +{ + // cases of invalid escape characters, invalid unicode encodings. + // Error strings will decode to nulls + auto const buffer = std::string{R"( + {"col0": "\"\a"} + {"col0": "\u"} + {"col0": "\u0"} + {"col0": "\u0b"} + {"col0": "\u00b"} + {"col0": "\u00bz"} + {"col0": "\t34567890123456\t9012345678901\ug0bc"} + {"col0": "\t34567890123456\t90123456789012\u0hbc"} + {"col0": "\t34567890123456\t90123456789012\u00ic"} + {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"} +)"}; + // Last one is not an error case, but shows that unicode in json is copied string column output. + + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .dtypes({data_type{cudf::type_id::STRING}}) + .lines(true) + .legacy(false); + + auto const result = cudf::io::read_json(in_opts); + auto const result_view = result.tbl->view().column(0); + + EXPECT_EQ(result.metadata.schema_info[0].name, "col0"); + EXPECT_EQ(result_view.null_count(), 9); + cudf::test::strings_column_wrapper expected{ + {"", + "", + "", + "", + "", + "", + "", + "", + "", + "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4" + "\xe0\xae\xbf"}, + // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d + // 0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf + cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); +} + TEST_F(JsonReaderTest, TokenAllocation) { std::array const json_inputs{ diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 5c32131114d..9eb5e8f5230 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -21,15 +21,20 @@ #include #include +#include + #include #include #include -#include #include #include #include #include +#include + +#include +#include #include using namespace cudf::test::iterators; @@ -37,13 +42,27 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct to_thrust_pair_fn { - __device__ thrust::pair operator()( - thrust::pair const& p) +struct offsets_to_length { + __device__ cudf::size_type operator()(thrust::tuple const& p) { - return {p.first.data(), p.first.size_bytes()}; + return thrust::get<1>(p) - thrust::get<0>(p); } }; + +/// Returns length of each string in the column +auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) +{ + auto offsets_begin = column.offsets_begin(); + auto offsets_pair = + thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); + rmm::device_uvector svs_length(column.size(), stream); + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + offsets_pair, + offsets_pair + column.size(), + svs_length.begin(), + offsets_to_length{}); + return svs_length; +} } // namespace auto default_json_options() @@ -67,26 +86,23 @@ TEST_F(JSONTypeCastTest, String) std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); - auto d_column = cudf::column_device_view::create(input); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(input); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto str_col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -103,26 +119,23 @@ TEST_F(JSONTypeCastTest, Int) auto const type = cudf::data_type{cudf::type_id::INT64}; cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -146,26 +159,23 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); - auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - d_column->pair_begin(), - d_column->pair_end(), - svs.begin(), - to_thrust_pair_fn{}); + auto column = cudf::strings_column_view(data); + rmm::device_uvector svs_length = string_offset_to_length(column, stream); auto null_mask_it = no_nulls(); auto null_mask = - std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - - auto col = cudf::io::json::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, @@ -173,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } +TEST_F(JSONTypeCastTest, ErrorNulls) +{ + auto const stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::STRING}; + + // error in decoding + std::vector input_values{R"("\"\a")", + R"("\u")", + R"("\u0")", + R"("\u0b")", + R"("\u00b")", + R"("\u00bz")", + R"("\t34567890123456\t9012345678901\ug0bc")", + R"("\t34567890123456\t90123456789012\u0hbc")", + R"("\t34567890123456\t90123456789012\u00ic")", + R"("\t34567890123456\t9012345678901\")", + R"("\t34567890123456\t90123456789012\")", + R"(null)"}; + // Note: without quotes are copied without decoding + cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end()); + + auto column = cudf::strings_column_view(input); + auto space_length = 128; + auto prepend_space = [&space_length](auto const& s) { + if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1); + return std::string(s); + }; + std::vector small_input; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space); + cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end()); + + std::vector large_input; + space_length = 128 * 128; + std::transform( + input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space); + cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end()); + + std::vector expected_values{"", "", "", "", "", "", "", "", "", "", "", ""}; + cudf::test::strings_column_wrapper expected( + expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls()); + + // single threads, warp, block. + for (auto const& column : + {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) { + rmm::device_uvector svs_length = string_offset_to_length(column, stream); + + auto null_mask_it = no_nulls(); + auto null_mask = + std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); + + auto str_col = cudf::io::json::detail::parse_data( + column.chars().data(), + thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + column.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index b2eb1b94f9c..a14e7ecf5b3 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 4, 7}; - auto const string_length = std::vector{2, 2, 1}; + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -83,8 +83,8 @@ TEST_F(TypeInference, Null) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 4}; - auto const string_length = std::vector{0, 2, 1}; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 1, 1}; - auto const string_length = std::vector{0, 0, 4}; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -149,8 +149,8 @@ TEST_F(TypeInference, String) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 8, 12}; - auto const string_length = std::vector{6, 3, 4}; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 6, 12}; - auto const string_length = std::vector{4, 5, 5}; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 10}; - auto const string_length = std::vector{8, 9}; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async( @@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput) auto d_data = cudf::make_string_scalar(data); auto& d_string_scalar = static_cast(*d_data); - auto const string_offset = std::vector{1, 3, 5, 7, 9}; - auto const string_length = std::vector{1, 1, 1, 1, 1}; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; auto const d_string_offset = cudf::detail::make_device_uvector_async( string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto const d_string_length = cudf::detail::make_device_uvector_async(