diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 12cec712d5d..b662b660557 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -125,8 +125,6 @@ class json_reader_options { // Normalize unquoted spaces and tabs bool _normalize_whitespace = false; - bool _nullify_empty_lines = false; - // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -315,13 +313,6 @@ class json_reader_options { */ [[nodiscard]] bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } - /** - * @brief Whether the reader should nullify empty lines for json lines format with recovery mode - * - * @returns true if the reader should nullify empty lines, false otherwise - */ - [[nodiscard]] bool is_nullify_empty_lines() const { return _nullify_empty_lines; } - /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -511,14 +502,6 @@ class json_reader_options { */ void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; } - /** - * @brief Set whether the reader should nullify empty lines for json lines format with recovery - * mode - * - * @param val Boolean value to indicate whether the reader should nullify empty lines - */ - void nullify_empty_lines(bool val) { _nullify_empty_lines = val; } - /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -796,19 +779,6 @@ class json_reader_options_builder { return *this; } - /** - * @brief Set whether the reader should nullify empty lines for json lines format with recovery - * mode - * - * @param val Boolean value to indicate whether the reader should nullify empty lines - * @return this for chaining - */ - json_reader_options_builder& nullify_empty_lines(bool val) - { - options._nullify_empty_lines = val; - return *this; - } - /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 23414f80e0c..3d9a51833e0 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -265,7 +265,6 @@ void get_stack_context(device_span json_in, * * @param tokens The tokens to be post-processed * @param token_indices The tokens' corresponding indices that are post-processed - * @param nullify_empty_lines Whether to nullify empty lines * @param stream The cuda stream to dispatch GPU kernels to * @return Returns the post-processed token stream */ @@ -273,7 +272,6 @@ CUDF_EXPORT std::pair, rmm::device_uvector> process_token_stream( device_span tokens, device_span token_indices, - bool nullify_empty_lines, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 6e2c905c8cc..76816071d8c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -196,12 +196,11 @@ using SymbolGroupT = uint8_t; /** * @brief Definition of the DFA's states */ -enum class dfa_states : StateT { START, VALID, INVALID, NUM_STATES }; +enum class dfa_states : StateT { VALID, INVALID, NUM_STATES }; // Aliases for readability of the transition table -constexpr auto TT_START = dfa_states::START; -constexpr auto TT_INV = dfa_states::INVALID; -constexpr auto TT_VLD = dfa_states::VALID; +constexpr auto TT_INV = dfa_states::INVALID; +constexpr auto TT_VLD = dfa_states::VALID; /** * @brief Definition of the symbol groups @@ -240,17 +239,14 @@ struct UnwrapTokenFromSymbolOp { * invalid lines. */ struct TransduceToken { - bool nullify_empty_lines; template constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, SymbolGroupT const match_id, RelativeOffsetT const relative_offset, SymbolT const read_symbol) const { - bool const is_empty_invalid = - (nullify_empty_lines && state_id == static_cast(TT_START)); bool const is_end_of_invalid_line = - ((state_id == static_cast(TT_INV) or is_empty_invalid) && + (state_id == static_cast(TT_INV) && match_id == static_cast(dfa_symbol_group_id::DELIMITER)); if (is_end_of_invalid_line) { @@ -270,17 +266,14 @@ struct TransduceToken { constexpr int32_t num_inv_tokens = 2; bool const is_delimiter = match_id == static_cast(dfa_symbol_group_id::DELIMITER); - bool const is_empty_invalid = - (nullify_empty_lines && state_id == static_cast(TT_START)); // If state is either invalid or we're entering an invalid state, we discard tokens bool const is_part_of_invalid_line = (match_id != static_cast(dfa_symbol_group_id::ERROR) && - (state_id == static_cast(TT_VLD) or state_id == static_cast(TT_START))); + state_id == static_cast(TT_VLD)); // Indicates whether we transition from an invalid line to a potentially valid line - bool const is_end_of_invalid_line = - ((state_id == static_cast(TT_INV) or is_empty_invalid) && is_delimiter); + bool const is_end_of_invalid_line = (state_id == static_cast(TT_INV) && is_delimiter); int32_t const emit_count = is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0); @@ -291,9 +284,8 @@ struct TransduceToken { // Transition table std::array, TT_NUM_STATES> const transition_table{ {/* IN_STATE ERROR DELIM OTHER */ - /* START */ {{TT_INV, TT_START, TT_VLD}}, - /* VALID */ {{TT_INV, TT_START, TT_VLD}}, - /* INVALID */ {{TT_INV, TT_START, TT_INV}}}}; + /* VALID */ {{TT_INV, TT_VLD, TT_VLD}}, + /* INVALID */ {{TT_INV, TT_VLD, TT_INV}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_VLD); @@ -1515,19 +1507,17 @@ void get_stack_context(device_span json_in, std::pair, rmm::device_uvector> process_token_stream( device_span tokens, device_span token_indices, - bool nullify_empty_lines, rmm::cuda_stream_view stream) { // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; - using symbol_t = thrust::tuple; - auto filter_fst = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), - fst::detail::make_transition_table(token_filter::transition_table), - fst::detail::make_translation_functor( - token_filter::TransduceToken{nullify_empty_lines}), - stream); + using symbol_t = thrust::tuple; + auto filter_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); auto const mr = cudf::get_current_device_resource_ref(); rmm::device_scalar d_num_selected_tokens(stream, mr); @@ -1674,7 +1664,7 @@ std::pair, rmm::device_uvector> ge tokens.set_element(0, token_t::LineEnd, stream); validate_token_stream(json_in, tokens, tokens_indices, options, stream); auto [filtered_tokens, filtered_tokens_indices] = - process_token_stream(tokens, tokens_indices, options.is_nullify_empty_lines(), stream); + process_token_stream(tokens, tokens_indices, stream); tokens = std::move(filtered_tokens); tokens_indices = std::move(filtered_tokens_indices); } @@ -2100,7 +2090,6 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); - return parse_opts; } diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index fea3118cde9..d51ab5d1a48 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2976,30 +2976,16 @@ TEST_F(JsonReaderTest, JsonDtypeSchema) cudf::test::debug_output_level::ALL_ERRORS); } -/** - * @brief Test fixture for parametrized JSON reader tests - */ -struct JsonReaderEmptyRecordTest : public cudf::test::BaseFixture, - public testing::WithParamInterface {}; - -// Parametrize qualifying JSON tests for optionally nullifying empty records -INSTANTIATE_TEST_CASE_P(JsonReaderEmptyRecordTest, - JsonReaderEmptyRecordTest, - ::testing::Values(true, false)); - -TEST_P(JsonReaderEmptyRecordTest, HandlingEmptyRecords) +TEST_F(JsonReaderTest, LastRecordInvalid) { - std::string data = R"( - {"key": "1"} + std::string data = R"({"key": "1"} {"key": "})"; - bool const enable_nullify_empty_rows = GetParam(); std::map schema{{"key", {dtype()}}}; auto opts = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .dtypes(schema) .lines(true) .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) - .nullify_empty_lines(enable_nullify_empty_rows) .build(); auto const result = cudf::io::read_json(opts); @@ -3008,16 +2994,9 @@ TEST_P(JsonReaderEmptyRecordTest, HandlingEmptyRecords) EXPECT_EQ(result.metadata.schema_info[0].name, "key"); auto const result_view = result.tbl->view().column(0); - if (!enable_nullify_empty_rows) { - EXPECT_EQ(result.tbl->num_rows(), 2); - cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); - } else { - EXPECT_EQ(result.tbl->num_rows(), 3); - cudf::test::strings_column_wrapper expected{{"", "1", ""}, - cudf::test::iterators::nulls_at({0, 2})}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); - } + EXPECT_EQ(result.tbl->num_rows(), 2); + cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 8481cbb8483..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -864,7 +864,7 @@ TEST_F(JsonTest, PostProcessTokenStream) // Run system-under-test auto [d_filtered_tokens, d_filtered_indices] = - cuio_json::detail::process_token_stream(d_tokens, d_offsets, false, stream); + cuio_json::detail::process_token_stream(d_tokens, d_offsets, stream); auto const filtered_tokens = cudf::detail::make_std_vector_async(d_filtered_tokens, stream); auto const filtered_indices = cudf::detail::make_std_vector_async(d_filtered_indices, stream);