Skip to content

Commit

Permalink
removing from modifications to dfa
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi committed Oct 16, 2024
1 parent 9d2a2f0 commit 3d0a51d
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 85 deletions.
30 changes: 0 additions & 30 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@ class json_reader_options {
// Normalize unquoted spaces and tabs
bool _normalize_whitespace = false;

bool _nullify_empty_lines = false;

// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

Expand Down Expand Up @@ -315,13 +313,6 @@ class json_reader_options {
*/
[[nodiscard]] bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }

/**
* @brief Whether the reader should nullify empty lines for json lines format with recovery mode
*
* @returns true if the reader should nullify empty lines, false otherwise
*/
[[nodiscard]] bool is_nullify_empty_lines() const { return _nullify_empty_lines; }

/**
* @brief Queries the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -511,14 +502,6 @@ class json_reader_options {
*/
void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }

/**
* @brief Set whether the reader should nullify empty lines for json lines format with recovery
* mode
*
* @param val Boolean value to indicate whether the reader should nullify empty lines
*/
void nullify_empty_lines(bool val) { _nullify_empty_lines = val; }

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -796,19 +779,6 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether the reader should nullify empty lines for json lines format with recovery
* mode
*
* @param val Boolean value to indicate whether the reader should nullify empty lines
* @return this for chaining
*/
json_reader_options_builder& nullify_empty_lines(bool val)
{
options._nullify_empty_lines = val;
return *this;
}

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,13 @@ void get_stack_context(device_span<SymbolT const> json_in,
*
* @param tokens The tokens to be post-processed
* @param token_indices The tokens' corresponding indices that are post-processed
* @param nullify_empty_lines Whether to nullify empty lines
* @param stream The cuda stream to dispatch GPU kernels to
* @return Returns the post-processed token stream
*/
CUDF_EXPORT
std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
bool nullify_empty_lines,
rmm::cuda_stream_view stream);

/**
Expand Down
41 changes: 15 additions & 26 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,11 @@ using SymbolGroupT = uint8_t;
/**
* @brief Definition of the DFA's states
*/
enum class dfa_states : StateT { START, VALID, INVALID, NUM_STATES };
enum class dfa_states : StateT { VALID, INVALID, NUM_STATES };

// Aliases for readability of the transition table
constexpr auto TT_START = dfa_states::START;
constexpr auto TT_INV = dfa_states::INVALID;
constexpr auto TT_VLD = dfa_states::VALID;
constexpr auto TT_INV = dfa_states::INVALID;
constexpr auto TT_VLD = dfa_states::VALID;

/**
* @brief Definition of the symbol groups
Expand Down Expand Up @@ -240,17 +239,14 @@ struct UnwrapTokenFromSymbolOp {
* invalid lines.
*/
struct TransduceToken {
bool nullify_empty_lines;
template <typename RelativeOffsetT, typename SymbolT>
constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
SymbolGroupT const match_id,
RelativeOffsetT const relative_offset,
SymbolT const read_symbol) const
{
bool const is_empty_invalid =
(nullify_empty_lines && state_id == static_cast<StateT>(TT_START));
bool const is_end_of_invalid_line =
((state_id == static_cast<StateT>(TT_INV) or is_empty_invalid) &&
(state_id == static_cast<StateT>(TT_INV) &&
match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER));

if (is_end_of_invalid_line) {
Expand All @@ -270,17 +266,14 @@ struct TransduceToken {
constexpr int32_t num_inv_tokens = 2;

bool const is_delimiter = match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER);
bool const is_empty_invalid =
(nullify_empty_lines && state_id == static_cast<StateT>(TT_START));

// If state is either invalid or we're entering an invalid state, we discard tokens
bool const is_part_of_invalid_line =
(match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::ERROR) &&
(state_id == static_cast<StateT>(TT_VLD) or state_id == static_cast<StateT>(TT_START)));
state_id == static_cast<StateT>(TT_VLD));

// Indicates whether we transition from an invalid line to a potentially valid line
bool const is_end_of_invalid_line =
((state_id == static_cast<StateT>(TT_INV) or is_empty_invalid) && is_delimiter);
bool const is_end_of_invalid_line = (state_id == static_cast<StateT>(TT_INV) && is_delimiter);

int32_t const emit_count =
is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0);
Expand All @@ -291,9 +284,8 @@ struct TransduceToken {
// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
{/* IN_STATE ERROR DELIM OTHER */
/* START */ {{TT_INV, TT_START, TT_VLD}},
/* VALID */ {{TT_INV, TT_START, TT_VLD}},
/* INVALID */ {{TT_INV, TT_START, TT_INV}}}};
/* VALID */ {{TT_INV, TT_VLD, TT_VLD}},
/* INVALID */ {{TT_INV, TT_VLD, TT_INV}}}};

// The DFA's starting state
constexpr auto start_state = static_cast<StateT>(TT_VLD);
Expand Down Expand Up @@ -1515,19 +1507,17 @@ void get_stack_context(device_span<SymbolT const> json_in,
std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
bool nullify_empty_lines,
rmm::cuda_stream_view stream)
{
// Instantiate FST for post-processing the token stream to remove all tokens that belong to an
// invalid JSON line
token_filter::UnwrapTokenFromSymbolOp sgid_op{};
using symbol_t = thrust::tuple<PdaTokenT, SymbolOffsetT>;
auto filter_fst =
fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
fst::detail::make_transition_table(token_filter::transition_table),
fst::detail::make_translation_functor<symbol_t, 0, 2>(
token_filter::TransduceToken{nullify_empty_lines}),
stream);
using symbol_t = thrust::tuple<PdaTokenT, SymbolOffsetT>;
auto filter_fst = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
fst::detail::make_transition_table(token_filter::transition_table),
fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
stream);

auto const mr = cudf::get_current_device_resource_ref();
rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
Expand Down Expand Up @@ -1674,7 +1664,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
tokens.set_element(0, token_t::LineEnd, stream);
validate_token_stream(json_in, tokens, tokens_indices, options, stream);
auto [filtered_tokens, filtered_tokens_indices] =
process_token_stream(tokens, tokens_indices, options.is_nullify_empty_lines(), stream);
process_token_stream(tokens, tokens_indices, stream);
tokens = std::move(filtered_tokens);
tokens_indices = std::move(filtered_tokens_indices);
}
Expand Down Expand Up @@ -2100,7 +2090,6 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
std::vector<std::string> na_values{"", "null"};
na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);

return parse_opts;
}

Expand Down
31 changes: 5 additions & 26 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2976,30 +2976,16 @@ TEST_F(JsonReaderTest, JsonDtypeSchema)
cudf::test::debug_output_level::ALL_ERRORS);
}

/**
* @brief Test fixture for parametrized JSON reader tests
*/
struct JsonReaderEmptyRecordTest : public cudf::test::BaseFixture,
public testing::WithParamInterface<bool> {};

// Parametrize qualifying JSON tests for optionally nullifying empty records
INSTANTIATE_TEST_CASE_P(JsonReaderEmptyRecordTest,
JsonReaderEmptyRecordTest,
::testing::Values(true, false));

TEST_P(JsonReaderEmptyRecordTest, HandlingEmptyRecords)
TEST_F(JsonReaderTest, LastRecordInvalid)
{
std::string data = R"(
{"key": "1"}
std::string data = R"({"key": "1"}
{"key": "})";
bool const enable_nullify_empty_rows = GetParam();
std::map<std::string, cudf::io::schema_element> schema{{"key", {dtype<cudf::string_view>()}}};
auto opts =
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
.dtypes(schema)
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.nullify_empty_lines(enable_nullify_empty_rows)
.build();
auto const result = cudf::io::read_json(opts);

Expand All @@ -3008,16 +2994,9 @@ TEST_P(JsonReaderEmptyRecordTest, HandlingEmptyRecords)
EXPECT_EQ(result.metadata.schema_info[0].name, "key");
auto const result_view = result.tbl->view().column(0);

if (!enable_nullify_empty_rows) {
EXPECT_EQ(result.tbl->num_rows(), 2);
cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
} else {
EXPECT_EQ(result.tbl->num_rows(), 3);
cudf::test::strings_column_wrapper expected{{"", "1", ""},
cudf::test::iterators::nulls_at({0, 2})};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
}
EXPECT_EQ(result.tbl->num_rows(), 2);
cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
}

CUDF_TEST_PROGRAM_MAIN()
2 changes: 1 addition & 1 deletion cpp/tests/io/json/nested_json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ TEST_F(JsonTest, PostProcessTokenStream)

// Run system-under-test
auto [d_filtered_tokens, d_filtered_indices] =
cuio_json::detail::process_token_stream(d_tokens, d_offsets, false, stream);
cuio_json::detail::process_token_stream(d_tokens, d_offsets, stream);

auto const filtered_tokens = cudf::detail::make_std_vector_async(d_filtered_tokens, stream);
auto const filtered_indices = cudf::detail::make_std_vector_async(d_filtered_indices, stream);
Expand Down

0 comments on commit 3d0a51d

Please sign in to comment.