Skip to content

Commit

Permalink
Detect mismatches in begin and end tokens returned by JSON tokenizer …
Browse files Browse the repository at this point in the history
…FST (#17471)

Addresses #15820

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #17471
  • Loading branch information
shrshi authored Dec 19, 2024
1 parent 88df0ad commit dfb7c11
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 6 deletions.
6 changes: 6 additions & 0 deletions cpp/src/io/fst/logical_stack.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
stream));
}

// Check if the last element of d_kv_operations is 0. If not, then we have a problem.
if (num_symbols_in && !supports_reset_op) {
StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream);
CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!");
}

// Stable radix sort, sorting by stack level of the operations
d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Current()),
Expand Down
10 changes: 4 additions & 6 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1473,10 +1473,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
to_stack_op::start_state,
stream);

auto stack_ops_bufsize = d_num_stack_ops.value(stream);
// Copy back to actual number of stack operations
auto num_stack_ops = d_num_stack_ops.value(stream);
// Sequence of stack symbols and their position in the original input (sparse representation)
rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
rmm::device_uvector<StackSymbolT> stack_ops{num_stack_ops, stream};
rmm::device_uvector<SymbolOffsetT> stack_op_indices{num_stack_ops, stream};

// Run bracket-brace FST to retrieve starting positions of structs and lists
json_to_stack_ops_fst.Transduce(json_in.begin(),
Expand All @@ -1487,9 +1488,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
to_stack_op::start_state,
stream);

// Copy back to actual number of stack operations
auto const num_stack_ops = d_num_stack_ops.value(stream);

// Stack operations with indices are converted to top of the stack for each character in the input
if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3450,4 +3450,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}});
}

TEST_F(JsonReaderTest, MismatchedBeginEndTokens)
{
std::string data = R"({"not_valid": "json)";
auto opts =
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::FAIL)
.build();
EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error);
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit dfb7c11

Please sign in to comment.