Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in recovering invalid lines in JSONL inputs #17098

Merged
merged 41 commits into from
Oct 30, 2024
Merged
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9ff3129
add option to nullify empty lines
karthikeyann Oct 9, 2024
624743b
printf debugging
shrshi Oct 11, 2024
bcecb25
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 14, 2024
f9b7e08
Merge branch 'enh-json_nullify_empty_lines' into json-quote-char-pars…
shrshi Oct 15, 2024
55c13a0
added test; fixed small bug in nullifying empty rows
shrshi Oct 16, 2024
9d2a2f0
formatting
shrshi Oct 16, 2024
3d0a51d
removing from modifications to dfa
shrshi Oct 16, 2024
911e065
remove hardcoding of delimiter
shrshi Oct 16, 2024
ab7659b
Merge branch 'branch-24.12' into enh-json_nullify_empty_lines
karthikeyann Oct 17, 2024
0ef5108
Merge branch 'branch-24.12' into enh-json_nullify_empty_lines
shrshi Oct 18, 2024
1dffbf0
Merge branch 'enh-json_nullify_empty_lines' of github.com:karthikeyan…
shrshi Oct 18, 2024
293521f
Update cpp/tests/io/json/json_test.cpp
shrshi Oct 21, 2024
ca8ee32
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
ttnghia Oct 21, 2024
ebc5275
pre-process concat
shrshi Oct 21, 2024
679833b
formatting
shrshi Oct 21, 2024
b192fd2
Merge branch 'branch-24.12' into enh-json_nullify_empty_lines
shrshi Oct 21, 2024
31d5cab
some logic fixes
shrshi Oct 22, 2024
7c3e0f0
formatting
shrshi Oct 22, 2024
35b7177
test
shrshi Oct 22, 2024
9370dc5
formatting
shrshi Oct 22, 2024
6d87031
test cleanup
shrshi Oct 22, 2024
b9005ae
formatting
shrshi Oct 22, 2024
4382ef8
pr reviews
shrshi Oct 22, 2024
f75d8ee
formatting
shrshi Oct 22, 2024
bb9584e
formatting fix
shrshi Oct 22, 2024
6ad06ca
Merge branch 'branch-24.12' into enh-json_nullify_empty_lines
shrshi Oct 22, 2024
424f90f
pr reviews
shrshi Oct 24, 2024
8b48297
Merge branch 'enh-json_nullify_empty_lines' of github.com:karthikeyan…
shrshi Oct 24, 2024
f651087
merge
shrshi Oct 24, 2024
dfba4cd
Merge branch 'json-quote-char-parsing-fix' of github.com:shrshi/cudf …
shrshi Oct 24, 2024
eb82450
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 29, 2024
d3193e3
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 29, 2024
18f1a6e
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 29, 2024
96dce9d
pr reviews
shrshi Oct 29, 2024
f8c5de3
formatting
shrshi Oct 29, 2024
c0d0b3e
Merge branch 'json-quote-char-parsing-fix' of github.com:shrshi/cudf …
shrshi Oct 29, 2024
234c19d
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 29, 2024
77b2f99
oops, undoing accidental merge
shrshi Oct 29, 2024
2e37ed4
Merge branch 'json-quote-char-parsing-fix' of github.com:shrshi/cudf …
shrshi Oct 29, 2024
3784be9
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 29, 2024
f351242
Merge branch 'branch-24.12' into json-quote-char-parsing-fix
shrshi Oct 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
formatting
  • Loading branch information
shrshi committed Oct 22, 2024
commit 7c3e0f0c08027db3c8816e432d00e85d84bd18d2
47 changes: 24 additions & 23 deletions cpp/src/io/json/read_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,12 @@

#include <cub/device/device_copy.cuh>
#include <cub/device/device_histogram.cuh>
#include <cuda/std/span>
#include <thrust/distance.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/scatter.h>

#include <cuda/std/span>

#include <numeric>

namespace cudf::io::json::detail {
Expand Down Expand Up @@ -526,37 +525,39 @@ std::tuple<rmm::device_buffer, char> preprocess(cudf::strings_column_view const&

auto d_offsets_colview = input.offsets();
CUDF_EXPECTS(d_offsets_colview.null_count() == 0, "how can offsets have null count");
device_span<cudf::size_type const> d_offsets(d_offsets_colview.data<cudf::size_type>(), d_offsets_colview.size());
device_span<cudf::size_type const> d_offsets(d_offsets_colview.data<cudf::size_type>(),
d_offsets_colview.size());

rmm::device_buffer concatenated_buffer(num_chars + d_offsets.size() - 2, stream);

thrust::scatter(rmm::exec_policy_nosync(stream),
thrust::make_constant_iterator(delimiter),
thrust::make_constant_iterator(delimiter) + d_offsets.size() - 2,
thrust::make_transform_iterator(
thrust::make_counting_iterator(1),
cuda::proclaim_return_type<cudf::size_type>(
[d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> cudf::size_type {
return d_offsets[idx] + idx - 1;
})),
reinterpret_cast<char*>(concatenated_buffer.data()));
thrust::scatter(
rmm::exec_policy_nosync(stream),
thrust::make_constant_iterator(delimiter),
thrust::make_constant_iterator(delimiter) + d_offsets.size() - 2,
thrust::make_transform_iterator(
thrust::make_counting_iterator(1),
cuda::proclaim_return_type<cudf::size_type>(
[d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> cudf::size_type {
return d_offsets[idx] + idx - 1;
})),
reinterpret_cast<char*>(concatenated_buffer.data()));

{
// cub device batched copy
auto input_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<char const*>(
[input = input.chars_begin(stream), d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> char const* {
return input + d_offsets[idx];
}));
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<char const*>(
[input = input.chars_begin(stream), d_offsets = d_offsets.begin()] __device__(
cudf::size_type idx) -> char const* { return input + d_offsets[idx]; }));
auto output_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<char*>(
[output = reinterpret_cast<char*>(concatenated_buffer.data()), d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> char* {
return output + d_offsets[idx] + idx;
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<char*>(
[output = reinterpret_cast<char*>(concatenated_buffer.data()),
d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> char* {
return output + d_offsets[idx] + idx;
}));
auto sizes_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<cudf::size_type>(
[d_offsets = d_offsets.begin()] __device__(cudf::size_type idx) -> cudf::size_type {
return d_offsets[idx + 1] - d_offsets[idx];
Expand Down
Loading