Skip to content

Commit

Permalink
Fix strings replace for adjacent, identical multi-byte UTF-8 characte…
Browse files Browse the repository at this point in the history
…r targets (#14235)

Fixes bug that can occur when replacing all occurrences in a string using a multi-byte UTF-8 target when the target matches sequentially in the same string -- some characters were missed.
Specialized gtest is also added.

Found while working on #13891

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14235
  • Loading branch information
davidwendt authored Oct 5, 2023
1 parent b120f7e commit 5d311ea
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cpp/src/strings/replace/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ struct replace_row_parallel_fn {
} else {
bytes += d_repl.size_bytes() - d_target.size_bytes();
}
position = d_str.find(d_target, position + d_target.size_bytes());
position = d_str.find(d_target, position + d_target.length());
--max_n;
}
if (out_ptr) // copy whats left (or right depending on your point of view)
Expand Down
22 changes: 22 additions & 0 deletions cpp/tests/strings/replace_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget)
{
auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"});
auto strings_view = cudf::strings_column_view(input);
// replace all occurrences of 'é' with 'e'
cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"});

auto stream = cudf::get_default_stream();
auto mr = rmm::mr::get_current_device_resource();

auto target = cudf::string_scalar("é", true, stream);
auto repl = cudf::string_scalar("e", true, stream);
auto results = cudf::strings::replace(strings_view, target, repl);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
strings_view, target, repl, -1, stream, mr);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
strings_view, target, repl, -1, stream, mr);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsReplaceTest, ReplaceSlice)
{
std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
Expand Down

0 comments on commit 5d311ea

Please sign in to comment.