Skip to content

Commit

Permalink
Expose stream parameter in public nvtext replace APIs (#14329)
Browse files Browse the repository at this point in the history
Add stream parameter to public APIs:

- `nvtext::replace_tokens()`
- `nvtext::filter_tokens`
- `nvtext::normalize_spaces()` 
- `nvtext::normalize_characters()`

Also cleaned up some of the doxygen comments and added stream gtests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

URL: #14329
  • Loading branch information
davidwendt authored Nov 7, 2023
1 parent f102ba8 commit 16051a7
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 30 deletions.
18 changes: 11 additions & 7 deletions cpp/include/nvtext/normalize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,12 +44,14 @@ namespace nvtext {
* A null input element at row `i` produces a corresponding null entry
* for row `i` in the output column.
*
* @param strings Strings column to normalize.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @param input Strings column to normalize
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New strings columns of normalized strings.
*/
std::unique_ptr<cudf::column> normalize_spaces(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -89,16 +91,18 @@ std::unique_ptr<cudf::column> normalize_spaces(
* This function requires about 16x the number of character bytes in the input
* strings column as working memory.
*
* @param strings The input strings to normalize.
* @param input The input strings to normalize
* @param do_lower_case If true, upper-case characters are converted to
* lower-case and accents are stripped from those characters.
* If false, accented and upper-case characters are not transformed.
* @param mr Memory resource to allocate any returned objects.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return Normalized strings column
*/
std::unique_ptr<cudf::column> normalize_characters(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
bool do_lower_case,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
28 changes: 16 additions & 12 deletions cpp/include/nvtext/replace.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,19 +73,21 @@ namespace nvtext {
* @throw cudf::logic_error if targets or replacements contain nulls
* @throw cudf::logic_error if delimiter is invalid
*
* @param strings Strings column to replace.
* @param targets Strings to compare against tokens found in `strings`
* @param input Strings column to replace
* @param targets Strings to compare against tokens found in `input`
* @param replacements Replacement strings for each string in `targets`
* @param delimiter Characters used to separate each string into tokens.
* The default of empty string will identify tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of with replaced strings.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
*/
std::unique_ptr<cudf::column> replace_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::strings_column_view const& targets,
cudf::strings_column_view const& replacements,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -120,19 +122,21 @@ std::unique_ptr<cudf::column> replace_tokens(
*
* @throw cudf::logic_error if `delimiter` or `replacement` is invalid
*
* @param strings Strings column to replace.
* @param min_token_length The minimum number of characters to retain a token in the output string.
* @param replacement Optional replacement string to be used in place of removed tokens.
* @param input Strings column to replace
* @param min_token_length The minimum number of characters to retain a token in the output string
* @param replacement Optional replacement string to be used in place of removed tokens
* @param delimiter Characters used to separate each string into tokens.
* The default of empty string will identify tokens using whitespace.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of with replaced strings.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
*/
std::unique_ptr<cudf::column> filter_tokens(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement = cudf::string_scalar{""},
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,24 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con

// external APIs

std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::normalize_spaces(strings, cudf::get_default_stream(), mr);
return detail::normalize_spaces(input, stream, mr);
}

/**
* @copydoc nvtext::normalize_characters
*/
std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
bool do_lower_case,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::normalize_characters(strings, do_lower_case, cudf::get_default_stream(), mr);
return detail::normalize_characters(input, do_lower_case, stream, mr);
}

} // namespace nvtext
12 changes: 6 additions & 6 deletions cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -274,26 +274,26 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str

// external APIs

std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& input,
cudf::strings_column_view const& targets,
cudf::strings_column_view const& replacements,
cudf::string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::replace_tokens(
strings, targets, replacements, delimiter, cudf::get_default_stream(), mr);
return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
}

std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& input,
cudf::size_type min_token_length,
cudf::string_scalar const& replacement,
cudf::string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_tokens(
strings, min_token_length, replacement, delimiter, cudf::get_default_stream(), mr);
return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
}

} // namespace nvtext
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,8 @@ ConfigureTest(
testing
)
ConfigureTest(
STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp
streams/text/tokenize_test.cpp STREAM_MODE testing
)
ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)

Expand Down
60 changes: 60 additions & 0 deletions cpp/tests/streams/text/replace_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <nvtext/normalize.hpp>
#include <nvtext/replace.hpp>

class TextReplaceTest : public cudf::test::BaseFixture {};

TEST_F(TextReplaceTest, Replace)
{
auto const input = cudf::test::strings_column_wrapper({"the fox jumped over the dog"});
auto const targets = cudf::test::strings_column_wrapper({"the", "dog"});
auto const repls = cudf::test::strings_column_wrapper({"_", ""});
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
nvtext::replace_tokens(cudf::strings_column_view(input),
cudf::strings_column_view(targets),
cudf::strings_column_view(repls),
delimiter,
cudf::test::get_default_stream());
}

TEST_F(TextReplaceTest, Filter)
{
auto const input = cudf::test::strings_column_wrapper({"one two three", "four five six"});
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
auto const repl = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::filter_tokens(
cudf::strings_column_view(input), 1, delimiter, repl, cudf::test::get_default_stream());
}

TEST_F(TextReplaceTest, NormalizeSpaces)
{
auto input =
cudf::test::strings_column_wrapper({"the\tquick brown\nfox", "jumped\rover the lazy\r\t\n"});
nvtext::normalize_spaces(cudf::strings_column_view(input), cudf::test::get_default_stream());
}

TEST_F(TextReplaceTest, NormalizeCharacters)
{
auto input = cudf::test::strings_column_wrapper({"abc£def", "éè â îô\taeio", "\tĂĆĖÑ Ü"});
nvtext::normalize_characters(
cudf::strings_column_view(input), false, cudf::test::get_default_stream());
}

0 comments on commit 16051a7

Please sign in to comment.