diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp deleted file mode 100644 index ae48fed4e79..00000000000 --- a/cpp/include/nvtext/detail/generate_ngrams.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -namespace CUDF_EXPORT nvtext { -namespace detail { - -/** - * @copydoc hash_character_ngrams(cudf::strings_column_view const&, - * cudf::size_type, rmm::device_async_resource_ref) - * - * @param stream CUDA stream used for allocating/copying device memory and launching kernels - */ -std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, - cudf::size_type ngrams, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace detail -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 54282b8ef3c..b2ba1798a8f 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ std::unique_ptr generate_character_ngrams( * * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param seed The seed value to use with the hash algorithm. Default is 0. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values @@ -124,6 +125,7 @@ std::unique_ptr generate_character_ngrams( std::unique_ptr hash_character_ngrams( cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + uint32_t seed = 0, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 997b0278fe2..33d52ccd570 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include @@ -315,6 +315,7 @@ namespace { */ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + uint32_t seed, cudf::size_type const* d_ngram_offsets, cudf::hash_value_type* d_results) { @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st __shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values auto const ngram_offset = d_ngram_offsets[str_idx]; - auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{seed}; auto const end = d_str.data() + d_str.size_bytes(); auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -400,7 +402,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto d_hashes = hashes->mutable_view().data(); character_ngram_hash_kernel<<>>( - *d_strings, ngrams, d_offsets, d_hashes); + *d_strings, ngrams, seed, d_offsets, d_hashes); return make_lists_column( input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr); @@ -419,11 +421,12 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, stream, mr); + return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp index 221c0a62f3e..47b9ac46d12 100644 --- a/cpp/tests/streams/text/ngrams_test.cpp +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams) auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); nvtext::hash_character_ngrams( - cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); + cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream()); } TEST_F(TextNGramsTest, NgramsTokenize) diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c72c7cfc80e..1a737231389 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash) 2319357747u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::hash_character_ngrams(view, 10, 10); + // clang-format off + LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u, + 2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u}, + LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u, + 3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u, + 4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(TextGenerateNgramsTest, NgramsHashErrors)