Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add seed parameter to hash_character_ngrams #17643

Open
wants to merge 4 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 0 additions & 39 deletions cpp/include/nvtext/detail/generate_ngrams.hpp

This file was deleted.

4 changes: 3 additions & 1 deletion cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -117,13 +117,15 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
*
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param seed The seed value to use with the hash algorithm. Default is 0.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
uint32_t seed = 0,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

Expand Down
13 changes: 8 additions & 5 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,7 +32,7 @@
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <nvtext/detail/generate_ngrams.hpp>
#include <nvtext/generate_ngrams.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -315,6 +315,7 @@ namespace {
*/
CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
cudf::size_type ngrams,
uint32_t seed,
cudf::size_type const* d_ngram_offsets,
cudf::hash_value_type* d_results)
{
Expand All @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
__shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values

auto const ngram_offset = d_ngram_offsets[str_idx];
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seed};

auto const end = d_str.data() + d_str.size_bytes();
auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
Expand Down Expand Up @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
cudf::size_type ngrams,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -400,7 +402,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();

character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, ngrams, d_offsets, d_hashes);
*d_strings, ngrams, seed, d_offsets, d_hashes);

return make_lists_column(
input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
Expand All @@ -419,11 +421,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/tests/streams/text/ngrams_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams)
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
nvtext::hash_character_ngrams(
cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, NgramsTokenize)
Expand Down
13 changes: 12 additions & 1 deletion cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash)
2319357747u}});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = nvtext::hash_character_ngrams(view, 10, 10);
// clang-format off
LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u,
2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u},
LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u,
3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u,
4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u}
});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
Expand Down
Loading