From 5306eca611c7926fa59c581351c3cf7f0abf464d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Dec 2024 09:50:48 -0800 Subject: [PATCH 1/5] Use rapids-logger to generate the cudf logger (#17307) This PR replaces cudf's logger implementation with one generated using https://github.com/rapidsai/rapids-logger. This approach allows us to centralize the logger definition across different RAPIDS projects while allowing each project to vendor its own copy with a suitable set of macros and default logger objects. The common logger also takes care of handling the more complex packaging problems around ensuring that we fully isolate our spdlog dependency and do not leak any of its symbols, allowing our libraries to be safely installed in a much broader set of environments. Contributes to https://github.com/rapidsai/build-planning/issues/104. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - James Lamb (https://github.com/jameslamb) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17307 --- .../all_cuda-118_arch-x86_64.yaml | 1 - .../all_cuda-125_arch-x86_64.yaml | 1 - conda/recipes/libcudf/conda_build_config.yaml | 3 - conda/recipes/libcudf/meta.yaml | 1 - cpp/CMakeLists.txt | 22 ++--- cpp/benchmarks/io/cuio_common.cpp | 2 +- cpp/cmake/thirdparty/get_spdlog.cmake | 27 ------ .../developer_guide/DEVELOPER_GUIDE.md | 6 +- cpp/include/cudf/detail/utilities/logger.hpp | 27 ------ cpp/include/cudf/utilities/logger.hpp | 54 ------------ cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- cpp/src/io/csv/reader_impl.cu | 2 +- cpp/src/io/orc/reader_impl_chunking.cu | 2 +- cpp/src/io/orc/stripe_enc.cu | 2 +- cpp/src/io/orc/writer_impl.cu | 2 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 2 +- cpp/src/io/utilities/base64_utilities.cpp | 2 +- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_io_utilities.cpp | 3 +- cpp/src/io/utilities/getenv_or.hpp | 2 +- cpp/src/utilities/host_memory.cpp | 2 +- cpp/src/utilities/logger.cpp | 83 ------------------- cpp/src/utilities/stream_pool.cpp | 2 +- cpp/tests/utilities_tests/logger_tests.cpp | 48 +++++------ dependencies.yaml | 1 - 27 files changed, 53 insertions(+), 252 deletions(-) delete mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake delete mode 100644 cpp/include/cudf/detail/utilities/logger.hpp delete mode 100644 cpp/include/cudf/utilities/logger.hpp delete mode 100644 cpp/src/utilities/logger.cpp diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index bad508154aa..33fc2f651c6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -87,7 +87,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 969124a29ad..c290a83a37f 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -86,7 +86,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c78ca326005..00020fdf6b8 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -31,9 +31,6 @@ fmt_version: flatbuffers_version: - "=24.3.25" -spdlog_version: - - ">=1.14.1,<1.15" - nvcomp_version: - "=4.1.0.6" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 1c2e9e8dd98..b585aafc397 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -68,7 +68,6 @@ requirements: - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - flatbuffers {{ flatbuffers_version }} - - spdlog {{ spdlog_version }} - zlib {{ zlib_version }} outputs: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e54c71de4fa..3d77307ccde 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -273,6 +273,14 @@ endif() # add third party dependencies using CPM rapids_cpm_init() + +# Not using rapids-cmake since we never want to find, always download. +CPMAddPackage( + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + 14bb233d2420f7187a690f0bb528ec0420c70d48 +) +rapids_make_logger(cudf EXPORT_SET cudf-exports) + # find jitify include(cmake/thirdparty/get_jitify.cmake) # find NVTX @@ -299,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake) include(cmake/thirdparty/get_kvikio.cmake) # find fmt include(cmake/thirdparty/get_fmt.cmake) -# find spdlog -include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) # find thread_pool @@ -772,7 +778,6 @@ add_library( src/utilities/default_stream.cpp src/utilities/host_memory.cpp src/utilities/linked_column.cpp - src/utilities/logger.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp @@ -910,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED) target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) endif() -# Define RMM logging level -target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") - -# Define spdlog level -target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Define logging level +target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -938,8 +940,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ - spdlog::spdlog_header_only + PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ cudf_logger PRIVATE $ cuco::cuco ZLIB::ZLIB @@ -948,6 +949,7 @@ target_link_libraries( $ nanoarrow rmm::rmm_logger_impl + cudf_logger_impl ) # Add Conda library, and include paths if specified diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45b46005c47..38a21961735 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 90b0f4d8a8e..00000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog( - FMT_OPTION "EXTERNAL_FMT_HO" - INSTALL_EXPORT_SET cudf-exports - BUILD_EXPORT_SET cudf-exports - ) - -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 1c1052487f2..5032a073b58 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable, in between the two settings will be excluded from the written log. The available levels are the same as for the CMake variable. * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime. -For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that +For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that are not errors or critical errors. This API should not be used within libcudf to manipulate logging, its purpose is to allow upstream users to configure libcudf logging to fit their application. By default, logging messages are output to stderr. Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the specified path (can be relative to the current directory). -Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to -standard output or even a custom spdlog sink. +Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to +standard output. # Data Types diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp deleted file mode 100644 index e7643eb44bd..00000000000 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp deleted file mode 100644 index 982554a23f5..00000000000 --- a/cpp/include/cudf/utilities/logger.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace CUDF_EXPORT cudf { - -namespace detail { -spdlog::logger& logger(); -} - -/** - * @brief Returns the global logger. - * - * This is a global instance of a spdlog logger. It can be used to configure logging behavior in - * libcudf. - * - * Examples: - * @code{.cpp} - * // Turn off logging at runtime - * cudf::logger().set_level(spdlog::level::off); - * // Add a stdout sink to the logger - * cudf::logger().sinks().push_back(std::make_shared()); - * // Replace the default sink - * cudf::logger().sinks() ={std::make_shared()}; - * @endcode - * - * Note: Changes to the sinks are not thread safe and should only be done during global - * initialization. - * - * @return spdlog::logger& The logger. - */ -[[deprecated( - "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& -logger(); - -} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 9d3cf75a13f..d45c02f374f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -18,8 +18,8 @@ #include "nvcomp_adapter.cuh" -#include #include +#include #include #include diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 6c84b53db46..7f0b5e07b09 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -28,13 +28,13 @@ #include "io/utilities/parsing_utils.cuh" #include -#include #include #include #include #include #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index fcaee9c548e..726c79bd004 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index ed0b6969154..07172b6b7f7 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -23,10 +23,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 76e5369ffd0..0906017ee61 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -29,9 +29,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index bfd0cc992cf..0dd1aff41e9 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -23,7 +23,7 @@ #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" -#include +#include #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f865c9a7643..188e6a8c0d8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -38,10 +38,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp index 2a2a07afc8d..00fc54f9883 100644 --- a/cpp/src/io/utilities/base64_utilities.cpp +++ b/cpp/src/io/utilities/base64_utilities.cpp @@ -60,7 +60,7 @@ #include "base64_utilities.hpp" -#include +#include #include diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index bed03869b34..dfa5d46cf48 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -16,9 +16,9 @@ #include "file_io_utilities.hpp" -#include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 62ef7c7a794..38dedcc2627 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -17,11 +17,11 @@ #include "file_io_utilities.hpp" #include "getenv_or.hpp" -#include #include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 9b17e7f6d55..28367c95430 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -19,10 +19,11 @@ #include "getenv_or.hpp" #include -#include #include +#include #include +#include #include #include diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp index 3fd97a00b61..b9613428418 100644 --- a/cpp/src/io/utilities/getenv_or.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index e30806a5011..4196523d211 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include #include diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp deleted file mode 100644 index e52fffbd8c6..00000000000 --- a/cpp/src/utilities/logger.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include - -#include - -namespace { - -/** - * @brief Creates a sink for libcudf logging. - * - * Returns a file sink if the file name has been specified, otherwise returns a stderr sink. - */ -[[nodiscard]] spdlog::sink_ptr make_libcudf_sink() -{ - if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) { - return std::make_shared(filename, true); - } else { - return std::make_shared(); - } -} - -/** - * @brief Converts the level name into the `spdlog` level enum. - */ -[[nodiscard]] spdlog::level::level_enum libcudf_log_level() -{ - auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL"); - if (env_level == nullptr) { return spdlog::level::warn; } - - auto const env_lvl_str = std::string(env_level); - if (env_lvl_str == "TRACE") return spdlog::level::trace; - if (env_lvl_str == "DEBUG") return spdlog::level::debug; - if (env_lvl_str == "INFO") return spdlog::level::info; - if (env_lvl_str == "WARN") return spdlog::level::warn; - if (env_lvl_str == "ERROR") return spdlog::level::err; - if (env_lvl_str == "CRITICAL") return spdlog::level::critical; - if (env_lvl_str == "OFF") return spdlog::level::off; - - CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable"); -} - -/** - * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization. - */ -struct logger_wrapper { - spdlog::logger logger_; - - logger_wrapper() : logger_{"CUDF", make_libcudf_sink()} - { - logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); - logger_.set_level(libcudf_log_level()); - logger_.flush_on(spdlog::level::warn); - } -}; - -} // namespace - -spdlog::logger& cudf::detail::logger() -{ - static logger_wrapper wrapped{}; - return wrapped.logger_; -} - -spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d1bebd1937..b0f2d8c0637 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index cfab570833b..58396115a54 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -16,29 +16,25 @@ #include -#include - -#include +#include #include class LoggerTest : public cudf::test::BaseFixture { std::ostringstream oss; - spdlog::level::level_enum prev_level; - std::vector prev_sinks; + cudf::level_enum prev_level; public: - LoggerTest() - : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} + LoggerTest() : prev_level{cudf::default_logger().level()} { - cudf::detail::logger().sinks() = {std::make_shared(oss)}; - cudf::detail::logger().set_formatter( - std::unique_ptr(new spdlog::pattern_formatter("%v"))); + cudf::default_logger().sinks().push_back(std::make_shared(oss)); + cudf::default_logger().set_pattern("%v"); } ~LoggerTest() override { - cudf::detail::logger().set_level(prev_level); - cudf::detail::logger().sinks() = prev_sinks; + cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); + cudf::default_logger().set_level(prev_level); + cudf::default_logger().sinks().pop_back(); } void clear_sink() { oss.str(""); } @@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::detail::logger().critical("crit msg"); + cudf::default_logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); - cudf::detail::logger().error("error"); - cudf::detail::logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); + cudf::default_logger().error("error"); + cudf::default_logger().critical("critical"); + ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::detail::logger().set_level(spdlog::level::warn); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); + cudf::default_logger().set_level(cudf::level_enum::warn); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::detail::logger().set_level(spdlog::level::debug); - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); + cudf::default_logger().set_level(cudf::level_enum::debug); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/dependencies.yaml b/dependencies.yaml index 3c55ce2c614..44767f1e9d3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -413,7 +413,6 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 - - spdlog>=1.14.1,<1.15 depends_on_nvcomp: common: - output_types: conda From 657f50bae866d97a231d565f34a1941efd49c721 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 Dec 2024 10:16:11 -0800 Subject: [PATCH 2/5] Fix typos, rename types, and add null_probability benchmark axis for distinct (#17546) This PR addresses several minor issues discovered while working on #17467: - Corrected a typo where `RowHasher` should have been `RowEqual` - Renamed `hash_set_type` to `distinct_set_t` - Added a `null_probability` benchmark axis for the distinct benchmark, similar to other stream compaction benchmarks Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17546 --- cpp/benchmarks/stream_compaction/distinct.cpp | 4 +++- cpp/src/stream_compaction/distinct.cu | 4 ++-- cpp/src/stream_compaction/distinct_helpers.cu | 12 ++++++------ cpp/src/stream_compaction/distinct_helpers.hpp | 12 +++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index d7deebca89a..75d04bb4e8e 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); + auto const null_probability = state.get_float64("null_probability"); if (cardinality > num_rows) { state.skip("cardinality > num_rows"); @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 7d11b02d3e1..9ab8ed5938a 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -95,8 +95,8 @@ rmm::device_uvector distinct_indices(table_view const& input, auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const helper_func = [&](auto const& d_equal) { - using RowHasher = std::decay_t; - auto set = hash_set_type{ + using RowEqual = std::decay_t; + auto set = distinct_set_t{ num_rows, 0.5, // desired load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index c3a004b7f28..aadb438b019 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -21,8 +21,8 @@ namespace cudf::detail { -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, @@ -100,7 +100,7 @@ rmm::device_uvector reduce_by_row(hash_set_type& set, } template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -110,7 +110,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -120,7 +120,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -130,7 +130,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp index f15807c2434..4ca1cab937a 100644 --- a/cpp/src/stream_compaction/distinct_helpers.hpp +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) } } -template -using hash_set_type = +template +using distinct_set_t = cuco::static_set, cuda::thread_scope_device, - RowHasher, + RowEqual, cuco::linear_probing<1, cudf::experimental::row::hash::device_row_hasher< cudf::hashing::detail::default_hash, @@ -79,6 +79,8 @@ using hash_set_type = * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. * + * @tparam RowEqual The type of row equality comparator + * * @param set The auxiliary set to perform reduction * @param set_size The number of elements in set * @param num_rows The number of all input rows @@ -87,8 +89,8 @@ using hash_set_type = * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the output indices */ -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, From be62ea60440a8357702eb292e19e69dd6be001e0 Mon Sep 17 00:00:00 2001 From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:21:39 -0600 Subject: [PATCH 3/5] Update version references in workflow (#17568) Update version references in breaking-change trigger workflow --- .github/workflows/trigger-breaking-change-alert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 3b972f31ca4..01dd2436beb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} From 1e95864f6631a1dc90d78fc9418281c256fa9f59 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 10 Dec 2024 13:47:42 -0600 Subject: [PATCH 4/5] Fix Dask-cuDF `clip` APIs (#17509) Closes https://github.com/rapidsai/cudf/issues/17502 **Background Info**: The cudf and pandas `axis` defaults are different, and the upstream dask-expr `clip` APIs are consistent with the behavior of Pandas (not cudf). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17509 --- .../dask_cudf/dask_cudf/_expr/collection.py | 10 +++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 2dc4031b876..5192e6b8171 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -163,6 +163,11 @@ def read_text(*args, **kwargs): return legacy_read_text(*args, **kwargs) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Series(DXSeries, CudfFrameBase): def groupby(self, by, **kwargs): @@ -182,6 +187,11 @@ def struct(self): return StructMethods(self) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cda7e2d134d..7101fb7e00a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1019,3 +1019,29 @@ def test_rename_axis_after_join(): result = ddf1.join(ddf2, how="outer") expected = df1.join(df2, how="outer") dd.assert_eq(result, expected, check_index=False) + + +def test_clip_dataframe(): + df = cudf.DataFrame( + { + "id": ["a", "b", "c", "d"], + "score": [-1, 1, 4, 6], + } + ) + expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1) + got = dd.from_pandas(df, npartitions=2).clip( + lower=["b", 1], upper=["d", 5], axis=1 + ) + dd.assert_eq(expect, got) + + +def test_clip_series(): + ser = cudf.Series([-0.5, 0.5, 4.5, 5.5]) + expect = ser.clip(lower=0, upper=5).round().astype(int) + got = ( + dd.from_pandas(ser, npartitions=2) + .clip(lower=0, upper=5) + .round() + .astype(int) + ) + dd.assert_eq(expect, got) From 0c5bd6627159fe44a49e56020f0c0842696bc397 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:10:45 -0500 Subject: [PATCH 5/5] Rework minhash APIs for deprecation cycle (#17421) Renames `minhash_permuted()` to `minhash()` and deprecates `minhash_permuted` Also removes the `word_minhash` APIs deprecated in 24.12. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17421 --- cpp/benchmarks/text/minhash.cpp | 5 +- cpp/include/nvtext/minhash.hpp | 194 +--------- cpp/src/text/minhash.cu | 341 +----------------- cpp/tests/text/minhash_tests.cpp | 79 ++-- docs/cudf/source/conf.py | 2 + python/cudf/cudf/_lib/nvtext/minhash.pyx | 46 +-- python/cudf/cudf/_lib/strings/__init__.py | 4 - python/cudf/cudf/core/column/string.py | 166 +-------- .../cudf/cudf/tests/text/test_text_methods.py | 72 +--- .../pylibcudf/libcudf/nvtext/minhash.pxd | 34 -- python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 12 +- python/pylibcudf/pylibcudf/nvtext/minhash.pyi | 7 +- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 168 +-------- .../pylibcudf/tests/test_nvtext_minhash.py | 30 +- 14 files changed, 100 insertions(+), 1060 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a80d0dcbdb8..8c86e8d4366 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 - ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) - : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); + auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width); }); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index b2c1a23f57e..f0d5d9ecb5d 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext { * @file */ -/** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values for each string in input - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - /** * @brief Returns the minhash values for each string * @@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash_permuted( +std::unique_ptr minhash( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -142,67 +79,16 @@ std::unique_ptr minhash_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. - * The hash function returns 2 uint64 values but only the first value - * is used with the minhash calculation. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values as UINT64 for each string in input - */ -[[deprecated]] std::unique_ptr minhash64( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. + * @copydoc nvtext::minhash * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash() */ -[[deprecated]] std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -244,7 +130,7 @@ std::unique_ptr minhash_permuted( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64_permuted( +std::unique_ptr minhash64( cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -254,64 +140,18 @@ std::unique_ptr minhash64_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. + * @copydoc nvtext::minhash64 * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash64() */ -[[deprecated]] std::unique_ptr word_minhash( - cudf::lists_column_view const& input, - cudf::device_span seeds, +[[deprecated]] std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); -/** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm though - * only the first 64-bits of the hash are used in computing the output. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr word_minhash64( - cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index b7a719a2041..9a44d9477ab 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -52,118 +52,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Compute the minhash of each string for each seed - * - * This is a warp-per-string algorithm where parallel threads within a warp - * work on substrings of a single string row. - * - * @tparam HashFunction hash function to use on each substring - * - * @param d_strings Strings column to process - * @param seeds Seeds for hashing each string - * @param width Substring window size in characters - * @param d_hashes Minhash output values for each string - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, - cudf::device_span seeds, - cudf::size_type width, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - - auto const str_idx = static_cast(idx / cudf::detail::warp_size); - if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - - if (d_strings.is_null(str_idx)) { return; } - - auto const d_str = d_strings.element(str_idx); - auto const d_output = d_hashes + (str_idx * seeds.size()); - - // initialize hashes output for this string - if (lane_idx == 0) { - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - auto const begin = d_str.data() + lane_idx; - auto const end = d_str.data() + d_str.size_bytes(); - - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } - auto const check_str = // used for counting 'width' characters - cudf::string_view(itr, static_cast(thrust::distance(itr, end))); - auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string - - auto const hash_str = cudf::string_view(itr, bytes); - // hashing with each seed on the same section of the string is 10x faster than - // computing the substrings for each seed - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash substring and store the min value - if constexpr (std::is_same_v) { - auto const hvalue = hasher(hash_str); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - auto const hvalue = thrust::get<0>(hasher(hash_str)); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr minhash_fn(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS(width >= 2, - "Parameter width should be an integer value of 2 or greater", - std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_kernel<<>>( - *d_strings, seeds, width, d_hashes); - - return hashes; -} - constexpr cudf::thread_index_type block_size = 256; // for potentially tuning minhash_seed_kernel independently from block_size constexpr cudf::thread_index_type tile_size = block_size; @@ -297,13 +185,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, * @param d_results Final results vector of calculate values */ template -CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, - cudf::device_span indices, - cudf::device_span parameter_a, - cudf::device_span parameter_b, - cudf::size_type width, - hash_value_type const* d_hashes, - hash_value_type* d_results) +CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const idx = (tid / blocks_per_string) / block_size; @@ -478,7 +366,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data(), threshold_index); cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -489,7 +377,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data() + threshold_index, count); cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -497,101 +385,6 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return results; } -/** - * @brief Compute the minhash of each list row of strings for each seed - * - * This is a warp-per-row algorithm where parallel threads within a warp - * work on strings in a single list row. - * - * @tparam HashFunction hash function to use on each string - * - * @param d_input List of strings to process - * @param seeds Seeds for hashing each string - * @param d_hashes Minhash output values (one per row) - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, - cudf::device_span seeds, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const row_idx = idx / cudf::detail::warp_size; - - if (row_idx >= d_input.size()) { return; } - if (d_input.is_null(row_idx)) { return; } - - auto const d_row = cudf::list_device_view(d_input, row_idx); - auto const d_output = d_hashes + (row_idx * seeds.size()); - - // initialize hashes output for this row - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - if (lane_idx == 0) { - auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - // each lane hashes a string from the input row - for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { - auto const hash_str = - d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash string and store the min value - hash_value_type hv; - if constexpr (std::is_same_v) { - hv = hasher(hash_str); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - hv = thrust::get<0>(hasher(hash_str)); - } - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hv, cuda::std::memory_order_relaxed); - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_input = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - auto lcdv = cudf::detail::lists_column_device_view(*d_input); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_word_kernel - <<>>(lcdv, seeds, d_hashes); - - return hashes; -} - std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, @@ -620,30 +413,6 @@ std::unique_ptr build_list_result(cudf::column_view const& input, } } // namespace -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash(cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -658,30 +427,6 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -696,45 +441,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash(input, seed, width, stream, mr); -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, stream, mr); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, @@ -750,23 +468,15 @@ std::unique_ptr minhash_permuted(cudf::strings_column_view const& } std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash64(input, seed, width, stream, mr); -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash64(input, seeds, width, stream, mr); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, @@ -781,21 +491,4 @@ std::unique_ptr minhash64_permuted(cudf::strings_column_view const return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash(input, seeds, stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash64(input, seeds, stream, mr); -} } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 042ac44621e..8bfb17e0efd 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted) auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(10); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide) auto input = cudf::test::strings_column_wrapper({small, wide}); auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(20); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters) auto first = thrust::counting_iterator(20); // more than params_per_thread - auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // more than params_per_thread - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters) TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - results = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); auto empty = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), - std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); auto empty64 = cudf::test::fixed_width_column_wrapper(); EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), std::invalid_argument); std::vector h_input(50000, ""); @@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), std::overflow_error); - auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), - std::overflow_error); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), std::invalid_argument); } diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index fbb9ca4b128..7aa8f9f4a1c 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -594,6 +594,8 @@ def on_missing_reference(app, env, node, contnode): # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), + ("py:class", "np.uint32"), + ("py:class", "np.uint64"), ] diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 25cfcf99ca6..9f2b3f92502 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,19 +10,9 @@ from pylibcudf import nvtext @acquire_spill_lock() -def minhash(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): +def minhash(Column input, uint32_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash_permuted( + nvtext.minhash.minhash( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -33,19 +23,9 @@ def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width) @acquire_spill_lock() -def minhash64(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): +def minhash64(Column input, uint64_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash64_permuted( + nvtext.minhash.minhash64( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -53,21 +33,3 @@ def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int widt width, ) ) - - -@acquire_spill_lock() -def word_minhash(Column input, Column seeds): - result = nvtext.minhash.word_minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - result = nvtext.minhash.word_minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 341ba6d11c3..b9095a22a42 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,10 +9,6 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, - minhash64_permuted, - minhash_permuted, - word_minhash, - word_minhash64, ) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4a2483a80e3..06196717ce3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5461,49 +5461,6 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582] - 1 [962346254] - dtype: list - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582, 403093213, 1258052021] - 1 [962346254, 677440381, 122618762] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, width) - ) - - def minhash_permuted( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5535,7 +5492,7 @@ def minhash_permuted( >>> s = cudf.Series(['this is my', 'favorite book']) >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) - >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash(0, a=a, b=b, width=5) 0 [1305480171, 462824409, 74608232] 1 [32665388, 65330773, 97996158] dtype: list @@ -5551,53 +5508,10 @@ def minhash_permuted( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash_permuted( - self._column, seed, a_column, b_column, width - ) + libstrings.minhash(self._column, seed, a_column, b_column, width) ) def minhash64( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> str_series.str.minhash64(seeds) - 0 [3232308021562742685, 4445611509348165860, 586435843695903598] - 1 [23008204270530356, 1281229757012344693, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, width) - ) - - def minhash64_permuted( self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5628,7 +5542,7 @@ def minhash64_permuted( >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) >>> a = cudf.Series([2, 3], dtype=np.uint64) >>> b = cudf.Series([5, 6], dtype=np.uint64) - >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash64(0, a=a, b=b, width=5) 0 [172452388517576012, 316595762085180527] 1 [71427536958126239, 58787297728258215] 2 [423885828176437114, 1140588505926961370] @@ -5645,79 +5559,7 @@ def minhash64_permuted( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64_permuted( - self._column, seed, a_column, b_column, width - ) - ) - - def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> ls.str.word_minhash(seeds=seeds) - 0 [21141582, 1232889953, 1268336794] - 1 [962346254, 2321233602, 1354839212] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash(self._column, seeds_column) - ) - - def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> ls.str.word_minhash64(seeds) - 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] - 1 [5240044617220523711, 5847101123925041457, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash64(self._column, seeds_column) + libstrings.minhash64(self._column, seed, a_column, b_column, width) ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 3637ef075f2..9a62285403f 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,7 +882,7 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash_permuted(): +def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) params = cudf.Series([1, 2, 3], dtype=np.uint32) @@ -894,7 +894,7 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash(0, a=params, b=params, width=5) assert_eq(expected, actual) params = cudf.Series([1, 2, 3], dtype=np.uint64) @@ -912,78 +912,18 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash64(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash_permuted(1, a="a", b="b", width=7) + strings.str.minhash(1, a="a", b="b", width=7) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash_permuted(1, a=params, b=params, width=6) + strings.str.minhash(1, a=params, b=params, width=6) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64_permuted(1, a=params, b=params, width=8) - - -def test_word_minhash(): - ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - - expected = cudf.Series( - [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), - cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash(seeds=seeds) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - cudf.Series([2603139454418834912], dtype=np.uint64), - cudf.Series([5240044617220523711], dtype=np.uint64), - ] - ) - actual = ls.str.word_minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - expected = cudf.Series( - [ - cudf.Series( - [ - 2603139454418834912, - 8644371945174847701, - 5541030711534384340, - ], - dtype=np.uint64, - ), - cudf.Series( - [5240044617220523711, 5847101123925041457, 153762819128779913], - dtype=np.uint64, - ), - ] - ) - actual = ls.str.word_minhash64(seeds=seeds) - assert_eq(expected, actual) - - # test wrong seed types - with pytest.raises(ValueError): - ls.str.word_minhash(seeds="a") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - ls.str.word_minhash(seeds=seeds) - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - ls.str.word_minhash64(seeds=seeds) + strings.str.minhash64(1, a=params, b=params, width=8) def test_jaccard_index(): diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 8570531dfde..9d1e8cba425 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( - const column_view &strings, - const numeric_scalar[uint32_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash_permuted( const column_view &strings, const uint32_t seed, const column_view &a, @@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: ) except + cdef unique_ptr[column] minhash64( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64( - const column_view &strings, - const numeric_scalar[uint64_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64_permuted( const column_view &strings, const uint64_t seed, const column_view &a, const column_view &b, const size_type width, ) except + - - cdef unique_ptr[column] word_minhash( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler - - cdef unique_ptr[column] word_minhash64( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 6b544282f44..0af53748cdc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar: Column Scalar -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -19,16 +17,10 @@ cpdef Column minhash_permuted( size_type width ) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, Column b, size_type width ) - -cpdef Column word_minhash(Column input, Column seeds) - -cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi index a2d9b6364f7..5d88cfbbea0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -1,13 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column import Column -from pylibcudf.scalar import Scalar def minhash( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... def minhash64( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... -def word_minhash(input: Column, seeds: Column) -> Column: ... -def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 5448cc6de9b..84811cda867 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, - minhash64_permuted as cpp_minhash64_permuted, - minhash_permuted as cpp_minhash_permuted, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, ) -from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.scalar cimport Scalar - -from cython.operator import dereference -import warnings __all__ = [ "minhash", "minhash64", - "word_minhash", - "word_minhash64", ] -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`minhash`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -81,7 +27,7 @@ cpdef Column minhash_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x86_32 for the hash algorithm. - For details, see :cpp:func:`minhash_permuted`. + For details, see :cpp:func:`minhash`. Parameters ---------- @@ -104,7 +50,7 @@ cpdef Column minhash_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash_permuted( + c_result = cpp_minhash( input.view(), seed, a.view(), @@ -114,50 +60,7 @@ cpdef Column minhash_permuted( return Column.from_libcudf(move(c_result)) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm. - - For details, see :cpp:func:`minhash64`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash64_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash64( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, @@ -168,7 +71,7 @@ cpdef Column minhash64_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x64_128 for the hash algorithm. - For details, see :cpp:func:`minhash64_permuted`. + For details, see :cpp:func:`minhash64`. Parameters ---------- @@ -191,7 +94,7 @@ cpdef Column minhash64_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash64_permuted( + c_result = cpp_minhash64( input.view(), seed, a.view(), @@ -200,62 +103,3 @@ cpdef Column minhash64_permuted( ) return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`word_minhash`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash64(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm though - only the first 64-bits of the hash are used in computing the output. - - For details, see :cpp:func:`word_minhash64`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash64( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ec533e64307..ad7a6f7a762 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -13,20 +13,13 @@ def minhash_input_data(request): return input_arr, seeds, request.param -@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) -def word_minhash_input_data(request): - input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) - seeds = pa.array([2, 3, 4, 5], request.param) - return input_arr, seeds, request.param - - @pytest.mark.parametrize("width", [5, 12]) -def test_minhash_permuted(minhash_input_data, width): +def test_minhash(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash_permuted + plc.nvtext.minhash.minhash if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64_permuted + else plc.nvtext.minhash.minhash64 ) result = minhash_func( plc.interop.from_arrow(input_arr), @@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width): assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) - - -def test_word_minhash(word_minhash_input_data): - input_arr, seeds, seed_type = word_minhash_input_data - word_minhash_func = ( - plc.nvtext.minhash.word_minhash - if seed_type == pa.uint32() - else plc.nvtext.minhash.word_minhash64 - ) - result = word_minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) - ) - pa_result = plc.interop.to_arrow(result) - assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) - assert pa_result.type == pa.list_( - pa.field("element", seed_type, nullable=False) - )