diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 49ca5ca0fb9..9d79733703c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,6 +13,7 @@ jobs: # Please keep pr-builder as the top job here pr-builder: needs: + - check-nightly-ci - changed-files - checks - conda-cpp-build @@ -54,6 +55,18 @@ jobs: - name: Telemetry setup if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main + check-nightly-ci: + # Switch to ubuntu-latest once it defaults to a version of Ubuntu that + # provides at least Python 3.11 (see + # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat) + runs-on: ubuntu-24.04 + env: + RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: cudf changed-files: secrets: inherit needs: telemetry-setup @@ -328,16 +341,11 @@ jobs: run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" telemetry-summarize: - runs-on: ubuntu-latest + # This job must use a self-hosted runner to record telemetry traces. + runs-on: linux-amd64-cpu4 needs: pr-builder if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - - name: Load stashed telemetry env vars - uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main - with: - load_service_name: true - name: Telemetry summarize - uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main - with: - cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" + uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 4290d013fe4..52d8f659611 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -35,6 +35,10 @@ rapids-mamba-retry install \ export RAPIDS_DOCS_DIR="$(mktemp -d)" +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + rapids-logger "Build CPP docs" pushd cpp/doxygen aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" @@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs + +exit ${EXITCODE} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2f17b57b0a4..9cbacee8e8d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() # Not using rapids-cmake since we never want to find, always download. CPMAddPackage( - NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 ) rapids_make_logger(cudf EXPORT_SET cudf-exports) @@ -916,7 +916,9 @@ if(CUDF_LARGE_STRINGS_DISABLED) endif() # Define logging level -target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") +target_compile_definitions( + cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=CUDF_LOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}" +) # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -1105,7 +1107,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" ) target_include_directories(${_tgt} PRIVATE "$") - target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl) if(CUDF_BUILD_STACKTRACE_DEBUG) target_link_libraries(${_tgt} PRIVATE cudf_backtrace) endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e5ea900efa..749e1b628ee 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ @@ -351,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/subword.cpp) - ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH + text/edit_distance.cpp + text/hash_ngrams.cpp + text/jaccard.cpp + text/minhash.cpp + text/ngrams.cpp + text/normalize.cpp + text/replace.cpp + text/subword.cpp + text/tokenize.cpp + text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 6ffa90edb8f..0ad1ae30f8c 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -27,15 +27,11 @@ static void bench_edit_distance(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input1(strings_table->view().column(0)); @@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state) NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 4e5daf83a3c..7577cf00c0f 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -27,16 +27,12 @@ static void bench_hash_ngrams(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const ngrams = static_cast(state.get_int64("ngrams")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); @@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state) NVBENCH_BENCH(bench_hash_ngrams) .set_name("hash_ngrams") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {16384, 32768, 262144}) .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d5b74da6773..5506501138b 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -28,17 +28,13 @@ static void bench_jaccard(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const substring_width = static_cast(state.get_int64("substring_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const input_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); @@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 1024, 2048}) .add_int64_axis("num_rows", {32768, 131072, 262144}) - .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 71bccd80d39..594dc0de28a 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -28,16 +28,12 @@ static void bench_normalize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const normalize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 767ebab3eee..24ca4e5dfd7 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", "ten ", "eleven ", "twelve ", "thirteen ", "fourteen ", @@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index dd8df695d3e..0b4e3bdefa5 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -#include -#include - #include #include @@ -24,6 +21,8 @@ #include +#include + #include #include #include @@ -54,40 +53,33 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_subword_tokenizer(benchmark::State& state) +static void bench_subword_tokenizer(nvbench::state& state) { - auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + auto const num_rows = static_cast(state.get_int64("num_rows")); + + std::vector h_strings(num_rows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; - uint32_t max_sequence_length = 64; - uint32_t stride = 48; - uint32_t do_truncate = 0; - uint32_t do_lower = 1; - // - auto vocab = nvtext::load_vocabulary_file(hash_file); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - do_lower, - do_truncate); - } -} + uint32_t max_sequence = 64; + uint32_t stride = 48; + uint32_t do_truncate = 0; + uint32_t do_lower = 1; -class Subword : public cudf::benchmark {}; + auto input = cudf::strings_column_view{strings}; -#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ - BENCHMARK_REGISTER_F(Subword, name) \ - ->RangeMultiplier(2) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows * max_sequence); -SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + auto vocab = nvtext::load_vocabulary_file(hash_file); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = + nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate); + }); +} -// BENCHMARK_MAIN(); +NVBENCH_BENCH(bench_subword_tokenizer) + .set_name("subword_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index e83310e0343..b9590c5539f 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -31,17 +31,13 @@ static void bench_tokenize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const tokenize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_tokenize) .set_name("tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 523d277df18..0502f375d99 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state) { auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto const column = [num_rows, row_width] { + auto const column = [num_rows, min_width, max_width] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), @@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_vocab_tokenize) .set_name("vocab_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp deleted file mode 100644 index adc3dddc59c..00000000000 --- a/cpp/benchmarks/text/word_minhash.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -static void bench_word_minhash(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const base64 = state.get_int64("hash_type") == 64; - - data_profile const strings_profile = - data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); - auto strings_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - - auto const num_offsets = (num_rows / row_width) + 1; - auto offsets = cudf::sequence(num_offsets, - cudf::numeric_scalar(0), - cudf::numeric_scalar(row_width)); - - auto source = cudf::make_lists_column(num_offsets - 1, - std::move(offsets), - std::move(strings_table->release().front()), - 0, - rmm::device_buffer{}); - - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, 256); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - - cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); - auto chars_size = input.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); - state.add_global_memory_writes(num_rows); // output are hashes - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) - : nvtext::word_minhash(source->view(), seeds.view()); - }); -} - -NVBENCH_BENCH(bench_word_minhash) - .set_name("word_minhash") - .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) - .add_int64_axis("row_width", {10, 100, 1000}) - .add_int64_axis("seed_count", {2, 25}) - .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index c440643037b..b0c48e04710 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,11 +14,6 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") - if(NOT BUILD_SHARED_LIBS) set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) else() @@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow) nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb + GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff deleted file mode 100644 index e9a36fcb567..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h -index caa6be4..70ec8a2 100644 ---- a/src/nanoarrow/common/inline_buffer.h -+++ b/src/nanoarrow/common/inline_buffer.h -@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - } - - static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { -- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | -+ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); -- bits[bytes_begin] &= only_byte_mask; -+ bits[bytes_begin] &= only_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte -- bits[bytes_begin] &= first_byte_mask; -+ bits[bytes_begin] &= first_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { -@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte -- *out_cursor = 0x00; -+ *out_cursor = 0x00; // NOLINT - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json deleted file mode 100644 index d529787e7c8..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "packages" : { - "nanoarrow" : { - "version" : "0.6.0.dev", - "git_url" : "https://github.com/apache/arrow-nanoarrow.git", - "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", - "git_shallow" : false, - "patches" : [ - { - "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", - "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index db6d5255616..aacb5ccfede 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -33,11 +33,13 @@ #include #include +#include #include #include #include #include +#include /** * @file column_device_view.cuh @@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf { * */ struct nullate { - struct YES : std::bool_constant {}; - struct NO : std::bool_constant {}; + struct YES : cuda::std::bool_constant {}; + struct NO : cuda::std::bool_constant {}; /** * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than * compile time. The calling code is responsible for specifying whether or not nulls are @@ -80,7 +82,7 @@ struct nullate { * @return `true` if nulls are expected in the operation in which this object is applied, * otherwise false */ - constexpr operator bool() const noexcept { return value; } + CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; } bool value; ///< True if nulls are expected }; }; @@ -319,14 +321,14 @@ class alignas(16) column_device_view_base { } template - struct has_element_accessor_impl : std::false_type {}; + struct has_element_accessor_impl : cuda::std::false_type {}; template struct has_element_accessor_impl< C, T, - void_t().template element(std::declval()))>> - : std::true_type {}; + void_t().template element(cuda::std::declval()))>> + : cuda::std::true_type {}; }; // @cond // Forward declaration @@ -442,7 +444,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return string_view instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset char const* d_strings = static_cast(_data); @@ -501,7 +503,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return dictionary32 instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset auto const indices = d_children[0]; @@ -519,7 +521,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return numeric::fixed_point representing the element at this index */ template ())> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { using namespace numeric; using rep = typename T::rep; @@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return `true` if `column_device_view::element()` has a valid overload, `false` otherwise */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1032,7 +1034,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return Reference to the element at the specified index */ template ())> - __device__ [[nodiscard]] T& element(size_type element_index) const noexcept + [[nodiscard]] __device__ T& element(size_type element_index) const noexcept { return data()[element_index]; } @@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return `true` if `mutable_column_device_view::element()` has a valid overload, `false` */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1425,13 +1427,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index de53e7586cd..59011f7b138 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -31,12 +32,11 @@ #include #include -#include namespace cudf { namespace detail { template -constexpr bool is_product_supported() +CUDF_HOST_DEVICE constexpr bool is_product_supported() { return is_numeric(); } @@ -216,12 +216,12 @@ struct identity_initializer { * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. - * @param aggs A vector of aggregation operations corresponding to the table + * @param aggs A span of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp index 16ca06c6561..090dc8b62b6 100644 --- a/cpp/include/cudf/detail/device_scalar.hpp +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar { [[nodiscard]] T value(rmm::cuda_stream_view stream) const { cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); - return bounce_buffer[0]; + return std::move(bounce_buffer[0]); } void set_value_async(T const& value, rmm::cuda_stream_view stream) diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 61a8e9f7ec3..72cdc3d8067 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -74,9 +74,10 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type global_thread_id(thread_index_type thread_id, - thread_index_type block_id, - thread_index_type num_threads_per_block) + __device__ static constexpr thread_index_type global_thread_id( + thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) { return thread_id + block_id * num_threads_per_block; } @@ -114,8 +115,8 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, - thread_index_type num_blocks_per_grid) + __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) { return num_threads_per_block * num_blocks_per_grid; } diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index d16be5e22dd..923cd04479d 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -29,6 +29,8 @@ #include #include +#include + #include namespace cudf { @@ -42,7 +44,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs) { - return std::min(lhs, rhs); + return cuda::std::min(lhs, rhs); } /** @@ -53,7 +55,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) { - return std::max(lhs, rhs); + return cuda::std::max(lhs, rhs); } } // namespace detail @@ -68,20 +70,20 @@ struct DeviceSum { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{typename T::duration{0}}; } template () && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{0}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); @@ -109,7 +111,7 @@ struct DeviceCount { } template - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{}; } @@ -129,7 +131,7 @@ struct DeviceMin { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::max() // https://eel.is/c++draft/numeric.limits.general#6 @@ -143,7 +145,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); @@ -161,7 +163,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::max_value()); } @@ -181,7 +183,7 @@ struct DeviceMax { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::min() // https://eel.is/c++draft/numeric.limits.general#6 @@ -195,7 +197,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); @@ -212,7 +214,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::lowest_value()); } @@ -229,13 +231,13 @@ struct DeviceProduct { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{1}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 957b6b70fe2..2e3d71815c0 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept * `modulus` is positive and does not check for overflow. */ template -constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept +CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -187,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr auto absolute_value(T value) -> T +CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index fce08b4a5c4..9e68bafb09a 100644 --- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct floating_converter { * @param integer_rep The bit-casted floating value to extract the exponent from * @return The stored base-2 exponent and significand, shifted for denormals */ - CUDF_HOST_DEVICE inline static std::pair get_significand_and_pow2( + CUDF_HOST_DEVICE inline static cuda::std::pair get_significand_and_pow2( IntegralType integer_rep) { // Extract the significand @@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** @@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 0ec41a20ef1..fd3455e761d 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -18,7 +18,8 @@ #include -#include +#include +#include namespace cudf::hashing::detail { @@ -29,7 +30,7 @@ template T __device__ inline normalize_nans(T const& key) { if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + if (cuda::std::isnan(key)) { return cuda::std::numeric_limits::quiet_NaN(); } } return key; } diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index a978e54a1b9..7cb80081a95 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -82,7 +82,7 @@ std::unique_ptr xxhash_64(table_view const& input, * @param rhs The second hash value * @return Combined hash value */ -constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) +CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) { return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2)); } diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index 85349a421b1..84957ab9f1d 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -31,7 +31,7 @@ namespace strings::detail { * @param chr Any single byte from a valid UTF-8 character * @return true if this is not the first byte of the character */ -constexpr bool is_utf8_continuation_char(unsigned char chr) +CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr) { // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. return (chr & 0xC0) == 0x80; @@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr) * @param chr Any single byte from a valid UTF-8 character * @return true if this the first byte of the character */ -constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); } +CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr) +{ + return not is_utf8_continuation_char(chr); +} /** * @brief This will return true if the passed in byte could be the start of @@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin * @param byte The byte to be tested * @return true if this can be the first byte of a character */ -constexpr bool is_valid_begin_utf8_char(uint8_t byte) +CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte) { // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of: // 0b0vvvvvvv a 1 byte character @@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte) * @param character Single character * @return Number of bytes */ -constexpr size_type bytes_in_char_utf8(char_utf8 character) +CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character) * @param byte Byte from an encoded character. * @return Number of bytes. */ -constexpr size_type bytes_in_utf8_byte(uint8_t byte) +CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte) { return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix @@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -constexpr size_type to_char_utf8(char const* str, char_utf8& character) +CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); @@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character) * @param[out] str Output array. * @return The number of bytes in the character */ -constexpr inline size_type from_char_utf8(char_utf8 character, char* str) +CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str) { size_type const chr_width = bytes_in_char_utf8(character); for (size_type idx = 0; idx < chr_width; ++idx) { @@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str) * @param utf8_char Single UTF-8 character to convert. * @return Code-point for the UTF-8 character. */ -constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) +CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) { uint32_t unchr = 0; if (utf8_char < 0x0000'0080) // single-byte pass thru @@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ -constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) +CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) // single byte utf8 diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 1ae4c3703b2..f0040e069d8 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -31,6 +31,8 @@ #include #endif +#include + #include // This file should only include device code logic. @@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte * @param pos Character position to count to * @return The number of bytes and the left over non-counted position value */ -__device__ inline std::pair bytes_to_character_position(string_view d_str, - size_type pos) +__device__ inline cuda::std::pair bytes_to_character_position( + string_view d_str, size_type pos) { size_type bytes = 0; auto ptr = d_str.data(); @@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { if (length() == size_bytes()) return pos; - return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); + return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(string_view const& in) const diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 504c31057ae..33f3176d2c6 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -54,7 +54,7 @@ class string_view { * * @return The number of characters in this string */ - __device__ [[nodiscard]] inline size_type length() const; + [[nodiscard]] __device__ inline size_type length() const; /** * @brief Return a pointer to the internal device array * @@ -119,13 +119,13 @@ class string_view { * * @return new iterator pointing to the beginning of this string */ - __device__ [[nodiscard]] inline const_iterator begin() const; + [[nodiscard]] __device__ inline const_iterator begin() const; /** * @brief Return new iterator pointing past the end of this string * * @return new iterator pointing past the end of this string */ - __device__ [[nodiscard]] inline const_iterator end() const; + [[nodiscard]] __device__ inline const_iterator end() const; /** * @brief Return single UTF-8 character at the given character position @@ -140,7 +140,7 @@ class string_view { * @param pos Character position * @return Byte offset from data() for a given character position */ - __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const; + [[nodiscard]] __device__ inline size_type byte_offset(size_type pos) const; /** * @brief Comparing target string with this string. Each character is compared @@ -155,7 +155,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ [[nodiscard]] inline int compare(string_view const& str) const; + [[nodiscard]] __device__ inline int compare(string_view const& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -225,7 +225,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if str is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(string_view const& str, + [[nodiscard]] __device__ inline size_type find(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -253,7 +253,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(char_utf8 character, + [[nodiscard]] __device__ inline size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const; /** @@ -266,7 +266,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(string_view const& str, + [[nodiscard]] __device__ inline size_type rfind(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -294,7 +294,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(char_utf8 character, + [[nodiscard]] __device__ inline size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const; @@ -306,7 +306,7 @@ class string_view { * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. */ - __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const; + [[nodiscard]] __device__ inline string_view substr(size_type start, size_type length) const; /** * @brief Return minimum value associated with the string type @@ -386,7 +386,7 @@ class string_view { * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. */ - __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const; + [[nodiscard]] __device__ inline size_type character_offset(size_type bytepos) const; /** * @brief Common internal implementation for string_view::find and string_view::rfind. diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 3f33c70c29a..8214ea6e83b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -33,6 +33,8 @@ #include #include +#include +#include #include #include #include @@ -48,11 +50,8 @@ #include #include -#include #include -#include #include -#include namespace CUDF_EXPORT cudf { @@ -287,15 +286,16 @@ class device_row_comparator { * `null_order::BEFORE` for all columns. * @param comparator Physical element relational comparison functor. */ - device_row_comparator(Nullate check_nulls, - table_device_view lhs, - table_device_view rhs, - device_span l_dremel_device_views, - device_span r_dremel_device_views, - std::optional> depth = std::nullopt, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + device_row_comparator( + Nullate check_nulls, + table_device_view lhs, + table_device_view rhs, + device_span l_dremel_device_views, + device_span r_dremel_device_views, + cuda::std::optional> depth = cuda::std::nullopt, + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel(l_dremel_device_views), @@ -331,9 +331,9 @@ class device_row_comparator { Nullate check_nulls, table_device_view lhs, table_device_view rhs, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel{}, @@ -410,7 +410,7 @@ class device_row_comparator { return cuda::std::pair(_comparator(_lhs.element(lhs_element_index), _rhs.element(rhs_element_index)), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); } /** @@ -455,7 +455,7 @@ class device_row_comparator { } if (lcol.num_child_columns() == 0) { - return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits::max()); + return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits::max()); } // Non-empty structs have been modified to only have 1 child when using this. @@ -607,7 +607,7 @@ class device_row_comparator { __device__ constexpr weak_ordering operator()(size_type const lhs_index, size_type const rhs_index) const noexcept { - int last_null_depth = std::numeric_limits::max(); + int last_null_depth = cuda::std::numeric_limits::max(); size_type list_column_index{-1}; for (size_type i = 0; i < _lhs.num_columns(); ++i) { if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; } @@ -626,9 +626,9 @@ class device_row_comparator { // here, otherwise the current code would be failing. auto const [l_dremel_i, r_dremel_i] = _lhs.column(i).type().id() == type_id::LIST - ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), - optional_dremel_view(_r_dremel[list_column_index])) - : std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), + optional_dremel_view(_r_dremel[list_column_index])) + : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), @@ -658,9 +658,9 @@ class device_row_comparator { device_span const _l_dremel; device_span const _r_dremel; Nullate const _check_nulls; - std::optional> const _depth; - std::optional> const _column_order; - std::optional> const _null_precedence; + cuda::std::optional> const _depth; + cuda::std::optional> const _column_order; + cuda::std::optional> const _null_precedence; PhysicalElementComparator const _comparator; }; // class device_row_comparator @@ -882,10 +882,10 @@ struct preprocessed_table { * @return Device array containing respective column orders. If no explicit column orders were * specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> column_order() const + [[nodiscard]] cuda::std::optional> column_order() const { - return _column_order.size() ? std::optional>(_column_order) - : std::nullopt; + return _column_order.size() ? cuda::std::optional>(_column_order) + : cuda::std::nullopt; } /** @@ -895,10 +895,11 @@ struct preprocessed_table { * @return Device array containing respective column null precedence. If no explicit column null * precedences were specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> null_precedence() const + [[nodiscard]] cuda::std::optional> null_precedence() const { - return _null_precedence.size() ? std::optional>(_null_precedence) - : std::nullopt; + return _null_precedence.size() + ? cuda::std::optional>(_null_precedence) + : cuda::std::nullopt; } /** @@ -909,9 +910,10 @@ struct preprocessed_table { * @return std::optional> Device array containing respective column depths. * If there are no nested columns in the table then this will be `nullopt`. */ - [[nodiscard]] std::optional> depths() const + [[nodiscard]] cuda::std::optional> depths() const { - return _depths.size() ? std::optional>(_depths) : std::nullopt; + return _depths.size() ? cuda::std::optional>(_depths) + : cuda::std::nullopt; } [[nodiscard]] device_span dremel_device_views() const @@ -940,8 +942,8 @@ struct preprocessed_table { rmm::device_uvector const _depths; // Dremel encoding of list columns used for the comparison algorithm - std::optional> _dremel_data; - std::optional> _dremel_device_views; + cuda::std::optional> _dremel_data; + cuda::std::optional> _dremel_device_views; // Intermediate columns generated from transforming nested children columns into // integers columns using `cudf::rank()`, need to be kept alive. @@ -1808,7 +1810,7 @@ class element_hasher { __device__ element_hasher( Nullate nulls, uint32_t seed = DEFAULT_HASH_SEED, - hash_value_type null_hash = std::numeric_limits::max()) noexcept + hash_value_type null_hash = cuda::std::numeric_limits::max()) noexcept : _check_nulls(nulls), _seed(seed), _null_hash(null_hash) { } @@ -1892,7 +1894,7 @@ class device_row_hasher { */ template