-
Notifications
You must be signed in to change notification settings - Fork 919
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add libcudf example with large strings (#15983)
Creating an example that shows reading large strings columns. This uses the 1 billion row challenge input data and provides three examples of loading this data: - `brc` uses the CSV reader to load the input file in one call and aggregates the results using `groupby` - `brc_chunks` uses the CSV reader to load the input file in chunks, aggregates each chunk, and computes the results - `brc_pipeline` same as `brc_chunks` but input chunks are processed in separate threads/streams. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Gregory Kimball (https://github.com/GregoryKimball) - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: #15983
- Loading branch information
1 parent
0e86f62
commit 715677e
Showing
9 changed files
with
674 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
cmake_minimum_required(VERSION 3.26.4) | ||
|
||
include(../set_cuda_architecture.cmake) | ||
|
||
# initialize cuda architecture | ||
rapids_cuda_init_architectures(billion_rows) | ||
rapids_cuda_set_architectures(RAPIDS) | ||
|
||
project( | ||
billion_rows | ||
VERSION 0.0.1 | ||
LANGUAGES CXX CUDA | ||
) | ||
|
||
include(../fetch_dependencies.cmake) | ||
|
||
list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) | ||
|
||
add_library(groupby_results OBJECT groupby_results.cpp) | ||
target_link_libraries(groupby_results PRIVATE cudf::cudf) | ||
|
||
add_executable(brc brc.cpp) | ||
target_link_libraries(brc PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>) | ||
install(TARGETS brc DESTINATION bin/examples/libcudf) | ||
|
||
add_executable(brc_chunks brc_chunks.cpp) | ||
target_link_libraries(brc_chunks PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>) | ||
install(TARGETS brc_chunks DESTINATION bin/examples/libcudf) | ||
|
||
add_executable(brc_pipeline brc_pipeline.cpp) | ||
target_link_libraries(brc_pipeline PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>) | ||
install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# libcudf C++ example for the 1 billion row challenge | ||
|
||
This C++ example demonstrates using libcudf APIs to read and process | ||
a table with 1 billion rows. The 1 billion row challenge is described here: | ||
https://github.com/gunnarmorling/1brc | ||
|
||
The examples load the 1 billion row text file using the CSV reader. | ||
The file contains around 400 unique city names (string type) along with | ||
random temperature values (float type). | ||
Once loaded, the examples performs groupby aggregations to find the | ||
minimum, maximum, and average temperature for each city. | ||
|
||
There are three examples included: | ||
1. `brc.cpp` | ||
Loads the file in one call to the CSV reader. | ||
This generally requires a large amount of available GPU memory. | ||
2. `brc_chunks.cpp` | ||
Loads and processes the file in chunks. | ||
The number of chunks to use is a parameter to the executable. | ||
3. `brc_pipeline.cpp` | ||
Loads and processes the file in chunks with separate threads/streams. | ||
The number of chunks and number of threads to use are parameters to the executable. | ||
|
||
An input file can be generated using the instructions from | ||
https://github.com/gunnarmorling/1brc. | ||
|
||
## Compile and execute | ||
|
||
```bash | ||
# Configure project | ||
cmake -S . -B build/ | ||
# Build | ||
cmake --build build/ --parallel $PARALLEL_LEVEL | ||
# Execute | ||
build/brc input.txt | ||
# Execute in chunked mode with 25 chunks (default) | ||
build/brc_chunks input.txt 25 | ||
# Execute in pipeline mode with 25 chunks and 2 threads (defaults) | ||
build/brc_pipeline input.txt 25 2 | ||
``` | ||
|
||
If your machine does not come with a pre-built libcudf binary, expect the | ||
first build to take some time, as it would build libcudf on the host machine. | ||
It may be sped up by configuring the proper `PARALLEL_LEVEL` number. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include "common.hpp" | ||
#include "groupby_results.hpp" | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/io/csv.hpp> | ||
#include <cudf/io/types.hpp> | ||
#include <cudf/sorting.hpp> | ||
#include <cudf/table/table.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
|
||
#include <rmm/mr/device/statistics_resource_adaptor.hpp> | ||
|
||
#include <chrono> | ||
#include <iostream> | ||
#include <memory> | ||
#include <string> | ||
|
||
using elapsed_t = std::chrono::duration<double>; | ||
|
||
int main(int argc, char const** argv) | ||
{ | ||
if (argc < 2) { | ||
std::cout << "required parameter: input-file-path\n"; | ||
return 1; | ||
} | ||
|
||
auto const input_file = std::string{argv[1]}; | ||
std::cout << "Input: " << input_file << std::endl; | ||
|
||
auto const mr_name = std::string("pool"); | ||
auto resource = create_memory_resource(mr_name); | ||
auto stats_mr = | ||
rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get()); | ||
rmm::mr::set_current_device_resource(&stats_mr); | ||
auto stream = cudf::get_default_stream(); | ||
|
||
auto start = std::chrono::steady_clock::now(); | ||
|
||
auto const csv_result = [input_file, stream] { | ||
cudf::io::csv_reader_options in_opts = | ||
cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file}) | ||
.header(-1) | ||
.delimiter(';') | ||
.doublequote(false) | ||
.dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING}, | ||
cudf::data_type{cudf::type_id::FLOAT32}}) | ||
.na_filter(false); | ||
return cudf::io::read_csv(in_opts, stream).tbl; | ||
}(); | ||
elapsed_t elapsed = std::chrono::steady_clock::now() - start; | ||
std::cout << "File load time: " << elapsed.count() << " seconds\n"; | ||
auto const csv_table = csv_result->view(); | ||
std::cout << "Input rows: " << csv_table.num_rows() << std::endl; | ||
|
||
auto const cities = csv_table.column(0); | ||
auto const temps = csv_table.column(1); | ||
|
||
std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations; | ||
aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>()); | ||
aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>()); | ||
aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>()); | ||
|
||
auto result = compute_results(cities, temps, std::move(aggregations), stream); | ||
|
||
// The other 2 examples employ sorting for the sub-aggregates so enabling | ||
// the following line may be more comparable in performance with them. | ||
// | ||
// result = cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream); | ||
|
||
stream.synchronize(); | ||
|
||
elapsed = std::chrono::steady_clock::now() - start; | ||
std::cout << "Number of keys: " << result->num_rows() << std::endl; | ||
std::cout << "Process time: " << elapsed.count() << " seconds\n"; | ||
std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n"; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include "common.hpp" | ||
#include "groupby_results.hpp" | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/io/csv.hpp> | ||
#include <cudf/sorting.hpp> | ||
#include <cudf/table/table.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
|
||
#include <rmm/mr/device/statistics_resource_adaptor.hpp> | ||
|
||
#include <chrono> | ||
#include <filesystem> | ||
#include <iostream> | ||
#include <memory> | ||
#include <string> | ||
|
||
using elapsed_t = std::chrono::duration<double>; | ||
|
||
std::unique_ptr<cudf::table> load_chunk(std::string const& input_file, | ||
std::size_t start, | ||
std::size_t size, | ||
rmm::cuda_stream_view stream) | ||
{ | ||
cudf::io::csv_reader_options in_opts = | ||
cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file}) | ||
.header(-1) | ||
.delimiter(';') | ||
.doublequote(false) | ||
.byte_range_offset(start) | ||
.byte_range_size(size) | ||
.dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING}, | ||
cudf::data_type{cudf::type_id::FLOAT32}}) | ||
.na_filter(false); | ||
return cudf::io::read_csv(in_opts, stream).tbl; | ||
} | ||
|
||
int main(int argc, char const** argv) | ||
{ | ||
if (argc < 2) { | ||
std::cout << "required parameter: input-file-path\n"; | ||
std::cout << "optional parameter: chunk-count\n"; | ||
return 1; | ||
} | ||
|
||
auto const input_file = std::string{argv[1]}; | ||
auto const divider = (argc < 3) ? 25 : std::stoi(std::string(argv[2])); | ||
|
||
std::cout << "Input: " << input_file << std::endl; | ||
std::cout << "Chunks: " << divider << std::endl; | ||
|
||
auto const mr_name = std::string("pool"); | ||
auto resource = create_memory_resource(mr_name); | ||
auto stats_mr = | ||
rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get()); | ||
rmm::mr::set_current_device_resource(&stats_mr); | ||
auto stream = cudf::get_default_stream(); | ||
|
||
std::filesystem::path p = input_file; | ||
auto const file_size = std::filesystem::file_size(p); | ||
|
||
auto start = std::chrono::steady_clock::now(); | ||
|
||
std::vector<std::unique_ptr<cudf::table>> agg_data; | ||
std::size_t chunk_size = file_size / divider + ((file_size % divider) != 0); | ||
std::size_t start_pos = 0; | ||
cudf::size_type total_rows = 0; | ||
do { | ||
auto const input_table = load_chunk(input_file, start_pos, chunk_size, stream); | ||
auto const read_rows = input_table->num_rows(); | ||
if (read_rows == 0) break; | ||
|
||
auto const cities = input_table->view().column(0); | ||
auto const temps = input_table->view().column(1); | ||
|
||
std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations; | ||
aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>()); | ||
aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>()); | ||
aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>()); | ||
aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>()); | ||
auto result = compute_results(cities, temps, std::move(aggregations), stream); | ||
|
||
agg_data.emplace_back( | ||
cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream)); | ||
start_pos += chunk_size; | ||
chunk_size = std::min(chunk_size, file_size - start_pos); | ||
total_rows += read_rows; | ||
} while (start_pos < file_size && chunk_size > 0); | ||
|
||
// now aggregate the aggregate results | ||
auto results = compute_final_aggregates(agg_data, stream); | ||
stream.synchronize(); | ||
|
||
elapsed_t elapsed = std::chrono::steady_clock::now() - start; | ||
std::cout << "Number of keys: " << results->num_rows() << std::endl; | ||
std::cout << "Process time: " << elapsed.count() << " seconds\n"; | ||
std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n"; | ||
|
||
return 0; | ||
} |
Oops, something went wrong.