Skip to content

Commit

Permalink
Add libcudf example with large strings (#15983)
Browse files Browse the repository at this point in the history
Creating an example that shows reading large strings columns. This uses the 1 billion row challenge input data and provides three examples of loading this data: 
- `brc` uses the CSV reader to load the input file in one call and aggregates the results using `groupby`
- `brc_chunks` uses the CSV reader to load the input file in chunks, aggregates each chunk, and computes the results
- `brc_pipeline` same as `brc_chunks` but input chunks are processed in separate threads/streams.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Gregory Kimball (https://github.com/GregoryKimball)
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #15983
  • Loading branch information
davidwendt authored Sep 5, 2024
1 parent 0e86f62 commit 715677e
Show file tree
Hide file tree
Showing 9 changed files with 674 additions and 0 deletions.
34 changes: 34 additions & 0 deletions cpp/examples/billion_rows/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

cmake_minimum_required(VERSION 3.26.4)

include(../set_cuda_architecture.cmake)

# initialize cuda architecture
rapids_cuda_init_architectures(billion_rows)
rapids_cuda_set_architectures(RAPIDS)

project(
billion_rows
VERSION 0.0.1
LANGUAGES CXX CUDA
)

include(../fetch_dependencies.cmake)

list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)

add_library(groupby_results OBJECT groupby_results.cpp)
target_link_libraries(groupby_results PRIVATE cudf::cudf)

add_executable(brc brc.cpp)
target_link_libraries(brc PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
install(TARGETS brc DESTINATION bin/examples/libcudf)

add_executable(brc_chunks brc_chunks.cpp)
target_link_libraries(brc_chunks PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
install(TARGETS brc_chunks DESTINATION bin/examples/libcudf)

add_executable(brc_pipeline brc_pipeline.cpp)
target_link_libraries(brc_pipeline PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf)
44 changes: 44 additions & 0 deletions cpp/examples/billion_rows/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# libcudf C++ example for the 1 billion row challenge

This C++ example demonstrates using libcudf APIs to read and process
a table with 1 billion rows. The 1 billion row challenge is described here:
https://github.com/gunnarmorling/1brc

The examples load the 1 billion row text file using the CSV reader.
The file contains around 400 unique city names (string type) along with
random temperature values (float type).
Once loaded, the examples performs groupby aggregations to find the
minimum, maximum, and average temperature for each city.

There are three examples included:
1. `brc.cpp`
Loads the file in one call to the CSV reader.
This generally requires a large amount of available GPU memory.
2. `brc_chunks.cpp`
Loads and processes the file in chunks.
The number of chunks to use is a parameter to the executable.
3. `brc_pipeline.cpp`
Loads and processes the file in chunks with separate threads/streams.
The number of chunks and number of threads to use are parameters to the executable.

An input file can be generated using the instructions from
https://github.com/gunnarmorling/1brc.

## Compile and execute

```bash
# Configure project
cmake -S . -B build/
# Build
cmake --build build/ --parallel $PARALLEL_LEVEL
# Execute
build/brc input.txt
# Execute in chunked mode with 25 chunks (default)
build/brc_chunks input.txt 25
# Execute in pipeline mode with 25 chunks and 2 threads (defaults)
build/brc_pipeline input.txt 25 2
```

If your machine does not come with a pre-built libcudf binary, expect the
first build to take some time, as it would build libcudf on the host machine.
It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
94 changes: 94 additions & 0 deletions cpp/examples/billion_rows/brc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common.hpp"
#include "groupby_results.hpp"

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/io/types.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/mr/device/statistics_resource_adaptor.hpp>

#include <chrono>
#include <iostream>
#include <memory>
#include <string>

using elapsed_t = std::chrono::duration<double>;

int main(int argc, char const** argv)
{
if (argc < 2) {
std::cout << "required parameter: input-file-path\n";
return 1;
}

auto const input_file = std::string{argv[1]};
std::cout << "Input: " << input_file << std::endl;

auto const mr_name = std::string("pool");
auto resource = create_memory_resource(mr_name);
auto stats_mr =
rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
rmm::mr::set_current_device_resource(&stats_mr);
auto stream = cudf::get_default_stream();

auto start = std::chrono::steady_clock::now();

auto const csv_result = [input_file, stream] {
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
.header(-1)
.delimiter(';')
.doublequote(false)
.dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
cudf::data_type{cudf::type_id::FLOAT32}})
.na_filter(false);
return cudf::io::read_csv(in_opts, stream).tbl;
}();
elapsed_t elapsed = std::chrono::steady_clock::now() - start;
std::cout << "File load time: " << elapsed.count() << " seconds\n";
auto const csv_table = csv_result->view();
std::cout << "Input rows: " << csv_table.num_rows() << std::endl;

auto const cities = csv_table.column(0);
auto const temps = csv_table.column(1);

std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());

auto result = compute_results(cities, temps, std::move(aggregations), stream);

// The other 2 examples employ sorting for the sub-aggregates so enabling
// the following line may be more comparable in performance with them.
//
// result = cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream);

stream.synchronize();

elapsed = std::chrono::steady_clock::now() - start;
std::cout << "Number of keys: " << result->num_rows() << std::endl;
std::cout << "Process time: " << elapsed.count() << " seconds\n";
std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";

return 0;
}
116 changes: 116 additions & 0 deletions cpp/examples/billion_rows/brc_chunks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common.hpp"
#include "groupby_results.hpp"

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/mr/device/statistics_resource_adaptor.hpp>

#include <chrono>
#include <filesystem>
#include <iostream>
#include <memory>
#include <string>

using elapsed_t = std::chrono::duration<double>;

std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
std::size_t start,
std::size_t size,
rmm::cuda_stream_view stream)
{
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
.header(-1)
.delimiter(';')
.doublequote(false)
.byte_range_offset(start)
.byte_range_size(size)
.dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
cudf::data_type{cudf::type_id::FLOAT32}})
.na_filter(false);
return cudf::io::read_csv(in_opts, stream).tbl;
}

int main(int argc, char const** argv)
{
if (argc < 2) {
std::cout << "required parameter: input-file-path\n";
std::cout << "optional parameter: chunk-count\n";
return 1;
}

auto const input_file = std::string{argv[1]};
auto const divider = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));

std::cout << "Input: " << input_file << std::endl;
std::cout << "Chunks: " << divider << std::endl;

auto const mr_name = std::string("pool");
auto resource = create_memory_resource(mr_name);
auto stats_mr =
rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
rmm::mr::set_current_device_resource(&stats_mr);
auto stream = cudf::get_default_stream();

std::filesystem::path p = input_file;
auto const file_size = std::filesystem::file_size(p);

auto start = std::chrono::steady_clock::now();

std::vector<std::unique_ptr<cudf::table>> agg_data;
std::size_t chunk_size = file_size / divider + ((file_size % divider) != 0);
std::size_t start_pos = 0;
cudf::size_type total_rows = 0;
do {
auto const input_table = load_chunk(input_file, start_pos, chunk_size, stream);
auto const read_rows = input_table->num_rows();
if (read_rows == 0) break;

auto const cities = input_table->view().column(0);
auto const temps = input_table->view().column(1);

std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
auto result = compute_results(cities, temps, std::move(aggregations), stream);

agg_data.emplace_back(
cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
start_pos += chunk_size;
chunk_size = std::min(chunk_size, file_size - start_pos);
total_rows += read_rows;
} while (start_pos < file_size && chunk_size > 0);

// now aggregate the aggregate results
auto results = compute_final_aggregates(agg_data, stream);
stream.synchronize();

elapsed_t elapsed = std::chrono::steady_clock::now() - start;
std::cout << "Number of keys: " << results->num_rows() << std::endl;
std::cout << "Process time: " << elapsed.count() << " seconds\n";
std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";

return 0;
}
Loading

0 comments on commit 715677e

Please sign in to comment.