From 305182e58c19add98a5abd6a5b00d9b266f41732 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 22 Nov 2024 08:45:32 -0600 Subject: [PATCH 01/23] Enable unified memory by default in `cudf_polars` (#17375) This PR enables Unified memory as the default memory resource for `cudf_polars` --------- Co-authored-by: Vyas Ramasubramani Co-authored-by: Vyas Ramasubramani Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com> Co-authored-by: Lawrence Mitchell Co-authored-by: Matthew Murray --- .../cudf/source/cudf_polars/engine_options.md | 7 +++ docs/cudf/source/cudf_polars/index.rst | 6 ++ python/cudf_polars/cudf_polars/callback.py | 56 +++++++++++++++++-- python/cudf_polars/tests/test_config.py | 20 +++++++ 4 files changed, 84 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md index 4c930c7392d..afb2bb6e8b9 100644 --- a/docs/cudf/source/cudf_polars/engine_options.md +++ b/docs/cudf/source/cudf_polars/engine_options.md @@ -23,3 +23,10 @@ engine = GPUEngine( result = query.collect(engine=engine) ``` Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect. + +## Disabling CUDA Managed Memory + +By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool +allocator is used. +Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here]( +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory). diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst index 6fd98a6b5da..a9b4bb2dff2 100644 --- a/docs/cudf/source/cudf_polars/index.rst +++ b/docs/cudf/source/cudf_polars/index.rst @@ -9,6 +9,12 @@ and run on the CPU. Benchmark --------- + +.. note:: + The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`. + Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors. + Peak performance can still be attained by setting the environment variable to 1. + We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: .. figure:: ../_static/pds_benchmark_polars.png diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 7915c9e6b18..8dc5715195d 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -15,6 +15,7 @@ from polars.exceptions import ComputeError, PerformanceWarning +import pylibcudf import rmm from rmm._cuda import gpu @@ -32,8 +33,26 @@ __all__: list[str] = ["execute_with_cudf"] +_SUPPORTED_PREFETCHES = { + "column_view::get_data", + "mutable_column_view::get_data", + "gather", + "hash_join", +} + + +def _env_get_int(name, default): + try: + return int(os.getenv(name, default)) + except (ValueError, TypeError): # pragma: no cover + return default # pragma: no cover + + @cache -def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: +def default_memory_resource( + device: int, + cuda_managed_memory: bool, # noqa: FBT001 +) -> rmm.mr.DeviceMemoryResource: """ Return the default memory resource for cudf-polars. @@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: device Disambiguating device id when selecting the device. Must be the active device when this function is called. + cuda_managed_memory + Whether to use managed memory or not. Returns ------- rmm.mr.DeviceMemoryResource The default memory resource that cudf-polars uses. Currently - an async pool resource. + a managed memory resource, if `cuda_managed_memory` is `True`. + else, an async pool resource is returned. """ try: - return rmm.mr.CudaAsyncMemoryResource() + if ( + cuda_managed_memory + and pylibcudf.utils._is_concurrent_managed_access_supported() + ): + # Allocating 80% of the available memory for the pool. + # Leaving a 20% headroom to avoid OOM errors. + free_memory, _ = rmm.mr.available_device_memory() + free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) + for key in _SUPPORTED_PREFETCHES: + pylibcudf.experimental.enable_prefetching(key) + mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=free_memory, + ) + ) + else: + mr = rmm.mr.CudaAsyncMemoryResource() except RuntimeError as e: # pragma: no cover msg, *_ = e.args if ( @@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: ) from None else: raise + else: + return mr @contextlib.contextmanager @@ -89,10 +130,15 @@ def set_memory_resource( at entry. If a memory resource is provided, it must be valid to use with the currently active device. """ + previous = rmm.mr.get_current_device_resource() if mr is None: device: int = gpu.getDevice() - mr = default_memory_resource(device) - previous = rmm.mr.get_current_device_resource() + mr = default_memory_resource( + device=device, + cuda_managed_memory=bool( + _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0 + ), + ) rmm.mr.set_current_device_resource(mr) try: yield mr diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 25b71716eed..52c5c9894fe 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -10,6 +10,7 @@ import rmm +from cudf_polars.callback import default_memory_resource from cudf_polars.dsl.ir import DataFrameScan from cudf_polars.testing.asserts import ( assert_gpu_result_equal, @@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr): q.collect(engine=pl.GPUEngine(memory_resource=mr)) +@pytest.mark.parametrize("disable_managed_memory", ["1", "0"]) +def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv( + "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory + ) + result = q.collect(engine=pl.GPUEngine()) + mr = default_memory_resource(0, bool(disable_managed_memory == "1")) + if disable_managed_memory == "1": + assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor) + assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource) + else: + assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource) + monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY") + assert_frame_equal(q.collect(), result) + + def test_explicit_device_zero(): q = pl.LazyFrame({"a": [1, 2, 3]}) From 53e452539070fe86f68b543fa7237816d657a01a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Nov 2024 11:01:36 -0800 Subject: [PATCH 02/23] Add write_parquet to pylibcudf (#17263) Broken off from https://github.com/rapidsai/cudf/pull/17252 since also replacing cudf Python's `write_parquet` usage would have made the PR fairly large. Authors: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Murray (https://github.com/Matt711) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17263 --- .../pylibcudf/pylibcudf/contiguous_split.pxd | 13 + .../pylibcudf/pylibcudf/contiguous_split.pyx | 13 +- python/pylibcudf/pylibcudf/io/parquet.pxd | 62 ++- python/pylibcudf/pylibcudf/io/parquet.pyi | 46 ++- python/pylibcudf/pylibcudf/io/parquet.pyx | 364 +++++++++++++++++- python/pylibcudf/pylibcudf/io/types.pxd | 39 ++ python/pylibcudf/pylibcudf/io/types.pyi | 19 + python/pylibcudf/pylibcudf/io/types.pyx | 227 +++++++++++ .../pylibcudf/libcudf/io/parquet.pxd | 6 +- .../pylibcudf/tests/io/test_parquet.py | 69 ++++ 10 files changed, 840 insertions(+), 18 deletions(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd index 3745e893c3e..14ad84709d5 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd @@ -1,12 +1,25 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp.memory cimport unique_ptr +from libcpp.vector cimport vector from pylibcudf.libcudf.contiguous_split cimport packed_columns from .gpumemoryview cimport gpumemoryview from .table cimport Table +cdef class HostBuffer: + cdef unique_ptr[vector[uint8_t]] c_obj + cdef size_t nbytes + cdef Py_ssize_t[1] shape + cdef Py_ssize_t[1] strides + + @staticmethod + cdef HostBuffer from_unique_ptr( + unique_ptr[vector[uint8_t]] vec + ) + cdef class PackedColumns: cdef unique_ptr[packed_columns] c_obj diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 2a40d42e6e9..b24b7a21af1 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -29,18 +29,14 @@ __all__ = [ cdef class HostBuffer: """Owning host buffer that implements the buffer protocol""" - cdef unique_ptr[vector[uint8_t]] c_obj - cdef size_t nbytes - cdef Py_ssize_t[1] shape - cdef Py_ssize_t[1] strides - @staticmethod cdef HostBuffer from_unique_ptr( unique_ptr[vector[uint8_t]] vec ): - cdef HostBuffer out = HostBuffer() + cdef HostBuffer out = HostBuffer.__new__(HostBuffer) + # Allow construction from nullptr + out.nbytes = 0 if vec.get() == NULL else dereference(vec).size() out.c_obj = move(vec) - out.nbytes = dereference(out.c_obj).size() out.shape[0] = out.nbytes out.strides[0] = 1 return out @@ -48,7 +44,8 @@ cdef class HostBuffer: __hash__ = None def __getbuffer__(self, Py_buffer *buffer, int flags): - buffer.buf = dereference(self.c_obj).data() + # Empty vec produces empty buffer + buffer.buf = NULL if self.nbytes == 0 else dereference(self.c_obj).data() buffer.format = NULL # byte buffer.internal = NULL buffer.itemsize = 1 diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 9c476030ded..1a61c20d783 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -1,14 +1,26 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libc.stdint cimport int64_t +from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.vector cimport vector from pylibcudf.expressions cimport Expression -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport ( + compression_type, + dictionary_policy, + statistics_freq, + SinkInfo, + SourceInfo, + TableInputMetadata, + TableWithMetadata, +) from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_reader as cpp_chunked_parquet_reader, + parquet_writer_options, + parquet_writer_options_builder, ) from pylibcudf.libcudf.types cimport size_type +from pylibcudf.table cimport Table from pylibcudf.types cimport DataType @@ -33,3 +45,49 @@ cpdef read_parquet( # ReaderColumnSchema reader_column_schema = *, # DataType timestamp_type = * ) + +cdef class ParquetWriterOptions: + cdef parquet_writer_options c_obj + cdef Table table_ref + cdef SinkInfo sink_ref + + cpdef void set_partitions(self, list partitions) + + cpdef void set_column_chunks_file_paths(self, list file_paths) + + cpdef void set_row_group_size_bytes(self, size_t size_bytes) + + cpdef void set_row_group_size_rows(self, size_type size_rows) + + cpdef void set_max_page_size_bytes(self, size_t size_bytes) + + cpdef void set_max_page_size_rows(self, size_type size_rows) + + cpdef void set_max_dictionary_size(self, size_t size_bytes) + +cdef class ParquetWriterOptionsBuilder: + cdef parquet_writer_options_builder c_obj + cdef Table table_ref + cdef SinkInfo sink_ref + + cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata) + + cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata) + + cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression) + + cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf) + + cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled) + + cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled) + + cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy val) + + cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled) + + cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled) + + cpdef ParquetWriterOptions build(self) + +cpdef memoryview write_parquet(ParquetWriterOptions options) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index bcf1d1cce09..eb2ca68109b 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -1,7 +1,20 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from typing import Self + from pylibcudf.expressions import Expression -from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.io.types import ( + CompressionType, + DictionaryPolicy, + PartitionInfo, + SinkInfo, + SourceInfo, + StatisticsFreq, + TableInputMetadata, + TableWithMetadata, +) +from pylibcudf.table import Table class ChunkedParquetReader: def __init__( @@ -34,3 +47,34 @@ def read_parquet( # reader_column_schema: ReaderColumnSchema = *, # timestamp_type: DataType = * ) -> TableWithMetadata: ... + +class ParquetWriterOptions: + def __init__(self): ... + @staticmethod + def builder( + sink: SinkInfo, table: Table + ) -> ParquetWriterOptionsBuilder: ... + def set_partitions(self, partitions: list[PartitionInfo]) -> None: ... + def set_column_chunks_file_paths(self, file_paths: list[str]) -> None: ... + def set_row_group_size_bytes(self, size_bytes: int) -> None: ... + def set_row_group_size_rows(self, size_rows: int) -> None: ... + def set_max_page_size_bytes(self, size_bytes: int) -> None: ... + def set_max_page_size_rows(self, size_rows: int) -> None: ... + def set_max_dictionary_size(self, size_bytes: int) -> None: ... + +class ParquetWriterOptionsBuilder: + def __init__(self): ... + def metadata(self, metadata: TableInputMetadata) -> Self: ... + def key_value_metadata( + self, metadata: list[Mapping[str, str]] + ) -> Self: ... + def compression(self, compression: CompressionType) -> Self: ... + def stats_level(self, sf: StatisticsFreq) -> Self: ... + def int96_timestamps(self, enabled: bool) -> Self: ... + def write_v2_headers(self, enabled: bool) -> Self: ... + def dictionary_policy(self, val: DictionaryPolicy) -> Self: ... + def utc_timestamps(self, enabled: bool) -> Self: ... + def write_arrow_schema(self, enabled: bool) -> Self: ... + def build(self) -> ParquetWriterOptions: ... + +def write_parquet(options: ParquetWriterOptions) -> memoryview: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index b76a352d633..b95b1f39de1 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -1,22 +1,45 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference -from libc.stdint cimport int64_t +from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector +from pylibcudf.contiguous_split cimport HostBuffer from pylibcudf.expressions cimport Expression -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport ( + SinkInfo, + SourceInfo, + PartitionInfo, + TableInputMetadata, + TableWithMetadata +) from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_reader as cpp_chunked_parquet_reader, parquet_reader_options, read_parquet as cpp_read_parquet, + write_parquet as cpp_write_parquet, + parquet_writer_options, +) +from pylibcudf.libcudf.io.types cimport ( + compression_type, + dictionary_policy as dictionary_policy_t, + partition_info, + statistics_freq, + table_with_metadata, ) -from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type +from pylibcudf.table cimport Table -__all__ = ["ChunkedParquetReader", "read_parquet"] +__all__ = [ + "ChunkedParquetReader", + "ParquetWriterOptions", + "ParquetWriterOptionsBuilder", + "read_parquet", + "write_parquet" +] cdef parquet_reader_options _setup_parquet_reader_options( @@ -221,3 +244,336 @@ cpdef read_parquet( c_result = move(cpp_read_parquet(opts)) return TableWithMetadata.from_libcudf(c_result) + + +cdef class ParquetWriterOptions: + + @staticmethod + def builder(SinkInfo sink, Table table): + """ + Create builder to create ParquetWriterOptionsBuilder. + + Parameters + ---------- + sink : SinkInfo + The sink used for writer output + + table : Table + Table to be written to output + + Returns + ------- + ParquetWriterOptionsBuilder + """ + cdef ParquetWriterOptionsBuilder bldr = ParquetWriterOptionsBuilder.__new__( + ParquetWriterOptionsBuilder + ) + bldr.c_obj = parquet_writer_options.builder(sink.c_obj, table.view()) + bldr.table_ref = table + bldr.sink_ref = sink + return bldr + + cpdef void set_partitions(self, list partitions): + """ + Sets partitions. + + Parameters + ---------- + partitions : list[Partitions] + Partitions of input table in {start_row, num_rows} pairs. + + Returns + ------- + None + """ + cdef vector[partition_info] c_partions + cdef PartitionInfo partition + + c_partions.reserve(len(partitions)) + for partition in partitions: + c_partions.push_back(partition.c_obj) + + self.c_obj.set_partitions(c_partions) + + cpdef void set_column_chunks_file_paths(self, list file_paths): + """ + Sets column chunks file path to be set in the raw output metadata. + + Parameters + ---------- + file_paths : list[str] + Vector of strings which indicate file paths. + + Returns + ------- + None + """ + self.c_obj.set_column_chunks_file_paths([fp.encode() for fp in file_paths]) + + cpdef void set_row_group_size_bytes(self, size_t size_bytes): + """ + Sets the maximum row group size, in bytes. + + Parameters + ---------- + size_bytes : int + Maximum row group size, in bytes to set + + Returns + ------- + None + """ + self.c_obj.set_row_group_size_bytes(size_bytes) + + cpdef void set_row_group_size_rows(self, size_type size_rows): + """ + Sets the maximum row group size, in rows. + + Parameters + ---------- + size_rows : int + Maximum row group size, in rows to set + + Returns + ------- + None + """ + self.c_obj.set_row_group_size_rows(size_rows) + + cpdef void set_max_page_size_bytes(self, size_t size_bytes): + """ + Sets the maximum uncompressed page size, in bytes. + + Parameters + ---------- + size_bytes : int + Maximum uncompressed page size, in bytes to set + + Returns + ------- + None + """ + self.c_obj.set_max_page_size_bytes(size_bytes) + + cpdef void set_max_page_size_rows(self, size_type size_rows): + """ + Sets the maximum page size, in rows. + + Parameters + ---------- + size_rows : int + Maximum page size, in rows to set. + + Returns + ------- + None + """ + self.c_obj.set_max_page_size_rows(size_rows) + + cpdef void set_max_dictionary_size(self, size_t size_bytes): + """ + Sets the maximum dictionary size, in bytes. + + Parameters + ---------- + size_bytes : int + Sets the maximum dictionary size, in bytes. + + Returns + ------- + None + """ + self.c_obj.set_max_dictionary_size(size_bytes) + + +cdef class ParquetWriterOptionsBuilder: + + cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata): + """ + Sets metadata. + + Parameters + ---------- + metadata : TableInputMetadata + Associated metadata + + Returns + ------- + Self + """ + self.c_obj.metadata(metadata.c_obj) + return self + + cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + metadata : list[dict[str, str]] + Key-Value footer metadata + + Returns + ------- + Self + """ + self.c_obj.key_value_metadata( + [ + {key.encode(): value.encode() for key, value in mapping.items()} + for mapping in metadata + ] + ) + return self + + cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression): + """ + Sets compression type. + + Parameters + ---------- + compression : CompressionType + The compression type to use + + Returns + ------- + Self + """ + self.c_obj.compression(compression) + return self + + cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf): + """ + Sets the level of statistics. + + Parameters + ---------- + sf : StatisticsFreq + Level of statistics requested in the output file + + Returns + ------- + Self + """ + self.c_obj.stats_level(sf) + return self + + cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled): + """ + Sets whether timestamps are written as int96 or timestamp micros. + + Parameters + ---------- + enabled : bool + Boolean value to enable/disable int96 timestamps + + Returns + ------- + Self + """ + self.c_obj.int96_timestamps(enabled) + return self + + cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled): + """ + Set to true to write V2 page headers, otherwise false to write V1 page headers. + + Parameters + ---------- + enabled : bool + Boolean value to enable/disable writing of V2 page headers. + + Returns + ------- + Self + """ + self.c_obj.write_v2_headers(enabled) + return self + + cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy_t val): + """ + Sets the policy for dictionary use. + + Parameters + ---------- + val : DictionaryPolicy + Policy for dictionary use. + + Returns + ------- + Self + """ + self.c_obj.dictionary_policy(val) + return self + + cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled): + """ + Set to true if timestamps are to be written as UTC. + + Parameters + ---------- + enabled : bool + Boolean value to enable/disable writing of timestamps as UTC. + + Returns + ------- + Self + """ + self.c_obj.utc_timestamps(enabled) + return self + + cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled): + """ + Set to true if arrow schema is to be written. + + Parameters + ---------- + enabled : bool + Boolean value to enable/disable writing of arrow schema. + + Returns + ------- + Self + """ + self.c_obj.write_arrow_schema(enabled) + return self + + cpdef ParquetWriterOptions build(self): + """ + Create a ParquetWriterOptions from the set options. + + Returns + ------- + ParquetWriterOptions + """ + cdef ParquetWriterOptions parquet_options = ParquetWriterOptions.__new__( + ParquetWriterOptions + ) + parquet_options.c_obj = move(self.c_obj.build()) + parquet_options.table_ref = self.table_ref + parquet_options.sink_ref = self.sink_ref + return parquet_options + + +cpdef memoryview write_parquet(ParquetWriterOptions options): + """ + Writes a set of columns to parquet format. + + Parameters + ---------- + options : ParquetWriterOptions + Settings for controlling writing behavior + + Returns + ------- + memoryview + A blob that contains the file metadata + (parquet FileMetadata thrift message) if requested in + parquet_writer_options (empty blob otherwise). + """ + cdef parquet_writer_options c_options = options.c_obj + cdef unique_ptr[vector[uint8_t]] c_result + + with nogil: + c_result = cpp_write_parquet(c_options) + + return memoryview(HostBuffer.from_unique_ptr(move(c_result))) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 0ab28cb0973..90b43cf0ff5 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t, int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.io.data_sink cimport data_sink @@ -18,9 +20,46 @@ from pylibcudf.libcudf.io.types cimport ( table_metadata, table_with_metadata, ) +from pylibcudf.libcudf.types cimport size_type from pylibcudf.table cimport Table +cdef class PartitionInfo: + cdef partition_info c_obj + +cdef class ColumnInMetadata: + cdef column_in_metadata c_obj + + @staticmethod + cdef ColumnInMetadata from_metadata(column_in_metadata metadata) + + cpdef ColumnInMetadata set_name(self, str name) + + cpdef ColumnInMetadata set_name(self, str name) + + cpdef ColumnInMetadata set_nullability(self, bool nullable) + + cpdef ColumnInMetadata set_list_column_as_map(self) + + cpdef ColumnInMetadata set_int96_timestamps(self, bool req) + + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t req) + + cpdef ColumnInMetadata child(self, size_type i) + + cpdef ColumnInMetadata set_output_as_binary(self, bool binary) + + cpdef ColumnInMetadata set_type_length(self, int32_t type_length) + + cpdef ColumnInMetadata set_skip_compression(self, bool skip) + + cpdef ColumnInMetadata set_encoding(self, column_encoding encoding) + + cpdef str get_name(self) + +cdef class TableInputMetadata: + cdef table_input_metadata c_obj + cdef class TableWithMetadata: cdef public Table tbl cdef table_metadata metadata diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index a4f4fc13bdc..04f276cfeee 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -59,6 +59,25 @@ class QuoteStyle(IntEnum): ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] +class PartitionInfo: + def __init__(self, start_row: int, num_rows: int): ... + +class TableInputMetadata: + def __init__(self, table: Table): ... + +class ColumnInMetadata: + def set_name(self, name: str) -> ColumnInMetadata: ... + def set_nullability(self, nullable: bool) -> ColumnInMetadata: ... + def set_list_column_as_map(self) -> ColumnInMetadata: ... + def set_int96_timestamps(self, req: bool) -> ColumnInMetadata: ... + def set_decimal_precision(self, precision: int) -> ColumnInMetadata: ... + def child(self, i: int) -> ColumnInMetadata: ... + def set_output_as_binary(self, binary: bool) -> ColumnInMetadata: ... + def set_type_length(self, type_length: int) -> ColumnInMetadata: ... + def set_skip_compression(self, skip: bool) -> ColumnInMetadata: ... + def set_encoding(self, encoding: ColumnEncoding) -> ColumnInMetadata: ... + def get_name(self) -> str: ... + class TableWithMetadata: tbl: Table def __init__( diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 51d5bda75c7..460ab6844c3 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -2,6 +2,8 @@ from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory +from libc.stdint cimport uint8_t, int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -10,11 +12,16 @@ from pylibcudf.io.datasource cimport Datasource from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.datasource cimport datasource from pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, column_name_info, host_buffer, + partition_info, source_info, + table_input_metadata, table_with_metadata, ) +from pylibcudf.libcudf.types cimport size_type import codecs import errno @@ -37,13 +44,233 @@ __all__ = [ "CompressionType", "DictionaryPolicy", "JSONRecoveryMode", + "PartitionInfo", "QuoteStyle", "SinkInfo", "SourceInfo", "StatisticsFreq", + "TableInputMetadata", "TableWithMetadata", ] +cdef class PartitionInfo: + """ + Information used while writing partitioned datasets. + + Parameters + ---------- + start_row : int + The start row of the partition. + + num_rows : int + The number of rows in the partition. + """ + def __init__(self, size_type start_row, size_type num_rows): + self.c_obj = partition_info(start_row, num_rows) + + +cdef class ColumnInMetadata: + """ + Metadata for a column + """ + + @staticmethod + cdef ColumnInMetadata from_metadata(column_in_metadata metadata): + """ + Construct a ColumnInMetadata. + + Parameters + ---------- + metadata : column_in_metadata + """ + cdef ColumnInMetadata col_metadata = ColumnInMetadata.__new__(ColumnInMetadata) + col_metadata.c_obj = metadata + return col_metadata + + cpdef ColumnInMetadata set_name(self, str name): + """ + Set the name of this column. + + Parameters + ---------- + name : str + Name of the column + + Returns + ------- + Self + """ + self.c_obj.set_name(name.encode()) + return self + + cpdef ColumnInMetadata set_nullability(self, bool nullable): + """ + Set the nullability of this column. + + Parameters + ---------- + nullable : bool + Whether this column is nullable + + Returns + ------- + Self + """ + self.c_obj.set_nullability(nullable) + return self + + cpdef ColumnInMetadata set_list_column_as_map(self): + """ + Specify that this list column should be encoded as a map in the + written file. + + Returns + ------- + Self + """ + self.c_obj.set_list_column_as_map() + return self + + cpdef ColumnInMetadata set_int96_timestamps(self, bool req): + """ + Specifies whether this timestamp column should be encoded using + the deprecated int96. + + Parameters + ---------- + req : bool + True = use int96 physical type. False = use int64 physical type. + + Returns + ------- + Self + """ + self.c_obj.set_int96_timestamps(req) + return self + + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision): + """ + Set the decimal precision of this column. + Only valid if this column is a decimal (fixed-point) type. + + Parameters + ---------- + precision : int + The integer precision to set for this decimal column + + Returns + ------- + Self + """ + self.c_obj.set_decimal_precision(precision) + return self + + cpdef ColumnInMetadata child(self, size_type i): + """ + Get reference to a child of this column. + + Parameters + ---------- + i : int + Index of the child to get. + + Returns + ------- + ColumnInMetadata + """ + return ColumnInMetadata.from_metadata(self.c_obj.child(i)) + + cpdef ColumnInMetadata set_output_as_binary(self, bool binary): + """ + Specifies whether this column should be written as binary or string data. + + Parameters + ---------- + binary : bool + True = use binary data type. False = use string data type + + Returns + ------- + Self + """ + self.c_obj.set_output_as_binary(binary) + return self + + cpdef ColumnInMetadata set_type_length(self, int32_t type_length): + """ + Sets the length of fixed length data. + + Parameters + ---------- + type_length : int + Size of the data type in bytes + + Returns + ------- + Self + """ + self.c_obj.set_type_length(type_length) + return self + + cpdef ColumnInMetadata set_skip_compression(self, bool skip): + """ + Specifies whether this column should not be compressed + regardless of the compression. + + Parameters + ---------- + skip : bool + If `true` do not compress this column + + Returns + ------- + Self + """ + self.c_obj.set_skip_compression(skip) + return self + + cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): + """ + Specifies whether this column should not be compressed + regardless of the compression. + + Parameters + ---------- + encoding : ColumnEncoding + The encoding to use + + Returns + ------- + ColumnInMetadata + """ + self.c_obj.set_encoding(encoding) + return self + + cpdef str get_name(self): + """ + Get the name of this column. + + Returns + ------- + str + The name of this column + """ + return self.c_obj.get_name().decode() + + +cdef class TableInputMetadata: + """ + Metadata for a table + + Parameters + ---------- + table : Table + The Table to construct metadata for + """ + def __init__(self, Table table): + self.c_obj = table_input_metadata(table.view()) + + cdef class TableWithMetadata: """A container holding a table and its associated metadata (e.g. column names) diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd index 110c9d4a0b9..e03fe7e921e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd @@ -156,7 +156,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: table_input_metadata m ) except +libcudf_exception_handler BuilderT& key_value_metadata( - vector[map[string, string]] kvm + vector[map[string, string]] metadata ) except +libcudf_exception_handler BuilderT& stats_level( statistics_freq sf @@ -189,7 +189,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_t val ) except +libcudf_exception_handler BuilderT& write_v2_headers( - bool val + bool enabled ) except +libcudf_exception_handler BuilderT& dictionary_policy( dictionary_policy val @@ -212,7 +212,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: ) except +libcudf_exception_handler cdef unique_ptr[vector[uint8_t]] write_parquet( - parquet_writer_options args + parquet_writer_options options ) except +libcudf_exception_handler cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base): diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index 41298601539..94524acbcc8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -1,4 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import io + import pyarrow as pa import pyarrow.compute as pc import pytest @@ -107,3 +109,70 @@ def test_read_parquet_filters( # ^^^ This one is not tested since it's not in pyarrow/pandas, deprecate? # bool convert_strings_to_categories = False, # bool use_pandas_metadata = True + + +@pytest.mark.parametrize("write_v2_headers", [True, False]) +@pytest.mark.parametrize("utc_timestamps", [True, False]) +@pytest.mark.parametrize("write_arrow_schema", [True, False]) +@pytest.mark.parametrize( + "partitions", + [None, [plc.io.types.PartitionInfo(0, 10)]], +) +@pytest.mark.parametrize("column_chunks_file_paths", [None, ["tmp.parquet"]]) +@pytest.mark.parametrize("row_group_size_bytes", [None, 1024]) +@pytest.mark.parametrize("row_group_size_rows", [None, 1]) +@pytest.mark.parametrize("max_page_size_bytes", [None, 1024]) +@pytest.mark.parametrize("max_page_size_rows", [None, 1]) +@pytest.mark.parametrize("max_dictionary_size", [None, 100]) +def test_write_parquet( + table_data, + write_v2_headers, + utc_timestamps, + write_arrow_schema, + partitions, + column_chunks_file_paths, + row_group_size_bytes, + row_group_size_rows, + max_page_size_bytes, + max_page_size_rows, + max_dictionary_size, +): + _, pa_table = table_data + if len(pa_table) == 0 and partitions is not None: + pytest.skip("https://github.com/rapidsai/cudf/issues/17361") + plc_table = plc.interop.from_arrow(pa_table) + table_meta = plc.io.types.TableInputMetadata(plc_table) + sink = plc.io.SinkInfo([io.BytesIO()]) + user_data = [{"foo": "{'bar': 'baz'}"}] + compression = plc.io.types.CompressionType.SNAPPY + stats_level = plc.io.types.StatisticsFreq.STATISTICS_COLUMN + dictionary_policy = plc.io.types.DictionaryPolicy.ADAPTIVE + options = ( + plc.io.parquet.ParquetWriterOptions.builder(sink, plc_table) + .metadata(table_meta) + .key_value_metadata(user_data) + .compression(compression) + .stats_level(stats_level) + .write_v2_headers(write_v2_headers) + .dictionary_policy(dictionary_policy) + .utc_timestamps(utc_timestamps) + .write_arrow_schema(write_arrow_schema) + .build() + ) + if partitions is not None: + options.set_partitions(partitions) + if column_chunks_file_paths is not None: + options.set_column_chunks_file_paths(column_chunks_file_paths) + if row_group_size_bytes is not None: + options.set_row_group_size_bytes(row_group_size_bytes) + if row_group_size_rows is not None: + options.set_row_group_size_rows(row_group_size_rows) + if max_page_size_bytes is not None: + options.set_max_page_size_bytes(max_page_size_bytes) + if max_page_size_rows is not None: + options.set_max_page_size_rows(max_page_size_rows) + if max_dictionary_size is not None: + options.set_max_dictionary_size(max_dictionary_size) + + result = plc.io.parquet.write_parquet(options) + assert isinstance(result, memoryview) From 5a89d0066b5cfbb38d5a392b425865d66b82a8b6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 22 Nov 2024 13:26:13 -0600 Subject: [PATCH 03/23] Run clang-tidy checks in PR CI (#17407) We discussed clang-tidy during the cuDF brown bag session. We decided to enable clang-tidy in PR CI and follow up by enabling more checks later. Authors: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17407 --- .github/workflows/pr.yaml | 8 +++ ci/cpp_linters.sh | 7 ++- cpp/.clang-tidy | 2 +- cpp/CMakeLists.txt | 82 +++++++++++++++------------- cpp/include/cudf/ast/expressions.hpp | 6 +- cpp/src/io/json/parser_features.cpp | 2 +- 6 files changed, 64 insertions(+), 43 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a8c4e481683..a8afede4821 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -16,6 +16,7 @@ jobs: - changed-files - checks - conda-cpp-build + - cpp-linters - conda-cpp-checks - conda-cpp-tests - conda-python-build @@ -113,6 +114,13 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02 with: build_type: pull-request + cpp-linters: + secrets: inherit + needs: checks + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02 + with: + build_type: pull-request + run_script: "ci/cpp_linters.sh" conda-cpp-checks: needs: conda-cpp-build secrets: inherit diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh index 286c7bfbc66..4d5b62ba280 100755 --- a/ci/cpp_linters.sh +++ b/ci/cpp_linters.sh @@ -25,7 +25,12 @@ RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" source rapids-configure-sccache # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. -cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja + +iwyu_flag="" +if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then + iwyu_flag="-DCUDF_IWYU=ON" +fi +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py # Remove invalid components of the path for local usage. The path below is diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index 0e5699876fc..60c0b5d3ba7 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -39,7 +39,7 @@ Checks: -clang-analyzer-optin.core.EnumCastOutOfRange, -clang-analyzer-optin.cplusplus.UninitializedObject' -WarningsAsErrors: '' +WarningsAsErrors: '*' HeaderFilterRegex: '.*cudf/cpp/(src|include).*' ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 506f6c185f5..e4fa3b28383 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -91,7 +91,8 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) -option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) +option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF) +option(CUDF_IWYU "Enable IWYU during compilation" OFF) option( CUDF_KVIKIO_REMOTE_IO @@ -159,9 +160,7 @@ endif() # ################################################################################################## # * linter configuration --------------------------------------------------------------------------- -if(CUDF_STATIC_LINTERS) - # For simplicity, for now we assume that all linters can be installed into an environment where - # any linter is being run. We could relax this requirement if desired. +if(CUDF_CLANG_TIDY) find_program( CLANG_TIDY_EXE NAMES "clang-tidy" @@ -188,7 +187,9 @@ if(CUDF_STATIC_LINTERS) "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" ) endif() +endif() +if(CUDF_IWYU) find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED) endif() @@ -201,38 +202,36 @@ function(enable_static_checkers target) _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} ) - if(CUDF_STATIC_LINTERS) - if(_LINT_CLANG_TIDY) - # clang will complain about unused link libraries on the compile line unless we specify - # -Qunused-arguments. - set_target_properties( - ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" - ) - endif() - if(_LINT_IWYU) - # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not - # relevant since they don't show up in any other build so it's better to suppress them until - # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not - # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX - # compile flags. To do this completely cleanly we should modify the flags on the target rather - # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run - # the linters on real builds. - foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration) - list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index) - if(_flag_index EQUAL -1) - list(APPEND CUDF_CXX_FLAGS ${_flag}) - endif() - endforeach() - set(CUDF_CXX_FLAGS - "${CUDF_CXX_FLAGS}" - PARENT_SCOPE - ) - set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") - endif() - foreach(file IN LISTS _LINT_SKIPPED_FILES) - set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) + if(_LINT_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() + if(_LINT_IWYU) + # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not + # relevant since they don't show up in any other build so it's better to suppress them until we + # can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not appear to + # be sufficient, we must also ensure that it is set to the underlying target's CXX compile + # flags. To do this completely cleanly we should modify the flags on the target rather than the + # global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run the linters + # on real builds. + foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration) + list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index) + if(_flag_index EQUAL -1) + list(APPEND CUDF_CXX_FLAGS ${_flag}) + endif() endforeach() + set(CUDF_CXX_FLAGS + "${CUDF_CXX_FLAGS}" + PARENT_SCOPE + ) + set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") endif() + foreach(file IN LISTS _LINT_SKIPPED_FILES) + set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) + endforeach() endfunction() # ################################################################################################## @@ -812,9 +811,18 @@ set_target_properties( # Note: This must come before the target_compile_options below so that the function can modify the # flags if necessary. -enable_static_checkers( - cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU -) +if(CUDF_CLANG_TIDY OR CUDF_IWYU) + set(linters) + if(CUDF_CLANG_TIDY) + list(APPEND linters CLANG_TIDY) + endif() + if(CUDF_IWYU) + list(APPEND linters IWYU) + endif() + enable_static_checkers( + cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp ${linters} + ) +endif() target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index bcc9ad1b391..85289a52831 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -612,19 +612,19 @@ class tree { * @brief get the first expression in the tree * @returns the first inserted expression into the tree */ - expression const& front() const { return *expressions.front(); } + [[nodiscard]] expression const& front() const { return *expressions.front(); } /** * @brief get the last expression in the tree * @returns the last inserted expression into the tree */ - expression const& back() const { return *expressions.back(); } + [[nodiscard]] expression const& back() const { return *expressions.back(); } /** * @brief get the number of expressions added to the tree * @returns the number of expressions added to the tree */ - size_t size() const { return expressions.size(); } + [[nodiscard]] size_t size() const { return expressions.size(); } /** * @brief get the expression at an index in the tree. Index is checked. diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 401a6e992de..e795e8e09d8 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -109,7 +109,7 @@ struct allnull_column_functor { rmm::device_async_resource_ref mr; private: - auto make_zeroed_offsets(size_type size) const + [[nodiscard]] auto make_zeroed_offsets(size_type size) const { auto offsets_buff = cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); From 881afd123572dac518e0baeb0537572a869e751c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Nov 2024 12:18:25 -0800 Subject: [PATCH 04/23] Remove cudf._lib.transpose in favor of inlining pylibcudf (#17365) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17365 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/transpose.pyx | 18 ------------------ python/cudf/cudf/core/dataframe.py | 10 +++++++++- 4 files changed, 9 insertions(+), 21 deletions(-) delete mode 100644 python/cudf/cudf/_lib/transpose.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 2958c286d20..8a521f19350 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -44,7 +44,6 @@ set(cython_sources text.pyx timezone.pyx transform.pyx - transpose.pyx types.pyx utils.pyx ) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 19dc4488560..27bb486f55b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -30,7 +30,6 @@ strings_udf, text, timezone, - transpose, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx deleted file mode 100644 index 995d278cb88..00000000000 --- a/python/cudf/cudf/_lib/transpose.pyx +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def transpose(list source_columns): - """Transpose m n-row columns into n m-row columns - """ - input_table = plc.table.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - result_table = plc.transpose.transpose(input_table) - return [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in result_table.columns() - ] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bd78d5dd9f1..728cc47a7c9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4113,7 +4113,15 @@ def transpose(self): if any(c.dtype != source_columns[0].dtype for c in source_columns): raise ValueError("Columns must all have the same dtype") - result_columns = libcudf.transpose.transpose(source_columns) + result_table = plc.transpose.transpose( + plc.table.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] + ) + ) + result_columns = [ + libcudf.column.Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in result_table.columns() + ] if isinstance(source_dtype, cudf.CategoricalDtype): result_columns = [ From 092fdff63ec6d166b4feca1c59bd0582dde932f5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 22 Nov 2024 14:24:05 -0800 Subject: [PATCH 05/23] Update xxhash_64 to utilize the cuco equivalent implementation (#17393) This PR updates `xxhash_64` to leverage the cuco equivalent implementation and makes it available as a cudf detail utility, as requested by @mhaseeb123. Additionally, it fixes a minor issue in the `murmurhash3_x64_128.cuh` header by adding the missing `#pragma once` directive. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17393 --- .../hashing/detail/murmurhash3_x64_128.cuh | 4 +- cpp/include/cudf/hashing/detail/xxhash_64.cuh | 99 +++++++++ cpp/src/hash/xxhash_64.cu | 203 +----------------- 3 files changed, 103 insertions(+), 203 deletions(-) create mode 100644 cpp/include/cudf/hashing/detail/xxhash_64.cuh diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh index c986a908706..5e88b905023 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once + #include #include diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh new file mode 100644 index 00000000000..b00e8297ac9 --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "hash_functions.cuh" + +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_64 : public cuco::xxhash_64 { + using result_type = typename cuco::xxhash_64::result_type; + + __device__ result_type operator()(Key const& key) const + { + return cuco::xxhash_64::operator()(key); + } + + template + __device__ result_type compute_hash(cuda::std::byte const* bytes, Extent size) const + { + return cuco::xxhash_64::compute_hash(bytes, size); + } +}; + +template <> +XXHash_64::result_type __device__ inline XXHash_64::operator()(bool const& key) const +{ + return this->compute_hash(reinterpret_cast(&key), sizeof(key)); +} + +template <> +XXHash_64::result_type __device__ inline XXHash_64::operator()(float const& key) const +{ + return cuco::xxhash_64::operator()(normalize_nans(key)); +} + +template <> +XXHash_64::result_type __device__ inline XXHash_64::operator()( + double const& key) const +{ + return cuco::xxhash_64::operator()(normalize_nans(key)); +} + +template <> +XXHash_64::result_type + __device__ inline XXHash_64::operator()(cudf::string_view const& key) const +{ + return this->compute_hash(reinterpret_cast(key.data()), key.size_bytes()); +} + +template <> +XXHash_64::result_type + __device__ inline XXHash_64::operator()(numeric::decimal32 const& key) const +{ + auto const val = key.value(); + auto const len = sizeof(val); + return this->compute_hash(reinterpret_cast(&val), len); +} + +template <> +XXHash_64::result_type + __device__ inline XXHash_64::operator()(numeric::decimal64 const& key) const +{ + auto const val = key.value(); + auto const len = sizeof(val); + return this->compute_hash(reinterpret_cast(&val), len); +} + +template <> +XXHash_64::result_type + __device__ inline XXHash_64::operator()(numeric::decimal128 const& key) const +{ + auto const val = key.value(); + auto const len = sizeof(val); + return this->compute_hash(reinterpret_cast(&val), len); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu index fad8383210b..bdbe13b1ffb 100644 --- a/cpp/src/hash/xxhash_64.cu +++ b/cpp/src/hash/xxhash_64.cu @@ -16,8 +16,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -35,207 +35,6 @@ namespace { using hash_value_type = uint64_t; -template -struct XXHash_64 { - using result_type = hash_value_type; - - constexpr XXHash_64() = default; - constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {} - - __device__ inline uint32_t getblock32(std::byte const* data, std::size_t offset) const - { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); - } - - __device__ inline uint64_t getblock64(std::byte const* data, std::size_t offset) const - { - uint64_t result = getblock32(data, offset + 4); - result = result << 32; - return result | getblock32(data, offset); - } - - result_type __device__ inline operator()(Key const& key) const { return compute(key); } - - template - result_type __device__ inline compute(T const& key) const - { - auto data = device_span(reinterpret_cast(&key), sizeof(T)); - return compute_bytes(data); - } - - result_type __device__ inline compute_remaining_bytes(device_span& in, - std::size_t offset, - result_type h64) const - { - // remaining data can be processed in 8-byte chunks - if ((in.size() % 32) >= 8) { - for (; offset <= in.size() - 8; offset += 8) { - uint64_t k1 = getblock64(in.data(), offset) * prime2; - - k1 = rotate_bits_left(k1, 31) * prime1; - h64 ^= k1; - h64 = rotate_bits_left(h64, 27) * prime1 + prime4; - } - } - - // remaining data can be processed in 4-byte chunks - if ((in.size() % 8) >= 4) { - for (; offset <= in.size() - 4; offset += 4) { - h64 ^= (getblock32(in.data(), offset) & 0xfffffffful) * prime1; - h64 = rotate_bits_left(h64, 23) * prime2 + prime3; - } - } - - // and the rest - if (in.size() % 4) { - while (offset < in.size()) { - h64 ^= (std::to_integer(in[offset]) & 0xff) * prime5; - h64 = rotate_bits_left(h64, 11) * prime1; - ++offset; - } - } - return h64; - } - - result_type __device__ compute_bytes(device_span& in) const - { - uint64_t offset = 0; - uint64_t h64; - // data can be processed in 32-byte chunks - if (in.size() >= 32) { - auto limit = in.size() - 32; - uint64_t v1 = m_seed + prime1 + prime2; - uint64_t v2 = m_seed + prime2; - uint64_t v3 = m_seed; - uint64_t v4 = m_seed - prime1; - - do { - // pipeline 4*8byte computations - v1 += getblock64(in.data(), offset) * prime2; - v1 = rotate_bits_left(v1, 31); - v1 *= prime1; - offset += 8; - v2 += getblock64(in.data(), offset) * prime2; - v2 = rotate_bits_left(v2, 31); - v2 *= prime1; - offset += 8; - v3 += getblock64(in.data(), offset) * prime2; - v3 = rotate_bits_left(v3, 31); - v3 *= prime1; - offset += 8; - v4 += getblock64(in.data(), offset) * prime2; - v4 = rotate_bits_left(v4, 31); - v4 *= prime1; - offset += 8; - } while (offset <= limit); - - h64 = rotate_bits_left(v1, 1) + rotate_bits_left(v2, 7) + rotate_bits_left(v3, 12) + - rotate_bits_left(v4, 18); - - v1 *= prime2; - v1 = rotate_bits_left(v1, 31); - v1 *= prime1; - h64 ^= v1; - h64 = h64 * prime1 + prime4; - - v2 *= prime2; - v2 = rotate_bits_left(v2, 31); - v2 *= prime1; - h64 ^= v2; - h64 = h64 * prime1 + prime4; - - v3 *= prime2; - v3 = rotate_bits_left(v3, 31); - v3 *= prime1; - h64 ^= v3; - h64 = h64 * prime1 + prime4; - - v4 *= prime2; - v4 = rotate_bits_left(v4, 31); - v4 *= prime1; - h64 ^= v4; - h64 = h64 * prime1 + prime4; - } else { - h64 = m_seed + prime5; - } - - h64 += in.size(); - - h64 = compute_remaining_bytes(in, offset, h64); - - return finalize(h64); - } - - constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept - { - h ^= h >> 33; - h *= prime2; - h ^= h >> 29; - h *= prime3; - h ^= h >> 32; - return h; - } - - private: - hash_value_type m_seed{}; - static constexpr uint64_t prime1 = 0x9e3779b185ebca87ul; - static constexpr uint64_t prime2 = 0xc2b2ae3d27d4eb4ful; - static constexpr uint64_t prime3 = 0x165667b19e3779f9ul; - static constexpr uint64_t prime4 = 0x85ebca77c2b2ae63ul; - static constexpr uint64_t prime5 = 0x27d4eb2f165667c5ul; -}; - -template <> -hash_value_type __device__ inline XXHash_64::operator()(bool const& key) const -{ - return compute(static_cast(key)); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()(float const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()(double const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()( - cudf::string_view const& key) const -{ - auto const len = key.size_bytes(); - auto data = device_span(reinterpret_cast(key.data()), len); - return compute_bytes(data); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()( - numeric::decimal32 const& key) const -{ - return compute(key.value()); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()( - numeric::decimal64 const& key) const -{ - return compute(key.value()); -} - -template <> -hash_value_type __device__ inline XXHash_64::operator()( - numeric::decimal128 const& key) const -{ - return compute(key.value()); -} - /** * @brief Computes the hash value of a row in the given table. * From 8b7127fe92aa28321b6422643a3d1d3ca2301ae1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 22 Nov 2024 18:16:42 -0500 Subject: [PATCH 06/23] Fix Debug-mode failing Arrow test (#17405) Fixes #17153 by fixing the appropriate element nullability data types Authors: - Matt Topol (https://github.com/zeroshade) Approvers: - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17405 --- cpp/tests/interop/arrow_utils.hpp | 3 +-- cpp/tests/interop/from_arrow_test.cpp | 5 ++++- cpp/tests/interop/to_arrow_test.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 70a9fe64d70..e785845394a 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -212,9 +212,8 @@ std::shared_ptr get_arrow_list_array(std::vector data, "Failed to append values to buffer builder"); CUDF_EXPECTS(buff_builder.Finish(&offset_buffer).ok(), "Failed to allocate buffer"); - auto nullable = std::accumulate(list_validity.begin(), list_validity.end(), 0) > 0; return std::make_shared( - arrow::list(arrow::field("", data_array->type(), nullable)), + arrow::list(arrow::field("element", data_array->type(), data_array->null_count() > 0)), offsets.size() - 1, offset_buffer, data_array, diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 18efae75cb1..62e38cbfd45 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -279,7 +279,10 @@ TEST_F(FromArrowTest, StructColumn) auto list_arr = get_arrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); std::vector offset{0, 3, 4, 6}; auto nested_list_arr = std::make_shared( - arrow::list(list(arrow::int64())), offset.size() - 1, arrow::Buffer::Wrap(offset), list_arr); + arrow::list(list(arrow::field("element", arrow::int64(), false))), + offset.size() - 1, + arrow::Buffer::Wrap(offset), + list_arr); std::vector> child_arrays2({str2_array, int2_array}); auto fields2 = std::vector>{ diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 86295d8efb1..faa07ba3311 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -261,14 +261,14 @@ TEST_F(ToArrowTest, NestedList) std::vector offset{0, 0, 2}; auto mask_buffer = arrow::internal::BytesToBits({0, 1}).ValueOrDie(); auto nested_list_arr = std::make_shared( - arrow::list(arrow::field("a", arrow::list(arrow::int64()), false)), + arrow::list(arrow::field("element", arrow::list(arrow::int64()), false)), offset.size() - 1, arrow::Buffer::Wrap(offset), list_arr, mask_buffer); std::vector> schema_vector( - {arrow::field("a", nested_list_arr->type())}); + {arrow::field("a", nested_list_arr->type(), false)}); auto schema = std::make_shared(schema_vector); auto expected_arrow_table = arrow::Table::Make(schema, {nested_list_arr}); From d1d44209673f3b789bcbc8dc628231d4ace76bef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:24:49 -0800 Subject: [PATCH 07/23] Move cudf._lib.search to cudf.core._internals (#17411) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17411 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/search.pyx | 68 ---------------------- python/cudf/cudf/core/_internals/search.py | 56 ++++++++++++++++++ python/cudf/cudf/core/column/column.py | 23 +++++++- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical.py | 4 +- python/cudf/cudf/core/column/string.py | 10 +--- python/cudf/cudf/core/frame.py | 8 ++- python/cudf/cudf/core/index.py | 2 +- 10 files changed, 87 insertions(+), 88 deletions(-) delete mode 100644 python/cudf/cudf/_lib/search.pyx create mode 100644 python/cudf/cudf/core/_internals/search.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8a521f19350..1c2b24d2391 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -36,7 +36,6 @@ set(cython_sources rolling.pyx round.pyx scalar.pyx - search.pyx sort.pyx stream_compaction.pyx string_casting.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 27bb486f55b..13d05033c11 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -22,7 +22,6 @@ reshape, rolling, round, - search, sort, stream_compaction, string_casting, diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx deleted file mode 100644 index 8108361052b..00000000000 --- a/python/cudf/cudf/_lib/search.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - - -@acquire_spill_lock() -def search_sorted( - list source, list values, side, ascending=True, na_position="last" -): - """Find indices where elements should be inserted to maintain order - - Parameters - ---------- - source : list of columns - List of columns to search in - values : List of columns - List of value columns to search for - side : str {'left', 'right'} optional - If 'left', the index of the first suitable location is given. - If 'right', return the last such index - """ - # Note: We are ignoring index columns here - column_order = [ - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ] * len(source) - null_precedence = [ - pylibcudf.types.NullOrder.AFTER - if na_position == "last" - else pylibcudf.types.NullOrder.BEFORE - ] * len(source) - - func = getattr( - pylibcudf.search, - "lower_bound" if side == "left" else "upper_bound", - ) - return Column.from_pylibcudf( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - column_order, - null_precedence, - ) - ) - - -@acquire_spill_lock() -def contains(Column haystack, Column needles): - """Check whether column contains multiple values - - Parameters - ---------- - column : NumericalColumn - Column to search in - needles : - A column of values to search for - """ - return Column.from_pylibcudf( - pylibcudf.search.contains( - haystack.to_pylibcudf(mode="read"), - needles.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py new file mode 100644 index 00000000000..a0ffe078de9 --- /dev/null +++ b/python/cudf/cudf/core/_internals/search.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def search_sorted( + source: list[ColumnBase], + values: list[ColumnBase], + side: Literal["left", "right"], + ascending: bool = True, + na_position: Literal["first", "last"] = "last", +) -> ColumnBase: + """Find indices where elements should be inserted to maintain order + + Parameters + ---------- + source : list of columns + List of columns to search in + values : List of columns + List of value columns to search for + side : str {'left', 'right'} optional + If 'left', the index of the first suitable location is given. + If 'right', return the last such index + """ + # Note: We are ignoring index columns here + column_order = [ + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ] * len(source) + null_precedence = [ + plc.types.NullOrder.AFTER + if na_position == "last" + else plc.types.NullOrder.BEFORE + ] * len(source) + + func = getattr( + plc.search, + "lower_bound" if side == "left" else "upper_bound", + ) + return Column.from_pylibcudf( + func( + plc.Table([col.to_pylibcudf(mode="read") for col in source]), + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + column_order, + null_precedence, + ) + ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f6eaea4b783..53946be1c49 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -757,7 +757,7 @@ def indices_of( raise ValueError("value must be a scalar") else: value = as_column(value, dtype=self.dtype, length=1) - mask = libcudf.search.contains(value, self) + mask = value.contains(self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] @@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # self.isin(other) asks "which values of self are in other" # contains(haystack, needles) asks "which needles are in haystack" # hence this argument ordering. - result = libcudf.search.contains(rhs, self) + result = rhs.contains(self) if self.null_count > 0: # If one of the needles is null, then the result contains # nulls, these nulls should be replaced by whether or not the @@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool: [self], [False], None ) + def contains(self, other: ColumnBase) -> ColumnBase: + """ + Check whether column contains multiple values. + + Parameters + ---------- + other : Column + A column of values to search for + """ + with acquire_spill_lock(): + return Column.from_pylibcudf( + plc.search.contains( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + ) + ) + def sort_values( self: Self, ascending: bool = True, @@ -1190,7 +1207,7 @@ def searchsorted( raise ValueError( "Column searchsorted expects values to be column of same dtype" ) - return libcudf.search.search_sorted( + return cudf.core._internals.search.search_sorted( # type: ignore[return-value] [self], [value], side=side, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 16124cf0a7d..b40ea4eedd3 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,9 +18,9 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.search import search_sorted from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import unary +from cudf.core._internals.search import search_sorted from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, get_compatible_timezone, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 36d1bdb45b6..a7538c1c947 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool: except (TypeError, ValueError): return False # TODO: Use `scalar`-based `contains` wrapper - return libcudf.search.contains( - self, column.as_column([search_item], dtype=self.dtype) + return self.contains( + column.as_column([search_item], dtype=self.dtype) ).any() def indices_of(self, value: ScalarLike) -> NumericalColumn: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 3d70b01b7e4..a9ab2d373fd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5857,14 +5857,8 @@ def sum( return result_col def __contains__(self, item: ScalarLike) -> bool: - if is_scalar(item): - return True in libcudf.search.contains( - self, column.as_column([item], dtype=self.dtype) - ) - else: - return True in libcudf.search.contains( - self, column.as_column(item, dtype=self.dtype) - ) + other = [item] if is_scalar(item) else item + return self.contains(column.as_column(other, dtype=self.dtype)).any() def as_numerical_column( self, dtype: Dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 30868924bcd..838fde260df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -8,8 +8,6 @@ from collections import abc from typing import TYPE_CHECKING, Any, Literal -# TODO: The `numpy` import is needed for typing purposes during doc builds -# only, need to figure out why the `np` alias is insufficient then remove. import cupy import numpy import numpy as np @@ -19,9 +17,13 @@ import pylibcudf as plc import cudf + +# TODO: The `numpy` import is needed for typing purposes during doc builds +# only, need to figure out why the `np` alias is insufficient then remove. from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals.search import search_sorted from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -1302,7 +1304,7 @@ def searchsorted( for val, common_dtype in zip(values, common_dtype_list) ] - outcol = libcudf.search.search_sorted( + outcol = search_sorted( sources, values, side, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b90e9f9df0..0a2b15a16b9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.filling import sequence -from cudf._lib.search import search_sorted from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( @@ -32,6 +31,7 @@ ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals.search import search_sorted from cudf.core.column import ( CategoricalColumn, ColumnBase, From 44b2e798bd6c280985d052634c0c1e495f57a609 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 24 Nov 2024 15:11:32 -0800 Subject: [PATCH 08/23] Remove the unused detail `int_fastdiv.h` header (#17426) This PR removes the unused detail `int_fastdiv.h` header. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17426 --- .../cudf/detail/utilities/int_fastdiv.h | 175 ------------------ 1 file changed, 175 deletions(-) delete mode 100644 cpp/include/cudf/detail/utilities/int_fastdiv.h diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h deleted file mode 100644 index ff442af5194..00000000000 --- a/cpp/include/cudf/detail/utilities/int_fastdiv.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Copyright 2014 Maxim Milakov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -class int_fastdiv { - public: - // divisor != 0 - __host__ __device__ __forceinline__ int_fastdiv(int divisor = 0) : d(divisor) - { - update_magic_numbers(); - } - - __host__ __device__ __forceinline__ int_fastdiv& operator=(int divisor) - { - this->d = divisor; - update_magic_numbers(); - return *this; - } - - __host__ __device__ __forceinline__ operator int() const { return d; } - - private: - int d; - int M; - int s; - int n_add_sign; - - // Hacker's Delight, Second Edition, Chapter 10, Integer Division By Constants - __host__ __device__ __forceinline__ void update_magic_numbers() - { - if (d == 1) { - M = 0; - s = -1; - n_add_sign = 1; - return; - } else if (d == -1) { - M = 0; - s = -1; - n_add_sign = -1; - return; - } - - int p; - unsigned int ad, anc, delta, q1, r1, q2, r2, t; - unsigned const two31 = 0x8000'0000u; - ad = (d == 0) ? 1 : abs(d); - t = two31 + ((unsigned int)d >> 31); - anc = t - 1 - t % ad; - p = 31; - q1 = two31 / anc; - r1 = two31 - q1 * anc; - q2 = two31 / ad; - r2 = two31 - q2 * ad; - do { - ++p; - q1 = 2 * q1; - r1 = 2 * r1; - if (r1 >= anc) { - ++q1; - r1 -= anc; - } - q2 = 2 * q2; - r2 = 2 * r2; - if (r2 >= ad) { - ++q2; - r2 -= ad; - } - delta = ad - r2; - } while (q1 < delta || (q1 == delta && r1 == 0)); - this->M = q2 + 1; - if (d < 0) this->M = -this->M; - this->s = p - 32; - - if ((d > 0) && (M < 0)) - n_add_sign = 1; - else if ((d < 0) && (M > 0)) - n_add_sign = -1; - else - n_add_sign = 0; - } - - __host__ __device__ __forceinline__ friend int operator/(int const divident, - int_fastdiv const& divisor); -}; - -__host__ __device__ __forceinline__ int operator/(int const n, int_fastdiv const& divisor) -{ - int q; -#ifdef __CUDA_ARCH__ - asm("mul.hi.s32 %0, %1, %2;" : "=r"(q) : "r"(divisor.M), "r"(n)); -#else - q = (((unsigned long long)((long long)divisor.M * (long long)n)) >> 32); -#endif - q += n * divisor.n_add_sign; - if (divisor.s >= 0) { - q >>= divisor.s; // we rely on this to be implemented as arithmetic shift - q += (((unsigned int)q) >> 31); - } - return q; -} - -__host__ __device__ __forceinline__ int operator%(int const n, int_fastdiv const& divisor) -{ - int quotient = n / divisor; - int remainder = n - quotient * divisor; - return remainder; -} - -__host__ __device__ __forceinline__ int operator/(unsigned int const n, int_fastdiv const& divisor) -{ - return ((int)n) / divisor; -} - -__host__ __device__ __forceinline__ int operator%(unsigned int const n, int_fastdiv const& divisor) -{ - return ((int)n) % divisor; -} - -__host__ __device__ __forceinline__ int operator/(short const n, int_fastdiv const& divisor) -{ - return ((int)n) / divisor; -} - -__host__ __device__ __forceinline__ int operator%(short const n, int_fastdiv const& divisor) -{ - return ((int)n) % divisor; -} - -__host__ __device__ __forceinline__ int operator/(unsigned short const n, - int_fastdiv const& divisor) -{ - return ((int)n) / divisor; -} - -__host__ __device__ __forceinline__ int operator%(unsigned short const n, - int_fastdiv const& divisor) -{ - return ((int)n) % divisor; -} - -__host__ __device__ __forceinline__ int operator/(char const n, int_fastdiv const& divisor) -{ - return ((int)n) / divisor; -} - -__host__ __device__ __forceinline__ int operator%(char const n, int_fastdiv const& divisor) -{ - return ((int)n) % divisor; -} - -__host__ __device__ __forceinline__ int operator/(unsigned char const n, int_fastdiv const& divisor) -{ - return ((int)n) / divisor; -} - -__host__ __device__ __forceinline__ int operator%(unsigned char const n, int_fastdiv const& divisor) -{ - return ((int)n) % divisor; -} From 8d8cd7818b310845f4c2ad3ffc1521a267df3973 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 25 Nov 2024 05:57:47 -0500 Subject: [PATCH 09/23] Expose stream-ordering to groupby APIs (#17324) Adds stream parameter to ``` cudf::groupby::scan cudf::groupby::aggregate cudf::groupby::shift cudf::groupby::get_groups cudf::groupby::replace_nulls ``` Added stream gtests to verify correct stream forwarding. Reference: https://github.com/rapidsai/cudf/issues/13744 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17324 --- cpp/include/cudf/groupby.hpp | 20 +++++------ cpp/src/groupby/groupby.cu | 22 +++++------- cpp/tests/streams/groupby_test.cpp | 57 +++++++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index c9df02f167a..ca3c97880df 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -178,6 +178,7 @@ class groupby { * * @param requests The set of columns to aggregate and the aggregations to * perform + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned table and columns' device memory * @return Pair containing the table with each group's unique key and * a vector of aggregation_results for each request in the same order as @@ -185,16 +186,7 @@ class groupby { */ std::pair, std::vector> aggregate( host_span requests, - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - - /** - * @copydoc aggregate(host_span, rmm::device_async_resource_ref) - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, std::vector> aggregate( - host_span requests, - rmm::cuda_stream_view stream, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * @brief Performs grouped scans on the specified values. @@ -242,6 +234,7 @@ class groupby { * ``` * * @param requests The set of columns to scan and the scans to perform + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned table and columns' device memory * @return Pair containing the table with each group's key and * a vector of aggregation_results for each request in the same order as @@ -249,6 +242,7 @@ class groupby { */ std::pair, std::vector> scan( host_span requests, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -295,6 +289,7 @@ class groupby { * @param values Table whose columns to be shifted * @param offsets The offsets by which to shift the input * @param fill_values Fill values for indeterminable outputs + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned table and columns' device memory * @return Pair containing the tables with each group's key and the columns shifted * @@ -305,6 +300,7 @@ class groupby { table_view const& values, host_span offsets, std::vector> const& fill_values, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -329,11 +325,13 @@ class groupby { * and the `values` of the `groups` object will be `nullptr`. * * @param values Table representing values on which a groupby operation is to be performed + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned tables's device memory in the * returned groups * @return A `groups` object representing grouped keys and values */ groups get_groups(cudf::table_view values = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -367,6 +365,7 @@ class groupby { * @param[in] values A table whose column null values will be replaced * @param[in] replace_policies Specify the position of replacement values relative to null values, * one for each column + * @param[in] stream CUDA stream used for device memory operations and kernel launches. * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @return Pair that contains a table with the sorted keys and the result column @@ -374,6 +373,7 @@ class groupby { std::pair, std::unique_ptr> replace_nulls( table_view const& values, host_span replace_policies, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); private: diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 6eb82618e2a..c42038026e5 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -191,13 +191,6 @@ void verify_valid_requests(host_span requests) } // namespace -// Compute aggregation requests -std::pair, std::vector> groupby::aggregate( - host_span requests, rmm::device_async_resource_ref mr) -{ - return aggregate(requests, cudf::get_default_stream(), mr); -} - // Compute aggregation requests std::pair, std::vector> groupby::aggregate( host_span requests, @@ -220,7 +213,9 @@ std::pair, std::vector> groupby::aggr // Compute scan requests std::pair, std::vector> groupby::scan( - host_span requests, rmm::device_async_resource_ref mr) + host_span requests, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS( @@ -233,13 +228,14 @@ std::pair, std::vector> groupby::scan if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } - return sort_scan(requests, cudf::get_default_stream(), mr); + return sort_scan(requests, stream, mr); } -groupby::groups groupby::get_groups(table_view values, rmm::device_async_resource_ref mr) +groupby::groups groupby::get_groups(table_view values, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto const stream = cudf::get_default_stream(); auto grouped_keys = helper().sorted_keys(stream, mr); auto const& group_offsets = helper().group_offsets(stream); @@ -262,6 +258,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::device_async_resourc std::pair, std::unique_ptr
> groupby::replace_nulls( table_view const& values, host_span replace_policies, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -271,7 +268,6 @@ std::pair, std::unique_ptr
> groupby::replace_nulls "Size mismatch between num_columns and replace_policies."); if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); } - auto const stream = cudf::get_default_stream(); auto const& group_labels = helper().group_labels(stream); std::vector> results; @@ -306,6 +302,7 @@ std::pair, std::unique_ptr
> groupby::shift( table_view const& values, host_span offsets, std::vector> const& fill_values, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -320,7 +317,6 @@ std::pair, std::unique_ptr
> groupby::shift( }), "values and fill_value should have the same type.", cudf::data_type_error); - auto stream = cudf::get_default_stream(); std::vector> results; auto const& group_offsets = helper().group_offsets(stream); std::transform( diff --git a/cpp/tests/streams/groupby_test.cpp b/cpp/tests/streams/groupby_test.cpp index 03cabbc4de0..73d6d31b282 100644 --- a/cpp/tests/streams/groupby_test.cpp +++ b/cpp/tests/streams/groupby_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,9 @@ #include #include +#include #include +#include using K = int32_t; // Key type. @@ -65,3 +67,56 @@ TYPED_TEST(groupby_stream_test, test_count) this->test_groupby(make_count_agg(), force_use_sort_impl::YES); this->test_groupby(make_count_agg(cudf::null_policy::INCLUDE)); } + +struct GroupbyTest : public cudf::test::BaseFixture {}; + +TEST_F(GroupbyTest, Scan) +{ + using key_wrapper = cudf::test::fixed_width_column_wrapper; + using value_wrapper = cudf::test::fixed_width_column_wrapper; + + key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4}); + + auto agg = cudf::make_min_aggregation(); + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + + cudf::groupby::groupby gb_obj(cudf::table_view({keys})); + // cudf::groupby scan uses sort implementation + auto result = gb_obj.scan(requests, cudf::test::get_default_stream()); +} + +TEST_F(GroupbyTest, Shift) +{ + cudf::test::fixed_width_column_wrapper key{1, 2, 1, 2, 2, 1, 1}; + cudf::test::fixed_width_column_wrapper val{3, 4, 5, 6, 7, 8, 9}; + cudf::size_type offset = 2; + auto slr = cudf::make_default_constructed_scalar(cudf::column_view(val).type(), + cudf::test::get_default_stream()); + + cudf::groupby::groupby gb_obj(cudf::table_view({key})); + std::vector offsets{offset}; + auto got = + gb_obj.shift(cudf::table_view{{val}}, offsets, {*slr}, cudf::test::get_default_stream()); +} + +TEST_F(GroupbyTest, GetGroups) +{ + cudf::test::fixed_width_column_wrapper keys{1, 1, 2, 1, 2, 3}; + cudf::test::fixed_width_column_wrapper values({0, 0, 1, 1, 2, 2}); + cudf::groupby::groupby gb(cudf::table_view({keys})); + auto gb_groups = gb.get_groups(cudf::table_view({values}), cudf::test::get_default_stream()); +} + +TEST_F(GroupbyTest, ReplaceNullsTest) +{ + cudf::test::fixed_width_column_wrapper key{0, 1, 0, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper val({42, 7, 24, 10, 1, 1000}, {1, 1, 1, 0, 0, 0}); + cudf::groupby::groupby gb_obj(cudf::table_view({key})); + std::vector policies{cudf::replace_policy::PRECEDING}; + auto p = + gb_obj.replace_nulls(cudf::table_view({val}), policies, cudf::test::get_default_stream()); +} From d93e9c267ac7a1a8792d9fc77d2ba8ab7be2683c Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 25 Nov 2024 05:58:16 -0500 Subject: [PATCH 10/23] Expose stream-ordering to strings attribute APIs (#17398) Adds stream parameter to ``` cudf::strings::count_characters cudf::strings::count_bytes cudf::strings::code_points ``` Added stream gtests to verify correct stream forwarding. Reference: https://github.com/rapidsai/cudf/issues/13744 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17398 --- cpp/include/cudf/strings/attributes.hpp | 6 ++ cpp/src/strings/attributes.cu | 9 ++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/strings/attributes_test.cpp | 59 +++++++++++++++++++ 4 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 cpp/tests/streams/strings/attributes_test.cpp diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp index 5f2eda8fa5b..0de57b556ad 100644 --- a/cpp/include/cudf/strings/attributes.hpp +++ b/cpp/include/cudf/strings/attributes.hpp @@ -41,11 +41,13 @@ namespace strings { * Any null string will result in a null entry for that row in the output column. * * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with lengths for each string */ std::unique_ptr count_characters( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -59,11 +61,13 @@ std::unique_ptr count_characters( * Any null string will result in a null entry for that row in the output column. * * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with the number of bytes for each string */ std::unique_ptr count_bytes( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -79,11 +83,13 @@ std::unique_ptr count_bytes( * Any null string is ignored. No null entries will appear in the output column. * * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New INT32 column with code point integer values for each character */ std::unique_ptr code_points( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of strings_apis group diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index c56d25fde2b..1c14cab4b1f 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -264,24 +264,27 @@ std::unique_ptr code_points(strings_column_view const& input, // external APIS std::unique_ptr count_characters(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::count_characters(input, cudf::get_default_stream(), mr); + return detail::count_characters(input, stream, mr); } std::unique_ptr count_bytes(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::count_bytes(input, cudf::get_default_stream(), mr); + return detail::count_bytes(input, stream, mr); } std::unique_ptr code_points(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::code_points(input, cudf::get_default_stream(), mr); + return detail::code_points(input, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 91c00d6af34..8928d27a871 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -724,6 +724,7 @@ ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing) ConfigureTest( STREAM_STRINGS_TEST + streams/strings/attributes_test.cpp streams/strings/case_test.cpp streams/strings/combine_test.cpp streams/strings/contains_test.cpp diff --git a/cpp/tests/streams/strings/attributes_test.cpp b/cpp/tests/streams/strings/attributes_test.cpp new file mode 100644 index 00000000000..e15681eb8a7 --- /dev/null +++ b/cpp/tests/streams/strings/attributes_test.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +struct StringsAttributesTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsAttributesTest, CodePoints) +{ + std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; + cudf::test::strings_column_wrapper strings( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::code_points(strings_view, cudf::test::get_default_stream()); +} + +TEST_F(StringsAttributesTest, CountCharacters) +{ + std::vector h_strings( + 40000, "something a bit longer than 32 bytes ééé ééé ééé ééé ééé ééé ééé"); + cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::count_characters(strings_view, cudf::test::get_default_stream()); +} + +TEST_F(StringsAttributesTest, CountBytes) +{ + std::vector h_strings{ + "eee", "bb", nullptr, "", "aa", "ééé", "something a bit longer than 32 bytes"}; + cudf::test::strings_column_wrapper strings( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::count_bytes(strings_view, cudf::test::get_default_stream()); +} From f05e89db8f6750232a452d072fa9f9ea988a6b34 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 25 Nov 2024 13:03:54 -0600 Subject: [PATCH 11/23] Single-partition Dask executor for cuDF-Polars (#17262) The goal here is to lay down the initial foundation for dask-based evaluation of `IR` graphs in cudf-polars. The first pass will only support single-partition workloads. This functionality could be achieved with much less-complicated changes to cudf-polars. However, we **do** want to build multi-partition support on top of this. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Lawrence Mitchell (https://github.com/wence-) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/17262 --- ci/run_cudf_polars_pytests.sh | 4 + python/cudf_polars/cudf_polars/callback.py | 18 +- python/cudf_polars/cudf_polars/dsl/ir.py | 25 +- .../cudf_polars/cudf_polars/dsl/translate.py | 3 +- .../cudf_polars/experimental/parallel.py | 236 ++++++++++++++++++ .../cudf_polars/testing/asserts.py | 11 +- python/cudf_polars/tests/conftest.py | 16 ++ .../tests/experimental/test_parallel.py | 21 ++ python/cudf_polars/tests/test_executors.py | 68 +++++ 9 files changed, 388 insertions(+), 14 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/experimental/parallel.py create mode 100644 python/cudf_polars/tests/experimental/test_parallel.py create mode 100644 python/cudf_polars/tests/test_executors.py diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh index c10612a065a..bf5a3ccee8e 100755 --- a/ci/run_cudf_polars_pytests.sh +++ b/ci/run_cudf_polars_pytests.sh @@ -8,4 +8,8 @@ set -euo pipefail # Support invoking run_cudf_polars_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ +# Test the default "cudf" executor python -m pytest --cache-clear "$@" tests + +# Test the "dask-experimental" executor +python -m pytest --cache-clear "$@" tests --executor dask-experimental diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 8dc5715195d..95527028aa9 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -9,7 +9,7 @@ import os import warnings from functools import cache, partial -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import nvtx @@ -181,6 +181,7 @@ def _callback( *, device: int | None, memory_resource: int | None, + executor: Literal["pylibcudf", "dask-experimental"] | None, ) -> pl.DataFrame: assert with_columns is None assert pyarrow_predicate is None @@ -191,7 +192,14 @@ def _callback( set_device(device), set_memory_resource(memory_resource), ): - return ir.evaluate(cache={}).to_polars() + if executor is None or executor == "pylibcudf": + return ir.evaluate(cache={}).to_polars() + elif executor == "dask-experimental": + from cudf_polars.experimental.parallel import evaluate_dask + + return evaluate_dask(ir).to_polars() + else: + raise ValueError(f"Unknown executor '{executor}'") def validate_config_options(config: dict) -> None: @@ -208,7 +216,9 @@ def validate_config_options(config: dict) -> None: ValueError If the configuration contains unsupported options. """ - if unsupported := (config.keys() - {"raise_on_fail", "parquet_options"}): + if unsupported := ( + config.keys() - {"raise_on_fail", "parquet_options", "executor"} + ): raise ValueError( f"Engine configuration contains unsupported settings: {unsupported}" ) @@ -243,6 +253,7 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: device = config.device memory_resource = config.memory_resource raise_on_fail = config.config.get("raise_on_fail", False) + executor = config.config.get("executor", None) validate_config_options(config.config) with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): @@ -272,5 +283,6 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: ir, device=device, memory_resource=memory_resource, + executor=executor, ) ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 62a2da9dcea..6617b71be81 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1599,13 +1599,15 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): # polars requires that all to-explode columns have the # same sub-shapes raise NotImplementedError("Explode with more than one column") + self.options = (tuple(to_explode),) elif self.name == "rename": - old, new, _ = self.options + old, new, strict = self.options # TODO: perhaps polars should validate renaming in the IR? if len(new) != len(set(new)) or ( set(new) & (set(df.schema.keys()) - set(old)) ): raise NotImplementedError("Duplicate new names in rename.") + self.options = (tuple(old), tuple(new), strict) elif self.name == "unpivot": indices, pivotees, variable_name, value_name = self.options value_name = "value" if value_name is None else value_name @@ -1623,13 +1625,15 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): self.options = ( tuple(indices), tuple(pivotees), - (variable_name, schema[variable_name]), - (value_name, schema[value_name]), + variable_name, + value_name, ) - self._non_child_args = (name, self.options) + self._non_child_args = (schema, name, self.options) @classmethod - def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: + def do_evaluate( + cls, schema: Schema, name: str, options: Any, df: DataFrame + ) -> DataFrame: """Evaluate and return a dataframe.""" if name == "rechunk": # No-op in our data model @@ -1651,8 +1655,8 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: ( indices, pivotees, - (variable_name, variable_dtype), - (value_name, value_dtype), + variable_name, + value_name, ) = options npiv = len(pivotees) index_columns = [ @@ -1669,7 +1673,7 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: plc.interop.from_arrow( pa.array( pivotees, - type=plc.interop.to_arrow(variable_dtype), + type=plc.interop.to_arrow(schema[variable_name]), ), ) ] @@ -1677,7 +1681,10 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees] + [ + df.column_map[pivotee].astype(schema[value_name]).obj + for pivotee in pivotees + ] ) return DataFrame( [ diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 12fc2a196cd..9480ce6e535 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -633,9 +633,10 @@ def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.E @_translate_expr.register def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr: + options = node.sort_options return expr.SortBy( dtype, - node.sort_options, + (options[0], tuple(options[1]), tuple(options[2])), translator.translate_expr(n=node.expr), *(translator.translate_expr(n=n) for n in node.by), ) diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py new file mode 100644 index 00000000000..6518dd60c7d --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/parallel.py @@ -0,0 +1,236 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Partitioned LogicalPlan nodes.""" + +from __future__ import annotations + +import operator +from functools import reduce, singledispatch +from typing import TYPE_CHECKING, Any + +from cudf_polars.dsl.ir import IR +from cudf_polars.dsl.traversal import traversal + +if TYPE_CHECKING: + from collections.abc import MutableMapping + from typing import TypeAlias + + from cudf_polars.containers import DataFrame + from cudf_polars.dsl.nodebase import Node + from cudf_polars.typing import GenericTransformer + + +class PartitionInfo: + """ + Partitioning information. + + This class only tracks the partition count (for now). + """ + + __slots__ = ("count",) + + def __init__(self, count: int): + self.count = count + + +LowerIRTransformer: TypeAlias = ( + "GenericTransformer[IR, MutableMapping[IR, PartitionInfo]]" +) +"""Protocol for Lowering IR nodes.""" + + +def get_key_name(node: Node) -> str: + """Generate the key name for a Node.""" + return f"{type(node).__name__.lower()}-{hash(node)}" + + +@singledispatch +def lower_ir_node( + ir: IR, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + """ + Rewrite an IR node and extract partitioning information. + + Parameters + ---------- + ir + IR node to rewrite. + rec + Recursive LowerIRTransformer callable. + + Returns + ------- + new_ir, partition_info + The rewritten node, and a mapping from unique nodes in + the full IR graph to associated partitioning information. + + Notes + ----- + This function is used by `lower_ir_graph`. + + See Also + -------- + lower_ir_graph + """ + raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + + +@lower_ir_node.register(IR) +def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + if len(ir.children) == 0: + # Default leaf node has single partition + return ir, {ir: PartitionInfo(count=1)} + + # Lower children + children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False) + partition_info = reduce(operator.or_, _partition_info) + + # Check that child partitioning is supported + count = max(partition_info[c].count for c in children) + if count > 1: + raise NotImplementedError( + f"Class {type(ir)} does not support multiple partitions." + ) # pragma: no cover + + # Return reconstructed node and partition-info dict + partition = PartitionInfo(count=1) + new_node = ir.reconstruct(children) + partition_info[new_node] = partition + return new_node, partition_info + + +def lower_ir_graph(ir: IR) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + """ + Rewrite an IR graph and extract partitioning information. + + Parameters + ---------- + ir + Root of the graph to rewrite. + + Returns + ------- + new_ir, partition_info + The rewritten graph, and a mapping from unique nodes + in the new graph to associated partitioning information. + + Notes + ----- + This function traverses the unique nodes of the graph with + root `ir`, and applies :func:`lower_ir_node` to each node. + + See Also + -------- + lower_ir_node + """ + from cudf_polars.dsl.traversal import CachingVisitor + + mapper = CachingVisitor(lower_ir_node) + return mapper(ir) + + +@singledispatch +def generate_ir_tasks( + ir: IR, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + """ + Generate a task graph for evaluation of an IR node. + + Parameters + ---------- + ir + IR node to generate tasks for. + partition_info + Partitioning information, obtained from :func:`lower_ir_graph`. + + Returns + ------- + mapping + A (partial) dask task graph for the evaluation of an ir node. + + Notes + ----- + Task generation should only produce the tasks for the current node, + referring to child tasks by name. + + See Also + -------- + task_graph + """ + raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + + +@generate_ir_tasks.register(IR) +def _( + ir: IR, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + # Single-partition default behavior. + # This is used by `generate_ir_tasks` for all unregistered IR sub-types. + if partition_info[ir].count > 1: + raise NotImplementedError( + f"Failed to generate multiple output tasks for {ir}." + ) # pragma: no cover + + child_names = [] + for child in ir.children: + child_names.append(get_key_name(child)) + if partition_info[child].count > 1: + raise NotImplementedError( + f"Failed to generate tasks for {ir} with child {child}." + ) # pragma: no cover + + key_name = get_key_name(ir) + return { + (key_name, 0): ( + ir.do_evaluate, + *ir._non_child_args, + *((child_name, 0) for child_name in child_names), + ) + } + + +def task_graph( + ir: IR, partition_info: MutableMapping[IR, PartitionInfo] +) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]: + """ + Construct a task graph for evaluation of an IR graph. + + Parameters + ---------- + ir + Root of the graph to rewrite. + partition_info + A mapping from all unique IR nodes to the + associated partitioning information. + + Returns + ------- + graph + A Dask-compatible task graph for the entire + IR graph with root `ir`. + + Notes + ----- + This function traverses the unique nodes of the + graph with root `ir`, and extracts the tasks for + each node with :func:`generate_ir_tasks`. + + See Also + -------- + generate_ir_tasks + """ + graph = reduce( + operator.or_, + (generate_ir_tasks(node, partition_info) for node in traversal(ir)), + ) + return graph, (get_key_name(ir), 0) + + +def evaluate_dask(ir: IR) -> DataFrame: + """Evaluate an IR graph with Dask.""" + from dask import get + + ir, partition_info = lower_ir_graph(ir) + + graph, key = task_graph(ir, partition_info) + return get(graph, key) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index ba0bb12a0fb..d986f150b2e 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -20,6 +20,11 @@ __all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"] +# Will be overriden by `conftest.py` with the value from the `--executor` +# command-line argument +Executor = None + + def assert_gpu_result_equal( lazydf: pl.LazyFrame, *, @@ -34,6 +39,7 @@ def assert_gpu_result_equal( rtol: float = 1e-05, atol: float = 1e-08, categorical_as_str: bool = False, + executor: str | None = None, ) -> None: """ Assert that collection of a lazyframe on GPU produces correct results. @@ -71,6 +77,9 @@ def assert_gpu_result_equal( Absolute tolerance for float comparisons categorical_as_str Decat categoricals to strings before comparing + executor + The executor configuration to pass to `GPUEngine`. If not specified + uses the module level `Executor` attribute. Raises ------ @@ -80,7 +89,7 @@ def assert_gpu_result_equal( If GPU collection failed in some way. """ if engine is None: - engine = GPUEngine(raise_on_fail=True) + engine = GPUEngine(raise_on_fail=True, executor=executor or Executor) final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py index 9bbce6bc080..6338bf0cae1 100644 --- a/python/cudf_polars/tests/conftest.py +++ b/python/cudf_polars/tests/conftest.py @@ -8,3 +8,19 @@ @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session") def with_nulls(request): return request.param + + +def pytest_addoption(parser): + parser.addoption( + "--executor", + action="store", + default="pylibcudf", + choices=("pylibcudf", "dask-experimental"), + help="Executor to use for GPUEngine.", + ) + + +def pytest_configure(config): + import cudf_polars.testing.asserts + + cudf_polars.testing.asserts.Executor = config.getoption("--executor") diff --git a/python/cudf_polars/tests/experimental/test_parallel.py b/python/cudf_polars/tests/experimental/test_parallel.py new file mode 100644 index 00000000000..d46ab88eebf --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_parallel.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import polars as pl +from polars import GPUEngine +from polars.testing import assert_frame_equal + + +def test_evaluate_dask(): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]}) + q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d") + + expected = q.collect(engine="cpu") + got_gpu = q.collect(engine=GPUEngine(raise_on_fail=True)) + got_dask = q.collect( + engine=GPUEngine(raise_on_fail=True, executor="dask-experimental") + ) + assert_frame_equal(expected, got_gpu) + assert_frame_equal(expected, got_dask) diff --git a/python/cudf_polars/tests/test_executors.py b/python/cudf_polars/tests/test_executors.py new file mode 100644 index 00000000000..3eaea2ec9ea --- /dev/null +++ b/python/cudf_polars/tests/test_executors.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("executor", [None, "pylibcudf", "dask-experimental"]) +def test_executor_basics(executor): + if executor == "dask-experimental": + pytest.importorskip("dask") + + df = pl.LazyFrame( + { + "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())), + "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())), + "c": pl.Series( + [ + [["1", "2", "3"], ["4", "567"]], + [["8", "9"], []], + ], + dtype=pl.List(pl.List(pl.String())), + ), + "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))), + } + ) + + assert_gpu_result_equal(df, executor=executor) + + +def test_cudf_cache_evaluate(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) + query = pl.concat([ldf, ldf2], how="diagonal") + assert_gpu_result_equal(query, executor="pylibcudf") + + +def test_dask_experimental_map_function_get_hashable(): + df = pl.LazyFrame( + { + "a": pl.Series([11, 12, 13], dtype=pl.UInt16), + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot(index="d") + assert_gpu_result_equal(q, executor="dask-experimental") + + +def test_unknown_executor(): + df = pl.LazyFrame({}) + + with pytest.raises( + pl.exceptions.ComputeError, + match="ValueError: Unknown executor 'unknown-executor'", + ): + assert_gpu_result_equal(df, executor="unknown-executor") From 4e3afd26127ebd0c04b739032873d3fce01eb1b7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:00:11 -0500 Subject: [PATCH 12/23] Change binops for-each kernel to thrust::for_each_n (#17419) Replaces the custom `for_each_kernel` in `binary_ops.cuh` with `thrust::for_each_n` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17419 --- cpp/src/binaryop/compiled/binary_ops.cuh | 56 ++++------------------- cpp/src/binaryop/compiled/equality_ops.cu | 38 ++++++++------- 2 files changed, 31 insertions(+), 63 deletions(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 06987139188..ec63504a414 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -244,44 +244,6 @@ struct binary_op_double_device_dispatcher { } }; -/** - * @brief Simplified for_each kernel - * - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) -{ - auto start = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - -#pragma unroll - for (auto i = start; i < size; i += stride) { - f(i); - } -} - -/** - * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. - * - * @tparam Functor - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); - auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */); - for_each_kernel<<>>( - size, std::forward(f)); -} - template void apply_binary_op(mutable_column_view& out, column_view const& lhs, @@ -298,16 +260,18 @@ void apply_binary_op(mutable_column_view& out, // Create binop functor instance if (common_dtype) { // Execute it on every element - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else { // Execute it on every element - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu index 041fca76494..d8c50683026 100644 --- a/cpp/src/binaryop/compiled/equality_ops.cu +++ b/cpp/src/binaryop/compiled/equality_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out, auto rhsd = column_device_view::create(rhs, stream); if (common_dtype) { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } else { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } } From ccc8833170b8099bf0de56f885ad1fdad5c43ada Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:19:13 -0800 Subject: [PATCH 13/23] Remove cudf._lib.timezone in favor of inlining pylibcudf (#17366) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - James Lamb (https://github.com/jameslamb) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17366 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/timezone.pyx | 10 ---------- python/cudf/cudf/core/_internals/timezones.py | 11 ++++++++--- 4 files changed, 8 insertions(+), 15 deletions(-) delete mode 100644 python/cudf/cudf/_lib/timezone.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 1c2b24d2391..391c0eac858 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -41,7 +41,6 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx - timezone.pyx transform.pyx types.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 13d05033c11..8ce6d5bf5c9 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -28,7 +28,6 @@ strings, strings_udf, text, - timezone, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx deleted file mode 100644 index 54624a5a2fd..00000000000 --- a/python/cudf/cudf/_lib/timezone.pyx +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def make_timezone_transition_table(tzdir, tzname): - plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname) - return [Column.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index fd89904e766..4d001577581 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -10,8 +10,10 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf -from cudf._lib.timezone import make_timezone_transition_table +from cudf._lib.column import Column if TYPE_CHECKING: from cudf.core.column.datetime import DatetimeColumn @@ -109,11 +111,14 @@ def _find_and_read_tzfile_tzdata( def _read_tzfile_as_columns( - tzdir, zone_name: str + tzdir: str, zone_name: str ) -> tuple[DatetimeColumn, TimeDeltaColumn]: - transition_times_and_offsets = make_timezone_transition_table( + plc_table = plc.io.timezone.make_timezone_transition_table( tzdir, zone_name ) + transition_times_and_offsets = [ + Column.from_pylibcudf(col) for col in plc_table.columns() + ] if not transition_times_and_offsets: from cudf.core.column.column import as_column From df177400e2aa956651f3ee5343416b5fb3d9e86c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:21:48 -0800 Subject: [PATCH 14/23] Remove cudf._lib.partitioning in favor of inlining pylibcudf (#17369) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17369 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/partitioning.pyx | 53 -------------------------- python/cudf/cudf/core/dataframe.py | 43 +++++++++++++++++++-- 4 files changed, 39 insertions(+), 59 deletions(-) delete mode 100644 python/cudf/cudf/_lib/partitioning.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 391c0eac858..b430d9f3d76 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -29,7 +29,6 @@ set(cython_sources null_mask.pyx orc.pyx parquet.pyx - partitioning.pyx reduce.pyx replace.pyx reshape.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 8ce6d5bf5c9..942e32747bc 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -16,7 +16,6 @@ nvtext, orc, parquet, - partitioning, reduce, replace, reshape, diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx deleted file mode 100644 index 13997da8403..00000000000 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - -from cudf._lib.reduce import minmax -from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count - - -@acquire_spill_lock() -def partition(list source_columns, Column partition_map, - object num_partitions): - """Partition source columns given a partitioning map - - Parameters - ---------- - source_columns: list[Column] - Columns to partition - partition_map: Column - Column of integer values that map each row in the input to a - partition - num_partitions: Optional[int] - Number of output partitions (deduced from unique values in - partition_map if None) - - Returns - ------- - Pair of reordered columns and partition offsets - - Raises - ------ - ValueError - If the partition map has invalid entries (not all in [0, - num_partitions)). - """ - - if num_partitions is None: - num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) - - if partition_map.size > 0: - lo, hi = minmax(partition_map) - if lo < 0 or hi >= num_partitions: - raise ValueError("Partition map has invalid values") - - plc_table, offsets = plc.partitioning.partition( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), - partition_map.to_pylibcudf(mode="read"), - num_partitions - ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 728cc47a7c9..7d523d2c5ad 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2487,11 +2487,46 @@ def scatter_by_map( f"ERROR: map_size must be >= {count} (got {map_size})." ) - partitioned_columns, output_offsets = libcudf.partitioning.partition( - [*(self.index._columns if keep_index else ()), *self._columns], - map_index, - map_size, + source_columns = ( + itertools.chain(self.index._columns, self._columns) + if keep_index + else self._columns ) + + with acquire_spill_lock(): + if map_size is None: + map_size = plc.stream_compaction.distinct_count( + map_index.to_pylibcudf(mode="read"), + plc.types.NullPolicy.EXCLUDE, + plc.types.NanPolicy.NAN_IS_VALID, + ) + + if map_index.size > 0: + plc_lo, plc_hi = plc.reduce.minmax( + map_index.to_pylibcudf(mode="read") + ) + # TODO: Use pylibcudf Scalar once APIs are more developed + lo = libcudf.column.Column.from_pylibcudf( + plc.Column.from_scalar(plc_lo, 1) + ).element_indexing(0) + hi = libcudf.column.Column.from_pylibcudf( + plc.Column.from_scalar(plc_hi, 1) + ).element_indexing(0) + if lo < 0 or hi >= map_size: + raise ValueError("Partition map has invalid values") + + plc_table, output_offsets = plc.partitioning.partition( + plc.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] + ), + map_index.to_pylibcudf(mode="read"), + map_size, + ) + partitioned_columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + partitioned = self._from_columns_like_self( partitioned_columns, column_names=self._column_names, From d8277bffacc8fd91f4009be310b5347e9c8a6397 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:23:24 -0800 Subject: [PATCH 15/23] Remove cudf._lib.lists in favor of inlining pylibcudf (#17425) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17425 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/lists.pyx | 150 --------------------- python/cudf/cudf/core/column/lists.py | 180 ++++++++++++++++++++----- python/cudf/cudf/core/indexed_frame.py | 19 ++- 4 files changed, 159 insertions(+), 191 deletions(-) delete mode 100644 python/cudf/cudf/_lib/lists.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index b430d9f3d76..a0457120935 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -24,7 +24,6 @@ set(cython_sources interop.pyx join.pyx json.pyx - lists.pyx merge.pyx null_mask.pyx orc.pyx diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx deleted file mode 100644 index 90a137dd546..00000000000 --- a/python/cudf/cudf/_lib/lists.pyx +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def count_elements(Column col): - return Column.from_pylibcudf( - plc.lists.count_elements( - col.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def explode_outer(list source_columns, int explode_column_idx): - return columns_from_pylibcudf_table( - plc.lists.explode_outer( - plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - explode_column_idx, - ) - ) - - -@acquire_spill_lock() -def distinct(Column col, bool nulls_equal, bool nans_all_equal): - return Column.from_pylibcudf( - plc.lists.distinct( - col.to_pylibcudf(mode="read"), - ( - plc.types.NullEquality.EQUAL - if nulls_equal - else plc.types.NullEquality.UNEQUAL - ), - ( - plc.types.NanEquality.ALL_EQUAL - if nans_all_equal - else plc.types.NanEquality.UNEQUAL - ), - ) - ) - - -@acquire_spill_lock() -def sort_lists(Column col, bool ascending, str na_position): - return Column.from_pylibcudf( - plc.lists.sort_lists( - col.to_pylibcudf(mode="read"), - plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING, - ( - plc.types.NullOrder.BEFORE - if na_position == "first" - else plc.types.NullOrder.AFTER - ), - False, - ) - ) - - -@acquire_spill_lock() -def extract_element_scalar(Column col, size_type index): - return Column.from_pylibcudf( - plc.lists.extract_list_element( - col.to_pylibcudf(mode="read"), - index, - ) - ) - - -@acquire_spill_lock() -def extract_element_column(Column col, Column index): - return Column.from_pylibcudf( - plc.lists.extract_list_element( - col.to_pylibcudf(mode="read"), - index.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def contains_scalar(Column col, py_search_key): - return Column.from_pylibcudf( - plc.lists.contains( - col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, - ) - ) - - -@acquire_spill_lock() -def index_of_scalar(Column col, object py_search_key): - return Column.from_pylibcudf( - plc.lists.index_of( - col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, - plc.lists.DuplicateFindOption.FIND_FIRST, - ) - ) - - -@acquire_spill_lock() -def index_of_column(Column col, Column search_keys): - return Column.from_pylibcudf( - plc.lists.index_of( - col.to_pylibcudf(mode="read"), - search_keys.to_pylibcudf(mode="read"), - plc.lists.DuplicateFindOption.FIND_FIRST, - ) - ) - - -@acquire_spill_lock() -def concatenate_rows(list source_columns): - return Column.from_pylibcudf( - plc.lists.concatenate_rows( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]) - ) - ) - - -@acquire_spill_lock() -def concatenate_list_elements(Column input_column, dropna=False): - return Column.from_pylibcudf( - plc.lists.concatenate_list_elements( - input_column.to_pylibcudf(mode="read"), - plc.lists.ConcatenateNullPolicy.IGNORE - if dropna - else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, - ) - ) - - -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - return Column.from_pylibcudf( - plc.lists.segmented_gather( - source_column.to_pylibcudf(mode="read"), - gather_map.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6b25e568f00..9962663e811 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -2,31 +2,21 @@ from __future__ import annotations +import itertools from functools import cached_property -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, cast -import numpy as np import pandas as pd import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf -from cudf._lib.lists import ( - concatenate_list_elements, - concatenate_rows, - contains_scalar, - count_elements, - distinct, - extract_element_column, - extract_element_scalar, - index_of_column, - index_of_scalar, - segmented_gather, - sort_lists, -) from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.column.numerical import NumericalColumn @@ -139,7 +129,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return NotImplemented if isinstance(other.dtype, ListDtype): if op == "__add__": - return concatenate_rows([self, other]) + return self.concatenate_rows([other]) # type: ignore[list-item] else: raise NotImplementedError( "Lists concatenation for this operation is not yet" @@ -326,6 +316,129 @@ def to_pandas( else: return pd.Index(self.to_arrow().tolist(), dtype="object") + @acquire_spill_lock() + def count_elements(self) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.count_elements(self.to_pylibcudf(mode="read")) + ) + + @acquire_spill_lock() + def distinct(self, nulls_equal: bool, nans_all_equal: bool) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.distinct( + self.to_pylibcudf(mode="read"), + ( + plc.types.NullEquality.EQUAL + if nulls_equal + else plc.types.NullEquality.UNEQUAL + ), + ( + plc.types.NanEquality.ALL_EQUAL + if nans_all_equal + else plc.types.NanEquality.UNEQUAL + ), + ) + ) + + @acquire_spill_lock() + def sort_lists( + self, ascending: bool, na_position: Literal["first", "last"] + ) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.sort_lists( + self.to_pylibcudf(mode="read"), + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING, + ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ), + False, + ) + ) + + @acquire_spill_lock() + def extract_element_scalar(self, index: int) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.extract_list_element( + self.to_pylibcudf(mode="read"), + index, + ) + ) + + @acquire_spill_lock() + def extract_element_column(self, index: ColumnBase) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.extract_list_element( + self.to_pylibcudf(mode="read"), + index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.contains( + self.to_pylibcudf(mode="read"), + search_key.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.index_of( + self.to_pylibcudf(mode="read"), + search_key.device_value.c_value, + plc.lists.DuplicateFindOption.FIND_FIRST, + ) + ) + + @acquire_spill_lock() + def index_of_column(self, search_keys: ColumnBase) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.index_of( + self.to_pylibcudf(mode="read"), + search_keys.to_pylibcudf(mode="read"), + plc.lists.DuplicateFindOption.FIND_FIRST, + ) + ) + + @acquire_spill_lock() + def concatenate_rows(self, other_columns: list[ColumnBase]) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.concatenate_rows( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain([self], other_columns) + ] + ) + ) + ) + + @acquire_spill_lock() + def concatenate_list_elements(self, dropna: bool) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.concatenate_list_elements( + self.to_pylibcudf(mode="read"), + plc.lists.ConcatenateNullPolicy.IGNORE + if dropna + else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, + ) + ) + + @acquire_spill_lock() + def segmented_gather(self, gather_map: ColumnBase) -> ColumnBase: + return type(self).from_pylibcudf( + plc.lists.segmented_gather( + self.to_pylibcudf(mode="read"), + gather_map.to_pylibcudf(mode="read"), + ) + ) + class ListMethods(ColumnMethods): """ @@ -397,18 +510,16 @@ def get( 2 6 dtype: int64 """ - if is_scalar(index): - out = extract_element_scalar(self._column, cudf.Scalar(index)) + if isinstance(index, int): + out = self._column.extract_element_scalar(index) else: index = as_column(index) - out = extract_element_column(self._column, as_column(index)) + out = self._column.extract_element_column(index) if not (default is None or default is NA): # determine rows for which `index` is out-of-bounds - lengths = count_elements(self._column) - out_of_bounds_mask = (np.negative(index) > lengths) | ( - index >= lengths - ) + lengths = self._column.count_elements() + out_of_bounds_mask = ((-1 * index) > lengths) | (index >= lengths) # replace the value in those rows (should be NA) with `default` if out_of_bounds_mask.any(): @@ -445,7 +556,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: dtype: bool """ return self._return_or_inplace( - contains_scalar(self._column, cudf.Scalar(search_key)) + self._column.contains_scalar(cudf.Scalar(search_key)) ) def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: @@ -494,13 +605,10 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ if is_scalar(search_key): - return self._return_or_inplace( - index_of_scalar(self._column, cudf.Scalar(search_key)) - ) + result = self._column.index_of_scalar(cudf.Scalar(search_key)) else: - return self._return_or_inplace( - index_of_column(self._column, as_column(search_key)) - ) + result = self._column.index_of_column(as_column(search_key)) + return self._return_or_inplace(result) @property def leaves(self) -> ParentType: @@ -550,7 +658,7 @@ def len(self) -> ParentType: 2 2 dtype: int32 """ - return self._return_or_inplace(count_elements(self._column)) + return self._return_or_inplace(self._column.count_elements()) def take(self, lists_indices: ColumnLike) -> ParentType: """ @@ -598,7 +706,7 @@ def take(self, lists_indices: ColumnLike) -> ParentType: ) return self._return_or_inplace( - segmented_gather(self._column, lists_indices_col) + self._column.segmented_gather(lists_indices_col) ) def unique(self) -> ParentType: @@ -631,7 +739,7 @@ def unique(self) -> ParentType: raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( - distinct(self._column, nulls_equal=True, nans_all_equal=True) + self._column.distinct(nulls_equal=True, nans_all_equal=True) ) def sort_values( @@ -639,7 +747,7 @@ def sort_values( ascending: bool = True, inplace: bool = False, kind: str = "quicksort", - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ignore_index: bool = False, ) -> ParentType: """ @@ -692,7 +800,7 @@ def sort_values( raise NotImplementedError("Nested lists sort is not supported.") return self._return_or_inplace( - sort_lists(self._column, ascending, na_position), + self._column.sort_lists(ascending, na_position), retain_index=not ignore_index, ) @@ -742,7 +850,7 @@ def concat(self, dropna=True) -> ParentType: dtype: list """ return self._return_or_inplace( - concatenate_list_elements(self._column, dropna=dropna) + self._column.concatenate_list_elements(dropna) ) def astype(self, dtype): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9130779c3e9..4e839aaeb6a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import itertools import operator import textwrap import warnings @@ -5316,10 +5317,20 @@ def _explode(self, explode_column: Any, ignore_index: bool): else: idx_cols = () - exploded = libcudf.lists.explode_outer( - [*idx_cols, *self._columns], - column_index + len(idx_cols), - ) + with acquire_spill_lock(): + plc_table = plc.lists.explode_outer( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(idx_cols, self._columns) + ] + ), + column_index + len(idx_cols), + ) + exploded = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] # We must copy inner datatype of the exploded list column to # maintain struct dtype key names element_type = cast( From 0bd95c93b44bb626ce26bd0788f9f2158008fe4d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:24:41 -0800 Subject: [PATCH 16/23] Remove cudf._lib.rolling in favor of inlining pylibcudf (#17423) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17423 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/rolling.pyx | 67 ------------------------- python/cudf/cudf/core/window/rolling.py | 44 ++++++++++++---- 4 files changed, 34 insertions(+), 79 deletions(-) delete mode 100644 python/cudf/cudf/_lib/rolling.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index a0457120935..8d3af5205fa 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -31,7 +31,6 @@ set(cython_sources reduce.pyx replace.pyx reshape.pyx - rolling.pyx round.pyx scalar.pyx sort.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 942e32747bc..1e0bf931c97 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -19,7 +19,6 @@ reduce, replace, reshape, - rolling, round, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx deleted file mode 100644 index 687b261c2c7..00000000000 --- a/python/cudf/cudf/_lib/rolling.pyx +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - -from cudf._lib.aggregation import make_aggregation - - -@acquire_spill_lock() -def rolling(Column source_column, - Column pre_column_window, - Column fwd_column_window, - window, - min_periods, - center, - op, - agg_params): - """ - Rolling on input executing operation within the given window for each row - - Parameters - ---------- - source_column : input column on which rolling operation is executed - pre_column_window : prior window for each element of source_column - fwd_column_window : forward window for each element of source_column - window : Size of the moving window, can be integer or None - min_periods : Minimum number of observations in window required to have - a value (otherwise result is null) - center : Set the labels at the center of the window - op : operation to be executed - agg_params : dict, parameter for the aggregation (e.g. ddof for VAR/STD) - - Returns - ------- - A Column with rolling calculations - """ - - if window is None: - if center: - # TODO: we can support this even though Pandas currently does not - raise NotImplementedError( - "center is not implemented for offset-based windows" - ) - pre = pre_column_window.to_pylibcudf(mode="read") - fwd = fwd_column_window.to_pylibcudf(mode="read") - else: - if center: - pre = (window // 2) + 1 - fwd = window - (pre) - else: - pre = window - fwd = 0 - - return Column.from_pylibcudf( - pylibcudf.rolling.rolling_window( - source_column.to_pylibcudf(mode="read"), - pre, - fwd, - min_periods, - make_aggregation( - op, {'dtype': source_column.dtype} if callable(op) else agg_params - ).c_obj, - ) - ) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 967edc2ab15..d2cb5e8c190 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -8,8 +8,11 @@ import pandas as pd from pandas.api.indexers import BaseIndexer +import pylibcudf as plc + import cudf from cudf import _lib as libcudf +from cudf._lib.aggregation import make_aggregation from cudf.api.types import is_integer, is_number from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import as_column @@ -284,16 +287,37 @@ def _apply_agg_column(self, source_column, agg_name): ) window = None - return libcudf.rolling.rolling( - source_column=source_column, - pre_column_window=preceding_window, - fwd_column_window=following_window, - window=window, - min_periods=min_periods, - center=self.center, - op=agg_name, - agg_params=self.agg_params, - ) + with acquire_spill_lock(): + if window is None: + if self.center: + # TODO: we can support this even though Pandas currently does not + raise NotImplementedError( + "center is not implemented for offset-based windows" + ) + pre = preceding_window.to_pylibcudf(mode="read") + fwd = following_window.to_pylibcudf(mode="read") + else: + if self.center: + pre = (window // 2) + 1 + fwd = window - (pre) + else: + pre = window + fwd = 0 + + return libcudf.column.Column.from_pylibcudf( + plc.rolling.rolling_window( + source_column.to_pylibcudf(mode="read"), + pre, + fwd, + min_periods, + make_aggregation( + agg_name, + {"dtype": source_column.dtype} + if callable(agg_name) + else self.agg_params, + ).c_obj, + ) + ) def _reduce( self, From ab36fc6f7ac91f63841666e59176229944f869b2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 19:56:46 -0800 Subject: [PATCH 17/23] Return categorical values in to_numpy/to_cupy (#17388) closes #17381 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17388 --- python/cudf/cudf/core/frame.py | 5 +++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 838fde260df..0c0f271fe6f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -427,6 +427,8 @@ def to_array( ) -> cupy.ndarray | numpy.ndarray: if na_value is not None: col = col.fillna(na_value) + if isinstance(col.dtype, cudf.CategoricalDtype): + col = col._get_decategorized_column() # type: ignore[attr-defined] array = get_array(col) casted_array = module.asarray(array, dtype=dtype) if copy and casted_array is array: @@ -448,6 +450,9 @@ def to_array( else: dtype = find_common_type([dtype for _, dtype in self._dtypes]) + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + if not isinstance(dtype, numpy.dtype): raise NotImplementedError( f"{dtype} cannot be exposed as an array" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0f2b41888fa..509ee0d65a5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1306,6 +1306,18 @@ def test_dataframe_to_cupy_null_values(): np.testing.assert_array_equal(refvalues[k], mat[:, i]) +@pytest.mark.parametrize("method", ["to_cupy", "to_numpy"]) +@pytest.mark.parametrize("value", [1, True, 1.5]) +@pytest.mark.parametrize("constructor", ["DataFrame", "Series"]) +def test_to_array_categorical(method, value, constructor): + data = [value] + expected = getattr(pd, constructor)(data, dtype="category").to_numpy() + result = getattr( + getattr(cudf, constructor)(data, dtype="category"), method + )() + assert_eq(result, expected) + + def test_dataframe_append_empty(): pdf = pd.DataFrame( { From 79a986067688c18b3d431c7a3acc23e2307fb668 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:02:06 -0800 Subject: [PATCH 18/23] Remove cudf._lib.reshape in favor of inlining pylibcudf (#17368) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17368 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/reshape.pyx | 35 -------------- python/cudf/cudf/core/dataframe.py | 61 +++++++++++++++++------- python/cudf/cudf/core/groupby/groupby.py | 15 +++++- python/cudf/cudf/core/indexed_frame.py | 22 +++++++-- 6 files changed, 75 insertions(+), 60 deletions(-) delete mode 100644 python/cudf/cudf/_lib/reshape.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8d3af5205fa..61d3bcbe24e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -30,7 +30,6 @@ set(cython_sources parquet.pyx reduce.pyx replace.pyx - reshape.pyx round.pyx scalar.pyx sort.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 1e0bf931c97..efa437eebb7 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -18,7 +18,6 @@ parquet, reduce, replace, - reshape, round, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx deleted file mode 100644 index 6cebeb2bc16..00000000000 --- a/python/cudf/cudf/_lib/reshape.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def interleave_columns(list source_columns): - return Column.from_pylibcudf( - plc.reshape.interleave_columns( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]) - ) - ) - - -@acquire_spill_lock() -def tile(list source_columns, size_type count): - cdef size_type c_count = count - - return columns_from_pylibcudf_table( - plc.reshape.tile( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - c_count - ) - ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7d523d2c5ad..73c0af45293 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7251,13 +7251,22 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): repeated_index = self.index.repeat(len(unique_named_levels)) # Each column name should tile itself by len(df) times - tiled_index = libcudf.reshape.tile( - [ - as_column(unique_named_levels.get_level_values(i)) - for i in range(unique_named_levels.nlevels) - ], - self.shape[0], - ) + with acquire_spill_lock(): + plc_table = plc.reshape.tile( + plc.Table( + [ + as_column( + unique_named_levels.get_level_values(i) + ).to_pylibcudf(mode="read") + for i in range(unique_named_levels.nlevels) + ] + ), + self.shape[0], + ) + tiled_index = [ + libcudf.column.Column.from_pylibcudf(plc) + for plc in plc_table.columns() + ] # Assemble the final index new_index_columns = [*repeated_index._columns, *tiled_index] @@ -7271,7 +7280,6 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): data=range(self._num_columns), index=named_levels ) - column_indices: list[list[int]] = [] if has_unnamed_levels: unnamed_level_values = list( map(column_name_idx.get_level_values, unnamed_levels_indices) @@ -7307,13 +7315,11 @@ def unnamed_group_generator(): else: yield column_idx_df.sort_index().values - column_indices = list(unnamed_group_generator()) - # For each of the group constructed from the unnamed levels, # invoke `interleave_columns` to stack the values. stacked = [] - for column_idx in column_indices: + for column_idx in unnamed_group_generator(): # Collect columns based on indices, append None for -1 indices. columns = [ None if i == -1 else self._data.select_by_index(i).columns[0] @@ -7332,12 +7338,23 @@ def unnamed_group_generator(): ) # homogenize the dtypes of the columns - homogenized = [ + homogenized = ( col.astype(common_type) if col is not None else all_nulls() for col in columns - ] + ) - stacked.append(libcudf.reshape.interleave_columns(homogenized)) + with acquire_spill_lock(): + interleaved_col = libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in homogenized + ] + ) + ) + ) + stacked.append(interleaved_col) # Construct the resulting dataframe / series if not has_unnamed_levels: @@ -7838,10 +7855,18 @@ def interleave_columns(self): raise ValueError( "interleave_columns does not support 'category' dtype." ) - - return self._constructor_sliced._from_column( - libcudf.reshape.interleave_columns([*self._columns]) - ) + with acquire_spill_lock(): + result_col = libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in self._columns + ] + ) + ) + ) + return self._constructor_sliced._from_column(result_col) @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e59b948aba9..b274bdea76d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,17 +14,19 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby from cudf._lib.null_mask import bitmask_or -from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import GatherMap @@ -2201,6 +2203,17 @@ def _cov_or_corr(self, func, method_name): # interleave: combines the correlation or covariance results for each # column-pair into a single column + + @acquire_spill_lock() + def interleave_columns(source_columns): + return libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ) + ) + ) + res = cudf.DataFrame._from_data( { x: interleave_columns([gb_cov_corr._data[y] for y in ys]) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4e839aaeb6a..2f8c2587937 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5349,7 +5349,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): ) @_performance_tracking - def tile(self, count): + def tile(self, count: int): """Repeats the rows `count` times to form a new Frame. Parameters @@ -5373,10 +5373,24 @@ def tile(self, count): ------- The indexed frame containing the tiled "rows". """ + with acquire_spill_lock(): + plc_table = plc.reshape.tile( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + self.index._columns, self._columns + ) + ] + ), + count, + ) + tiled = [ + libcudf.column.Column.from_pylibcudf(plc) + for plc in plc_table.columns() + ] return self._from_columns_like_self( - libcudf.reshape.tile( - [*self.index._columns, *self._columns], count - ), + tiled, column_names=self._column_names, index_names=self._index_names, ) From d10eae79ec06ec002a34420a34380e077c4540f7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:01:00 -0500 Subject: [PATCH 19/23] Update strings/text source to use grid_1d for thread/block/stride calculations (#17404) Replaces `threadIdx.x + blockDim.x * blockIdx.x` logic with `grid_1d::global_thread_id()` and `blockDim.x * gridDim.x` with `grid_1d::grid_stride()` in libcudf strings and text source. Reference #10368 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17404 --- cpp/include/cudf/strings/detail/gather.cuh | 10 +++++----- cpp/src/strings/convert/convert_urls.cu | 6 ++++-- cpp/src/strings/copying/concatenate.cu | 10 +++++----- cpp/src/strings/regex/utilities.cuh | 4 ++-- cpp/src/strings/search/find.cu | 16 +++++++--------- cpp/src/text/minhash.cu | 9 +++------ cpp/src/text/subword/data_normalizer.cu | 5 ++--- cpp/src/text/subword/subword_tokenize.cu | 11 +++-------- cpp/src/text/subword/wordpiece_tokenizer.cu | 14 ++++---------- cpp/src/text/vocabulary_tokenize.cu | 9 +++------ 10 files changed, 38 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 4216523df97..58665fbf27e 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin, constexpr size_t out_datatype_size = sizeof(uint4); constexpr size_t in_datatype_size = sizeof(uint); - int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int global_warp_id = global_thread_id / cudf::detail::warp_size; - int warp_lane = global_thread_id % cudf::detail::warp_size; - int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = global_thread_id / cudf::detail::warp_size; + auto const warp_lane = global_thread_id % cudf::detail::warp_size; + auto const nwarps = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size; auto const alignment_offset = reinterpret_cast(out_chars) % out_datatype_size; uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); - for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { + for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) { auto const out_start = out_offsets[istring]; auto const out_end = out_offsets[istring + 1]; diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 520f5897415..9d0186b7a51 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); + auto const nwarps = + static_cast(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp. @@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); + auto const nwarps = + static_cast(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 9e4ef47ff79..3712b0e7fc6 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( bitmask_type* output_mask, size_type* out_valid_count) { - cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x; - size_type warp_valid_count = 0; + auto output_index = cudf::detail::grid_1d::global_thread_id(); + size_type warp_valid_count = 0; unsigned active_mask; if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); } @@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( warp_valid_count += __popc(new_word); } - output_index += blockDim.x * gridDim.x; + output_index += cudf::detail::grid_1d::grid_stride(); if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); } } @@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* size_type const output_size, char* output_data) { - cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x; + auto output_index = cudf::detail::grid_1d::global_thread_id(); while (output_index < output_size) { // Lookup input index by searching for output index in offsets @@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* auto const first_char = input_offsets_data[input_view.offset()]; output_data[output_index] = input_chars_data[offset_index + first_char]; - output_index += blockDim.x * gridDim.x; + output_index += cudf::detail::grid_1d::grid_stride(); } } diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 679907788bb..2594fd7b6da 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, __syncthreads(); auto const s_prog = reprog_device::load(d_prog, shmem); - auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + auto const thread_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = s_prog.thread_count(); if (thread_idx < stride) { for (auto idx = thread_idx; idx < size; idx += stride) { @@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn, __syncthreads(); auto const s_prog = reprog_device::load(d_prog, shmem); - auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + auto const thread_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = s_prog.thread_count(); if (thread_idx < stride) { for (auto idx = thread_idx; idx < size; idx += stride) { diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 9bd1abb5542..3cf4970d36e 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings, size_type const stop, size_type* d_results) { - size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto const idx = cudf::detail::grid_1d::global_thread_id(); - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = idx % cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } @@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, string_view const d_target, bool* d_results) { - size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - using warp_reduce = cub::WarpReduce; + auto const idx = cudf::detail::grid_1d::global_thread_id(); + using warp_reduce = cub::WarpReduce; __shared__ typename warp_reduce::TempStorage temp_storage; - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = idx % cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } // get the string for this warp diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index aee83ab35ed..b7a719a2041 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, cudf::size_type width, hash_value_type* d_hashes) { - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size()) * - static_cast(cudf::detail::warp_size))) { - return; - } + auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = static_cast(idx % cudf::detail::warp_size); if (d_strings.is_null(str_idx)) { return; } diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index c662581b3f4..a3bed45e4bd 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings, constexpr uint32_t init_val = (1 << FILTER_BIT); uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; - cudf::thread_index_type const char_for_thread = - threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x); - uint32_t num_new_chars = 0; + auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); + uint32_t num_new_chars = 0; if (char_for_thread < total_bytes) { auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread); diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index dee589d6daf..6302b478c14 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata( uint32_t* attn_mask, uint32_t* metadata) { - cudf::thread_index_type const output_idx = - threadIdx.x + static_cast(blockIdx.x) * - static_cast(blockDim.x); - if (output_idx >= (static_cast(nrows_tensor_token_ids) * - static_cast(max_sequence_length))) { - return; - } + auto const output_idx = cudf::detail::grid_1d::global_thread_id(); - uint32_t const absolute_row_id = output_idx / max_sequence_length; + uint32_t const absolute_row_id = output_idx / max_sequence_length; + if (absolute_row_id >= nrows_tensor_token_ids) { return; } uint32_t const tensor_id = row2tensor[absolute_row_id]; uint32_t const row_within_tensor = row2row_within_tensor[absolute_row_id]; uint32_t const offset_token_ids_tensor = offsets[tensor_id]; diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index c094537ebc2..dd1e8ddb027 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi uint32_t* token_ids, uint8_t* tokens_per_word) { - cudf::thread_index_type char_for_thread = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); // Deal with the start_word_indices array if (char_for_thread < num_code_points) { @@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points, uint32_t* end_word_indices, uint32_t num_strings) { - cudf::thread_index_type idx = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const idx = cudf::detail::grid_1d::global_thread_id(); // Ensure the starting character of each strings is written to the word start array. if (idx <= num_strings) { auto const offset = strings_offsets[idx]; @@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, uint32_t* token_ids, uint8_t* tokens_per_word) { - cudf::thread_index_type word_to_tokenize = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id(); - if (word_to_tokenize >= total_words) return; + if (word_to_tokenize >= total_words) { return; } // Each thread gets the start code_point offset for each word and resets the token_id memory to // the default value. In a post processing step, all of these values will be removed. auto const token_start = word_starts[word_to_tokenize]; diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index a2297987732..caf2b1d8b30 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, int8_t* d_results) { // string per warp - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size()) * - static_cast(cudf::detail::warp_size))) { - return; - } - auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = static_cast(idx % cudf::detail::warp_size); if (d_strings.is_null(str_idx)) { From e7022fbc22eda538783e67f32d35ea8ea0798be8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:01:53 -0500 Subject: [PATCH 20/23] Use thread_index_type in binary-ops jit kernel.cu (#17420) Follow on to #17354 to prevent overflow in jit kernel binary-ops. This uses the `thread_index_type` directly since the `detail/utilities/cuda.cuh` cannot be included in the jit'd kernel source. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17420 --- cpp/src/binaryop/jit/kernel.cu | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu index 985fc87521c..1133e9ac22e 100644 --- a/cpp/src/binaryop/jit/kernel.cu +++ b/cpp/src/binaryop/jit/kernel.cu @@ -51,15 +51,10 @@ CUDF_KERNEL void kernel_v_v(cudf::size_type size, TypeLhs* lhs_data, TypeRhs* rhs_data) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; + auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; + auto const step = static_cast(blockDim.x) * gridDim.x; - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += step) { out_data[i] = TypeOpe::template operate(lhs_data[i], rhs_data[i]); } } @@ -75,15 +70,10 @@ CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size, cudf::bitmask_type const* rhs_mask, cudf::size_type rhs_offset) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; + auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; + auto const step = static_cast(blockDim.x) * gridDim.x; - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += step) { bool output_valid = false; out_data[i] = TypeOpe::template operate( lhs_data[i], From f5954a44ce86af82b7750f64e511d063e35e9625 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 12:04:36 -0500 Subject: [PATCH 21/23] Remove nvtx/ranges.hpp include from cuda.cuh (#17427) Removes unused header include `nvtx/ranges.hpp` from `cuda.cuh` and fixes up all the source files that were dependent on this include. Found while trying to include `cuda.cuh` in a jit'd kernel source. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17427 --- cpp/include/cudf/detail/utilities/cuda.cuh | 6 ++---- cpp/src/copying/contiguous_split.cu | 1 + cpp/src/join/conditional_join.cu | 1 + cpp/src/join/hash_join.cu | 1 + cpp/src/join/join.cu | 1 + cpp/src/join/mixed_join.cu | 1 + cpp/src/join/mixed_join_semi.cu | 1 + cpp/src/json/json_path.cu | 1 + cpp/src/labeling/label_bins.cu | 1 + cpp/src/lists/contains.cu | 1 + cpp/src/lists/copying/segmented_gather.cu | 1 + cpp/src/quantiles/tdigest/tdigest.cu | 1 + cpp/src/reductions/minmax.cu | 1 + cpp/src/rolling/detail/rolling_fixed_window.cu | 1 + cpp/src/rolling/detail/rolling_variable_window.cu | 1 + cpp/src/rolling/grouped_rolling.cu | 1 + cpp/src/strings/copying/concatenate.cu | 1 + 17 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index d31ca3d92d1..61a8e9f7ec3 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -25,8 +24,7 @@ #include #include - -#include +#include namespace cudf { namespace detail { @@ -164,7 +162,7 @@ template __device__ T single_lane_block_sum_reduce(T lane_value) { static_assert(block_size <= 1024, "Invalid block size."); - static_assert(std::is_arithmetic_v, "Invalid non-arithmetic type."); + static_assert(cuda::std::is_arithmetic_v, "Invalid non-arithmetic type."); constexpr auto warps_per_block{block_size / warp_size}; auto const lane_id{threadIdx.x % warp_size}; auto const warp_id{threadIdx.x / warp_size}; diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 15aa31ff5ee..e9443980320 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 781fda215fd..81287e9a3fd 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index beeaabfdaab..05b85fed1a8 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 7b13c260364..bbafb159caf 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -16,6 +16,7 @@ #include "join_common_utils.hpp" #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 90b0d0a45ad..56044bb1264 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 62ba558b0bd..6c37f801693 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index fb5cf66dd60..fd8629ed6f3 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu index 18a500069ad..40a48d919cd 100644 --- a/cpp/src/labeling/label_bins.cu +++ b/cpp/src/labeling/label_bins.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 9556ef23784..03fbd8e5e89 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index f6e48f141e1..9d11035cfdc 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index fb5aebb4b39..3a365477366 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 4f6eb23ce5b..98fd9f679c8 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu index 23424da13cd..0603f27852a 100644 --- a/cpp/src/rolling/detail/rolling_fixed_window.cu +++ b/cpp/src/rolling/detail/rolling_fixed_window.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu index c2324947ef6..d4851df740b 100644 --- a/cpp/src/rolling/detail/rolling_variable_window.cu +++ b/cpp/src/rolling/detail/rolling_variable_window.cu @@ -17,6 +17,7 @@ #include "rolling.cuh" #include +#include #include #include diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index ac6c7b11ef5..3cf292f5abb 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -21,6 +21,7 @@ #include "detail/rolling_jit.hpp" #include +#include #include #include #include diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 3712b0e7fc6..ba96e2cb988 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include From b89728b7ccdb7f39b70087eccc2c8c36765742bd Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 26 Nov 2024 18:10:45 +0100 Subject: [PATCH 22/23] Abstract polars function expression nodes to ensure they are serializable (#17418) Use `Enum`s to define Python types as references to `polars.polars._expr_nodes.*Function` as to ensure `cudf_polars.dsl.expressions` specializations of `Expr` are serializable. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17418 --- .../cudf_polars/dsl/expressions/boolean.py | 77 +++++++--- .../cudf_polars/dsl/expressions/datetime.py | 98 ++++++++++--- .../cudf_polars/dsl/expressions/string.py | 135 +++++++++++++----- python/cudf_polars/cudf_polars/dsl/to_ast.py | 10 +- .../cudf_polars/cudf_polars/dsl/translate.py | 14 +- .../tests/dsl/test_serialization.py | 56 ++++++++ 6 files changed, 304 insertions(+), 86 deletions(-) create mode 100644 python/cudf_polars/tests/dsl/test_serialization.py diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 8db8172ebd1..1682e7a8a9c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -6,13 +6,12 @@ from __future__ import annotations +from enum import IntEnum, auto from functools import partial, reduce from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from cudf_polars.containers import Column @@ -24,7 +23,10 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + import polars.type_aliases as pl_types + from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import DataFrame @@ -32,13 +34,46 @@ class BooleanFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `BooleanFunction`.""" + + All = auto() + AllHorizontal = auto() + Any = auto() + AnyHorizontal = auto() + IsBetween = auto() + IsDuplicated = auto() + IsFinite = auto() + IsFirstDistinct = auto() + IsIn = auto() + IsInfinite = auto() + IsLastDistinct = auto() + IsNan = auto() + IsNotNan = auto() + IsNotNull = auto() + IsNull = auto() + IsUnique = auto() + Not = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self: + """Convert from polars' `BooleanFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "BooleanFunction": + raise ValueError("BooleanFunction required") + return getattr(cls, name) + __slots__ = ("name", "options") _non_child = ("dtype", "name", "options") def __init__( self, dtype: plc.DataType, - name: pl_expr.BooleanFunction, + name: BooleanFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -46,7 +81,7 @@ def __init__( self.options = options self.name = name self.children = children - if self.name == pl_expr.BooleanFunction.IsIn and not all( + if self.name is BooleanFunction.Name.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): # TODO: If polars IR doesn't put the casts in, we need to @@ -110,12 +145,12 @@ def do_evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" if self.name in ( - pl_expr.BooleanFunction.IsFinite, - pl_expr.BooleanFunction.IsInfinite, + BooleanFunction.Name.IsFinite, + BooleanFunction.Name.IsInfinite, ): # Avoid evaluating the child if the dtype tells us it's unnecessary. (child,) = self.children - is_finite = self.name == pl_expr.BooleanFunction.IsFinite + is_finite = self.name is BooleanFunction.Name.IsFinite if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): value = plc.interop.from_arrow( pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) @@ -142,10 +177,10 @@ def do_evaluate( ] # Kleene logic for Any (OR) and All (AND) if ignore_nulls is # False - if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All): (ignore_nulls,) = self.options (column,) = columns - is_any = self.name == pl_expr.BooleanFunction.Any + is_any = self.name is BooleanFunction.Name.Any agg = plc.aggregation.any() if is_any else plc.aggregation.all() result = plc.reduce.reduce(column.obj, agg, self.dtype) if not ignore_nulls and column.obj.null_count() > 0: @@ -165,27 +200,27 @@ def do_evaluate( # False || Null => Null True && Null => Null return Column(plc.Column.all_null_like(column.obj, 1)) return Column(plc.Column.from_scalar(result, 1)) - if self.name == pl_expr.BooleanFunction.IsNull: + if self.name is BooleanFunction.Name.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNotNull: + elif self.name is BooleanFunction.Name.IsNotNull: (column,) = columns return Column(plc.unary.is_valid(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNan: + elif self.name is BooleanFunction.Name.IsNan: (column,) = columns return Column( plc.unary.is_nan(column.obj).with_mask( column.obj.null_mask(), column.obj.null_count() ) ) - elif self.name == pl_expr.BooleanFunction.IsNotNan: + elif self.name is BooleanFunction.Name.IsNotNan: (column,) = columns return Column( plc.unary.is_not_nan(column.obj).with_mask( column.obj.null_mask(), column.obj.null_count() ) ) - elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + elif self.name is BooleanFunction.Name.IsFirstDistinct: (column,) = columns return self._distinct( column, @@ -197,7 +232,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + elif self.name is BooleanFunction.Name.IsLastDistinct: (column,) = columns return self._distinct( column, @@ -209,7 +244,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsUnique: + elif self.name is BooleanFunction.Name.IsUnique: (column,) = columns return self._distinct( column, @@ -221,7 +256,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsDuplicated: + elif self.name is BooleanFunction.Name.IsDuplicated: (column,) = columns return self._distinct( column, @@ -233,7 +268,7 @@ def do_evaluate( pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.AllHorizontal: + elif self.name is BooleanFunction.Name.AllHorizontal: return Column( reduce( partial( @@ -244,7 +279,7 @@ def do_evaluate( (c.obj for c in columns), ) ) - elif self.name == pl_expr.BooleanFunction.AnyHorizontal: + elif self.name is BooleanFunction.Name.AnyHorizontal: return Column( reduce( partial( @@ -255,10 +290,10 @@ def do_evaluate( (c.obj for c in columns), ) ) - elif self.name == pl_expr.BooleanFunction.IsIn: + elif self.name is BooleanFunction.Name.IsIn: needles, haystack = columns return Column(plc.search.contains(haystack.obj, needles.obj)) - elif self.name == pl_expr.BooleanFunction.Not: + elif self.name is BooleanFunction.Name.Not: (column,) = columns return Column( plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index cd8e5c6a4eb..c2dddfd9940 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -6,12 +6,11 @@ from __future__ import annotations +from enum import IntEnum, auto from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from cudf_polars.containers import Column @@ -20,33 +19,94 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.containers import DataFrame __all__ = ["TemporalFunction"] class TemporalFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `TemporalFunction`.""" + + BaseUtcOffset = auto() + CastTimeUnit = auto() + Century = auto() + Combine = auto() + ConvertTimeZone = auto() + DSTOffset = auto() + Date = auto() + Datetime = auto() + DatetimeFunction = auto() + Day = auto() + Duration = auto() + Hour = auto() + IsLeapYear = auto() + IsoYear = auto() + Microsecond = auto() + Millennium = auto() + Millisecond = auto() + Minute = auto() + Month = auto() + MonthEnd = auto() + MonthStart = auto() + Nanosecond = auto() + OffsetBy = auto() + OrdinalDay = auto() + Quarter = auto() + ReplaceTimeZone = auto() + Round = auto() + Second = auto() + Time = auto() + TimeStamp = auto() + ToString = auto() + TotalDays = auto() + TotalHours = auto() + TotalMicroseconds = auto() + TotalMilliseconds = auto() + TotalMinutes = auto() + TotalNanoseconds = auto() + TotalSeconds = auto() + Truncate = auto() + Week = auto() + WeekDay = auto() + WithTimeUnit = auto() + Year = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self: + """Convert from polars' `TemporalFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "TemporalFunction": + raise ValueError("TemporalFunction required") + return getattr(cls, name) + __slots__ = ("name", "options") - _COMPONENT_MAP: ClassVar[ - dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] - ] = { - pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, - pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, - pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, - pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, - pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, - pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, - pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, - pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, - pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, - pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, - } _non_child = ("dtype", "name", "options") + _COMPONENT_MAP: ClassVar[dict[Name, plc.datetime.DatetimeComponent]] = { + Name.Year: plc.datetime.DatetimeComponent.YEAR, + Name.Month: plc.datetime.DatetimeComponent.MONTH, + Name.Day: plc.datetime.DatetimeComponent.DAY, + Name.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + Name.Hour: plc.datetime.DatetimeComponent.HOUR, + Name.Minute: plc.datetime.DatetimeComponent.MINUTE, + Name.Second: plc.datetime.DatetimeComponent.SECOND, + Name.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + Name.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, + } def __init__( self, dtype: plc.DataType, - name: pl_expr.TemporalFunction, + name: TemporalFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -70,7 +130,7 @@ def do_evaluate( for child in self.children ] (column,) = columns - if self.name == pl_expr.TemporalFunction.Microsecond: + if self.name is TemporalFunction.Name.Microsecond: millis = plc.datetime.extract_datetime_component( column.obj, plc.datetime.DatetimeComponent.MILLISECOND ) @@ -90,7 +150,7 @@ def do_evaluate( plc.types.DataType(plc.types.TypeId.INT32), ) return Column(total_micros) - elif self.name == pl_expr.TemporalFunction.Nanosecond: + elif self.name is TemporalFunction.Name.Nanosecond: millis = plc.datetime.extract_datetime_component( column.obj, plc.datetime.DatetimeComponent.MILLISECOND ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 8b66c9d4676..92c3c658c21 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -6,13 +6,13 @@ from __future__ import annotations +from enum import IntEnum, auto from typing import TYPE_CHECKING, Any import pyarrow as pa import pyarrow.compute as pc from polars.exceptions import InvalidOperationError -from polars.polars import _expr_nodes as pl_expr import pylibcudf as plc @@ -23,19 +23,82 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.containers import DataFrame __all__ = ["StringFunction"] class StringFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `StringFunction`.""" + + Base64Decode = auto() + Base64Encode = auto() + ConcatHorizontal = auto() + ConcatVertical = auto() + Contains = auto() + ContainsMany = auto() + CountMatches = auto() + EndsWith = auto() + EscapeRegex = auto() + Extract = auto() + ExtractAll = auto() + ExtractGroups = auto() + Find = auto() + Head = auto() + HexDecode = auto() + HexEncode = auto() + JsonDecode = auto() + JsonPathMatch = auto() + LenBytes = auto() + LenChars = auto() + Lowercase = auto() + PadEnd = auto() + PadStart = auto() + Replace = auto() + ReplaceMany = auto() + Reverse = auto() + Slice = auto() + Split = auto() + SplitExact = auto() + SplitN = auto() + StartsWith = auto() + StripChars = auto() + StripCharsEnd = auto() + StripCharsStart = auto() + StripPrefix = auto() + StripSuffix = auto() + Strptime = auto() + Tail = auto() + Titlecase = auto() + ToDecimal = auto() + ToInteger = auto() + Uppercase = auto() + ZFill = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.StringFunction) -> Self: + """Convert from polars' `StringFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "StringFunction": + raise ValueError("StringFunction required") + return getattr(cls, name) + __slots__ = ("name", "options", "_regex_program") _non_child = ("dtype", "name", "options") def __init__( self, dtype: plc.DataType, - name: pl_expr.StringFunction, + name: StringFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -47,21 +110,21 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Contains, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Replace, - pl_expr.StringFunction.ReplaceMany, - pl_expr.StringFunction.Slice, - pl_expr.StringFunction.Strptime, - pl_expr.StringFunction.StartsWith, - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - pl_expr.StringFunction.Uppercase, + StringFunction.Name.Contains, + StringFunction.Name.EndsWith, + StringFunction.Name.Lowercase, + StringFunction.Name.Replace, + StringFunction.Name.ReplaceMany, + StringFunction.Name.Slice, + StringFunction.Name.Strptime, + StringFunction.Name.StartsWith, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, + StringFunction.Name.Uppercase, ): raise NotImplementedError(f"String function {self.name}") - if self.name == pl_expr.StringFunction.Contains: + if self.name is StringFunction.Name.Contains: literal, strict = self.options if not literal: if not strict: @@ -82,7 +145,7 @@ def _validate_input(self): raise NotImplementedError( f"Unsupported regex {pattern} for GPU engine." ) from e - elif self.name == pl_expr.StringFunction.Replace: + elif self.name is StringFunction.Name.Replace: _, literal = self.options if not literal: raise NotImplementedError("literal=False is not supported for replace") @@ -93,7 +156,7 @@ def _validate_input(self): raise NotImplementedError( "libcudf replace does not support empty strings" ) - elif self.name == pl_expr.StringFunction.ReplaceMany: + elif self.name is StringFunction.Name.ReplaceMany: (ascii_case_insensitive,) = self.options if ascii_case_insensitive: raise NotImplementedError( @@ -109,12 +172,12 @@ def _validate_input(self): "libcudf replace_many is implemented differently from polars " "for empty strings" ) - elif self.name == pl_expr.StringFunction.Slice: + elif self.name is StringFunction.Name.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) - elif self.name == pl_expr.StringFunction.Strptime: + elif self.name is StringFunction.Name.Strptime: format, _, exact, cache = self.options if cache: raise NotImplementedError("Strptime cache is a CPU feature") @@ -123,9 +186,9 @@ def _validate_input(self): if not exact: raise NotImplementedError("Strptime does not support exact=False") elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, }: if not isinstance(self.children[1], Literal): raise NotImplementedError( @@ -140,7 +203,7 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - if self.name == pl_expr.StringFunction.Contains: + if self.name is StringFunction.Name.Contains: child, arg = self.children column = child.evaluate(df, context=context, mapping=mapping) @@ -157,7 +220,7 @@ def do_evaluate( return Column( plc.strings.contains.contains_re(column.obj, self._regex_program) ) - elif self.name == pl_expr.StringFunction.Slice: + elif self.name is StringFunction.Name.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) assert isinstance(expr_length, Literal) @@ -188,16 +251,16 @@ def do_evaluate( ) ) elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, }: column, chars = ( c.evaluate(df, context=context, mapping=mapping) for c in self.children ) - if self.name == pl_expr.StringFunction.StripCharsStart: + if self.name is StringFunction.Name.StripCharsStart: side = plc.strings.SideType.LEFT - elif self.name == pl_expr.StringFunction.StripCharsEnd: + elif self.name is StringFunction.Name.StripCharsEnd: side = plc.strings.SideType.RIGHT else: side = plc.strings.SideType.BOTH @@ -207,13 +270,13 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.StringFunction.Lowercase: + if self.name is StringFunction.Name.Lowercase: (column,) = columns return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: + elif self.name is StringFunction.Name.Uppercase: (column,) = columns return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: + elif self.name is StringFunction.Name.EndsWith: column, suffix = columns return Column( plc.strings.find.ends_with( @@ -223,7 +286,7 @@ def do_evaluate( else suffix.obj, ) ) - elif self.name == pl_expr.StringFunction.StartsWith: + elif self.name is StringFunction.Name.StartsWith: column, prefix = columns return Column( plc.strings.find.starts_with( @@ -233,7 +296,7 @@ def do_evaluate( else prefix.obj, ) ) - elif self.name == pl_expr.StringFunction.Strptime: + elif self.name is StringFunction.Name.Strptime: # TODO: ignores ambiguous format, strict, exact, cache = self.options col = self.children[0].evaluate(df, context=context, mapping=mapping) @@ -265,7 +328,7 @@ def do_evaluate( res.columns()[0], self.dtype, format ) ) - elif self.name == pl_expr.StringFunction.Replace: + elif self.name is StringFunction.Name.Replace: column, target, repl = columns n, _ = self.options return Column( @@ -273,7 +336,7 @@ def do_evaluate( column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n ) ) - elif self.name == pl_expr.StringFunction.ReplaceMany: + elif self.name is StringFunction.Name.ReplaceMany: column, target, repl = columns return Column( plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py index acc4b3669af..c3febc833e2 100644 --- a/python/cudf_polars/cudf_polars/dsl/to_ast.py +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -8,8 +8,6 @@ from functools import partial, reduce, singledispatch from typing import TYPE_CHECKING, TypeAlias -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from pylibcudf import expressions as plc_expr @@ -185,7 +183,7 @@ def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression: @_to_ast.register def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: - if node.name == pl_expr.BooleanFunction.IsIn: + if node.name is expr.BooleanFunction.Name.IsIn: needles, haystack = node.children if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16: # 16 is an arbitrary limit @@ -204,14 +202,14 @@ def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: raise NotImplementedError( f"Parquet filters don't support {node.name} on columns" ) - if node.name == pl_expr.BooleanFunction.IsNull: + if node.name is expr.BooleanFunction.Name.IsNull: return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])) - elif node.name == pl_expr.BooleanFunction.IsNotNull: + elif node.name is expr.BooleanFunction.Name.IsNotNull: return plc_expr.Operation( plc_expr.ASTOperator.NOT, plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])), ) - elif node.name == pl_expr.BooleanFunction.Not: + elif node.name is expr.BooleanFunction.Name.Not: return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0])) raise NotImplementedError(f"AST conversion does not support {node.name}") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 9480ce6e535..b1e2de63ba6 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -531,10 +531,16 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex column.dtype, pa.scalar("", type=plc.interop.to_arrow(column.dtype)), ) - return expr.StringFunction(dtype, name, options, column, chars) + return expr.StringFunction( + dtype, + expr.StringFunction.Name.from_polars(name), + options, + column, + chars, + ) return expr.StringFunction( dtype, - name, + expr.StringFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) @@ -551,7 +557,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex ) return expr.BooleanFunction( dtype, - name, + expr.BooleanFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) @@ -571,7 +577,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex } result_expr = expr.TemporalFunction( dtype, - name, + expr.TemporalFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) diff --git a/python/cudf_polars/tests/dsl/test_serialization.py b/python/cudf_polars/tests/dsl/test_serialization.py new file mode 100644 index 00000000000..7de8f959843 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_serialization.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pickle + +import pytest + +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.dsl.expressions.boolean import BooleanFunction +from cudf_polars.dsl.expressions.datetime import TemporalFunction +from cudf_polars.dsl.expressions.string import StringFunction + + +@pytest.fixture(params=[BooleanFunction, StringFunction, TemporalFunction]) +def function(request): + return request.param + + +def test_function_name_serialization_all_values(function): + # Test serialization and deserialization for all values of function.Name + for name in function.Name: + serialized_name = pickle.dumps(name) + deserialized_name = pickle.loads(serialized_name) + assert deserialized_name is name + + +def test_function_name_invalid(function): + # Test invalid attribute name + with pytest.raises(AttributeError, match="InvalidAttribute"): + assert function.Name.InvalidAttribute is function.Name.InvalidAttribute + + +def test_from_polars_all_names(function): + # Test that all valid names of polars expressions are correctly converted + polars_function = getattr(pl_expr, function.__name__) + polars_names = [name for name in dir(polars_function) if not name.startswith("_")] + # Check names advertised by polars are the same as we advertise + assert set(polars_names) == set(function.Name.__members__) + for name in function.Name: + attr = getattr(polars_function, name.name) + assert function.Name.from_polars(attr) == name + + +def test_from_polars_invalid_attribute(function): + # Test converting from invalid attribute name + with pytest.raises(ValueError, match=f"{function.__name__} required"): + function.Name.from_polars("InvalidAttribute") + + +def test_from_polars_invalid_polars_attribute(function): + # Test converting from polars function with invalid attribute name + with pytest.raises(AttributeError, match="InvalidAttribute"): + function.Name.from_polars(f"{function.__name__}.InvalidAttribute") From 165d756f7f7cb558d1cab62a81a1c91368648d12 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:38:22 -0500 Subject: [PATCH 23/23] Migrate ORC Writer to pylibcudf (#17310) Apart of #15162. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17310 --- python/cudf/cudf/_lib/orc.pyx | 167 ++++--- python/pylibcudf/pylibcudf/io/orc.pxd | 65 ++- python/pylibcudf/pylibcudf/io/orc.pyi | 51 ++- python/pylibcudf/pylibcudf/io/orc.pyx | 413 +++++++++++++++++- python/pylibcudf/pylibcudf/io/types.pxd | 17 +- python/pylibcudf/pylibcudf/io/types.pyi | 22 +- python/pylibcudf/pylibcudf/io/types.pyx | 61 ++- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 62 +++ .../pylibcudf/tests/io/test_types.py | 28 ++ 9 files changed, 762 insertions(+), 124 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_types.py diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 32a5e463916..c829cac6409 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -3,11 +3,9 @@ from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map -from libcpp.memory cimport unique_ptr from libcpp.string cimport string -from libcpp.utility cimport move from libcpp.vector cimport vector - +import itertools from collections import OrderedDict try: @@ -16,23 +14,10 @@ except ImportError: import json cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.orc cimport ( - chunked_orc_writer_options, - orc_chunked_writer, - orc_writer_options, - write_orc as libcudf_write_orc, -) -from pylibcudf.libcudf.io.types cimport ( - column_in_metadata, - sink_info, - table_input_metadata, -) -from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table +from cudf._lib.io.utils cimport update_col_struct_field_names +from cudf._lib.utils cimport data_from_pylibcudf_io import pylibcudf as plc @@ -40,7 +25,8 @@ import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.core.buffer import acquire_spill_lock - +from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata +from pylibcudf.io.orc cimport OrcChunkedWriter # TODO: Consider inlining this function since it seems to only be used in one place. cpdef read_parsed_orc_statistics(filepath_or_buffer): @@ -246,36 +232,33 @@ def write_orc( -------- cudf.read_orc """ - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - cdef table_input_metadata tbl_meta - cdef map[string, string] user_data - user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata( - table, index) - ) - + user_data = {} + user_data["pandas"] = generate_pandas_metadata(table, index) if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) + columns = table._columns if table._index is None else [ + *table.index._columns, *table._columns + ] + plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) + tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) + _index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table._index.names) else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = TableInputMetadata(plc_table) num_index_cols_meta = 0 if cols_as_map_type is not None: cols_as_map_type = set(cols_as_map_type) for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name.encode()) + tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, tbl_meta.column_metadata[i], @@ -283,24 +266,24 @@ def write_orc( and (name in cols_as_map_type), ) - cdef orc_writer_options c_orc_writer_options = move( - orc_writer_options.builder( - sink_info_c, tv - ).metadata(tbl_meta) - .key_value_metadata(move(user_data)) + options = ( + plc.io.orc.OrcWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) if stripe_size_bytes is not None: - c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes) + options.set_stripe_size_bytes(stripe_size_bytes) if stripe_size_rows is not None: - c_orc_writer_options.set_stripe_size_rows(stripe_size_rows) + options.set_stripe_size_rows(stripe_size_rows) if row_index_stride is not None: - c_orc_writer_options.set_row_index_stride(row_index_stride) + options.set_row_index_stride(row_index_stride) - with nogil: - libcudf_write_orc(c_orc_writer_options) + plc.io.orc.write_orc(options) cdef int64_t get_skiprows_arg(object arg) except*: @@ -326,13 +309,12 @@ cdef class ORCWriter: cudf.io.orc.to_orc """ cdef bool initialized - cdef unique_ptr[orc_chunked_writer] writer - cdef sink_info sink - cdef unique_ptr[data_sink] _data_sink + cdef OrcChunkedWriter writer + cdef SinkInfo sink cdef str statistics cdef object compression cdef object index - cdef table_input_metadata tbl_meta + cdef TableInputMetadata tbl_meta cdef object cols_as_map_type cdef object stripe_size_bytes cdef object stripe_size_rows @@ -347,8 +329,7 @@ cdef class ORCWriter: object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): - - self.sink = make_sink_info(path, self._data_sink) + self.sink = plc.io.SinkInfo([path]) self.statistics = statistics self.compression = compression self.index = index @@ -368,17 +349,21 @@ cdef class ORCWriter: table._index.name is not None or isinstance(table._index, cudf.core.multiindex.MultiIndex) ) - tv = table_view_from_table(table, not keep_index) + if keep_index: + columns = [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + else: + columns = [col.to_pylibcudf(mode="read") for col in table._columns] - with nogil: - self.writer.get()[0].write(tv) + self.writer.write(plc.Table(columns)) def close(self): if not self.initialized: return - with nogil: - self.writer.get()[0].close() + self.writer.close() def __dealloc__(self): self.close() @@ -387,32 +372,47 @@ cdef class ORCWriter: """ Prepare all the values required to build the chunked_orc_writer_options anb creates a writer""" - cdef table_view tv num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True), + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in table._columns + ] ) + self.tbl_meta = TableInputMetadata(plc_table) if self.index is not False: if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) + idx_name ) num_index_cols_meta = len(table._index.names) else: if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) self.tbl_meta.column_metadata[0].set_name( - str.encode(table._index.name) + table._index.name ) num_index_cols_meta = 1 for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) + self.tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, self.tbl_meta.column_metadata[i], @@ -420,38 +420,37 @@ cdef class ORCWriter: and (name in self.cols_as_map_type), ) - cdef map[string, string] user_data + user_data = {} pandas_metadata = generate_pandas_metadata(table, self.index) - user_data[str.encode("pandas")] = str.encode(pandas_metadata) - - cdef chunked_orc_writer_options c_opts = move( - chunked_orc_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(_get_comp_type(self.compression)) - .enable_statistics(_get_orc_stat_freq(self.statistics)) - .build() - ) + user_data["pandas"] = pandas_metadata + + options = ( + plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) + .build() + ) if self.stripe_size_bytes is not None: - c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + options.set_stripe_size_bytes(self.stripe_size_bytes) if self.stripe_size_rows is not None: - c_opts.set_stripe_size_rows(self.stripe_size_rows) + options.set_stripe_size_rows(self.stripe_size_rows) if self.row_index_stride is not None: - c_opts.set_row_index_stride(self.row_index_stride) + options.set_row_index_stride(self.row_index_stride) - with nogil: - self.writer.reset(new orc_chunked_writer(c_opts)) + self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) self.initialized = True cdef _set_col_children_metadata(Column col, - column_in_metadata& col_meta, + ColumnInMetadata col_meta, list_column_as_map=False): if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): - col_meta.child(i).set_name(name.encode()) + col_meta.child(i).set_name(name) _set_col_children_metadata( child_col, col_meta.child(i), list_column_as_map ) diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index b111d617b1b..671f0692444 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -4,15 +4,33 @@ from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from libcpp.memory cimport unique_ptr +from libcpp.map cimport map +from pylibcudf.io.types cimport ( + SourceInfo, + SinkInfo, + TableWithMetadata, + TableInputMetadata, +) from pylibcudf.libcudf.io.orc_metadata cimport ( column_statistics, parsed_orc_statistics, statistics_type, ) +from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, + orc_writer_options, + orc_writer_options_builder, + chunked_orc_writer_options, + chunked_orc_writer_options_builder, +) from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType - +from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) cpdef TableWithMetadata read_orc( SourceInfo source_info, @@ -48,3 +66,46 @@ cdef class ParsedOrcStatistics: cpdef ParsedOrcStatistics read_parsed_orc_statistics( SourceInfo source_info ) + +cdef class OrcWriterOptions: + cdef orc_writer_options c_obj + cdef Table table + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) + +cdef class OrcWriterOptionsBuilder: + cdef orc_writer_options_builder c_obj + cdef Table table + cdef SinkInfo sink + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm) + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) + cpdef OrcWriterOptions build(self) + +cpdef void write_orc(OrcWriterOptions options) + +cdef class OrcChunkedWriter: + cdef unique_ptr[orc_chunked_writer] c_obj + cpdef void close(self) + cpdef void write(self, Table table) + +cdef class ChunkedOrcWriterOptions: + cdef chunked_orc_writer_options c_obj + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) + +cdef class ChunkedOrcWriterOptionsBuilder: + cdef chunked_orc_writer_options_builder c_obj + cdef SinkInfo sink + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, dict kvm + ) + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) + cpdef ChunkedOrcWriterOptions build(self) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 4cf87f1a832..516f97981e9 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -1,8 +1,16 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from typing import Any +from typing import Any, Self -from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.io.types import ( + CompressionType, + SinkInfo, + SourceInfo, + StatisticsFreq, + TableInputMetadata, + TableWithMetadata, +) +from pylibcudf.table import Table from pylibcudf.types import DataType def read_orc( @@ -39,3 +47,42 @@ class ParsedOrcStatistics: def read_parsed_orc_statistics( source_info: SourceInfo, ) -> ParsedOrcStatistics: ... + +class OrcWriterOptions: + def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... + @staticmethod + def builder(sink: SinkInfo, table: Table) -> OrcWriterOptionsBuilder: ... + +class OrcWriterOptionsBuilder: + def __init__(self): ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableWithMetadata) -> Self: ... + def build(self) -> OrcWriterOptions: ... + +def write_orc(options: OrcWriterOptions) -> None: ... + +class OrcChunkedWriter: + def __init__(self): ... + def close(self) -> None: ... + def write(self, table: Table) -> None: ... + +class ChunkedOrcWriterOptions: + def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... + @staticmethod + def builder(sink: SinkInfo) -> ChunkedOrcWriterOptionsBuilder: ... + +class ChunkedOrcWriterOptionsBuilder: + def __init__(self): ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableInputMetadata) -> Self: ... + def build(self) -> ChunkedOrcWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 4270f5b4f95..63eab4a9634 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -6,10 +6,11 @@ from libcpp.vector cimport vector import datetime -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata, SinkInfo from pylibcudf.libcudf.io.orc cimport ( orc_reader_options, read_orc as cpp_read_orc, + write_orc as cpp_write_orc, ) from pylibcudf.libcudf.io.orc_metadata cimport ( binary_statistics, @@ -29,12 +30,27 @@ from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) +from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, + orc_writer_options, + chunked_orc_writer_options, +) __all__ = [ "OrcColumnStatistics", "ParsedOrcStatistics", "read_orc", "read_parsed_orc_statistics", + "write_orc", + "OrcWriterOptions", + "OrcWriterOptionsBuilder", + "OrcChunkedWriter", + "ChunkedOrcWriterOptions", + "ChunkedOrcWriterOptionsBuilder", ] cdef class OrcColumnStatistics: @@ -310,3 +326,398 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics( cpp_read_parsed_orc_statistics(source_info.c_obj) ) return ParsedOrcStatistics.from_libcudf(parsed) + + +cdef class OrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_bytes` + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_rows` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_row_index_stride` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink, Table table): + """ + Create builder to create OrcWriterOptions. + + For details, see :cpp:func:`cudf::io::orc_writer_options::builder` + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + OrcWriterOptionsBuilder + """ + cdef OrcWriterOptionsBuilder orc_builder = OrcWriterOptionsBuilder.__new__( + OrcWriterOptionsBuilder + ) + orc_builder.c_obj = orc_writer_options.builder(sink.c_obj, table.view()) + orc_builder.table = table + orc_builder.sink = sink + return orc_builder + + +cdef class OrcWriterOptionsBuilder: + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::compression` + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.compression(comp) + return self + + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + For details, see :cpp:func:`enable_statistics` + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.enable_statistics(val) + return self + + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) + return self + + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::metadata` + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.metadata(meta.c_obj) + return self + + cpdef OrcWriterOptions build(self): + """Moves the ORC writer options builder""" + cdef OrcWriterOptions orc_options = OrcWriterOptions.__new__( + OrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.table = self.table + orc_options.sink = self.sink + return orc_options + + +cpdef void write_orc(OrcWriterOptions options): + """ + Write to ORC format. + + The table to write, output paths, and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`write_csv`. + + Parameters + ---------- + options: OrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + None + """ + with nogil: + cpp_write_orc(move(options.c_obj)) + + +cdef class OrcChunkedWriter: + cpdef void close(self): + """ + Closes the chunked ORC writer. + + Returns + ------- + None + """ + with nogil: + self.c_obj.get()[0].close() + + cpdef void write(self, Table table): + """ + Writes table to output. + + Parameters + ---------- + table: Table + able that needs to be written + + Returns + ------- + None + """ + with nogil: + self.c_obj.get()[0].write(table.view()) + + @staticmethod + def from_options(ChunkedOrcWriterOptions options): + """ + Creates a chunked ORC writer from options + + Parameters + ---------- + options: ChunkedOrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + OrcChunkedWriter + """ + cdef OrcChunkedWriter orc_writer = OrcChunkedWriter.__new__( + OrcChunkedWriter + ) + orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj)) + return orc_writer + + +cdef class ChunkedOrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink): + """ + Create builder to create ChunkedOrcWriterOptions. + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + cdef ChunkedOrcWriterOptionsBuilder orc_builder = \ + ChunkedOrcWriterOptionsBuilder.__new__( + ChunkedOrcWriterOptionsBuilder + ) + orc_builder.c_obj = chunked_orc_writer_options.builder(sink.c_obj) + orc_builder.sink = sink + return orc_builder + + +cdef class ChunkedOrcWriterOptionsBuilder: + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.compression(comp) + return self + + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.enable_statistics(val) + return self + + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, + dict kvm + ): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) + return self + + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.metadata(meta.c_obj) + return self + + cpdef ChunkedOrcWriterOptions build(self): + """Create a OrcWriterOptions object""" + cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__( + ChunkedOrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.sink = self.sink + return orc_options diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 90b43cf0ff5..a1f3b17936c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -3,6 +3,7 @@ from libc.stdint cimport uint8_t, int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from libcpp cimport bool from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.types cimport ( column_encoding, @@ -22,16 +23,16 @@ from pylibcudf.libcudf.io.types cimport ( ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.table cimport Table - +from pylibcudf.libcudf.types cimport size_type cdef class PartitionInfo: cdef partition_info c_obj cdef class ColumnInMetadata: - cdef column_in_metadata c_obj + cdef column_in_metadata* c_obj + cdef TableInputMetadata owner - @staticmethod - cdef ColumnInMetadata from_metadata(column_in_metadata metadata) + cdef TableInputMetadata table cpdef ColumnInMetadata set_name(self, str name) @@ -43,7 +44,7 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_int96_timestamps(self, bool req) - cpdef ColumnInMetadata set_decimal_precision(self, uint8_t req) + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision) cpdef ColumnInMetadata child(self, size_type i) @@ -57,8 +58,14 @@ cdef class ColumnInMetadata: cpdef str get_name(self) + @staticmethod + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ) + cdef class TableInputMetadata: cdef table_input_metadata c_obj + cdef list column_metadata cdef class TableWithMetadata: cdef public Table tbl diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index 04f276cfeee..a3a559219ff 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -3,7 +3,7 @@ import io import os from collections.abc import Mapping from enum import IntEnum -from typing import Any, Literal, TypeAlias, overload +from typing import Any, Literal, Self, TypeAlias, overload from pylibcudf.column import Column from pylibcudf.io.datasource import Datasource @@ -66,16 +66,16 @@ class TableInputMetadata: def __init__(self, table: Table): ... class ColumnInMetadata: - def set_name(self, name: str) -> ColumnInMetadata: ... - def set_nullability(self, nullable: bool) -> ColumnInMetadata: ... - def set_list_column_as_map(self) -> ColumnInMetadata: ... - def set_int96_timestamps(self, req: bool) -> ColumnInMetadata: ... - def set_decimal_precision(self, precision: int) -> ColumnInMetadata: ... - def child(self, i: int) -> ColumnInMetadata: ... - def set_output_as_binary(self, binary: bool) -> ColumnInMetadata: ... - def set_type_length(self, type_length: int) -> ColumnInMetadata: ... - def set_skip_compression(self, skip: bool) -> ColumnInMetadata: ... - def set_encoding(self, encoding: ColumnEncoding) -> ColumnInMetadata: ... + def set_name(self, name: str) -> Self: ... + def set_nullability(self, nullable: bool) -> Self: ... + def set_list_column_as_map(self) -> Self: ... + def set_int96_timestamps(self, req: bool) -> Self: ... + def set_decimal_precision(self, precision: int) -> Self: ... + def child(self, i: int) -> Self: ... + def set_output_as_binary(self, binary: bool) -> Self: ... + def set_type_length(self, type_length: int) -> Self: ... + def set_skip_compression(self, skip: bool) -> Self: ... + def set_encoding(self, encoding: ColumnEncoding) -> Self: ... def get_name(self) -> str: ... class TableWithMetadata: diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 460ab6844c3..a2155829f2c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -2,7 +2,6 @@ from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory -from libc.stdint cimport uint8_t, int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -20,6 +19,8 @@ from pylibcudf.libcudf.io.types cimport ( source_info, table_input_metadata, table_with_metadata, + column_in_metadata, + table_input_metadata, ) from pylibcudf.libcudf.types cimport size_type @@ -38,9 +39,14 @@ from pylibcudf.libcudf.io.types import ( quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +from cython.operator cimport dereference +from pylibcudf.libcudf.types cimport size_type +from cython.operator cimport dereference +from pylibcudf.libcudf.types cimport size_type __all__ = [ "ColumnEncoding", + "ColumnInMetadata", "CompressionType", "DictionaryPolicy", "JSONRecoveryMode", @@ -74,18 +80,30 @@ cdef class ColumnInMetadata: Metadata for a column """ + def __init__(self): + raise ValueError( + "ColumnInMetadata should not be constructed directly. " + "Use one of the factories." + ) + @staticmethod - cdef ColumnInMetadata from_metadata(column_in_metadata metadata): + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ): """ - Construct a ColumnInMetadata. + A Python representation of `column_in_metadata`. Parameters ---------- - metadata : column_in_metadata - """ - cdef ColumnInMetadata col_metadata = ColumnInMetadata.__new__(ColumnInMetadata) - col_metadata.c_obj = metadata - return col_metadata + metadata : column_in_metadata* + Raw pointer to C++ metadata. + owner : TableInputMetadata + Owning table input metadata that manages lifetime of the raw pointer. + """ + cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) + out.c_obj = metadata + out.owner = owner + return out cpdef ColumnInMetadata set_name(self, str name): """ @@ -100,7 +118,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_name(name.encode()) + dereference(self.c_obj).set_name(name.encode()) return self cpdef ColumnInMetadata set_nullability(self, bool nullable): @@ -116,7 +134,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_nullability(nullable) + dereference(self.c_obj).set_nullability(nullable) return self cpdef ColumnInMetadata set_list_column_as_map(self): @@ -128,7 +146,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_list_column_as_map() + dereference(self.c_obj).set_list_column_as_map() return self cpdef ColumnInMetadata set_int96_timestamps(self, bool req): @@ -145,7 +163,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_int96_timestamps(req) + dereference(self.c_obj).set_int96_timestamps(req) return self cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision): @@ -162,7 +180,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_decimal_precision(precision) + dereference(self.c_obj).set_decimal_precision(precision) return self cpdef ColumnInMetadata child(self, size_type i): @@ -178,7 +196,8 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - return ColumnInMetadata.from_metadata(self.c_obj.child(i)) + cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) + return ColumnInMetadata.from_libcudf(child_c_obj, self.owner) cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ @@ -193,7 +212,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_output_as_binary(binary) + dereference(self.c_obj).set_output_as_binary(binary) return self cpdef ColumnInMetadata set_type_length(self, int32_t type_length): @@ -209,7 +228,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_type_length(type_length) + dereference(self.c_obj).set_type_length(type_length) return self cpdef ColumnInMetadata set_skip_compression(self, bool skip): @@ -226,7 +245,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_skip_compression(skip) + dereference(self.c_obj).set_skip_compression(skip) return self cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): @@ -243,7 +262,7 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - self.c_obj.set_encoding(encoding) + dereference(self.c_obj).set_encoding(encoding) return self cpdef str get_name(self): @@ -255,7 +274,7 @@ cdef class ColumnInMetadata: str The name of this column """ - return self.c_obj.get_name().decode() + return dereference(self.c_obj).get_name().decode() cdef class TableInputMetadata: @@ -269,6 +288,10 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) + self.column_metadata = [ + ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self) + for i in range(self.c_obj.column_metadata.size()) + ] cdef class TableWithMetadata: diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 5ed660ba6cf..2557e40c935 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. + import pyarrow as pa import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source @@ -52,3 +53,64 @@ def test_read_orc_basic( ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + + +@pytest.mark.parametrize( + "compression", + [ + plc.io.types.CompressionType.NONE, + plc.io.types.CompressionType.SNAPPY, + ], +) +@pytest.mark.parametrize( + "statistics", + [ + plc.io.types.StatisticsFreq.STATISTICS_NONE, + plc.io.types.StatisticsFreq.STATISTICS_COLUMN, + ], +) +@pytest.mark.parametrize("stripe_size_bytes", [None, 65536]) +@pytest.mark.parametrize("stripe_size_rows", [None, 512]) +@pytest.mark.parametrize("row_index_stride", [None, 512]) +def test_roundtrip_pa_table( + compression, + statistics, + stripe_size_bytes, + stripe_size_rows, + row_index_stride, + tmp_path, +): + pa_table = pa.table({"a": [1.0, 2.0, None], "b": [True, None, False]}) + plc_table = plc.interop.from_arrow(pa_table) + + tmpfile_name = tmp_path / "test.orc" + + sink = plc.io.SinkInfo([str(tmpfile_name)]) + + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + user_data = {"a": "", "b": ""} + options = ( + plc.io.orc.OrcWriterOptions.builder(sink, plc_table) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(compression) + .enable_statistics(statistics) + .build() + ) + if stripe_size_bytes is not None: + options.set_stripe_size_bytes(stripe_size_bytes) + if stripe_size_rows is not None: + options.set_stripe_size_rows(stripe_size_rows) + if row_index_stride is not None: + options.set_row_index_stride(row_index_stride) + + plc.io.orc.write_orc(options) + + read_table = pa.orc.read_table(str(tmpfile_name)) + + res = plc.io.types.TableWithMetadata( + plc.interop.from_arrow(read_table), + [(name, []) for name in pa_table.schema.names], + ) + + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py new file mode 100644 index 00000000000..a7642556bf2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import gc +import weakref + +import pyarrow as pa + +import pylibcudf as plc + + +def test_gc_with_table_and_column_input_metadata(): + class Foo(plc.io.types.TableInputMetadata): + def __del__(self): + pass + + pa_table = pa.table( + {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])} + ) + plc_table = plc.interop.from_arrow(pa_table) + + tbl_meta = Foo(plc_table) + weak_tbl_meta = weakref.ref(tbl_meta) + + del tbl_meta + + gc.collect() + + assert weak_tbl_meta() is None