From b9760ac12b593521b7afb803f0d40d5e7996e01a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:01:45 -0800 Subject: [PATCH] Remove cudf._lib.interop in favor of inlining pylibcudf (#17555) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17555 --- python/cudf/cudf/_lib/CMakeLists.txt | 9 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/interop.pyx | 111 ----------------------- python/cudf/cudf/core/column/column.py | 48 ++++++---- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/decimal.py | 10 +- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/frame.py | 15 +-- python/cudf/cudf/io/dlpack.py | 27 +++--- 9 files changed, 62 insertions(+), 165 deletions(-) delete mode 100644 python/cudf/cudf/_lib/interop.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index bfbfbfed333..410fd57691e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx strings_udf.pyx types.pyx - utils.pyx -) +set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -24,8 +22,3 @@ rapids_cython_create_modules( ) target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -target_include_directories(interop PUBLIC "$") - -include(${rapids-cmake-dir}/export/find_package_root.cmake) -include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) -target_link_libraries(interop PUBLIC nanoarrow) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index e18e05cc43e..6b5a7814e48 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -3,7 +3,6 @@ from . import ( groupby, - interop, strings_udf, ) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx deleted file mode 100644 index 1c9d3a01b80..00000000000 --- a/python/cudf/cudf/_lib/interop.pyx +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ListDtype, StructDtype - - -def from_dlpack(object dlpack_capsule): - """ - Converts a DLPack Tensor PyCapsule into a list of columns. - - DLPack Tensor PyCapsule is expected to have the name "dltensor". - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_dlpack(dlpack_capsule) - ) - - -def to_dlpack(list source_columns): - """ - Converts a list of columns into a DLPack Tensor PyCapsule. - - DLPack Tensor PyCapsule will have the name "dltensor". - """ - return pylibcudf.interop.to_dlpack( - pylibcudf.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - ) - - -def gather_metadata(object cols_dtypes): - """ - Generates a ColumnMetadata vector for each column. - - Parameters - ---------- - cols_dtypes : iterable - An iterable of ``(column_name, dtype)`` pairs. - """ - cpp_metadata = [] - if cols_dtypes is not None: - for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) - if isinstance(col_dtype, (ListDtype, StructDtype)): - _set_col_children_metadata(col_dtype, cpp_metadata[idx]) - else: - raise TypeError( - "An iterable of (column_name, dtype) pairs is required to " - "construct column_metadata" - ) - return cpp_metadata - - -def _set_col_children_metadata(dtype, col_meta): - if isinstance(dtype, StructDtype): - for name, value in dtype.fields.items(): - element_metadata = pylibcudf.interop.ColumnMetadata(name) - _set_col_children_metadata(value, element_metadata) - col_meta.children_meta.append(element_metadata) - elif isinstance(dtype, ListDtype): - # Offsets - child 0 - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - # Element column - child 1 - element_metadata = pylibcudf.interop.ColumnMetadata() - _set_col_children_metadata(dtype.element_type, element_metadata) - col_meta.children_meta.append(element_metadata) - else: - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - -@acquire_spill_lock() -def to_arrow(list source_columns, object column_dtypes): - """Convert a list of columns from - cudf Frame to a PyArrow Table. - - Parameters - ---------- - source_columns : a list of columns to convert - column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs - - Returns - ------- - pyarrow table - """ - cpp_metadata = gather_metadata(column_dtypes) - return pylibcudf.interop.to_arrow( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - cpp_metadata, - ) - - -@acquire_spill_lock() -def from_arrow(object input_table): - """Convert from PyArrow Table to a list of columns. - - Parameters - ---------- - input_table : PyArrow table - - Returns - ------- - A list of columns to construct Frame object - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_arrow(input_table) - ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2515157253c..cccafaeba88 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -279,6 +279,7 @@ def dropna(self) -> Self: else: return self.copy() + @acquire_spill_lock() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -295,9 +296,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) + return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -334,26 +333,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( - { - "None": pa.chunked_array( - [chunk.indices for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.indices for chunk in data.column(0).chunks], type=array.type.index_type, ) - } + ], + [None], ) dictionaries_table = pa.table( - { - "None": pa.chunked_array( - [chunk.dictionary for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.dictionary for chunk in data.column(0).chunks], type=array.type.value_type, ) - } + ], + [None], ) - - codes = libcudf.interop.from_arrow(indices_table)[0] - categories = libcudf.interop.from_arrow(dictionaries_table)[0] + with acquire_spill_lock(): + codes = cls.from_pylibcudf( + plc.interop.from_arrow(indices_table).columns()[0] + ) + categories = cls.from_pylibcudf( + plc.interop.from_arrow(dictionaries_table).columns()[0] + ) codes = cudf.core.column.categorical.as_unsigned_codes( - len(categories), codes + len(categories), + codes, # type: ignore[arg-type] ) return cudf.core.column.CategoricalColumn( data=None, @@ -364,10 +370,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: mask=codes.base_mask, children=(codes,), ) - - result = libcudf.interop.from_arrow(data)[0] - - return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + else: + result = cls.from_pylibcudf( + plc.interop.from_arrow(data).columns()[0] + ) + # TODO: cudf_dtype_from_pa_type may be less necessary for some types + return result._with_type_metadata( + cudf_dtype_from_pa_type(array.type) + ) @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1a820da3c62..b6a4122ebb9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1016,7 +1016,7 @@ def to_pandas( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - def to_arrow(self): + def to_arrow(self) -> pa.Array: return pa.compute.assume_timezone( self._local_time.to_arrow(), str(self.dtype.tz) ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9e6a73f1a9c..09941665ba2 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_32 = np.array(self.base_data.memoryview()).view("int32") + def to_arrow(self) -> pa.Array: + data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") # use striding to set the first 32 bits of each 128-bit chunk: @@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array): result.dtype.precision = data.type.precision return result - def to_arrow(self): + def to_arrow(self) -> pa.Array: return super().to_arrow().cast(self.dtype.to_arrow()) def _with_type_metadata( @@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_64 = np.array(self.base_data.memoryview()).view("int64") + def to_arrow(self) -> pa.Array: + data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") # use striding to set the first 64 bits of each 128-bit chunk: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ba98e28f6a2..3d9440cdf21 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn: """ return cast(NumericalColumn, self.children[0]) - def to_arrow(self): + def to_arrow(self) -> pa.Array: offsets = self.offsets.to_arrow() elements = ( pa.nulls(len(self.elements)) @@ -160,7 +160,7 @@ def to_arrow(self): pa_type = pa.list_(elements.type) if self.nullable: - nbuf = pa.py_buffer(self.mask.memoryview()) + nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr] buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ba9b15667f1..9aadbf8f47a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -946,16 +946,17 @@ def from_arrow(cls, data: pa.Table) -> Self: if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - indices_columns = libcudf.interop.from_arrow(dict_indices_table) + plc_indices = plc.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in zip( - dict_indices_table.column_names, indices_columns + for name, plc_codes in zip( + dict_indices_table.column_names, plc_indices.columns() ): + codes = libcudf.column.Column.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] codes = as_unsigned_codes(len(categories), codes) cudf_category_frame[name] = CategoricalColumn( @@ -971,9 +972,9 @@ def from_arrow(cls, data: pa.Table) -> Self: # Handle non-dict arrays cudf_non_category_frame = { - name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) + name: libcudf.column.Column.from_pylibcudf(plc_col) + for name, plc_col in zip( + data.column_names, plc.interop.from_arrow(data).columns() ) } @@ -1032,7 +1033,7 @@ def from_arrow(cls, data: pa.Table) -> Self: return cls._from_data({name: result[name] for name in column_names}) @_performance_tracking - def to_arrow(self): + def to_arrow(self) -> pa.Table: """ Convert to arrow Table diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index fe8e446f9c0..3b3fd5f7c56 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,13 +1,14 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations +import pylibcudf as plc import cudf -from cudf._lib import interop as libdlpack from cudf.core.column import ColumnBase from cudf.utils import ioutils -def from_dlpack(pycapsule_obj): +def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: @@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj): cuDF from_dlpack() assumes column-major (Fortran order) input. If the input tensor is row-major, transpose it before passing it to this function. """ + plc_table = plc.interop.from_dlpack(pycapsule_obj) + data = dict( + enumerate( + (ColumnBase.from_pylibcudf(col) for col in plc_table.columns()) + ) + ) - columns = libdlpack.from_dlpack(pycapsule_obj) - data = dict(enumerate(columns)) - - if len(columns) == 1: + if len(data) == 1: return cudf.Series._from_data(data) else: return cudf.DataFrame._from_data(data) @ioutils.doc_to_dlpack() -def to_dlpack(cudf_obj): +def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): """Converts a cuDF object to a DLPack tensor. DLPack is an open-source memory tensor structure: @@ -80,13 +84,14 @@ def to_dlpack(cudf_obj): if any( not cudf.api.types._is_non_decimal_numeric_dtype(dtype) - for _, dtype in gdf._dtypes + for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [dtype for _, dtype in gdf._dtypes] + [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) - - return libdlpack.to_dlpack([*gdf._columns]) + return plc.interop.to_dlpack( + plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns]) + )