From f42197f54eb988511bdc6f47c9d4cac13438709d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:36:33 -0800 Subject: [PATCH] Remove cudf._lib.utils in favor of python APIs --- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/__init__.pxd | 0 python/cudf/cudf/_lib/utils.pxd | 6 -- python/cudf/cudf/_lib/utils.pyx | 94 ---------------------------- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/io/avro.py | 13 +++- python/cudf/cudf/io/csv.py | 16 +++-- python/cudf/cudf/io/json.py | 26 ++++---- python/cudf/cudf/io/orc.py | 37 +++++++++-- python/cudf/cudf/io/parquet.py | 29 +++++---- 10 files changed, 85 insertions(+), 142 deletions(-) delete mode 100644 python/cudf/cudf/_lib/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/utils.pxd delete mode 100644 python/cudf/cudf/_lib/utils.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 410fd57691e..a9ba24006d6 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx) +set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/__init__.pxd b/python/cudf/cudf/_lib/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd deleted file mode 100644 index 900be721c9a..00000000000 --- a/python/cudf/cudf/_lib/utils.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) -cpdef columns_from_pylibcudf_table(tbl) -cpdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx deleted file mode 100644 index 975c9eb741c..00000000000 --- a/python/cudf/cudf/_lib/utils.pyx +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import cudf - -from cudf._lib.column cimport Column - - -cpdef columns_from_pylibcudf_table(tbl): - """Convert a pylibcudf table into list of columns. - - Parameters - ---------- - tbl : pylibcudf.Table - The pylibcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - return [Column.from_pylibcudf(plc) for plc in tbl.columns()] - - -cpdef _data_from_columns(columns, column_names, index_names=None): - """Convert a list of columns into a dict with an index. - - This method is intended to provide the bridge between the columns returned - from calls to libcudf or pylibcudf APIs and the cuDF Python Frame objects, which - require named columns and a separate index. - - Since cuDF Python has an independent representation of a table as a - collection of columns, this function simply returns a dict of columns - suitable for conversion into data to be passed to cuDF constructors. - This method returns the columns of the table in the order they are - stored in libcudf, but calling code is responsible for partitioning and - labeling them as needed. - - Parameters - ---------- - columns : list[Column] - The columns to be extracted - column_names : iterable - The keys associated with the columns in the output data. - index_names : iterable, optional - If provided, an iterable of strings that will be used to label the - corresponding first set of columns into a (Multi)Index. If this - argument is omitted, all columns are assumed to be part of the output - table and no index is constructed. - """ - # First construct the index, if any - index = ( - # TODO: For performance, the _from_data methods of Frame types assume - # that the passed index object is already an Index because cudf.Index - # and cudf.as_index are expensive. As a result, this function is - # currently somewhat inconsistent in returning a dict of columns for - # the data while actually constructing the Index object here (instead - # of just returning a dict for that as well). As we clean up the - # Frame factories we may want to look for a less dissonant approach - # that does not impose performance penalties. - cudf.core.index._index_from_data( - { - name: columns[i] - for i, name in enumerate(index_names) - } - ) - if index_names is not None - else None - ) - n_index_columns = len(index_names) if index_names is not None else 0 - data = { - name: columns[i + n_index_columns] - for i, name in enumerate(column_names) - } - return data, index - - -cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): - return _data_from_columns( - columns_from_pylibcudf_table(tbl), - column_names, - index_names - ) - -cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): - """ - Unpacks the TableWithMetadata from libcudf I/O - into a dict of columns and an Index (cuDF format) - """ - if column_names is None: - column_names = tbl_with_meta.column_names(include_children=False) - return _data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=column_names, - index_names=index_names - ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9aadbf8f47a..8f45c6f0115 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -861,7 +861,9 @@ def _quantile_table( column_order, null_precedence, ) - columns = libcudf.utils.columns_from_pylibcudf_table(plc_table) + columns = [ + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() + ] return self._from_columns_like_self( columns, column_names=self._column_names, diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 4966cdb86e1..dcbdd4423fc 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -3,7 +3,7 @@ import pylibcudf as plc import cudf -from cudf._lib.utils import data_from_pylibcudf_io +from cudf._lib.column import Column from cudf.utils import ioutils @@ -46,5 +46,12 @@ def read_avro( options.set_columns(columns) plc_result = plc.io.avro.read_avro(options) - - return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result)) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + plc_result.column_names(include_children=False), + plc_result.columns, + strict=True, + ) + } + return cudf.DataFrame._from_data(data) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index da9a66f3874..6d617cbf38e 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -15,8 +15,8 @@ import pylibcudf as plc import cudf +from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils @@ -251,9 +251,17 @@ def read_csv( if na_values is not None: options.set_na_values([str(val) for val in na_values]) - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) - ) + table_w_meta = plc.io.csv.read_csv(options) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + table_w_meta.column_names(include_children=False), + table_w_meta.columns, + strict=True, + ) + } + + df = cudf.DataFrame._from_data(data) if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 39a85465deb..a31f01e5956 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -15,7 +15,6 @@ import cudf from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import _data_from_columns, data_from_pylibcudf_io from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type @@ -172,13 +171,11 @@ def read_json( ) ) ) - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(col) for col in res_cols], - column_names=res_col_names, - index_names=None, - ) - ) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip(res_col_names, res_cols, strict=True) + } + df = cudf.DataFrame._from_data(data) ioutils._add_df_col_struct_names(df, res_child_names) return df else: @@ -201,10 +198,15 @@ def read_json( extra_parameters=kwargs, ) ) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(table_w_meta) - ) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + table_w_meta.column_names(include_children=False), + table_w_meta.columns, + strict=True, + ) + } + df = cudf.DataFrame._from_data(data) # Post-processing to add in struct column names ioutils._add_df_col_struct_names(df, table_w_meta.child_names) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 5103137bc77..f3124552fd1 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -10,10 +10,11 @@ import pylibcudf as plc import cudf +from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock +from cudf.core.index import _index_from_data from cudf.utils import ioutils try: @@ -323,11 +324,35 @@ def read_orc( actual_index_names = list(index_col_names.values()) col_names = names[len(actual_index_names) :] - data, index = data_from_pylibcudf_io( - tbl_w_meta, - col_names if columns is None else names, - actual_index_names, - ) + result_col_names = col_names if columns is None else names + if actual_index_names is None: + index = None + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + result_col_names, tbl_w_meta.columns, strict=True + ) + } + else: + result_columns = [ + Column.from_pylibcudf(col) for col in tbl_w_meta.columns + ] + index = _index_from_data( + dict( + zip( + actual_index_names, + result_columns[: len(actual_index_names)], + strict=True, + ) + ) + ) + data = dict( + zip( + result_col_names, + result_columns[len(actual_index_names) :], + strict=True, + ) + ) if is_range_index: index = range_idx diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index c13489630a3..feb6e12da8c 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -23,10 +23,6 @@ import cudf from cudf._lib.column import Column -from cudf._lib.utils import ( - _data_from_columns, - data_from_pylibcudf_io, -) from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column, column_empty @@ -1238,16 +1234,11 @@ def _read_parquet( # Drop residual columns to save memory tbl._columns[i] = None - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[ - Column.from_pylibcudf(plc) - for plc in concatenated_columns - ], - column_names=column_names, - index_names=None, - ) - ) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip(column_names, concatenated_columns) + } + df = cudf.DataFrame._from_data(data) df = _process_metadata( df, column_names, @@ -1287,8 +1278,16 @@ def _read_parquet( options.set_filter(filters) tbl_w_meta = plc.io.parquet.read_parquet(options) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + tbl_w_meta.column_names(include_children=False), + tbl_w_meta.columns, + strict=True, + ) + } - df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta)) + df = cudf.DataFrame._from_data(data) df = _process_metadata( df,