From e5753e3a0c2d161477de5edabe91b3f013246187 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:02:57 -0500 Subject: [PATCH] Add Avro Reader options classes to pylibcudf (#17599) Apart of #17565 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17599 --- python/cudf/cudf/io/avro.py | 17 +- python/pylibcudf/pylibcudf/io/avro.pxd | 25 ++- python/pylibcudf/pylibcudf/io/avro.pyi | 21 ++- python/pylibcudf/pylibcudf/io/avro.pyx | 156 ++++++++++++++---- .../pylibcudf/pylibcudf/tests/io/test_avro.py | 13 +- 5 files changed, 173 insertions(+), 59 deletions(-) diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 11730e98c95..4966cdb86e1 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -33,11 +33,18 @@ def read_avro( if not isinstance(skip_rows, int) or skip_rows < 0: raise TypeError("skip_rows must be an int >= 0") - plc_result = plc.io.avro.read_avro( - plc.io.types.SourceInfo([filepath_or_buffer]), - columns, - skip_rows, - num_rows, + options = ( + plc.io.avro.AvroReaderOptions.builder( + plc.io.types.SourceInfo([filepath_or_buffer]) + ) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() ) + if columns is not None and len(columns) > 0: + options.set_columns(columns) + + plc_result = plc.io.avro.read_avro(options) + return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result)) diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd index 8696fcb3c15..a0fca95d459 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pxd +++ b/python/pylibcudf/pylibcudf/io/avro.pxd @@ -1,12 +1,23 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.io.types cimport SourceInfo, TableWithMetadata -from pylibcudf.libcudf.io.avro cimport avro_reader_options +from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder from pylibcudf.libcudf.types cimport size_type -cpdef TableWithMetadata read_avro( - SourceInfo source_info, - list columns = *, - size_type skip_rows = *, - size_type num_rows = * -) +from pylibcudf.libcudf.types cimport size_type + +cdef class AvroReaderOptions: + cdef avro_reader_options c_obj + cdef SourceInfo source + cpdef void set_columns(self, list col_names) + + +cdef class AvroReaderOptionsBuilder: + cdef avro_reader_options_builder c_obj + cdef SourceInfo source + cpdef AvroReaderOptionsBuilder columns(self, list col_names) + cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows) + cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows) + cpdef AvroReaderOptions build(self) + +cpdef TableWithMetadata read_avro(AvroReaderOptions options) diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi index 49c2f083702..8cafc9a6573 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyi +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -1,11 +1,16 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.io.types import SourceInfo, TableWithMetadata -__all__ = ["read_avro"] - -def read_avro( - source_info: SourceInfo, - columns: list[str] | None = None, - skip_rows: int = 0, - num_rows: int = -1, -) -> TableWithMetadata: ... +__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"] + +class AvroReaderOptions: + @staticmethod + def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ... + +class AvroReaderOptionsBuilder: + def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ... + def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ... + def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ... + def build(self) -> AvroReaderOptions: ... + +def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 4271333511a..c378fca0415 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type -__all__ = ["read_avro"] +__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"] + + +cdef class AvroReaderOptions: + """ + The settings to use for ``read_avro`` + For details, see :cpp:class:`cudf::io::avro_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a AvroWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::avro_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the Avro file from. + + Returns + ------- + AvroReaderOptionsBuilder + Builder to build AvroReaderOptions + """ + cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__( + AvroReaderOptionsBuilder + ) + avro_builder.c_obj = avro_reader_options.builder(source.c_obj) + avro_builder.source = source + return avro_builder + + cpdef void set_columns(self, list col_names): + """ + Set names of the column to be read. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.set_columns(vec) + + +cdef class AvroReaderOptionsBuilder: + cpdef AvroReaderOptionsBuilder columns(self, list col_names): + """ + Set names of the column to be read. + + Parameters + ---------- + col_names : list + List of column names + + Returns + ------- + AvroReaderOptionsBuilder + """ + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.columns(vec) + return self + + cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows): + """ + Sets number of rows to skip. + + Parameters + ---------- + skip_rows : size_type + Number of rows to skip from start + + Returns + ------- + AvroReaderOptionsBuilder + """ + self.c_obj.skip_rows(skip_rows) + return self + + cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows): + """ + Sets number of rows to read. + + Parameters + ---------- + num_rows : size_type + Number of rows to read after skip + + Returns + ------- + AvroReaderOptionsBuilder + """ + self.c_obj.num_rows(num_rows) + return self + + cpdef AvroReaderOptions build(self): + """Create a AvroReaderOptions object""" + cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__( + AvroReaderOptions + ) + avro_options.c_obj = move(self.c_obj.build()) + avro_options.source = self.source + return avro_options cpdef TableWithMetadata read_avro( - SourceInfo source_info, - list columns = None, - size_type skip_rows = 0, - size_type num_rows = -1 + AvroReaderOptions options ): """ - Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. + Read from Avro format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_avro`. Parameters ---------- - source_info: SourceInfo - The SourceInfo object to read the avro dataset from. - columns: list, default None - Optional columns to read, if not provided, reads all columns in the file. - skip_rows: size_type, default 0 - The number of rows to skip. - num_rows: size_type, default -1 - The number of rows to read, after skipping rows. - If -1 is passed, all rows will be read. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: AvroReaderOptions + Settings for controlling reading behavior """ - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options avro_opts = ( - avro_reader_options.builder(source_info.c_obj) - .columns(c_columns) - .skip_rows(skip_rows) - .num_rows(num_rows) - .build() - ) - with nogil: - c_result = move(cpp_read_avro(avro_opts)) + c_result = move(cpp_read_avro(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 3d9d99ffa61..bda8921b62a 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): buffer.seek(0) res = plc.io.avro.read_avro( - plc.io.types.SourceInfo([buffer]), - columns=columns, - skip_rows=skip_rows, - num_rows=num_rows, + ( + plc.io.avro.AvroReaderOptions.builder( + plc.io.types.SourceInfo([buffer]) + ) + .columns(columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) ) expected = pa.Table.from_arrays(