Skip to content

Commit

Permalink
Add Avro Reader options classes to pylibcudf (#17599)
Browse files Browse the repository at this point in the history
Apart of #17565

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17599
  • Loading branch information
Matt711 authored Dec 17, 2024
1 parent 0058b52 commit e5753e3
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 59 deletions.
17 changes: 12 additions & 5 deletions python/cudf/cudf/io/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,18 @@ def read_avro(
if not isinstance(skip_rows, int) or skip_rows < 0:
raise TypeError("skip_rows must be an int >= 0")

plc_result = plc.io.avro.read_avro(
plc.io.types.SourceInfo([filepath_or_buffer]),
columns,
skip_rows,
num_rows,
options = (
plc.io.avro.AvroReaderOptions.builder(
plc.io.types.SourceInfo([filepath_or_buffer])
)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)

if columns is not None and len(columns) > 0:
options.set_columns(columns)

plc_result = plc.io.avro.read_avro(options)

return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
25 changes: 18 additions & 7 deletions python/pylibcudf/pylibcudf/io/avro.pxd
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from pylibcudf.libcudf.io.avro cimport avro_reader_options
from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder
from pylibcudf.libcudf.types cimport size_type


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = *,
size_type skip_rows = *,
size_type num_rows = *
)
from pylibcudf.libcudf.types cimport size_type

cdef class AvroReaderOptions:
cdef avro_reader_options c_obj
cdef SourceInfo source
cpdef void set_columns(self, list col_names)


cdef class AvroReaderOptionsBuilder:
cdef avro_reader_options_builder c_obj
cdef SourceInfo source
cpdef AvroReaderOptionsBuilder columns(self, list col_names)
cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows)
cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows)
cpdef AvroReaderOptions build(self)

cpdef TableWithMetadata read_avro(AvroReaderOptions options)
21 changes: 13 additions & 8 deletions python/pylibcudf/pylibcudf/io/avro.pyi
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from pylibcudf.io.types import SourceInfo, TableWithMetadata

__all__ = ["read_avro"]

def read_avro(
source_info: SourceInfo,
columns: list[str] | None = None,
skip_rows: int = 0,
num_rows: int = -1,
) -> TableWithMetadata: ...
__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]

class AvroReaderOptions:
@staticmethod
def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...

class AvroReaderOptionsBuilder:
def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ...
def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ...
def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ...
def build(self) -> AvroReaderOptions: ...

def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ...
156 changes: 121 additions & 35 deletions python/pylibcudf/pylibcudf/io/avro.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport (
)
from pylibcudf.libcudf.types cimport size_type

__all__ = ["read_avro"]
__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"]


cdef class AvroReaderOptions:
"""
The settings to use for ``read_avro``
For details, see :cpp:class:`cudf::io::avro_reader_options`
"""
@staticmethod
def builder(SourceInfo source):
"""
Create a AvroWriterOptionsBuilder object
For details, see :cpp:func:`cudf::io::avro_reader_options::builder`
Parameters
----------
sink : SourceInfo
The source to read the Avro file from.
Returns
-------
AvroReaderOptionsBuilder
Builder to build AvroReaderOptions
"""
cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__(
AvroReaderOptionsBuilder
)
avro_builder.c_obj = avro_reader_options.builder(source.c_obj)
avro_builder.source = source
return avro_builder

cpdef void set_columns(self, list col_names):
"""
Set names of the column to be read.
Parameters
----------
col_names : list[str]
List of column names
Returns
-------
None
"""
cdef vector[string] vec
vec.reserve(len(col_names))
for name in col_names:
vec.push_back(str(name).encode())
self.c_obj.set_columns(vec)


cdef class AvroReaderOptionsBuilder:
cpdef AvroReaderOptionsBuilder columns(self, list col_names):
"""
Set names of the column to be read.
Parameters
----------
col_names : list
List of column names
Returns
-------
AvroReaderOptionsBuilder
"""
cdef vector[string] vec
vec.reserve(len(col_names))
for name in col_names:
vec.push_back(str(name).encode())
self.c_obj.columns(vec)
return self

cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows):
"""
Sets number of rows to skip.
Parameters
----------
skip_rows : size_type
Number of rows to skip from start
Returns
-------
AvroReaderOptionsBuilder
"""
self.c_obj.skip_rows(skip_rows)
return self

cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows):
"""
Sets number of rows to read.
Parameters
----------
num_rows : size_type
Number of rows to read after skip
Returns
-------
AvroReaderOptionsBuilder
"""
self.c_obj.num_rows(num_rows)
return self

cpdef AvroReaderOptions build(self):
"""Create a AvroReaderOptions object"""
cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__(
AvroReaderOptions
)
avro_options.c_obj = move(self.c_obj.build())
avro_options.source = self.source
return avro_options


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = None,
size_type skip_rows = 0,
size_type num_rows = -1
AvroReaderOptions options
):
"""
Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
Read from Avro format.
The source to read from and options are encapsulated
by the `options` object.
For details, see :cpp:func:`read_avro`.
Parameters
----------
source_info: SourceInfo
The SourceInfo object to read the avro dataset from.
columns: list, default None
Optional columns to read, if not provided, reads all columns in the file.
skip_rows: size_type, default 0
The number of rows to skip.
num_rows: size_type, default -1
The number of rows to read, after skipping rows.
If -1 is passed, all rows will be read.
Returns
-------
TableWithMetadata
The Table and its corresponding metadata (column names) that were read in.
options: AvroReaderOptions
Settings for controlling reading behavior
"""
cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
c_columns.reserve(len(columns))
for col in columns:
c_columns.push_back(str(col).encode())

cdef avro_reader_options avro_opts = (
avro_reader_options.builder(source_info.c_obj)
.columns(c_columns)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)

with nogil:
c_result = move(cpp_read_avro(avro_opts))
c_result = move(cpp_read_avro(options.c_obj))

return TableWithMetadata.from_libcudf(c_result)
13 changes: 9 additions & 4 deletions python/pylibcudf/pylibcudf/tests/io/test_avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
buffer.seek(0)

res = plc.io.avro.read_avro(
plc.io.types.SourceInfo([buffer]),
columns=columns,
skip_rows=skip_rows,
num_rows=num_rows,
(
plc.io.avro.AvroReaderOptions.builder(
plc.io.types.SourceInfo([buffer])
)
.columns(columns)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)
)

expected = pa.Table.from_arrays(
Expand Down

0 comments on commit e5753e3

Please sign in to comment.