From 1f55d80ca6be173de1319679377fe0eff05cbc51 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 18 Dec 2024 10:19:49 -0500
Subject: [PATCH] Add ORC reader options structs to pylibcudf (#17601)

Apart of #17565

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17601
---
 python/cudf/cudf/io/orc.py                    |  28 +-
 python/pylibcudf/pylibcudf/io/orc.pxd         |  32 ++-
 python/pylibcudf/pylibcudf/io/orc.pyi         |  30 ++-
 python/pylibcudf/pylibcudf/io/orc.pyx         | 240 +++++++++++++-----
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  17 +-
 5 files changed, 242 insertions(+), 105 deletions(-)

diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 5616413b7e4..5103137bc77 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -240,15 +240,27 @@ def read_orc(
         elif not isinstance(num_rows, int) or num_rows < -1:
             raise TypeError("num_rows must be an int >= -1")
 
-        tbl_w_meta = plc.io.orc.read_orc(
-            plc.io.SourceInfo(filepaths_or_buffers),
-            columns,
-            stripes,
-            skiprows,
-            num_rows,
-            use_index,
-            dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
+        options = (
+            plc.io.orc.OrcReaderOptions.builder(
+                plc.io.types.SourceInfo(filepaths_or_buffers)
+            )
+            .use_index(use_index)
+            .build()
         )
+        if num_rows >= 0:
+            options.set_num_rows(num_rows)
+        if skiprows >= 0:
+            options.set_skip_rows(skiprows)
+        if stripes is not None and len(stripes) > 0:
+            options.set_stripes(stripes)
+        if timestamp_type is not None:
+            options.set_timestamp_type(
+                dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
+            )
+        if columns is not None and len(columns) > 0:
+            options.set_columns(columns)
+
+        tbl_w_meta = plc.io.orc.read_orc(options)
 
         if isinstance(columns, list) and len(columns) == 0:
             # When `columns=[]`, index needs to be
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
index 671f0692444..7531608519c 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint64_t, int64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport (
 )
 from pylibcudf.libcudf.io.orc cimport (
     orc_chunked_writer,
+    orc_reader_options,
+    orc_reader_options_builder,
     orc_writer_options,
     orc_writer_options_builder,
     chunked_orc_writer_options,
@@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport (
     statistics_freq,
 )
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = *,
-    list stripes = *,
-    size_type skip_rows = *,
-    size_type nrows = *,
-    bool use_index = *,
-    bool use_np_dtypes = *,
-    DataType timestamp_type = *,
-    list decimal128_columns = *
-)
+cdef class OrcReaderOptions:
+    cdef orc_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_num_rows(self, int64_t nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_stripes(self, list stripes)
+    cpdef void set_decimal128_columns(self, list val)
+    cpdef void set_timestamp_type(self, DataType type_)
+    cpdef void set_columns(self, list col_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cdef orc_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use)
+    cpdef OrcReaderOptions build(self)
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options)
 
 cdef class OrcColumnStatistics:
     cdef optional[uint64_t] number_of_values_c
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
index 516f97981e9..c496b7a2152 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyi
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -1,6 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from typing import Any, Self
+from typing import Any
+
+from typing_extensions import Self
 
 from pylibcudf.io.types import (
     CompressionType,
@@ -11,19 +13,21 @@ from pylibcudf.io.types import (
     TableWithMetadata,
 )
 from pylibcudf.table import Table
-from pylibcudf.types import DataType
 
-def read_orc(
-    source_info: SourceInfo,
-    columns: list[str] | None = None,
-    stripes: list[list[int]] | None = None,
-    skip_rows: int = 0,
-    nrows: int = -1,
-    use_index: bool = True,
-    use_np_dtypes: bool = True,
-    timestamp_type: DataType | None = None,
-    decimal128_columns: list[str] | None = None,
-) -> TableWithMetadata: ...
+class OrcReaderOptions:
+    def set_num_rows(self, nrows: int) -> None: ...
+    def set_skip_rows(self, skip_rows: int) -> None: ...
+    def set_stripes(self, stripes: list[list[int]]) -> None: ...
+    def set_decimal128_columns(self, val: list[str]) -> None: ...
+    def set_columns(self, col_names: list[str]) -> None: ...
+    @staticmethod
+    def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ...
+
+class OrcReaderOptionsBuilder:
+    def use_index(self, use: bool) -> Self: ...
+    def build(self) -> OrcReaderOptions: ...
+
+def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ...
 
 class OrcColumnStatistics:
     def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 63eab4a9634..c125d7e76fa 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -46,6 +46,8 @@ __all__ = [
     "read_orc",
     "read_parsed_orc_statistics",
     "write_orc",
+    "OrcReaderOptions",
+    "OrcReaderOptionsBuilder",
     "OrcWriterOptions",
     "OrcWriterOptionsBuilder",
     "OrcChunkedWriter",
@@ -237,84 +239,190 @@ cdef class ParsedOrcStatistics:
         return out
 
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = None,
-    list stripes = None,
-    size_type skip_rows = 0,
-    size_type nrows = -1,
-    bool use_index = True,
-    bool use_np_dtypes = True,
-    DataType timestamp_type = None,
-    list decimal128_columns = None,
-):
-    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
-
-    Parameters
-    ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    stripes : list[list[size_type]], default None
-        List of stripes to be read.
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    use_index : bool, default True
-        Whether to use the row index to speed up reading.
-    use_np_dtypes : bool, default True
-        Whether to use numpy compatible dtypes.
-    timestamp_type : DataType, default None
-        The timestamp type to use for the timestamp columns.
-    decimal128_columns : list, default None
-        List of column names to be read as 128-bit decimals.
+cdef class OrcReaderOptions:
+    """
+    The settings to use for ``read_orc``
 
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    For details, see :cpp:class:`cudf::io::orc_reader_options`
     """
-    cdef orc_reader_options opts
-    cdef vector[vector[size_type]] c_stripes
-    opts = (
-        orc_reader_options.builder(source_info.c_obj)
-        .use_index(use_index)
-        .build()
-    )
-    if nrows >= 0:
-        opts.set_num_rows(nrows)
-    if skip_rows >= 0:
-        opts.set_skip_rows(skip_rows)
-    if stripes is not None:
-        c_stripes = stripes
-        opts.set_stripes(c_stripes)
-    if timestamp_type is not None:
-        opts.set_timestamp_type(timestamp_type.c_obj)
-
-    cdef vector[string] c_decimal128_columns
-    if decimal128_columns is not None and len(decimal128_columns) > 0:
-        c_decimal128_columns.reserve(len(decimal128_columns))
-        for col in decimal128_columns:
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a OrcReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::orc_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the ORC file from.
+
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+            Builder to build OrcReaderOptions
+        """
+        cdef OrcReaderOptionsBuilder orc_builder = (
+            OrcReaderOptionsBuilder.__new__(OrcReaderOptionsBuilder)
+        )
+        orc_builder.c_obj = orc_reader_options.builder(source.c_obj)
+        orc_builder.source = source
+        return orc_builder
+
+    cpdef void set_num_rows(self, int64_t nrows):
+        """
+        Sets number of row to read.
+
+        Parameters
+        ----------
+        nrows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip from the start.
+
+        Parameters
+        ----------
+        skip_rows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_stripes(self, list stripes):
+        """
+        Sets list of stripes to read for each input source.
+
+        Parameters
+        ----------
+        stripes: list[list[size_type]]
+            List of lists, mapping stripes to read to input sources
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] c_stripes
+        cdef vector[size_type] vec
+        for sub_list in stripes:
+            for x in sub_list:
+                vec.push_back(x)
+            c_stripes.push_back(vec)
+            vec.clear()
+        self.c_obj.set_stripes(c_stripes)
+
+    cpdef void set_decimal128_columns(self, list val):
+        """
+        Set columns that should be read as 128-bit Decimal.
+
+        Parameters
+        ----------
+        val: list[str]
+            List of fully qualified column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_decimal128_columns
+        c_decimal128_columns.reserve(len(val))
+        for col in val:
             if not isinstance(col, str):
                 raise TypeError("Decimal 128 column names must be strings!")
             c_decimal128_columns.push_back(col.encode())
-        opts.set_decimal128_columns(c_decimal128_columns)
+        self.c_obj.set_decimal128_columns(c_decimal128_columns)
+
+    cpdef void set_timestamp_type(self, DataType type_):
+        """
+        Sets timestamp type to which timestamp column will be cast.
+
+        Parameters
+        ----------
+        type_: DataType
+            Type of timestamp
 
-    cdef vector[string] c_column_names
-    if columns is not None and len(columns) > 0:
-        c_column_names.reserve(len(columns))
-        for col in columns:
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_timestamp_type(type_.c_obj)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the column to read.
+
+        Parameters
+        ----------
+        col_names: list[str]
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_column_names
+        c_column_names.reserve(len(col_names))
+        for col in col_names:
             if not isinstance(col, str):
                 raise TypeError("Column names must be strings!")
             c_column_names.push_back(col.encode())
-        opts.set_columns(c_column_names)
+        self.c_obj.set_columns(c_column_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use):
+        """
+        Enable/Disable use of row index to speed-up reading.
+
+        Parameters
+        ----------
+        use : bool
+            Boolean value to enable/disable row index use
 
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+        """
+        self.c_obj.use_index(use)
+        return self
+
+    cpdef OrcReaderOptions build(self):
+        """Create a OrcReaderOptions object"""
+        cdef OrcReaderOptions orc_options = OrcReaderOptions.__new__(
+            OrcReaderOptions
+        )
+        orc_options.c_obj = move(self.c_obj.build())
+        orc_options.source = self.source
+        return orc_options
+
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options):
+    """
+    Read from ORC format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`read_orc`.
+
+    Parameters
+    ----------
+    options: OrcReaderOptions
+        Settings for controlling reading behavior
+    """
     cdef table_with_metadata c_result
 
     with nogil:
-        c_result = move(cpp_read_orc(opts))
+        c_result = move(cpp_read_orc(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
@@ -503,7 +611,7 @@ cpdef void write_orc(OrcWriterOptions options):
     The table to write, output paths, and options are encapsulated
     by the `options` object.
 
-    For details, see :cpp:func:`write_csv`.
+    For details, see :cpp:func:`write_orc`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 2557e40c935..fe35255505c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -37,12 +37,17 @@ def test_read_orc_basic(
         binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
     )
 
-    res = plc.io.orc.read_orc(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.orc.OrcReaderOptions.builder(
+        plc.io.types.SourceInfo([source])
+    ).build()
+    if nrows >= 0:
+        options.set_num_rows(nrows)
+    if skiprows >= 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None and len(columns) > 0:
+        options.set_columns(columns)
+
+    res = plc.io.orc.read_orc(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)