Skip to content

Commit

Permalink
Add ORC reader options structs to pylibcudf (#17601)
Browse files Browse the repository at this point in the history
Apart of #17565

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17601
  • Loading branch information
Matt711 authored Dec 18, 2024
1 parent a081a57 commit 1f55d80
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 105 deletions.
28 changes: 20 additions & 8 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,27 @@ def read_orc(
elif not isinstance(num_rows, int) or num_rows < -1:
raise TypeError("num_rows must be an int >= -1")

tbl_w_meta = plc.io.orc.read_orc(
plc.io.SourceInfo(filepaths_or_buffers),
columns,
stripes,
skiprows,
num_rows,
use_index,
dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
options = (
plc.io.orc.OrcReaderOptions.builder(
plc.io.types.SourceInfo(filepaths_or_buffers)
)
.use_index(use_index)
.build()
)
if num_rows >= 0:
options.set_num_rows(num_rows)
if skiprows >= 0:
options.set_skip_rows(skiprows)
if stripes is not None and len(stripes) > 0:
options.set_stripes(stripes)
if timestamp_type is not None:
options.set_timestamp_type(
dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
)
if columns is not None and len(columns) > 0:
options.set_columns(columns)

tbl_w_meta = plc.io.orc.read_orc(options)

if isinstance(columns, list) and len(columns) == 0:
# When `columns=[]`, index needs to be
Expand Down
32 changes: 20 additions & 12 deletions python/pylibcudf/pylibcudf/io/orc.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libc.stdint cimport uint64_t
from libc.stdint cimport uint64_t, int64_t
from libcpp cimport bool
from libcpp.optional cimport optional
from libcpp.string cimport string
Expand All @@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport (
)
from pylibcudf.libcudf.io.orc cimport (
orc_chunked_writer,
orc_reader_options,
orc_reader_options_builder,
orc_writer_options,
orc_writer_options_builder,
chunked_orc_writer_options,
Expand All @@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport (
statistics_freq,
)

cpdef TableWithMetadata read_orc(
SourceInfo source_info,
list columns = *,
list stripes = *,
size_type skip_rows = *,
size_type nrows = *,
bool use_index = *,
bool use_np_dtypes = *,
DataType timestamp_type = *,
list decimal128_columns = *
)
cdef class OrcReaderOptions:
cdef orc_reader_options c_obj
cdef SourceInfo source
cpdef void set_num_rows(self, int64_t nrows)
cpdef void set_skip_rows(self, int64_t skip_rows)
cpdef void set_stripes(self, list stripes)
cpdef void set_decimal128_columns(self, list val)
cpdef void set_timestamp_type(self, DataType type_)
cpdef void set_columns(self, list col_names)

cdef class OrcReaderOptionsBuilder:
cdef orc_reader_options_builder c_obj
cdef SourceInfo source
cpdef OrcReaderOptionsBuilder use_index(self, bool use)
cpdef OrcReaderOptions build(self)

cpdef TableWithMetadata read_orc(OrcReaderOptions options)

cdef class OrcColumnStatistics:
cdef optional[uint64_t] number_of_values_c
Expand Down
30 changes: 17 additions & 13 deletions python/pylibcudf/pylibcudf/io/orc.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from typing import Any, Self
from typing import Any

from typing_extensions import Self

from pylibcudf.io.types import (
CompressionType,
Expand All @@ -11,19 +13,21 @@ from pylibcudf.io.types import (
TableWithMetadata,
)
from pylibcudf.table import Table
from pylibcudf.types import DataType

def read_orc(
source_info: SourceInfo,
columns: list[str] | None = None,
stripes: list[list[int]] | None = None,
skip_rows: int = 0,
nrows: int = -1,
use_index: bool = True,
use_np_dtypes: bool = True,
timestamp_type: DataType | None = None,
decimal128_columns: list[str] | None = None,
) -> TableWithMetadata: ...
class OrcReaderOptions:
def set_num_rows(self, nrows: int) -> None: ...
def set_skip_rows(self, skip_rows: int) -> None: ...
def set_stripes(self, stripes: list[list[int]]) -> None: ...
def set_decimal128_columns(self, val: list[str]) -> None: ...
def set_columns(self, col_names: list[str]) -> None: ...
@staticmethod
def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ...

class OrcReaderOptionsBuilder:
def use_index(self, use: bool) -> Self: ...
def build(self) -> OrcReaderOptions: ...

def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ...

class OrcColumnStatistics:
def __init__(self): ...
Expand Down
Loading

0 comments on commit 1f55d80

Please sign in to comment.