Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ORC reader options structs to pylibcudf #17601

Merged
merged 4 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,27 @@ def read_orc(
elif not isinstance(num_rows, int) or num_rows < -1:
raise TypeError("num_rows must be an int >= -1")

tbl_w_meta = plc.io.orc.read_orc(
plc.io.SourceInfo(filepaths_or_buffers),
columns,
stripes,
skiprows,
num_rows,
use_index,
dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
options = (
plc.io.orc.OrcReaderOptions.builder(
plc.io.types.SourceInfo(filepaths_or_buffers)
)
.use_index(use_index)
.build()
)
if num_rows >= 0:
options.set_num_rows(num_rows)
if skiprows >= 0:
options.set_skip_rows(skiprows)
if stripes is not None and len(stripes) > 0:
options.set_stripes(stripes)
if timestamp_type is not None:
options.set_timestamp_type(
dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
)
if columns is not None and len(columns) > 0:
options.set_columns(columns)

tbl_w_meta = plc.io.orc.read_orc(options)

if isinstance(columns, list) and len(columns) == 0:
# When `columns=[]`, index needs to be
Expand Down
32 changes: 20 additions & 12 deletions python/pylibcudf/pylibcudf/io/orc.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libc.stdint cimport uint64_t
from libc.stdint cimport uint64_t, int64_t
from libcpp cimport bool
from libcpp.optional cimport optional
from libcpp.string cimport string
Expand All @@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport (
)
from pylibcudf.libcudf.io.orc cimport (
orc_chunked_writer,
orc_reader_options,
orc_reader_options_builder,
orc_writer_options,
orc_writer_options_builder,
chunked_orc_writer_options,
Expand All @@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport (
statistics_freq,
)

cpdef TableWithMetadata read_orc(
SourceInfo source_info,
list columns = *,
list stripes = *,
size_type skip_rows = *,
size_type nrows = *,
bool use_index = *,
bool use_np_dtypes = *,
DataType timestamp_type = *,
list decimal128_columns = *
)
cdef class OrcReaderOptions:
cdef orc_reader_options c_obj
cdef SourceInfo source
cpdef void set_num_rows(self, int64_t nrows)
cpdef void set_skip_rows(self, int64_t skip_rows)
cpdef void set_stripes(self, list stripes)
cpdef void set_decimal128_columns(self, list val)
cpdef void set_timestamp_type(self, DataType type_)
cpdef void set_columns(self, list col_names)

cdef class OrcReaderOptionsBuilder:
cdef orc_reader_options_builder c_obj
cdef SourceInfo source
cpdef OrcReaderOptionsBuilder use_index(self, bool use)
cpdef OrcReaderOptions build(self)

cpdef TableWithMetadata read_orc(OrcReaderOptions options)

cdef class OrcColumnStatistics:
cdef optional[uint64_t] number_of_values_c
Expand Down
30 changes: 17 additions & 13 deletions python/pylibcudf/pylibcudf/io/orc.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from typing import Any, Self
from typing import Any

from typing_extensions import Self

from pylibcudf.io.types import (
CompressionType,
Expand All @@ -11,19 +13,21 @@ from pylibcudf.io.types import (
TableWithMetadata,
)
from pylibcudf.table import Table
from pylibcudf.types import DataType

def read_orc(
source_info: SourceInfo,
columns: list[str] | None = None,
stripes: list[list[int]] | None = None,
skip_rows: int = 0,
nrows: int = -1,
use_index: bool = True,
use_np_dtypes: bool = True,
timestamp_type: DataType | None = None,
decimal128_columns: list[str] | None = None,
) -> TableWithMetadata: ...
class OrcReaderOptions:
def set_num_rows(self, nrows: int) -> None: ...
def set_skip_rows(self, skip_rows: int) -> None: ...
def set_stripes(self, stripes: list[list[int]]) -> None: ...
def set_decimal128_columns(self, val: list[str]) -> None: ...
def set_columns(self, col_names: list[str]) -> None: ...
@staticmethod
def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ...

class OrcReaderOptionsBuilder:
def use_index(self, use: bool) -> Self: ...
def build(self) -> OrcReaderOptions: ...

def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ...

class OrcColumnStatistics:
def __init__(self): ...
Expand Down
Loading
Loading