diff --git a/python-package/lightgbm/arrow.py b/python-package/lightgbm/arrow.py deleted file mode 100644 index df38d0e496d5..000000000000 --- a/python-package/lightgbm/arrow.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding: utf-8 -"""Utilities for handling Arrow in LightGBM.""" -from contextlib import contextmanager -from typing import Iterator, List - -import pyarrow as pa -from pyarrow.cffi import ffi - -__all__: List[str] = [] - - -class _ArrowCArray: - """Simple wrapper around the C representation of an Arrow type.""" - - n_chunks: int - chunks: ffi.CData - schema: ffi.CData - - def __init__(self, n_chunks: int, chunks: ffi.CData, schema: ffi.CData): - self.n_chunks = n_chunks - self.chunks = chunks - self.schema = schema - - @property - def chunks_ptr(self) -> int: - """Returns the address of the pointer to the list of chunks making up the array.""" - return int(ffi.cast("uintptr_t", ffi.addressof(self.chunks[0]))) - - @property - def schema_ptr(self) -> int: - """Returns the address of the pointer to the schema of the array.""" - return int(ffi.cast("uintptr_t", self.schema)) - - -@contextmanager -def _export_arrow_to_c(data: pa.Table) -> Iterator[_ArrowCArray]: - """Export an Arrow type to its C representation.""" - # Obtain objects to export - if isinstance(data, pa.Table): - export_objects = data.to_batches() - else: - raise ValueError(f"data of type '{type(data)}' cannot be exported to Arrow") - - # Prepare export - chunks = ffi.new(f"struct ArrowArray[{len(export_objects)}]") - schema = ffi.new("struct ArrowSchema*") - - # Export all objects - for i, obj in enumerate(export_objects): - chunk_ptr = int(ffi.cast("uintptr_t", ffi.addressof(chunks[i]))) - if i == 0: - schema_ptr = int(ffi.cast("uintptr_t", schema)) - obj._export_to_c(chunk_ptr, schema_ptr) - else: - obj._export_to_c(chunk_ptr) - - yield _ArrowCArray(len(chunks), chunks, schema) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 84f5ec02bcb4..50d2abfec97b 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -6,6 +6,7 @@ import json import warnings from collections import OrderedDict +from contextlib import contextmanager from copy import deepcopy from enum import Enum from functools import wraps @@ -13,13 +14,14 @@ from os.path import getsize from pathlib import Path from tempfile import NamedTemporaryFile -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union import numpy as np import scipy.sparse -from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_is_floating, arrow_is_integer, concat, dt_DataTable, - export_arrow_to_c, pa_Table, pd_CategoricalDtype, pd_DataFrame, pd_Series) +from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_ffi_addressof, arrow_ffi_cast, arrow_ffi_CData, + arrow_ffi_new, arrow_is_floating, arrow_is_integer, concat, dt_DataTable, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -337,6 +339,55 @@ def _is_pyarrow_table(data: Any) -> bool: return isinstance(data, pa_Table) +class _ArrowCArray: + """Simple wrapper around the C representation of an Arrow type.""" + + n_chunks: int + chunks: arrow_ffi_CData + schema: arrow_ffi_CData + + def __init__(self, n_chunks: int, chunks: arrow_ffi_CData, schema: arrow_ffi_CData): + self.n_chunks = n_chunks + self.chunks = chunks + self.schema = schema + + @property + def chunks_ptr(self) -> int: + """Returns the address of the pointer to the list of chunks making up the array.""" + return int(arrow_ffi_cast("uintptr_t", arrow_ffi_addressof(self.chunks[0]))) + + @property + def schema_ptr(self) -> int: + """Returns the address of the pointer to the schema of the array.""" + return int(arrow_ffi_cast("uintptr_t", self.schema)) + + +@contextmanager +def _export_arrow_to_c(data: pa_Table) -> Iterator[_ArrowCArray]: + """Export an Arrow type to its C representation.""" + # Obtain objects to export + if isinstance(data, pa_Table): + export_objects = data.to_batches() + else: + raise ValueError(f"data of type '{type(data)}' cannot be exported to Arrow") + + # Prepare export + chunks = arrow_ffi_new(f"struct ArrowArray[{len(export_objects)}]") + schema = arrow_ffi_new("struct ArrowSchema*") + + # Export all objects + for i, obj in enumerate(export_objects): + chunk_ptr = int(arrow_ffi_cast("uintptr_t", arrow_ffi_addressof(chunks[i]))) + if i == 0: + schema_ptr = int(arrow_ffi_cast("uintptr_t", schema)) + obj._export_to_c(chunk_ptr, schema_ptr) + else: + obj._export_to_c(chunk_ptr) + + yield _ArrowCArray(len(chunks), chunks, schema) + + + def _data_to_2d_numpy( data: Any, dtype: "np.typing.DTypeLike", @@ -2202,7 +2253,7 @@ def __init_from_pyarrow_table( raise ValueError("Arrow table may only have integer or floating point datatypes") # Export Arrow table to C - with export_arrow_to_c(table) as c_array: + with _export_arrow_to_c(table) as c_array: self._handle = ctypes.c_void_p() _safe_call(_LIB.LGBM_DatasetCreateFromArrow( ctypes.c_int64(c_array.n_chunks), diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index a639f479fbfb..4a323c8e1e68 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -188,10 +188,12 @@ def __init__(self, *args, **kwargs): """pyarrow""" try: from pyarrow import Table as pa_Table + from pyarrow.cffi import CData as arrow_ffi_CData + from pyarrow.cffi import addressof as arrow_ffi_addressof + from pyarrow.cffi import cast as arrow_ffi_cast + from pyarrow.cffi import new as arrow_ffi_new from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer - - from .arrow import _export_arrow_to_c as export_arrow_to_c PYARROW_INSTALLED = True except ImportError: PYARROW_INSTALLED = False @@ -202,9 +204,17 @@ class pa_Table: # type: ignore def __init__(self, *args, **kwargs): pass + class arrow_ffi_CData: # type: ignore + """Dummy class for pa.cffi.Table.""" + + def __init__(self, *args, **kwargs): + pass + arrow_is_integer = None arrow_is_floating = None - export_arrow_to_c = None + arrow_ffi_addressof = None + arrow_ffi_cast = None + arrow_ffi_new = None """cpu_count()""" try: