Merge branch 'master' into r/internal-functions

microsoft · Nov 13, 2023 · 2e998e9 · 2e998e9
2 parents 078a5fa + 501e6e6
commit 2e998e9
Show file tree

Hide file tree

Showing 14 changed files with 399 additions and 65 deletions.
diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
@@ -1462,7 +1462,6 @@ lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_
       , toString(eval_names)
       , "]"
     ))
-    stop("lgb.get.eval.result: wrong eval name")
   }
 
   result <- booster$record_evals[[data_name]][[eval_name]][[.EVAL_KEY()]]

diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
@@ -555,6 +555,23 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
                                            int num_element,
                                            int type);
 
+/*!
+ * \brief Set vector to a content in info.
+ * \note
+ * - \a label convert input datatype into ``float32``.
+ * \param handle Handle of dataset
+ * \param field_name Field name, can be \a label
+ * \param n_chunks The number of Arrow arrays passed to this function
+ * \param chunks Pointer to the list of Arrow arrays
+ * \param schema Pointer to the schema of all Arrow arrays
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_DatasetSetFieldFromArrow(DatasetHandle handle,
+                                                    const char* field_name,
+                                                    int64_t n_chunks,
+                                                    const ArrowArray* chunks,
+                                                    const ArrowSchema* schema);
+
 /*!
  * \brief Get info vector from dataset.
  * \param handle Handle of dataset

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -110,6 +110,7 @@ class Metadata {
                         const std::vector<data_size_t>& used_data_indices);
 
   void SetLabel(const label_t* label, data_size_t len);
+  void SetLabel(const ArrowChunkedArray& array);
 
   void SetWeights(const label_t* weights, data_size_t len);
 
@@ -334,6 +335,9 @@ class Metadata {
   void CalculateQueryBoundaries();
   /*! \brief Insert labels at the given index */
   void InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len);
+  /*! \brief Set labels from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetLabelsFromIterator(It first, It last);
   /*! \brief Insert weights at the given index */
   void InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len);
   /*! \brief Insert initial scores at the given index */
@@ -655,6 +659,8 @@ class Dataset {
 
   LIGHTGBM_EXPORT void FinishLoad();
 
+  bool SetFieldFromArrow(const char* field_name, const ArrowChunkedArray& ca);
+
   LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
 
   LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -19,7 +19,7 @@
 import scipy.sparse
 
 from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
-                     dt_DataTable, pa_Table, pd_CategoricalDtype, pd_DataFrame, pd_Series)
+                     dt_DataTable, pa_Array, pa_ChunkedArray, pa_Table, pd_CategoricalDtype, pd_DataFrame, pd_Series)
 from .libpath import find_lib_path
 
 if TYPE_CHECKING:
@@ -99,7 +99,9 @@
     List[int],
     np.ndarray,
     pd_Series,
-    pd_DataFrame
+    pd_DataFrame,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 _LGBM_PredictDataType = Union[
     str,
@@ -353,6 +355,11 @@ def _is_2d_collection(data: Any) -> bool:
     )
 
 
+def _is_pyarrow_array(data: Any) -> bool:
+    """Check whether data is a PyArrow array."""
+    return isinstance(data, (pa_Array, pa_ChunkedArray))
+
+
 def _is_pyarrow_table(data: Any) -> bool:
     """Check whether data is a PyArrow table."""
     return isinstance(data, pa_Table)
@@ -384,7 +391,11 @@ def schema_ptr(self) -> int:
 def _export_arrow_to_c(data: pa_Table) -> _ArrowCArray:
     """Export an Arrow type to its C representation."""
     # Obtain objects to export
-    if isinstance(data, pa_Table):
+    if isinstance(data, pa_Array):
+        export_objects = [data]
+    elif isinstance(data, pa_ChunkedArray):
+        export_objects = data.chunks
+    elif isinstance(data, pa_Table):
         export_objects = data.to_batches()
     else:
         raise ValueError(f"data of type '{type(data)}' cannot be exported to Arrow")
@@ -423,31 +434,31 @@ def _data_to_2d_numpy(
                     "It should be list of lists, numpy 2-D array or pandas DataFrame")
 
 
-def _cfloat32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes float pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected float pointer')
 
 
-def _cfloat64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes double pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected double pointer')
 
 
-def _cint32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected int32 pointer')
 
 
-def _cint64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -1284,18 +1295,18 @@ def __create_sparse_native(
         data_indices_len = out_shape[0]
         indptr_len = out_shape[1]
         if indptr_type == _C_API_DTYPE_INT32:
-            out_indptr = _cint32_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint32_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         elif indptr_type == _C_API_DTYPE_INT64:
-            out_indptr = _cint64_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint64_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         else:
             raise TypeError("Expected int32 or int64 type for indptr")
         if data_type == _C_API_DTYPE_FLOAT32:
-            out_data = _cfloat32_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat32_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         elif data_type == _C_API_DTYPE_FLOAT64:
-            out_data = _cfloat64_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat64_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         else:
             raise TypeError("Expected float32 or float64 type for data")
-        out_indices = _cint32_array_to_numpy(out_ptr_indices, data_indices_len)
+        out_indices = _cint32_array_to_numpy(cptr=out_ptr_indices, length=data_indices_len)
         # break up indptr based on number of rows (note more than one matrix in multiclass case)
         per_class_indptr_shape = cs.indptr.shape[0]
         # for CSC there is extra column added
@@ -1620,7 +1631,7 @@ def __init__(
         data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
             Data source of Dataset.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Label of the data.
         reference : Dataset or None, optional (default=None)
             If this is Dataset for validation, training data should be used as reference.
@@ -2402,7 +2413,7 @@ def create_valid(
         data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array
             Data source of Dataset.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Label of the data.
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Weight for each instance. Weights should be non-negative.
@@ -2519,15 +2530,15 @@ def _reverse_update_params(self) -> "Dataset":
     def set_field(
         self,
         field_name: str,
-        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame]]
+        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]]
     ) -> "Dataset":
         """Set property into the Dataset.
 
         Parameters
         ----------
         field_name : str
             The field name of the information.
-        data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
+        data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray or None
             The data to be set.
 
         Returns
@@ -2546,6 +2557,20 @@ def set_field(
                 ctypes.c_int(0),
                 ctypes.c_int(_FIELD_TYPE_MAPPER[field_name])))
             return self
+
+        # If the data is a arrow data, we can just pass it to C
+        if _is_pyarrow_array(data):
+            c_array = _export_arrow_to_c(data)
+            _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow(
+                self._handle,
+                _c_str(field_name),
+                ctypes.c_int64(c_array.n_chunks),
+                ctypes.c_void_p(c_array.chunks_ptr),
+                ctypes.c_void_p(c_array.schema_ptr),
+            ))
+            self.version += 1
+            return self
+
         dtype: "np.typing.DTypeLike"
         if field_name == 'init_score':
             dtype = np.float64
@@ -2584,6 +2609,12 @@ def set_field(
     def get_field(self, field_name: str) -> Optional[np.ndarray]:
         """Get property from the Dataset.
 
+        Can only be run on a constructed Dataset.
+
+        Unlike ``get_group()``, ``get_init_score()``, ``get_label()``, ``get_position()``, and ``get_weight()``,
+        this method ignores any raw data passed into ``lgb.Dataset()`` on the Python side, and will only read
+        data from the constructed C++ ``Dataset`` object.
+
         Parameters
         ----------
         field_name : str
@@ -2610,11 +2641,20 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]:
         if tmp_out_len.value == 0:
             return None
         if out_type.value == _C_API_DTYPE_INT32:
-            arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
+            arr = _cint32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT32:
-            arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
+            arr = _cfloat32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT64:
-            arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
+            arr = _cfloat64_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)),
+                length=tmp_out_len.value
+            )
         else:
             raise TypeError("Unknown type")
         if field_name == 'init_score':
@@ -2749,7 +2789,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
 
         Parameters
         ----------
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None
             The label information to be set into Dataset.
 
         Returns
@@ -2774,6 +2814,8 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
                     # data has nullable dtypes, but we can specify na_value argument and copy will be made
                     label = label.to_numpy(dtype=np.float32, na_value=np.nan)
                 label_array = np.ravel(label)
+            elif _is_pyarrow_array(label):
+                label_array = label
             else:
                 label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label')
             self.set_field('label', label_array)
@@ -2851,6 +2893,10 @@ def set_group(
         if self._handle is not None and group is not None:
             group = _list_to_1d_numpy(group, dtype=np.int32, name='group')
             self.set_field('group', group)
+            # original values can be modified at cpp side
+            constructed_group = self.get_field('group')
+            if constructed_group is not None:
+                self.group = np.diff(constructed_group)
         return self
 
     def set_position(
@@ -2914,37 +2960,40 @@ def get_feature_name(self) -> List[str]:
                 ptr_string_buffers))
         return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
-    def get_label(self) -> Optional[np.ndarray]:
+    def get_label(self) -> Optional[_LGBM_LabelType]:
         """Get the label of the Dataset.
 
         Returns
         -------
-        label : numpy array or None
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
             The label information from the Dataset.
+            For a constructed ``Dataset``, this will only return a numpy array.
         """
         if self.label is None:
             self.label = self.get_field('label')
         return self.label
 
-    def get_weight(self) -> Optional[np.ndarray]:
+    def get_weight(self) -> Optional[_LGBM_WeightType]:
         """Get the weight of the Dataset.
 
         Returns
         -------
-        weight : numpy array or None
+        weight : list, numpy 1-D array, pandas Series or None
             Weight for each data point from the Dataset. Weights should be non-negative.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.weight is None:
             self.weight = self.get_field('weight')
         return self.weight
 
-    def get_init_score(self) -> Optional[np.ndarray]:
+    def get_init_score(self) -> Optional[_LGBM_InitScoreType]:
         """Get the initial score of the Dataset.
 
         Returns
         -------
-        init_score : numpy array or None
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
             Init score of Booster.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.init_score is None:
             self.init_score = self.get_field('init_score')
@@ -2982,17 +3031,18 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                                 "set free_raw_data=False when construct Dataset to avoid this.")
         return self.data
 
-    def get_group(self) -> Optional[np.ndarray]:
+    def get_group(self) -> Optional[_LGBM_GroupType]:
         """Get the group of the Dataset.
 
         Returns
         -------
-        group : numpy array or None
+        group : list, numpy 1-D array, pandas Series or None
             Group/query data.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
             For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
             where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.group is None:
             self.group = self.get_field('group')
@@ -3001,13 +3051,14 @@ def get_group(self) -> Optional[np.ndarray]:
                 self.group = np.diff(self.group)
         return self.group
 
-    def get_position(self) -> Optional[np.ndarray]:
+    def get_position(self) -> Optional[_LGBM_PositionType]:
         """Get the position of the Dataset.
 
         Returns
         -------
-        position : numpy 1-D array or None
+        position : numpy 1-D array, pandas Series or None
             Position of items used in unbiased learning-to-rank task.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.position is None:
             self.position = self.get_field('position')
@@ -4353,7 +4404,7 @@ def refit(
         data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array
             Data source for refit.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
-        label : list, numpy 1-D array or pandas Series / one-column DataFrame
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array or pyarrow ChunkedArray
             Label for refit.
         decay_rate : float, optional (default=0.9)
             Decay rate of refit,
-Original file line number
+Diff line change
@@ Expand Up @@
           , toString(eval_names)
           , "]"
         ))
-        stop("lgb.get.eval.result: wrong eval name")
       }
       result <- booster$record_evals[[data_name]][[eval_name]][[.EVAL_KEY()]]
@@ Expand Down @@