howsoai · jackx111 · Oct 9, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -40,6 +40,7 @@
     reshape_data,
     seconds_to_time,
     serialize_datetimes,
+    yield_dataframe_as_chunks,
     StopExecution,
     time_to_seconds,
     UserFriendlyExit,
@@ -83,6 +84,7 @@
     "serialize_cases",
     "serialize_datetimes",
     "SingleTableFeatureAttributes",
+    "yield_dataframe_as_chunks",
     "StopExecution",
     "time_to_seconds",
     "Timer",

@@ -169,7 +169,7 @@ def test_set_rate_delta_boundaries():
         id_feature_name=id_feature_name,
         datetime_feature_formats={time_feature_name: time_format},
         rate_boundaries=rate_boundaries,
-        delta_boundaries=delta_boundaries,
+        delta_boundaries=delta_boundaries
     )
 
     # Make sure that order 0 was overwritten for rate/delta min & max

@@ -18,7 +18,7 @@
 
 from .base import SingleTableFeatureAttributes
 from .pandas import InferFeatureAttributesDataFrame
-from ..utilities import date_to_epoch
+from ..utilities import date_to_epoch, yield_dataframe_as_chunks
 
 logger = logging.getLogger(__name__)
 
@@ -167,8 +167,8 @@ def _infer_delta_min_and_max(  # noqa: C901
                         futures: dict[Future, str] = dict()
 
                         with ProcessPoolExecutor(max_workers=max_workers, mp_context=mp_context) as pool:
-                            df_chunks = np.array_split(df_c, max_workers)
-                            for chunk in df_chunks:
+                            df_chunks_generator = yield_dataframe_as_chunks(df_c, max_workers)
+                            for chunk in df_chunks_generator:
                                 future = pool.submit(
                                     _apply_chunks_shard,
                                     df=chunk,

@@ -4,7 +4,7 @@
 import datetime as dt
 import inspect
 import locale as python_locale
-from math import isnan
+from math import ceil, isnan
 import re
 import sys
 import threading
@@ -1393,3 +1393,37 @@ def format_confusion_matrix(confusion_matrix: dict[str, dict[str, int]]) -> tupl
             confusion_matrix_array[i, j] = confusion_matrix.get(actual_label, {}).get(predicted_label, 0)
 
     return confusion_matrix_array, row_labels
+
+
+def yield_dataframe_as_chunks(df: pd.DataFrame, num_chunks: int) -> t.Generator[pd.DataFrame, None, None]:
+    """
+    Yields a DataFrame in chunks using iloc. Np.array_split is deprecated.
+
+    Parameters
+    ----------
+    df : DataFrame
+        Pandas DataFrame to be split.
+    num_chunks : int
+        The number of chunks to split the DataFrame into.
+
+    Yields
+    ------
+    DataFrame
+        A DataFrame chunk.
+    """
+    total_rows = len(df)
+
+    if num_chunks > total_rows:
+        warnings.warn(
+            f"Number of chunks requested: {num_chunks} is greater than "
+            f"the number of rows in the DataFrame. Yielding the original DataFrame."
+        )
+        yield df
+
+    rows_per_chunk = ceil(total_rows / num_chunks)
+
+    for i in range(num_chunks):
+        start = i * rows_per_chunk
+        # Cap the end index at total_rows to avoid out-of-bounds
+        end = ((i + 1) * rows_per_chunk) if i != num_chunks - 1 else total_rows
+        yield df.iloc[start:end]