Add chunksize argument to Coniferest models

snad-space · Nov 5, 2024 · 79e801d · 79e801d
1 parent d126216
commit 79e801d
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 6 deletions.
diff --git a/src/coniferest/aadforest.py b/src/coniferest/aadforest.py
@@ -37,7 +37,7 @@ def score_samples(self, x, weights=None):
         if weights is None:
             weights = self.weights
 
-        return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads)
+        return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads, chunksize=self.chunksize)
 
     def loss(
         self,
@@ -111,6 +111,7 @@ def loss_gradient(
             self.leaf_count,
             sample_weights,
             num_threads=self.num_threads,
+            chunksize=self.chunksize,
         )
         delta_weights = weights - prior_weights
         grad += prior_influence * delta_weights
@@ -153,6 +154,9 @@ class AADForest(Coniferest):
     n_jobs : int or None, optional
         Number of threads to use for scoring. If None - all available CPUs are used.
 
+    chunksize : int, optional
+        Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
+
     random_seed : int or None, optional
         Random seed to use for reproducibility. If None - random seed is used.
 
@@ -170,13 +174,15 @@ def __init__(
         C_a=1.0,
         prior_influence=1.0,
         n_jobs=None,
+        chunksize=None,
         random_seed=None,
     ):
         super().__init__(
             trees=[],
             n_subsamples=n_subsamples,
             max_depth=max_depth,
             n_jobs=n_jobs,
+            chunksize=chunksize,
             random_seed=random_seed,
         )
         self.n_trees = n_trees

diff --git a/src/coniferest/coniferest.py b/src/coniferest/coniferest.py
@@ -40,16 +40,20 @@ class Coniferest(ABC):
     n_jobs : int, optional
         Number of threads to use for scoring. If -1, then number of CPUs is used.
 
+    chunksize : int, optional
+        Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
+
     random_seed : int or None, optional
         Seed for the reproducibility. If None, then random seed is used.
     """
 
-    def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, random_seed=None):
+    def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, chunksize=0, random_seed=None):
         self.trees = trees or []
         self.n_subsamples = n_subsamples
         self.max_depth = max_depth or int(np.log2(n_subsamples))
 
         self.n_jobs = n_jobs
+        self.chunksize = chunksize
 
         # For the better future with reproducible parallel tree building.
         # self.seedseq = np.random.SeedSequence(random_state)
@@ -225,6 +229,7 @@ def __init__(self, coniferest, map_value=None):
             indices=indices,
             leaf_count=leaf_count,
             num_threads=coniferest.n_jobs,
+            chunksize=coniferest.chunksize,
         )
 
     @classmethod

diff --git a/src/coniferest/evaluator.py b/src/coniferest/evaluator.py
@@ -18,7 +18,7 @@ class ForestEvaluator:
         ]
     )
 
-    def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
+    def __init__(self, samples, selectors, indices, leaf_count, *, num_threads, chunksize):
         """
         Base class for the forest evaluators. Does the trivial job:
         * runs calc_paths_sum written in cython,
@@ -42,6 +42,9 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
 
         num_threads : int or None
             Number of threads to use for calculations. If None then
+
+        chunksize : int or None
+            Size of the chunk to use for multithreading calculations.
         """
         self.samples = samples
 
@@ -57,6 +60,11 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
         else:
             self.num_threads = num_threads
 
+        if chunksize is not None:
+            self.chunksize = chunksize
+        else:
+            self.chunksize = 0
+
     @classmethod
     def combine_selectors(cls, selectors_list):
         """
@@ -113,7 +121,7 @@ def score_samples(self, x):
         return -(
             2
             ** (
-                -calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads)
+                -calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)
                 / (self.average_path_length(self.samples) * trees)
             )
         )
@@ -122,7 +130,7 @@ def _feature_delta_sum(self, x):
         if not x.flags["C_CONTIGUOUS"]:
             x = np.ascontiguousarray(x)
 
-        return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads)
+        return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)
 
     def feature_signature(self, x):
         delta_sum, hit_count = self._feature_delta_sum(x)
@@ -138,7 +146,7 @@ def apply(self, x):
         if not x.flags["C_CONTIGUOUS"]:
             x = np.ascontiguousarray(x)
 
-        return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads)
+        return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)
 
     @classmethod
     def average_path_length(cls, n_nodes):

diff --git a/src/coniferest/isoforest.py b/src/coniferest/isoforest.py
@@ -25,6 +25,9 @@ class IsolationForest(Coniferest):
     n_jobs : int or None, optional
         Number of threads to use for evaluation. If None, use all available CPUs.
 
+    chunksize : int, optional
+        Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
+
     random_seed : int or None, optional
         Seed for reproducibility. If None, random seed is used.
     """
@@ -35,13 +38,15 @@ def __init__(
         n_subsamples=256,
         max_depth=None,
         n_jobs=None,
+        chunksize=None,
         random_seed=None,
     ):
         super().__init__(
             trees=[],
             n_subsamples=n_subsamples,
             max_depth=max_depth,
             n_jobs=n_jobs,
+            chunksize=chunksize,
             random_seed=random_seed,
         )
         self.n_trees = n_trees

diff --git a/src/coniferest/pineforest.py b/src/coniferest/pineforest.py
@@ -49,6 +49,9 @@ class PineForest(Coniferest):
     n_jobs : int, optional
         Number of threads to use for scoring. If None - number of CPUs is used.
 
+    chunksize : int, optional
+        Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
+
     random_seed : int or None, optional
         Random seed. If None - random seed is used.
     """
@@ -62,13 +65,15 @@ def __init__(
         regenerate_trees=False,
         weight_ratio=1.0,
         n_jobs=None,
+        chunksize=None,
         random_seed=None,
     ):
         super().__init__(
             trees=[],
             n_subsamples=n_subsamples,
             max_depth=max_depth,
             n_jobs=n_jobs,
+            chunksize=chunksize,
             random_seed=random_seed,
         )
         self.n_trees = n_trees

diff --git a/src/coniferest/sklearn/isoforest.py b/src/coniferest/sklearn/isoforest.py
@@ -22,6 +22,7 @@ def __init__(self, isoforest):
             indices=indices,
             leaf_count=leaf_count,
             num_threads=isoforest.n_jobs,
+            chunksize=None,
         )
 
     @classmethod