From 79e801dd580b2c94364ffe16762a018d0d6bc696 Mon Sep 17 00:00:00 2001 From: "Matwey V. Kornilov" Date: Thu, 31 Oct 2024 15:24:23 +0100 Subject: [PATCH] Add chunksize argument to Coniferest models --- src/coniferest/aadforest.py | 8 +++++++- src/coniferest/coniferest.py | 7 ++++++- src/coniferest/evaluator.py | 16 ++++++++++++---- src/coniferest/isoforest.py | 5 +++++ src/coniferest/pineforest.py | 5 +++++ src/coniferest/sklearn/isoforest.py | 1 + 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/coniferest/aadforest.py b/src/coniferest/aadforest.py index 1850af2..1db2185 100644 --- a/src/coniferest/aadforest.py +++ b/src/coniferest/aadforest.py @@ -37,7 +37,7 @@ def score_samples(self, x, weights=None): if weights is None: weights = self.weights - return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads) + return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads, chunksize=self.chunksize) def loss( self, @@ -111,6 +111,7 @@ def loss_gradient( self.leaf_count, sample_weights, num_threads=self.num_threads, + chunksize=self.chunksize, ) delta_weights = weights - prior_weights grad += prior_influence * delta_weights @@ -153,6 +154,9 @@ class AADForest(Coniferest): n_jobs : int or None, optional Number of threads to use for scoring. If None - all available CPUs are used. + chunksize : int, optional + Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used. + random_seed : int or None, optional Random seed to use for reproducibility. If None - random seed is used. @@ -170,6 +174,7 @@ def __init__( C_a=1.0, prior_influence=1.0, n_jobs=None, + chunksize=None, random_seed=None, ): super().__init__( @@ -177,6 +182,7 @@ def __init__( n_subsamples=n_subsamples, max_depth=max_depth, n_jobs=n_jobs, + chunksize=chunksize, random_seed=random_seed, ) self.n_trees = n_trees diff --git a/src/coniferest/coniferest.py b/src/coniferest/coniferest.py index 6cc9935..5e13e0d 100644 --- a/src/coniferest/coniferest.py +++ b/src/coniferest/coniferest.py @@ -40,16 +40,20 @@ class Coniferest(ABC): n_jobs : int, optional Number of threads to use for scoring. If -1, then number of CPUs is used. + chunksize : int, optional + Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used. + random_seed : int or None, optional Seed for the reproducibility. If None, then random seed is used. """ - def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, random_seed=None): + def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, chunksize=0, random_seed=None): self.trees = trees or [] self.n_subsamples = n_subsamples self.max_depth = max_depth or int(np.log2(n_subsamples)) self.n_jobs = n_jobs + self.chunksize = chunksize # For the better future with reproducible parallel tree building. # self.seedseq = np.random.SeedSequence(random_state) @@ -225,6 +229,7 @@ def __init__(self, coniferest, map_value=None): indices=indices, leaf_count=leaf_count, num_threads=coniferest.n_jobs, + chunksize=coniferest.chunksize, ) @classmethod diff --git a/src/coniferest/evaluator.py b/src/coniferest/evaluator.py index 876ab94..422cd3f 100644 --- a/src/coniferest/evaluator.py +++ b/src/coniferest/evaluator.py @@ -18,7 +18,7 @@ class ForestEvaluator: ] ) - def __init__(self, samples, selectors, indices, leaf_count, *, num_threads): + def __init__(self, samples, selectors, indices, leaf_count, *, num_threads, chunksize): """ Base class for the forest evaluators. Does the trivial job: * runs calc_paths_sum written in cython, @@ -42,6 +42,9 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads): num_threads : int or None Number of threads to use for calculations. If None then + + chunksize : int or None + Size of the chunk to use for multithreading calculations. """ self.samples = samples @@ -57,6 +60,11 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads): else: self.num_threads = num_threads + if chunksize is not None: + self.chunksize = chunksize + else: + self.chunksize = 0 + @classmethod def combine_selectors(cls, selectors_list): """ @@ -113,7 +121,7 @@ def score_samples(self, x): return -( 2 ** ( - -calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads) + -calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize) / (self.average_path_length(self.samples) * trees) ) ) @@ -122,7 +130,7 @@ def _feature_delta_sum(self, x): if not x.flags["C_CONTIGUOUS"]: x = np.ascontiguousarray(x) - return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads) + return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize) def feature_signature(self, x): delta_sum, hit_count = self._feature_delta_sum(x) @@ -138,7 +146,7 @@ def apply(self, x): if not x.flags["C_CONTIGUOUS"]: x = np.ascontiguousarray(x) - return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads) + return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize) @classmethod def average_path_length(cls, n_nodes): diff --git a/src/coniferest/isoforest.py b/src/coniferest/isoforest.py index 9d8a4cf..2d08857 100644 --- a/src/coniferest/isoforest.py +++ b/src/coniferest/isoforest.py @@ -25,6 +25,9 @@ class IsolationForest(Coniferest): n_jobs : int or None, optional Number of threads to use for evaluation. If None, use all available CPUs. + chunksize : int, optional + Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used. + random_seed : int or None, optional Seed for reproducibility. If None, random seed is used. """ @@ -35,6 +38,7 @@ def __init__( n_subsamples=256, max_depth=None, n_jobs=None, + chunksize=None, random_seed=None, ): super().__init__( @@ -42,6 +46,7 @@ def __init__( n_subsamples=n_subsamples, max_depth=max_depth, n_jobs=n_jobs, + chunksize=chunksize, random_seed=random_seed, ) self.n_trees = n_trees diff --git a/src/coniferest/pineforest.py b/src/coniferest/pineforest.py index 15c8290..bf96da2 100644 --- a/src/coniferest/pineforest.py +++ b/src/coniferest/pineforest.py @@ -49,6 +49,9 @@ class PineForest(Coniferest): n_jobs : int, optional Number of threads to use for scoring. If None - number of CPUs is used. + chunksize : int, optional + Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used. + random_seed : int or None, optional Random seed. If None - random seed is used. """ @@ -62,6 +65,7 @@ def __init__( regenerate_trees=False, weight_ratio=1.0, n_jobs=None, + chunksize=None, random_seed=None, ): super().__init__( @@ -69,6 +73,7 @@ def __init__( n_subsamples=n_subsamples, max_depth=max_depth, n_jobs=n_jobs, + chunksize=chunksize, random_seed=random_seed, ) self.n_trees = n_trees diff --git a/src/coniferest/sklearn/isoforest.py b/src/coniferest/sklearn/isoforest.py index c33dba2..4a30bb6 100644 --- a/src/coniferest/sklearn/isoforest.py +++ b/src/coniferest/sklearn/isoforest.py @@ -22,6 +22,7 @@ def __init__(self, isoforest): indices=indices, leaf_count=leaf_count, num_threads=isoforest.n_jobs, + chunksize=None, ) @classmethod