Skip to content

Commit

Permalink
Add chunksize argument to Coniferest models
Browse files Browse the repository at this point in the history
  • Loading branch information
matwey committed Nov 5, 2024
1 parent d126216 commit 79e801d
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 6 deletions.
8 changes: 7 additions & 1 deletion src/coniferest/aadforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def score_samples(self, x, weights=None):
if weights is None:
weights = self.weights

return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads)
return calc_paths_sum(self.selectors, self.indices, x, weights, num_threads=self.num_threads, chunksize=self.chunksize)

def loss(
self,
Expand Down Expand Up @@ -111,6 +111,7 @@ def loss_gradient(
self.leaf_count,
sample_weights,
num_threads=self.num_threads,
chunksize=self.chunksize,
)
delta_weights = weights - prior_weights
grad += prior_influence * delta_weights
Expand Down Expand Up @@ -153,6 +154,9 @@ class AADForest(Coniferest):
n_jobs : int or None, optional
Number of threads to use for scoring. If None - all available CPUs are used.
chunksize : int, optional
Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
random_seed : int or None, optional
Random seed to use for reproducibility. If None - random seed is used.
Expand All @@ -170,13 +174,15 @@ def __init__(
C_a=1.0,
prior_influence=1.0,
n_jobs=None,
chunksize=None,
random_seed=None,
):
super().__init__(
trees=[],
n_subsamples=n_subsamples,
max_depth=max_depth,
n_jobs=n_jobs,
chunksize=chunksize,
random_seed=random_seed,
)
self.n_trees = n_trees
Expand Down
7 changes: 6 additions & 1 deletion src/coniferest/coniferest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,20 @@ class Coniferest(ABC):
n_jobs : int, optional
Number of threads to use for scoring. If -1, then number of CPUs is used.
chunksize : int, optional
Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
random_seed : int or None, optional
Seed for the reproducibility. If None, then random seed is used.
"""

def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, random_seed=None):
def __init__(self, trees=None, n_subsamples=256, max_depth=None, n_jobs=-1, chunksize=0, random_seed=None):
self.trees = trees or []
self.n_subsamples = n_subsamples
self.max_depth = max_depth or int(np.log2(n_subsamples))

self.n_jobs = n_jobs
self.chunksize = chunksize

# For the better future with reproducible parallel tree building.
# self.seedseq = np.random.SeedSequence(random_state)
Expand Down Expand Up @@ -225,6 +229,7 @@ def __init__(self, coniferest, map_value=None):
indices=indices,
leaf_count=leaf_count,
num_threads=coniferest.n_jobs,
chunksize=coniferest.chunksize,
)

@classmethod
Expand Down
16 changes: 12 additions & 4 deletions src/coniferest/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ForestEvaluator:
]
)

def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
def __init__(self, samples, selectors, indices, leaf_count, *, num_threads, chunksize):
"""
Base class for the forest evaluators. Does the trivial job:
* runs calc_paths_sum written in cython,
Expand All @@ -42,6 +42,9 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
num_threads : int or None
Number of threads to use for calculations. If None then
chunksize : int or None
Size of the chunk to use for multithreading calculations.
"""
self.samples = samples

Expand All @@ -57,6 +60,11 @@ def __init__(self, samples, selectors, indices, leaf_count, *, num_threads):
else:
self.num_threads = num_threads

if chunksize is not None:
self.chunksize = chunksize
else:
self.chunksize = 0

@classmethod
def combine_selectors(cls, selectors_list):
"""
Expand Down Expand Up @@ -113,7 +121,7 @@ def score_samples(self, x):
return -(
2
** (
-calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads)
-calc_paths_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)
/ (self.average_path_length(self.samples) * trees)
)
)
Expand All @@ -122,7 +130,7 @@ def _feature_delta_sum(self, x):
if not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)

return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads)
return calc_feature_delta_sum(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)

def feature_signature(self, x):
delta_sum, hit_count = self._feature_delta_sum(x)
Expand All @@ -138,7 +146,7 @@ def apply(self, x):
if not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)

return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads)
return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads, chunksize=self.chunksize)

@classmethod
def average_path_length(cls, n_nodes):
Expand Down
5 changes: 5 additions & 0 deletions src/coniferest/isoforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class IsolationForest(Coniferest):
n_jobs : int or None, optional
Number of threads to use for evaluation. If None, use all available CPUs.
chunksize : int, optional
Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
random_seed : int or None, optional
Seed for reproducibility. If None, random seed is used.
"""
Expand All @@ -35,13 +38,15 @@ def __init__(
n_subsamples=256,
max_depth=None,
n_jobs=None,
chunksize=None,
random_seed=None,
):
super().__init__(
trees=[],
n_subsamples=n_subsamples,
max_depth=max_depth,
n_jobs=n_jobs,
chunksize=chunksize,
random_seed=random_seed,
)
self.n_trees = n_trees
Expand Down
5 changes: 5 additions & 0 deletions src/coniferest/pineforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class PineForest(Coniferest):
n_jobs : int, optional
Number of threads to use for scoring. If None - number of CPUs is used.
chunksize : int, optional
Size of the chunk to use for multithreading calculations. If 0, then automatic numer is used.
random_seed : int or None, optional
Random seed. If None - random seed is used.
"""
Expand All @@ -62,13 +65,15 @@ def __init__(
regenerate_trees=False,
weight_ratio=1.0,
n_jobs=None,
chunksize=None,
random_seed=None,
):
super().__init__(
trees=[],
n_subsamples=n_subsamples,
max_depth=max_depth,
n_jobs=n_jobs,
chunksize=chunksize,
random_seed=random_seed,
)
self.n_trees = n_trees
Expand Down
1 change: 1 addition & 0 deletions src/coniferest/sklearn/isoforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self, isoforest):
indices=indices,
leaf_count=leaf_count,
num_threads=isoforest.n_jobs,
chunksize=None,
)

@classmethod
Expand Down

0 comments on commit 79e801d

Please sign in to comment.