Skip to content

Commit

Permalink
Initial implementation for apply
Browse files Browse the repository at this point in the history
Partially fix #134
  • Loading branch information
matwey committed Sep 5, 2024
1 parent f40bfba commit ac48754
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 1 deletion.
17 changes: 17 additions & 0 deletions src/coniferest/aadforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,20 @@ def feature_signature(self, x):

def feature_importance(self, x):
raise NotImplementedError()

def apply(self, x):
"""
Apply the forest to X, return leaf indices.
Parameters
----------
x : ndarray shape (n_samples, n_features)
2-d array with features.
Returns
-------
x_leafs : ndarray of shape (n_samples, n_estimators)
For each datapoint x in X and for each tree in the forest,
return the index of the leaf x ends up in.
"""
return self.evaluator.apply(x)
6 changes: 6 additions & 0 deletions src/coniferest/calc_paths_sum.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,9 @@ cdef void _feature_delta_sum(selector_t [::1] selectors,
np.float64_t [:, ::1] delta_sum,
np.int64_t [:, ::1] hit_count,
int num_threads=*)

cdef void _apply(selector_t [::1] selectors,
np.int64_t [::1] indices,
floating [:, ::1] data,
np.int64_t [:, ::1] leafs,
int num_threads=*)
49 changes: 49 additions & 0 deletions src/coniferest/calc_paths_sum.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ def calc_feature_delta_sum(selector_t [::1] selectors,
_feature_delta_sum(selectors, indices, data, delta_sum_view, hit_count_view, num_threads)
return delta_sum, hit_count

def calc_apply(selector_t [::1] selectors, np.int64_t [::1] indices, floating [:, ::1] data, int num_threads=1):
cdef np.ndarray [np.int64_t, ndim=2] leafs = np.zeros([data.shape[0], indices.shape[0] - 1], dtype=np.int64)
cdef np.int64_t [:, ::1] leafs_view = leafs
cdef Py_ssize_t sellen = selectors.shape[0]

if np.any(np.diff(indices) < 0):
raise ValueError('indices should be an increasing sequence')

if indices[-1] > sellen:
raise ValueError('indices are out of range of the selectors')

_apply(selectors, indices, data, leafs_view, num_threads)
return leafs



Expand Down Expand Up @@ -187,3 +200,39 @@ cdef void _feature_delta_sum(selector_t [::1] selectors,

delta_sum[x_index, feature] += 1.0 + 2.0 * (child_selector.log_n_node_samples - selector.log_n_node_samples)
hit_count[x_index, feature] += 1


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _apply(selector_t [::1] selectors,
np.int64_t [::1] indices,
floating [:, ::1] data,
np.int64_t [:, ::1] leafs,
int num_threads=1):

cdef Py_ssize_t trees
cdef Py_ssize_t tree_index
cdef Py_ssize_t x_index
cdef selector_t selector
cdef Py_ssize_t tree_offset
cdef np.int32_t feature, i

with nogil, parallel(num_threads=num_threads):
trees = indices.shape[0] - 1

for x_index in prange(data.shape[0], schedule='static'):
for tree_index in range(trees):
tree_offset = indices[tree_index]
i = 0
while True:
selector = selectors[tree_offset + i]
feature = selector.feature
if feature < 0:
break

if data[x_index, feature] <= selector.value:
i = selector.left
else:
i = selector.right

leafs[x_index, tree_index] = tree_offset + i
8 changes: 7 additions & 1 deletion src/coniferest/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import joblib
import numpy as np

from .calc_paths_sum import calc_feature_delta_sum, calc_paths_sum # noqa
from .calc_paths_sum import calc_apply, calc_feature_delta_sum, calc_paths_sum # noqa
from .utils import average_path_length

__all__ = ["ForestEvaluator"]
Expand Down Expand Up @@ -134,6 +134,12 @@ def feature_importance(self, x):

return np.sum(delta_sum, axis=0) / np.sum(hit_count, axis=0) / self.average_path_length(self.samples)

def apply(self, x):
if not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)

return calc_apply(self.selectors, self.indices, x, num_threads=self.num_threads)

@classmethod
def average_path_length(cls, n_nodes):
"""
Expand Down
17 changes: 17 additions & 0 deletions src/coniferest/isoforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,20 @@ def feature_signature(self, x):

def feature_importance(self, x):
return self.evaluator.feature_importance(x)

def apply(self, x):
"""
Apply the forest to X, return leaf indices.
Parameters
----------
x : ndarray shape (n_samples, n_features)
2-d array with features.
Returns
-------
x_leafs : ndarray of shape (n_samples, n_estimators)
For each datapoint x in X and for each tree in the forest,
return the index of the leaf x ends up in.
"""
return self.evaluator.apply(x)
17 changes: 17 additions & 0 deletions src/coniferest/pineforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,20 @@ def feature_signature(self, x):

def feature_importance(self, x):
return self.evaluator.feature_importance(x)

def apply(self, x):
"""
Apply the forest to X, return leaf indices.
Parameters
----------
x : ndarray shape (n_samples, n_features)
2-d array with features.
Returns
-------
x_leafs : ndarray of shape (n_samples, n_estimators)
For each datapoint x in X and for each tree in the forest,
return the index of the leaf x ends up in.
"""
return self.evaluator.apply(x)
21 changes: 21 additions & 0 deletions tests/test_isoforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,27 @@ def test_reproducibility():
forest2 = build_forest(n_features, random_seed)
assert_forest_scores(forest1, forest2, n_features=n_features)

def test_apply():
n_features = 16
n_trees = 100
n_subsamples = 256

random_seed = np.random.randint(1 << 16)
rng = np.random.default_rng(random_seed)
data = rng.standard_normal((n_trees * n_subsamples, n_features))

forest = IsolationForest(
n_trees=n_trees,
n_subsamples=n_subsamples,
max_depth=None,
random_seed=random_seed,
)

forest.fit(data)

scores = np.sum(forest.evaluator.selectors[forest.apply(data)]["value"], axis=1)
scores = -2**(-scores / (forest.evaluator.average_path_length(n_subsamples) * n_trees))
assert_allclose(forest.score_samples(data), scores)

@pytest.mark.regression
def test_regression(regression_data):
Expand Down

0 comments on commit ac48754

Please sign in to comment.