-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoutliers.py
62 lines (41 loc) · 2.12 KB
/
outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Dealing with outliers in ML pipelines."""
# ----------------------------------------------------------------------------------------------------------------------------
# LOGGING
# ----------------------------------------------------------------------------------------------------------------------------
import logging
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------------------------------------------------------------
# Packages
# ----------------------------------------------------------------------------------------------------------------------------
from pyutilz.pythonlib import ensure_installed
ensure_installed("imbalanced-learn scikit-learn")
from sklearn.ensemble import IsolationForest
# ----------------------------------------------------------------------------------------------------------------------------
# Normal Imports
# ----------------------------------------------------------------------------------------------------------------------------
from typing import *
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# ----------------------------------------------------------------------------------------------------------------------------
# Core
# ----------------------------------------------------------------------------------------------------------------------------
def reject_outliers(
X: object,
y: object,
model: object = None,
verbose: bool = True,
):
"""Function used to resample the dataset by dropping the outliers. Should be a part of imblearn Pipeline:
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline
pipe = Pipeline([("out", FunctionSampler(func=reject_outliers, validate=False)), ("est", clf)])
"""
if model is None:
model = Pipeline([("imp", SimpleImputer()), ("est", IsolationForest())])
model.fit(X)
y_pred = model.predict(X)
idx = y_pred == 1
if verbose:
logger.info("Outlier rejection: received %s samples, kept %s", len(X), idx.sum())
return X[idx], y[idx]