Skip to content

Commit

Permalink
Create two abstract class for outliers
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 25, 2019
1 parent 36ec821 commit 2622fad
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from optimus.helpers.filters import dict_filter


class AbstractOutlier(ABC):
class AbstractOutlierBounds(ABC):
"""
This is a template class to expand the outliers methods
Also you need to add the function to outliers.py
Expand Down Expand Up @@ -86,10 +86,11 @@ def non_outliers_count(self):
:return:
"""
col_name = self.col_name
return self.df.rows.select((F.col(col_name) <= self.upper_bound) | (F.col(col_name) >= self.lower_bound)).count()
return self.df.rows.select(
(F.col(col_name) <= self.upper_bound) | (F.col(col_name) >= self.lower_bound)).count()

@abstractmethod
def info(self):
def info(self, output: str = "dict"):
"""
Get whiskers, iqrs and outliers and non outliers count
:return:
Expand Down
90 changes: 90 additions & 0 deletions optimus/outliers/abstract_outliers_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from abc import ABC, abstractmethod

from pyspark.sql import functions as F

from optimus.helpers.check import is_dataframe
from optimus.helpers.columns import parse_columns
from optimus.helpers.converter import one_list_to_val
from optimus.helpers.filters import dict_filter


class AbstractOutlierThreshold(ABC):
"""
This is a template class to expand the outliers methods
Also you need to add the function to outliers.py
"""

def __init__(self, df, col_name):
"""
:param df: Spark Dataframe
:param col_name: column name
"""
if not is_dataframe(df):
raise TypeError("Spark Dataframe expected")

self.df = df
self.col_name = one_list_to_val(parse_columns(df, col_name))

def select(self):
"""
Select outliers rows using the selected column
:return:
"""

col_name = self.col_name
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])

return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))

def drop(self):
"""
Drop outliers rows using the selected column
:return:
"""

col_name = self.col_name
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
print(upper_bound, lower_bound)
return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))

def count_lower_bound(self, bound):
"""
Count outlier in the lower bound
:return:
"""
col_name = self.col_name
return self.df.rows.select(self.df[col_name] < bound).count()

def count_upper_bound(self, bound):
"""
Count outliers in the upper bound
:return:
"""
col_name = self.col_name
return self.df.rows.select(self.df[col_name] > bound).count()

def count(self):
"""
Count the outliers rows using the selected column
:return:
"""
col_name = self.col_name
return self.df.rows.select((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound)).count()

def non_outliers_count(self):
"""
Count non outliers rows using the selected column
:return:
"""
col_name = self.col_name
return self.df.rows.select(
(F.col(col_name) <= self.upper_bound) | (F.col(col_name) >= self.lower_bound)).count()

@abstractmethod
def info(self, output: str = "dict"):
"""
Get whiskers, iqrs and outliers and non outliers count
:return:
"""
pass
16 changes: 10 additions & 6 deletions optimus/outliers/mad.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from optimus.helpers.constants import RELATIVE_ERROR
from optimus.helpers.filters import dict_filter
from optimus.outliers.abstract_outliers import AbstractOutlier
from optimus.helpers.json import dump_json
from optimus.outliers.abstract_outliers_bounds import AbstractOutlierBounds


class MAD(AbstractOutlier):
class MAD(AbstractOutlierBounds):
"""
Handle outliers using mad
"""
Expand Down Expand Up @@ -35,14 +36,17 @@ def whiskers(self):

return {"lower_bound": lower_bound, "upper_bound": upper_bound}

def info(self):
def info(self, output: str = "dict"):
"""
Get whiskers, iqrs and outliers and non outliers count
:return:
"""
upper_bound, lower_bound, = dict_filter(self.whiskers(),
["upper_bound", "lower_bound"])

return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
"lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound),
"upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound)}
result = {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
"lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound),
"upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound)}
if output == "json":
result = dump_json(result)
return result
8 changes: 4 additions & 4 deletions optimus/outliers/modified_z_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from optimus.helpers.columns import parse_columns, name_col
from optimus.helpers.constants import RELATIVE_ERROR
from optimus.helpers.converter import one_list_to_val
from optimus.outliers.abstract_outliers import AbstractOutlier
from optimus.outliers.abstract_outliers_threshold import AbstractOutlierThreshold


class ModifiedZScore(AbstractOutlier):
class ModifiedZScore(AbstractOutlierThreshold):
"""
Handle outliers from a DataFrame using modified z score
Reference: http://colingorrie.github.io/outlier-detection.html#modified-z-score-method
Expand All @@ -21,7 +21,6 @@ def __init__(self, df, col_name, threshold, relative_error=RELATIVE_ERROR):
:param col_name:
:param threshold:
"""
super().__init__(df, col_name)
if not is_dataframe(df):
raise TypeError("Spark Dataframe expected")

Expand All @@ -34,9 +33,10 @@ def __init__(self, df, col_name, threshold, relative_error=RELATIVE_ERROR):
self.df = df
self.threshold = threshold
self.relative_error = relative_error

self.col_name = one_list_to_val(parse_columns(df, col_name))

super().__init__(df, col_name)

def _m_z_score(self):
df = self.df
col_name = self.col_name
Expand Down
19 changes: 12 additions & 7 deletions optimus/outliers/tukey.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from optimus.helpers.filters import dict_filter
from optimus.outliers.abstract_outliers import AbstractOutlier
from optimus.helpers.json import dump_json
from optimus.outliers.abstract_outliers_bounds import AbstractOutlierBounds


class Tukey(AbstractOutlier):
class Tukey(AbstractOutlierBounds):
"""
Handle outliers using inter quartile range
"""
Expand Down Expand Up @@ -31,7 +32,7 @@ def whiskers(self):

return {"lower_bound": lower_bound, "upper_bound": upper_bound, "iqr1": iqr["q1"], "iqr3": iqr["q3"]}

def info(self):
def info(self, output: str = "dict"):
"""
Get whiskers, iqrs and outliers and non outliers count
:return:
Expand All @@ -41,7 +42,11 @@ def info(self):
iqr1 = self.iqr1
iqr3 = self.iqr3

return {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
"lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound),
"upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound),
"iqr1": iqr1, "iqr3": iqr3}
result = {"count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(),
"lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound),
"upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound),
"iqr1": iqr1, "iqr3": iqr3}

if output == "json":
result = dump_json(result)
return result
31 changes: 29 additions & 2 deletions optimus/outliers/z_score.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from pyspark.sql import functions as F

from optimus.helpers.check import is_dataframe, is_numeric
from optimus.helpers.columns import parse_columns, name_col
from optimus.helpers.converter import one_list_to_val
from optimus.outliers.abstract_outliers import AbstractOutlier
from optimus.outliers.abstract_outliers_threshold import AbstractOutlierThreshold


class ZScore(AbstractOutlier):
class ZScore(AbstractOutlierThreshold):
"""
Handle outliers using z Score
"""
Expand All @@ -27,6 +29,31 @@ def __init__(self, df, col_name, threshold):

self.col_name = one_list_to_val(parse_columns(df, col_name))

super().__init__(df, col_name)

def drop(self):
col_name = self.col_name
z_col_name = name_col(col_name, "z_score")
threshold = self.threshold

return self.df.cols.z_score(col_name, z_col_name) \
.rows.drop(F.col(z_col_name) > threshold) \
.cols.drop(z_col_name)

def select(self):
col_name = self.col_name
z_col_name = name_col(col_name, "z_score")

return self.df.cols.z_score(col_name, z_col_name) \
.rows.select(F.col(z_col_name) > self.threshold) \
.cols.drop(z_col_name)

def non_outliers_count(self):
return self.drop().count()

def count(self):
return self.select().count()

def info(self):
col_name = self.col_name
z_col_name = name_col(col_name, "z_score")
Expand Down

0 comments on commit 2622fad

Please sign in to comment.