-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create two abstract class for outliers
- Loading branch information
1 parent
36ec821
commit 2622fad
Showing
6 changed files
with
149 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
from pyspark.sql import functions as F | ||
|
||
from optimus.helpers.check import is_dataframe | ||
from optimus.helpers.columns import parse_columns | ||
from optimus.helpers.converter import one_list_to_val | ||
from optimus.helpers.filters import dict_filter | ||
|
||
|
||
class AbstractOutlierThreshold(ABC): | ||
""" | ||
This is a template class to expand the outliers methods | ||
Also you need to add the function to outliers.py | ||
""" | ||
|
||
def __init__(self, df, col_name): | ||
""" | ||
:param df: Spark Dataframe | ||
:param col_name: column name | ||
""" | ||
if not is_dataframe(df): | ||
raise TypeError("Spark Dataframe expected") | ||
|
||
self.df = df | ||
self.col_name = one_list_to_val(parse_columns(df, col_name)) | ||
|
||
def select(self): | ||
""" | ||
Select outliers rows using the selected column | ||
:return: | ||
""" | ||
|
||
col_name = self.col_name | ||
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) | ||
|
||
return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) | ||
|
||
def drop(self): | ||
""" | ||
Drop outliers rows using the selected column | ||
:return: | ||
""" | ||
|
||
col_name = self.col_name | ||
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) | ||
print(upper_bound, lower_bound) | ||
return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) | ||
|
||
def count_lower_bound(self, bound): | ||
""" | ||
Count outlier in the lower bound | ||
:return: | ||
""" | ||
col_name = self.col_name | ||
return self.df.rows.select(self.df[col_name] < bound).count() | ||
|
||
def count_upper_bound(self, bound): | ||
""" | ||
Count outliers in the upper bound | ||
:return: | ||
""" | ||
col_name = self.col_name | ||
return self.df.rows.select(self.df[col_name] > bound).count() | ||
|
||
def count(self): | ||
""" | ||
Count the outliers rows using the selected column | ||
:return: | ||
""" | ||
col_name = self.col_name | ||
return self.df.rows.select((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound)).count() | ||
|
||
def non_outliers_count(self): | ||
""" | ||
Count non outliers rows using the selected column | ||
:return: | ||
""" | ||
col_name = self.col_name | ||
return self.df.rows.select( | ||
(F.col(col_name) <= self.upper_bound) | (F.col(col_name) >= self.lower_bound)).count() | ||
|
||
@abstractmethod | ||
def info(self, output: str = "dict"): | ||
""" | ||
Get whiskers, iqrs and outliers and non outliers count | ||
:return: | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters