Skip to content

Commit

Permalink
Implemented select lower or upper dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 26, 2019
1 parent 8e512c4 commit 6fbeda8
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
23 changes: 20 additions & 3 deletions optimus/outliers/abstract_outliers_bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from optimus.helpers.columns import parse_columns
from optimus.helpers.converter import one_list_to_val
from optimus.helpers.filters import dict_filter
from optimus.helpers.json import dump_json


class AbstractOutlierBounds(ABC):
Expand All @@ -26,6 +27,9 @@ def __init__(self, df, col_name):
self.df = df
self.col_name = one_list_to_val(parse_columns(df, col_name))

self.lower_bound = None
self.upper_bound = None

@abstractmethod
def whiskers(self):
"""
Expand All @@ -42,19 +46,32 @@ def select(self):

col_name = self.col_name
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
print(upper_bound, lower_bound)

return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))

def select_lower_bound(self):
col_name = self.col_name
sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
"value": self.df.rows.select(self.df[col_name] < self.lower_bound).limit(100).rows.to_list("*")}
return dump_json(sample)

def select_upper_bound(self):
col_name = self.col_name
sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
"value": self.df.rows.select(self.df[col_name] > self.upper_bound).limit(100).rows.to_list("*")}
return dump_json(sample)

def drop(self):
"""
Drop outliers rows using the selected column
:return:
"""

col_name = self.col_name
upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
print(upper_bound, lower_bound)
return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
# upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
# print(upper_bound, lower_bound)
return self.df.rows.drop((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound))

def count_lower_bound(self, bound):
"""
Expand Down
2 changes: 1 addition & 1 deletion optimus/profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELAT
assign(output_columns, "summary.missing_count", total_count_na, dict)
assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2))

# TODO: drop, rename and move operation must affect the sample
# TODO: drop, rename and move operation must affect the sample
sample = {"columns": [{"title": cols} for cols in df.cols.names()],
"value": df.sample_n(sample).rows.to_list(columns)}

Expand Down

0 comments on commit 6fbeda8

Please sign in to comment.