From 6fbeda8b9b74dcd1bdf7d1aee52d3a5bcee7672b Mon Sep 17 00:00:00 2001 From: Argenis Leon <argenisleon@gmail.com> Date: Tue, 26 Nov 2019 08:18:54 -0600 Subject: [PATCH] Implemented select lower or upper dataframe --- optimus/outliers/abstract_outliers_bounds.py | 23 +++++++++++++++++--- optimus/profiler/profiler.py | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/optimus/outliers/abstract_outliers_bounds.py b/optimus/outliers/abstract_outliers_bounds.py index f6c4bcd4..5395da69 100644 --- a/optimus/outliers/abstract_outliers_bounds.py +++ b/optimus/outliers/abstract_outliers_bounds.py @@ -6,6 +6,7 @@ from optimus.helpers.columns import parse_columns from optimus.helpers.converter import one_list_to_val from optimus.helpers.filters import dict_filter +from optimus.helpers.json import dump_json class AbstractOutlierBounds(ABC): @@ -26,6 +27,9 @@ def __init__(self, df, col_name): self.df = df self.col_name = one_list_to_val(parse_columns(df, col_name)) + self.lower_bound = None + self.upper_bound = None + @abstractmethod def whiskers(self): """ @@ -42,9 +46,22 @@ def select(self): col_name = self.col_name upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) + print(upper_bound, lower_bound) return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) + def select_lower_bound(self): + col_name = self.col_name + sample = {"columns": [{"title": cols} for cols in self.df.cols.names()], + "value": self.df.rows.select(self.df[col_name] < self.lower_bound).limit(100).rows.to_list("*")} + return dump_json(sample) + + def select_upper_bound(self): + col_name = self.col_name + sample = {"columns": [{"title": cols} for cols in self.df.cols.names()], + "value": self.df.rows.select(self.df[col_name] > self.upper_bound).limit(100).rows.to_list("*")} + return dump_json(sample) + def drop(self): """ Drop outliers rows using the selected column @@ -52,9 +69,9 @@ def drop(self): """ col_name = self.col_name - upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) - print(upper_bound, lower_bound) - return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) + # upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) + # print(upper_bound, lower_bound) + return self.df.rows.drop((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound)) def count_lower_bound(self, bound): """ diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index f8d6386e..226ac245 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -348,7 +348,7 @@ def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELAT assign(output_columns, "summary.missing_count", total_count_na, dict) assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2)) - # TODO: drop, rename and move operation must affect the sample + # TODO: drop, rename and move operation must affect the sample sample = {"columns": [{"title": cols} for cols in df.cols.names()], "value": df.sample_n(sample).rows.to_list(columns)}