From 6fbeda8b9b74dcd1bdf7d1aee52d3a5bcee7672b Mon Sep 17 00:00:00 2001
From: Argenis Leon <argenisleon@gmail.com>
Date: Tue, 26 Nov 2019 08:18:54 -0600
Subject: [PATCH] Implemented select lower or upper dataframe

---
 optimus/outliers/abstract_outliers_bounds.py | 23 +++++++++++++++++---
 optimus/profiler/profiler.py                 |  2 +-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/optimus/outliers/abstract_outliers_bounds.py b/optimus/outliers/abstract_outliers_bounds.py
index f6c4bcd4..5395da69 100644
--- a/optimus/outliers/abstract_outliers_bounds.py
+++ b/optimus/outliers/abstract_outliers_bounds.py
@@ -6,6 +6,7 @@
 from optimus.helpers.columns import parse_columns
 from optimus.helpers.converter import one_list_to_val
 from optimus.helpers.filters import dict_filter
+from optimus.helpers.json import dump_json
 
 
 class AbstractOutlierBounds(ABC):
@@ -26,6 +27,9 @@ def __init__(self, df, col_name):
         self.df = df
         self.col_name = one_list_to_val(parse_columns(df, col_name))
 
+        self.lower_bound = None
+        self.upper_bound = None
+
     @abstractmethod
     def whiskers(self):
         """
@@ -42,9 +46,22 @@ def select(self):
 
         col_name = self.col_name
         upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
+        print(upper_bound, lower_bound)
 
         return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
 
+    def select_lower_bound(self):
+        col_name = self.col_name
+        sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
+                  "value": self.df.rows.select(self.df[col_name] < self.lower_bound).limit(100).rows.to_list("*")}
+        return dump_json(sample)
+
+    def select_upper_bound(self):
+        col_name = self.col_name
+        sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
+                  "value": self.df.rows.select(self.df[col_name] > self.upper_bound).limit(100).rows.to_list("*")}
+        return dump_json(sample)
+
     def drop(self):
         """
         Drop outliers rows using the selected column
@@ -52,9 +69,9 @@ def drop(self):
         """
 
         col_name = self.col_name
-        upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
-        print(upper_bound, lower_bound)
-        return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
+        # upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
+        # print(upper_bound, lower_bound)
+        return self.df.rows.drop((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound))
 
     def count_lower_bound(self, bound):
         """
diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py
index f8d6386e..226ac245 100644
--- a/optimus/profiler/profiler.py
+++ b/optimus/profiler/profiler.py
@@ -348,7 +348,7 @@ def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELAT
                 assign(output_columns, "summary.missing_count", total_count_na, dict)
                 assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2))
 
-            # TODO: drop, rename and move operation must affect  the sample
+            # TODO: drop, rename and move operation must affect the sample
             sample = {"columns": [{"title": cols} for cols in df.cols.names()],
                       "value": df.sample_n(sample).rows.to_list(columns)}