Implemented select lower or upper dataframe

eapframework · Nov 26, 2019 · 6fbeda8 · 6fbeda8
1 parent 8e512c4
commit 6fbeda8
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 4 deletions.
diff --git a/optimus/outliers/abstract_outliers_bounds.py b/optimus/outliers/abstract_outliers_bounds.py
@@ -6,6 +6,7 @@
 from optimus.helpers.columns import parse_columns
 from optimus.helpers.converter import one_list_to_val
 from optimus.helpers.filters import dict_filter
+from optimus.helpers.json import dump_json
 
 
 class AbstractOutlierBounds(ABC):
@@ -26,6 +27,9 @@ def __init__(self, df, col_name):
         self.df = df
         self.col_name = one_list_to_val(parse_columns(df, col_name))
 
+        self.lower_bound = None
+        self.upper_bound = None
+
     @abstractmethod
     def whiskers(self):
         """
@@ -42,19 +46,32 @@ def select(self):
 
         col_name = self.col_name
         upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
+        print(upper_bound, lower_bound)
 
         return self.df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
 
+    def select_lower_bound(self):
+        col_name = self.col_name
+        sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
+                  "value": self.df.rows.select(self.df[col_name] < self.lower_bound).limit(100).rows.to_list("*")}
+        return dump_json(sample)
+
+    def select_upper_bound(self):
+        col_name = self.col_name
+        sample = {"columns": [{"title": cols} for cols in self.df.cols.names()],
+                  "value": self.df.rows.select(self.df[col_name] > self.upper_bound).limit(100).rows.to_list("*")}
+        return dump_json(sample)
+
     def drop(self):
         """
         Drop outliers rows using the selected column
         :return:
         """
 
         col_name = self.col_name
-        upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
-        print(upper_bound, lower_bound)
-        return self.df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
+        # upper_bound, lower_bound = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"])
+        # print(upper_bound, lower_bound)
+        return self.df.rows.drop((F.col(col_name) > self.upper_bound) | (F.col(col_name) < self.lower_bound))
 
     def count_lower_bound(self, bound):
         """

diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py
@@ -348,7 +348,7 @@ def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELAT
                 assign(output_columns, "summary.missing_count", total_count_na, dict)
                 assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2))
 
-            # TODO: drop, rename and move operation must affect  the sample
+            # TODO: drop, rename and move operation must affect the sample
             sample = {"columns": [{"title": cols} for cols in df.cols.names()],
                       "value": df.sample_n(sample).rows.to_list(columns)}