From d2ae712f6bb69dd9eb8549c29fc88c659d632acc Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 6 Dec 2019 15:27:11 -0600 Subject: [PATCH] Fix --- optimus/outliers/abstract_outliers_bounds.py | 20 +++++++++----------- tests/test_df_rows.py | 6 +----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/optimus/outliers/abstract_outliers_bounds.py b/optimus/outliers/abstract_outliers_bounds.py index fd1077a0..a65561af 100644 --- a/optimus/outliers/abstract_outliers_bounds.py +++ b/optimus/outliers/abstract_outliers_bounds.py @@ -2,12 +2,11 @@ from pyspark.sql import functions as F -from optimus.infer import is_dataframe from optimus.helpers.columns import parse_columns from optimus.helpers.converter import one_list_to_val, val_to_list from optimus.helpers.filters import dict_filter -from optimus.helpers.functions import create_buckets from optimus.helpers.json import dump_json +from optimus.infer import is_dataframe # LOWER_BOUND = @@ -54,14 +53,14 @@ def select(self): # TODO: Pass a defined division param instead or run 3 separated jobs def hist(self, col_name): - buckets = 20 - min_value, max_value = self.df.cols.range(col_name) - - create_buckets(min_value, self.lower_bound, buckets) - - create_buckets(self.lower_bound, self.upper_bound, buckets) - - create_buckets(self.upper_bound, max_value, buckets) + # buckets = 20 + # min_value, max_value = self.df.cols.range(col_name) + # + # create_buckets(min_value, self.lower_bound, buckets) + # + # create_buckets(self.lower_bound, self.upper_bound, buckets) + # + # create_buckets(self.upper_bound, max_value, buckets) # lower bound lower_bound_hist = self.df.rows.select(self.df[col_name] < self.lower_bound).cols.hist(col_name, 20) @@ -73,7 +72,6 @@ def hist(self, col_name): non_outlier_hist = self.df.rows.select( (F.col(col_name) >= self.lower_bound) & (F.col(col_name) <= self.upper_bound)).cols.hist(col_name, 20) - result = {} if lower_bound_hist is not None: result.update(lower_bound_hist) diff --git a/tests/test_df_rows.py b/tests/test_df_rows.py index 052245af..ee92d757 100644 --- a/tests/test_df_rows.py +++ b/tests/test_df_rows.py @@ -43,11 +43,7 @@ def test_rows_is_in(): actual_df =source_df.rows.is_in('num',2) expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' zombies', 2, 'cat', 'tv', 6, 'b'), ('simpsons cat lady', 2, 'frog', 'table', 7, '1')]) assert (expected_df.collect() == actual_df.collect()) - @staticmethod - def test_rows_select(): - actual_df =source_df.rows.select(Column) - expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' I like fish ', 1, 'dog dog', 'housé', 5, 'a')]) - assert (expected_df.collect() == actual_df.collect()) + @staticmethod def test_rows_select_by_dtypes(): actual_df =source_df.rows.select_by_dtypes('filter','integer')