diff --git a/tests/test_df_outliers.py b/tests/test_df_outliers.py index 3497cb74..1bb3b7c1 100644 --- a/tests/test_df_outliers.py +++ b/tests/test_df_outliers.py @@ -27,13 +27,13 @@ def test_outliers_mad_drop(): def test_outliers_mad_info(): actual_df =source_df.outliers.mad('height(ft)',0.5,10000).info() actual_df =json_enconding(actual_df) - expected_value =json_enconding({'count_outliers': 3, 'count_non_outliers': 5, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}) + expected_value =json_enconding({'count_outliers': 3, 'count_non_outliers': 2, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}) assert(expected_value == actual_df) @staticmethod def test_outliers_mad_non_outliers_count(): actual_df =source_df.outliers.mad('height(ft)',0.5,10000).non_outliers_count() actual_df =json_enconding(actual_df) - expected_value =json_enconding(5) + expected_value =json_enconding(2) assert(expected_value == actual_df) @staticmethod def test_outliers_mad_select(): @@ -72,20 +72,24 @@ def test_outliers_modified_z_score_select(): def test_outliers_tukey_count(): actual_df =source_df.outliers.tukey('height(ft)').count() actual_df =json_enconding(actual_df) - expected_value =json_enconding(5) + expected_value =json_enconding(2) assert(expected_value == actual_df) - + @staticmethod + def test_outliers_tukey_drop(): + actual_df =source_df.outliers.tukey('height(ft)').drop() + expected_df = op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('function(binary)', BinaryType(), True),('NullType', NullType(), True)], []) + assert (expected_df.collect() == actual_df.collect()) @staticmethod def test_outliers_tukey_info(): actual_df =source_df.outliers.tukey('height(ft)').info() actual_df =json_enconding(actual_df) - expected_value =json_enconding({'count_outliers': 5, 'count_non_outliers': 2, 'lower_bound': 45.5, 'lower_bound_count': 4, 'upper_bound': -6.5, 'upper_bound_count': 4, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}) + expected_value =json_enconding({'count_outliers': 2, 'count_non_outliers': 3, 'lower_bound': -6.5, 'lower_bound_count': 1, 'upper_bound': 45.5, 'upper_bound_count': 1, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}) assert(expected_value == actual_df) @staticmethod def test_outliers_tukey_non_outliers_count(): actual_df =source_df.outliers.tukey('height(ft)').non_outliers_count() actual_df =json_enconding(actual_df) - expected_value =json_enconding(2) + expected_value =json_enconding(3) assert(expected_value == actual_df) @staticmethod def test_outliers_tukey_select(): diff --git a/tests/test_df_rows.py b/tests/test_df_rows.py index c94b2f08..e76d28b5 100644 --- a/tests/test_df_rows.py +++ b/tests/test_df_rows.py @@ -33,16 +33,7 @@ def test_rows_between_invert_equal(): actual_df =source_df.rows.between('second',6,8,invert=True,equal=True) expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' I like fish ', 1, 'dog dog', 'housé', 5, 'a'), (' zombies', 2, 'cat', 'tv', 6, 'b'), (None, 3, 'eagle', 'glass', 8, 'c')]) assert (expected_df.collect() == actual_df.collect()) - @staticmethod - def test_rows_drop(): - actual_df =source_df.rows.drop(fil) - expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(None, 3, 'eagle', 'glass', 8, 'c')]) - assert (expected_df.collect() == actual_df.collect()) - @staticmethod - def test_rows_drop_audf(): - actual_df =source_df.rows.drop(a) - expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' I like fish ', 1, 'dog dog', 'housé', 5, 'a')]) - assert (expected_df.collect() == actual_df.collect()) + @staticmethod def test_rows_drop_by_dtypes(): actual_df =source_df.rows.drop_by_dtypes('filter','integer') @@ -54,11 +45,6 @@ def test_rows_is_in(): expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' zombies', 2, 'cat', 'tv', 6, 'b'), ('simpsons cat lady', 2, 'frog', 'table', 7, '1')]) assert (expected_df.collect() == actual_df.collect()) @staticmethod - def test_rows_select(): - actual_df =source_df.rows.select(fil) - expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' I like fish ', 1, 'dog dog', 'housé', 5, 'a')]) - assert (expected_df.collect() == actual_df.collect()) - @staticmethod def test_rows_select_by_dtypes(): actual_df =source_df.rows.select_by_dtypes('filter','integer') expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [('simpsons cat lady', 2, 'frog', 'table', 7, '1')])