Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Dec 6, 2019
1 parent 69fc5e9 commit d2ae712
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 16 deletions.
20 changes: 9 additions & 11 deletions optimus/outliers/abstract_outliers_bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@

from pyspark.sql import functions as F

from optimus.infer import is_dataframe
from optimus.helpers.columns import parse_columns
from optimus.helpers.converter import one_list_to_val, val_to_list
from optimus.helpers.filters import dict_filter
from optimus.helpers.functions import create_buckets
from optimus.helpers.json import dump_json
from optimus.infer import is_dataframe


# LOWER_BOUND =
Expand Down Expand Up @@ -54,14 +53,14 @@ def select(self):

# TODO: Pass a defined division param instead or run 3 separated jobs
def hist(self, col_name):
buckets = 20
min_value, max_value = self.df.cols.range(col_name)

create_buckets(min_value, self.lower_bound, buckets)

create_buckets(self.lower_bound, self.upper_bound, buckets)

create_buckets(self.upper_bound, max_value, buckets)
# buckets = 20
# min_value, max_value = self.df.cols.range(col_name)
#
# create_buckets(min_value, self.lower_bound, buckets)
#
# create_buckets(self.lower_bound, self.upper_bound, buckets)
#
# create_buckets(self.upper_bound, max_value, buckets)

# lower bound
lower_bound_hist = self.df.rows.select(self.df[col_name] < self.lower_bound).cols.hist(col_name, 20)
Expand All @@ -73,7 +72,6 @@ def hist(self, col_name):
non_outlier_hist = self.df.rows.select(
(F.col(col_name) >= self.lower_bound) & (F.col(col_name) <= self.upper_bound)).cols.hist(col_name, 20)


result = {}
if lower_bound_hist is not None:
result.update(lower_bound_hist)
Expand Down
6 changes: 1 addition & 5 deletions tests/test_df_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,7 @@ def test_rows_is_in():
actual_df =source_df.rows.is_in('num',2)
expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' zombies', 2, 'cat', 'tv', 6, 'b'), ('simpsons cat lady', 2, 'frog', 'table', 7, '1')])
assert (expected_df.collect() == actual_df.collect())
@staticmethod
def test_rows_select():
actual_df =source_df.rows.select(Column<b'(num = 1)'>)
expected_df = op.create.df([('words', StringType(), True),('num', IntegerType(), True),('animals', StringType(), True),('thing', StringType(), True),('second', IntegerType(), True),('filter', StringType(), True)], [(' I like fish ', 1, 'dog dog', 'housé', 5, 'a')])
assert (expected_df.collect() == actual_df.collect())

@staticmethod
def test_rows_select_by_dtypes():
actual_df =source_df.rows.select_by_dtypes('filter','integer')
Expand Down

0 comments on commit d2ae712

Please sign in to comment.