From bf970ef8bab0c77bd9e4536d75c4315a4072f6ff Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 24 Nov 2019 23:20:14 -0600 Subject: [PATCH] Fix replace for when handling different columns dtypes --- optimus/dataframe/columns.py | 17 ++++++++++------- optimus/dataframe/rows.py | 4 +++- optimus/helpers/columns.py | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 9115abd8..ac70dbb0 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -1103,20 +1103,23 @@ def func_numeric(_df, _input_col, _output_col, _search, _replace_by): func = func_full elif search_by == "chars" or search_by == "words": func = func_chars_words + elif search_by == "numeric": + func == func_numeric else: - RaiseIt.value_error(search_by, ["chars", "words", "full"]) + RaiseIt.value_error(search_by, ["chars", "words", "full", "numeric"]) + + if search_by in ["chars", "words", "full"]: + filter_dtype = [PYSPARK_STRING_TYPES] + elif search_by == "numeric": + filter_dtype = [PYSPARK_NUMERIC_TYPES] + + input_cols = parse_columns(self, input_cols, filter_by_column_dtypes=filter_dtype) - input_cols = parse_columns(self, input_cols, - filter_by_column_dtypes=[PYSPARK_STRING_TYPES, PYSPARK_NUMERIC_TYPES]) check_column_numbers(input_cols, "*") output_cols = get_output_cols(input_cols, output_cols) df = self for input_col, output_col in zip(input_cols, output_cols): - if is_column_a(df, input_col, "int"): - func = func_numeric - # df = df.cols.cast(input_col, "str", output_col) - df = func(df, input_col, output_col, search, replace_by) df = df.preserve_meta(self, Actions.REPLACE.value, output_col) diff --git a/optimus/dataframe/rows.py b/optimus/dataframe/rows.py index 5e9a4f0b..b197d486 100644 --- a/optimus/dataframe/rows.py +++ b/optimus/dataframe/rows.py @@ -55,6 +55,7 @@ def append(rows): return df_result + @add_attr(rows) def select_by_dtypes(input_cols, data_type=None): """ @@ -84,7 +85,8 @@ def select(*args, **kwargs): :param kwargs: :return: Spark DataFrame """ - return self.filter(*args, **kwargs) + df = self + return df.filter(*args, **kwargs) @add_attr(rows) def to_list(input_cols): diff --git a/optimus/helpers/columns.py b/optimus/helpers/columns.py index 90f76173..d3e7f61c 100644 --- a/optimus/helpers/columns.py +++ b/optimus/helpers/columns.py @@ -119,6 +119,7 @@ def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column elif cols_args == "*" or cols_args is None: cols = df.columns + # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) @@ -163,7 +164,6 @@ def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column final_columns = cols cols_params = [] - if invert: final_columns = list(OrderedSet(df.cols.names()) - OrderedSet(final_columns))