diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index effcf35e..bd4bcd21 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -2163,16 +2163,8 @@ def string_to_index(input_cols=None, output_cols=None, columns=None): :param columns: :return: """ - df = self - if columns is None: - input_cols = parse_columns(df, input_cols) - else: - input_cols, output_cols = zip(*columns) - - df = ml_string_to_index(df, input_cols, output_cols) - - return df + return ml_string_to_index(df, input_cols, output_cols, columns) @add_attr(cols) def bucketizer(input_cols, splits, output_cols=None): diff --git a/optimus/ml/encoding.py b/optimus/ml/encoding.py index 7c24fb09..92b4249b 100644 --- a/optimus/ml/encoding.py +++ b/optimus/ml/encoding.py @@ -1,9 +1,9 @@ from pyspark.ml import feature, Pipeline from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer -from optimus.helpers.constants import Actions from optimus.helpers.check import is_dataframe, is_, is_str -from optimus.helpers.columns import parse_columns, name_col +from optimus.helpers.columns import parse_columns, name_col, get_output_cols +from optimus.helpers.constants import Actions from optimus.helpers.raiseit import RaiseIt @@ -29,7 +29,7 @@ def n_gram(df, input_col, n=2): return df_model, tfidf_model -def string_to_index(df, input_cols, output_cols=None, **kargs): +def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs): """ Maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. @@ -38,17 +38,23 @@ def string_to_index(df, input_cols, output_cols=None, **kargs): :param output_cols:Column where the ouput is going to be saved :return: Dataframe with indexed columns. """ - df_old = df - input_cols = parse_columns(df, input_cols) - if output_cols is None: - output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] + df_actual = df + + if columns is None: + input_cols = parse_columns(df, input_cols) + if output_cols is None: + output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] + output_cols = get_output_cols(input_cols, output_cols) + else: + input_cols, output_cols = zip(*columns) indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col, output_col in zip(list(set(input_cols)), list(set(output_cols)))] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) - df = df.preserve_meta(df_old, Actions.STRING_TO_INDEX.value, output_cols) + + df = df.preserve_meta(df_actual, Actions.STRING_TO_INDEX.value, output_cols) return df