diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 8853952f..823df6b3 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -2117,17 +2117,20 @@ def join_all(_dfs): return join_all(combined) @add_attr(cols) - def string_to_index(input_cols, output_cols=None): + def string_to_index(input_cols=None, output_cols=None, columns=None): """ Encodes a string column of labels to a column of label indices :param input_cols: :param output_cols: + :param columns: :return: """ df = self - input_cols = parse_columns(df, input_cols) - # output_cols = get_output_cols(input_cols, output_cols) + if columns is None: + input_cols = parse_columns(df, input_cols) + else: + input_cols, output_cols = zip(*columns) df = ml_string_to_index(df, input_cols, output_cols) diff --git a/optimus/ml/feature.py b/optimus/ml/feature.py index df1e827d..481da052 100644 --- a/optimus/ml/feature.py +++ b/optimus/ml/feature.py @@ -28,26 +28,28 @@ def n_gram(df, input_col, n=2): return df_model, tfidf_model -def string_to_index(df, input_cols, output_col=None, **kargs): +def string_to_index(df, input_cols, output_cols=None, **kargs): """ Maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. :param df: Dataframe to be transformed :param input_cols: Columns to be indexed. - :param output_col:Column where the ouput is going to be saved + :param output_cols:Column where the ouput is going to be saved :return: Dataframe with indexed columns. """ - input_cols = parse_columns(df, input_cols) - if output_col is None: - output_col = name_col(input_cols, "index_to_string") + # input_cols = parse_columns(df, input_cols) + if output_cols is None: - indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col - in list(set(input_cols))] + output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] + print(output_cols) + + indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col, output_col + in zip(list(set(input_cols)), list(set(output_cols)))] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) - + # df.show() return df