Skip to content

Commit

Permalink
string_to_index now accepts multiple columns
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 6, 2019
1 parent 3878a12 commit a7255d0
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 11 deletions.
9 changes: 6 additions & 3 deletions optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -2117,17 +2117,20 @@ def join_all(_dfs):
return join_all(combined)

@add_attr(cols)
def string_to_index(input_cols, output_cols=None):
def string_to_index(input_cols=None, output_cols=None, columns=None):
"""
Encodes a string column of labels to a column of label indices
:param input_cols:
:param output_cols:
:param columns:
:return:
"""
df = self

input_cols = parse_columns(df, input_cols)
# output_cols = get_output_cols(input_cols, output_cols)
if columns is None:
input_cols = parse_columns(df, input_cols)
else:
input_cols, output_cols = zip(*columns)

df = ml_string_to_index(df, input_cols, output_cols)

Expand Down
18 changes: 10 additions & 8 deletions optimus/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,28 @@ def n_gram(df, input_col, n=2):
return df_model, tfidf_model


def string_to_index(df, input_cols, output_col=None, **kargs):
def string_to_index(df, input_cols, output_cols=None, **kargs):
"""
Maps a string column of labels to an ML column of label indices. If the input column is
numeric, we cast it to string and index the string values.
:param df: Dataframe to be transformed
:param input_cols: Columns to be indexed.
:param output_col:Column where the ouput is going to be saved
:param output_cols:Column where the ouput is going to be saved
:return: Dataframe with indexed columns.
"""

input_cols = parse_columns(df, input_cols)
if output_col is None:
output_col = name_col(input_cols, "index_to_string")
# input_cols = parse_columns(df, input_cols)
if output_cols is None:

indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col
in list(set(input_cols))]
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
print(output_cols)

indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col, output_col
in zip(list(set(input_cols)), list(set(output_cols)))]

pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

# df.show()
return df


Expand Down

0 comments on commit a7255d0

Please sign in to comment.