Skip to content

Commit

Permalink
Improve string_to_index function decoupling
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 14, 2019
1 parent ed81ded commit b5c86c3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 17 deletions.
10 changes: 1 addition & 9 deletions optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -2163,16 +2163,8 @@ def string_to_index(input_cols=None, output_cols=None, columns=None):
:param columns:
:return:
"""
df = self

if columns is None:
input_cols = parse_columns(df, input_cols)
else:
input_cols, output_cols = zip(*columns)

df = ml_string_to_index(df, input_cols, output_cols)

return df
return ml_string_to_index(df, input_cols, output_cols, columns)

@add_attr(cols)
def bucketizer(input_cols, splits, output_cols=None):
Expand Down
22 changes: 14 additions & 8 deletions optimus/ml/encoding.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pyspark.ml import feature, Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer

from optimus.helpers.constants import Actions
from optimus.helpers.check import is_dataframe, is_, is_str
from optimus.helpers.columns import parse_columns, name_col
from optimus.helpers.columns import parse_columns, name_col, get_output_cols
from optimus.helpers.constants import Actions
from optimus.helpers.raiseit import RaiseIt


Expand All @@ -29,7 +29,7 @@ def n_gram(df, input_col, n=2):
return df_model, tfidf_model


def string_to_index(df, input_cols, output_cols=None, **kargs):
def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs):
"""
Maps a string column of labels to an ML column of label indices. If the input column is
numeric, we cast it to string and index the string values.
Expand All @@ -38,17 +38,23 @@ def string_to_index(df, input_cols, output_cols=None, **kargs):
:param output_cols:Column where the ouput is going to be saved
:return: Dataframe with indexed columns.
"""
df_old = df
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
df_actual = df

if columns is None:
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
output_cols = get_output_cols(input_cols, output_cols)
else:
input_cols, output_cols = zip(*columns)

indexers = [StringIndexer(inputCol=input_col, outputCol=output_col, **kargs).fit(df) for input_col, output_col
in zip(list(set(input_cols)), list(set(output_cols)))]

pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)
df = df.preserve_meta(df_old, Actions.STRING_TO_INDEX.value, output_cols)

df = df.preserve_meta(df_actual, Actions.STRING_TO_INDEX.value, output_cols)

return df

Expand Down

0 comments on commit b5c86c3

Please sign in to comment.