Skip to content

Commit

Permalink
Added index to string
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Dec 13, 2019
1 parent 7768433 commit 5d54993
Show file tree
Hide file tree
Showing 6 changed files with 8,544 additions and 7,043 deletions.
16 changes: 16 additions & 0 deletions optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from optimus.helpers.parser import compress_list, compress_dict, parse_python_dtypes, parse_col_names_funcs_to_keys
from optimus.helpers.raiseit import RaiseIt
from optimus.ml.encoding import string_to_index as ml_string_to_index
from optimus.ml.encoding import index_to_string as ml_index_to_string
from optimus.profiler.functions import fill_missing_var_types, parse_profiler_dtypes

from optimus import ROOT_DIR
Expand Down Expand Up @@ -2092,6 +2093,21 @@ def string_to_index(input_cols=None, output_cols=None, columns=None):

return df

@add_attr(cols)
def index_to_string(input_cols=None, output_cols=None, columns=None):
"""
Encodes a string column of labels to a column of label indices
:param input_cols:
:param output_cols:
:param columns:
:return:
"""
df = self

df = ml_index_to_string(df, input_cols, output_cols, columns)

return df

@add_attr(cols)
def bucketizer(input_cols, splits, output_cols=None):
"""
Expand Down
1 change: 1 addition & 0 deletions optimus/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Actions(Enum):
VALUES_TO_COLS = "values_to_cols"
SET = "set"
STRING_TO_INDEX = "string_to_index"
INDEX_TO_STRING = "index_to_string"
MIN_MAX_SCALER = "min_max_scaler"
MAX_ABS_SCALER = "max_abs_scaler"
# ROWS
Expand Down
30 changes: 19 additions & 11 deletions optimus/ml/encoding.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from pyspark.ml import feature, Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer, IndexToString

from optimus.infer import is_, is_str, is_dataframe
from optimus.helpers.columns import parse_columns, name_col, get_output_cols
from optimus.helpers.constants import Actions
from optimus.helpers.raiseit import RaiseIt
from optimus.infer import is_, is_str, is_dataframe


def n_gram(df, input_col, n=2):
Expand Down Expand Up @@ -36,14 +36,15 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs):
:param df: Dataframe to be transformed
:param input_cols: Columns to be indexed.
:param output_cols:Column where the ouput is going to be saved
:param columns:
:return: Dataframe with indexed columns.
"""
df_actual = df

if columns is None:
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
output_cols = [name_col(input_col, "string_to_index") for input_col in input_cols]
output_cols = get_output_cols(input_cols, output_cols)
else:
input_cols, output_cols = zip(*columns)
Expand All @@ -59,27 +60,34 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs):
return df


def index_to_string(df, input_cols, output_col=None, **kargs):
def index_to_string(df, input_cols, output_cols=None, columns=None, **kargs):
"""
Maps a column of indices back to a new column of corresponding string values. The index-string mapping is
either from the ML attributes of the input column, or from user-supplied labels (which take precedence over
ML attributes).
:param df: Dataframe to be transformed.
:param input_cols: Columns to be indexed.
:param output_col: Column where the output is going to be saved.
:param output_cols: Column where the output is going to be saved.
:param columns:
:return: Dataframe with indexed columns.
"""
df_actual = df

input_cols = parse_columns(df, input_cols)
if output_col is None:
output_col = name_col(input_cols, "index_to_string")

indexers = [IndexToString(inputCol=column, outputCol=output_col, **kargs) for column in
list(set(input_cols))]
if columns is None:
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
output_cols = get_output_cols(input_cols, output_cols)
else:
input_cols, output_cols = zip(*columns)

indexers = [IndexToString(inputCol=input_col, outputCol=output_col, **kargs) for input_col, output_col
in zip(list(set(input_cols)), list(set(output_cols)))]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

df = df.preserve_meta(df_actual, Actions.INDEX_TO_STRING.value, output_cols)

return df


Expand Down
Loading

0 comments on commit 5d54993

Please sign in to comment.