Skip to content

Commit

Permalink
Add metadata to string to index
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 14, 2019
1 parent 2cab7a1 commit 38c9d96
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
3 changes: 2 additions & 1 deletion optimus/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
# Profiler
PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"}
PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric",
"date": "date", "array": "array", "binaty": "binary", "null": "null"}
"date": "date", "array": "array", "binaty": "binary", "null": "null"}

SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"],
"string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary",
Expand Down Expand Up @@ -107,6 +107,7 @@ class Actions(Enum):
DROP_ROW = "drop_row"
VALUES_TO_COLS = "values_to_cols"
SET = "set"
STRING_TO_INDEX = "string_to_index"

@staticmethod
def list():
Expand Down
6 changes: 4 additions & 2 deletions optimus/ml/encoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pyspark.ml import feature, Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer

from optimus.helpers.constants import Actions
from optimus.helpers.check import is_dataframe, is_, is_str
from optimus.helpers.columns import parse_columns, name_col
from optimus.helpers.raiseit import RaiseIt
Expand Down Expand Up @@ -37,7 +38,7 @@ def string_to_index(df, input_cols, output_cols=None, **kargs):
:param output_cols:Column where the ouput is going to be saved
:return: Dataframe with indexed columns.
"""

df_old = df
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols]
Expand All @@ -47,7 +48,8 @@ def string_to_index(df, input_cols, output_cols=None, **kargs):

pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

df = df.preserve_meta(df_old, Actions.STRING_TO_INDEX.value, output_cols)

return df


Expand Down

0 comments on commit 38c9d96

Please sign in to comment.