From 38c9d96230ebe853e9d2ddb6e8cf6cc3224c740e Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 14 Nov 2019 16:52:51 -0600 Subject: [PATCH] Add metadata to string to index --- optimus/helpers/constants.py | 3 ++- optimus/ml/encoding.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/optimus/helpers/constants.py b/optimus/helpers/constants.py index 92d147e1..fe894750 100644 --- a/optimus/helpers/constants.py +++ b/optimus/helpers/constants.py @@ -76,7 +76,7 @@ # Profiler PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"} PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric", - "date": "date", "array": "array", "binaty": "binary", "null": "null"} + "date": "date", "array": "array", "binaty": "binary", "null": "null"} SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"], "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary", @@ -107,6 +107,7 @@ class Actions(Enum): DROP_ROW = "drop_row" VALUES_TO_COLS = "values_to_cols" SET = "set" + STRING_TO_INDEX = "string_to_index" @staticmethod def list(): diff --git a/optimus/ml/encoding.py b/optimus/ml/encoding.py index c8586867..7c24fb09 100644 --- a/optimus/ml/encoding.py +++ b/optimus/ml/encoding.py @@ -1,6 +1,7 @@ from pyspark.ml import feature, Pipeline from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer +from optimus.helpers.constants import Actions from optimus.helpers.check import is_dataframe, is_, is_str from optimus.helpers.columns import parse_columns, name_col from optimus.helpers.raiseit import RaiseIt @@ -37,7 +38,7 @@ def string_to_index(df, input_cols, output_cols=None, **kargs): :param output_cols:Column where the ouput is going to be saved :return: Dataframe with indexed columns. """ - + df_old = df input_cols = parse_columns(df, input_cols) if output_cols is None: output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] @@ -47,7 +48,8 @@ def string_to_index(df, input_cols, output_cols=None, **kargs): pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) - + df = df.preserve_meta(df_old, Actions.STRING_TO_INDEX.value, output_cols) + return df