From cd00cbea3af80555eaa020cc3382929b6d254ed9 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 13 Dec 2019 11:52:23 -0600 Subject: [PATCH] Handling string to index name as constant --- optimus/ml/contants.py | 3 ++- optimus/ml/encoding.py | 3 ++- optimus/ml/models.py | 9 +++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/optimus/ml/contants.py b/optimus/ml/contants.py index d834429f..e3e60ba9 100644 --- a/optimus/ml/contants.py +++ b/optimus/ml/contants.py @@ -8,4 +8,5 @@ FINGERPRINT_COL = "fingerprint" NGRAM_FINGERPRINT_COL = "ngram_fingerprint" -LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE" \ No newline at end of file +LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE" +STRING_TO_INDEX = "string_to_index" \ No newline at end of file diff --git a/optimus/ml/encoding.py b/optimus/ml/encoding.py index ae257b22..2ba9a82c 100644 --- a/optimus/ml/encoding.py +++ b/optimus/ml/encoding.py @@ -5,6 +5,7 @@ from optimus.helpers.constants import Actions from optimus.helpers.raiseit import RaiseIt from optimus.infer import is_, is_str, is_dataframe +from optimus.ml.contants import STRING_TO_INDEX def n_gram(df, input_col, n=2): @@ -44,7 +45,7 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs): if columns is None: input_cols = parse_columns(df, input_cols) if output_cols is None: - output_cols = [name_col(input_col, "string_to_index") for input_col in input_cols] + output_cols = [name_col(input_col, STRING_TO_INDEX) for input_col in input_cols] output_cols = get_output_cols(input_cols, output_cols) else: input_cols, output_cols = zip(*columns) diff --git a/optimus/ml/models.py b/optimus/ml/models.py index 39694161..dc28834d 100644 --- a/optimus/ml/models.py +++ b/optimus/ml/models.py @@ -6,6 +6,7 @@ from optimus.infer import is_str, is_dataframe from optimus.helpers.columns import parse_columns, name_col +from optimus.ml.contants import STRING_TO_INDEX from optimus.ml.encoding import string_to_index, vector_assembler from optimus.spark import Spark @@ -50,7 +51,7 @@ def random_forest(df, columns, input_col, **kwargs): model = RandomForestClassifier(**kwargs) df.table() - df = df.cols.rename(name_col(input_col, "index_to_string"), "label") + df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") rf_model = model.fit(df) df_model = rf_model.transform(df) @@ -83,7 +84,7 @@ def decision_tree(df, columns, input_col, **kwargs): model = DecisionTreeClassifier(**kwargs) - df = df.cols.rename(name_col(input_col, "index_to_string"), "label") + df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") dt_model = model.fit(df) df_model = dt_model.transform(df) @@ -116,7 +117,7 @@ def gbt(df, columns, input_col, **kwargs): model = GBTClassifier(**kwargs) - df = df.cols.rename(name_col(input_col, "index_to_string"), "label") + df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") gbt_model = model.fit(df) df_model = gbt_model.transform(df) @@ -133,7 +134,7 @@ def h2o_automl(df, label, columns, **kwargs): maxRuntimeSecs=60, # 1 minutes seed=1, maxModels=3, - labelCol=name_col(label, "index_to_string"), + labelCol=name_col(label, STRING_TO_INDEX), **kwargs) model = automl.fit(df_va)