From cd00cbea3af80555eaa020cc3382929b6d254ed9 Mon Sep 17 00:00:00 2001
From: Argenis Leon <argenisleon@gmail.com>
Date: Fri, 13 Dec 2019 11:52:23 -0600
Subject: [PATCH] Handling string to index name as constant

---
 optimus/ml/contants.py | 3 ++-
 optimus/ml/encoding.py | 3 ++-
 optimus/ml/models.py   | 9 +++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/optimus/ml/contants.py b/optimus/ml/contants.py
index d834429f..e3e60ba9 100644
--- a/optimus/ml/contants.py
+++ b/optimus/ml/contants.py
@@ -8,4 +8,5 @@
 FINGERPRINT_COL = "fingerprint"
 NGRAM_FINGERPRINT_COL = "ngram_fingerprint"
 
-LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE"
\ No newline at end of file
+LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE"
+STRING_TO_INDEX = "string_to_index"
\ No newline at end of file
diff --git a/optimus/ml/encoding.py b/optimus/ml/encoding.py
index ae257b22..2ba9a82c 100644
--- a/optimus/ml/encoding.py
+++ b/optimus/ml/encoding.py
@@ -5,6 +5,7 @@
 from optimus.helpers.constants import Actions
 from optimus.helpers.raiseit import RaiseIt
 from optimus.infer import is_, is_str, is_dataframe
+from optimus.ml.contants import STRING_TO_INDEX
 
 
 def n_gram(df, input_col, n=2):
@@ -44,7 +45,7 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs):
     if columns is None:
         input_cols = parse_columns(df, input_cols)
         if output_cols is None:
-            output_cols = [name_col(input_col, "string_to_index") for input_col in input_cols]
+            output_cols = [name_col(input_col, STRING_TO_INDEX) for input_col in input_cols]
         output_cols = get_output_cols(input_cols, output_cols)
     else:
         input_cols, output_cols = zip(*columns)
diff --git a/optimus/ml/models.py b/optimus/ml/models.py
index 39694161..dc28834d 100644
--- a/optimus/ml/models.py
+++ b/optimus/ml/models.py
@@ -6,6 +6,7 @@
 
 from optimus.infer import is_str, is_dataframe
 from optimus.helpers.columns import parse_columns, name_col
+from optimus.ml.contants import STRING_TO_INDEX
 from optimus.ml.encoding import string_to_index, vector_assembler
 from optimus.spark import Spark
 
@@ -50,7 +51,7 @@ def random_forest(df, columns, input_col, **kwargs):
 
         model = RandomForestClassifier(**kwargs)
         df.table()
-        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
+        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")
 
         rf_model = model.fit(df)
         df_model = rf_model.transform(df)
@@ -83,7 +84,7 @@ def decision_tree(df, columns, input_col, **kwargs):
 
         model = DecisionTreeClassifier(**kwargs)
 
-        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
+        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")
 
         dt_model = model.fit(df)
         df_model = dt_model.transform(df)
@@ -116,7 +117,7 @@ def gbt(df, columns, input_col, **kwargs):
 
         model = GBTClassifier(**kwargs)
 
-        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
+        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")
 
         gbt_model = model.fit(df)
         df_model = gbt_model.transform(df)
@@ -133,7 +134,7 @@ def h2o_automl(df, label, columns, **kwargs):
                            maxRuntimeSecs=60,  # 1 minutes
                            seed=1,
                            maxModels=3,
-                           labelCol=name_col(label, "index_to_string"),
+                           labelCol=name_col(label, STRING_TO_INDEX),
                            **kwargs)
 
         model = automl.fit(df_va)