Skip to content

Commit

Permalink
Handling string to index name as constant
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Dec 13, 2019
1 parent ec85106 commit cd00cbe
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
3 changes: 2 additions & 1 deletion optimus/ml/contants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
FINGERPRINT_COL = "fingerprint"
NGRAM_FINGERPRINT_COL = "ngram_fingerprint"

LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE"
LEVENSHTEIN_DISTANCE = "LEVENSHTEIN_DISTANCE"
STRING_TO_INDEX = "string_to_index"
3 changes: 2 additions & 1 deletion optimus/ml/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from optimus.helpers.constants import Actions
from optimus.helpers.raiseit import RaiseIt
from optimus.infer import is_, is_str, is_dataframe
from optimus.ml.contants import STRING_TO_INDEX


def n_gram(df, input_col, n=2):
Expand Down Expand Up @@ -44,7 +45,7 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs):
if columns is None:
input_cols = parse_columns(df, input_cols)
if output_cols is None:
output_cols = [name_col(input_col, "string_to_index") for input_col in input_cols]
output_cols = [name_col(input_col, STRING_TO_INDEX) for input_col in input_cols]
output_cols = get_output_cols(input_cols, output_cols)
else:
input_cols, output_cols = zip(*columns)
Expand Down
9 changes: 5 additions & 4 deletions optimus/ml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from optimus.infer import is_str, is_dataframe
from optimus.helpers.columns import parse_columns, name_col
from optimus.ml.contants import STRING_TO_INDEX
from optimus.ml.encoding import string_to_index, vector_assembler
from optimus.spark import Spark

Expand Down Expand Up @@ -50,7 +51,7 @@ def random_forest(df, columns, input_col, **kwargs):

model = RandomForestClassifier(**kwargs)
df.table()
df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

rf_model = model.fit(df)
df_model = rf_model.transform(df)
Expand Down Expand Up @@ -83,7 +84,7 @@ def decision_tree(df, columns, input_col, **kwargs):

model = DecisionTreeClassifier(**kwargs)

df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

dt_model = model.fit(df)
df_model = dt_model.transform(df)
Expand Down Expand Up @@ -116,7 +117,7 @@ def gbt(df, columns, input_col, **kwargs):

model = GBTClassifier(**kwargs)

df = df.cols.rename(name_col(input_col, "index_to_string"), "label")
df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

gbt_model = model.fit(df)
df_model = gbt_model.transform(df)
Expand All @@ -133,7 +134,7 @@ def h2o_automl(df, label, columns, **kwargs):
maxRuntimeSecs=60, # 1 minutes
seed=1,
maxModels=3,
labelCol=name_col(label, "index_to_string"),
labelCol=name_col(label, STRING_TO_INDEX),
**kwargs)

model = automl.fit(df_va)
Expand Down

0 comments on commit cd00cbe

Please sign in to comment.