Skip to content

Commit

Permalink
Quality improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Nov 9, 2019
1 parent 00b6ba2 commit 956727f
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 10 deletions.
10 changes: 8 additions & 2 deletions optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,11 +1620,17 @@ def hist(columns, buckets=20):
result = agg_exprs(columns, hist_agg, self, buckets)
# TODO: for some reason casting to int in the exprs do not work. Casting Here. A Spark bug?
# Example
# Column < b'array(map(count, CAST(sum(CASE WHEN ((rank >= 7) AND (rank < 7.75)) THEN 1 ELSE 0 END) AS INT), lower, 7, upper, 7.75) AS `hist_agg_rank_0`, map(count, CAST(sum(CASE WHEN ((rank >= 7.75) AND (rank < 8.5)) THEN 1 ELSE 0 END) AS INT), lower, 7.75, upper, 8.5) AS `hist_agg_rank_1`, map(count, CAST(sum(CASE WHEN ((rank >= 8.5) AND (rank < 9.25)) THEN 1 ELSE 0 END) AS INT), lower, 8.5, upper, 9.25) AS `hist_agg_rank_2`, map(count, CAST(sum(CASE WHEN ((rank >= 9.25) AND (rank < 10)) THEN 1 ELSE 0 END) AS INT), lower, 9.25, upper, 10) AS `hist_agg_rank_3`) AS `histrank`' >
# Column < b'array(map(count, CAST(sum(CASE WHEN ((rank >= 7) AND (rank < 7.75)) THEN 1 ELSE 0 END) AS INT),
# lower, 7, upper, 7.75) AS `hist_agg_rank_0`, map(count, CAST(sum(CASE WHEN ((rank >= 7.75) AND (rank < 8.5))
# THEN 1 ELSE 0 END) AS INT), lower, 7.75, upper, 8.5) AS `hist_agg_rank_1`, map(count,
# CAST(sum(CASE WHEN ((rank >= 8.5) AND (rank < 9.25)) THEN 1 ELSE 0 END) AS INT), lower, 8.5, upper, 9.25)
# AS `hist_agg_rank_2`, map(count, CAST(sum(CASE WHEN ((rank >= 9.25) AND (rank < 10))
# THEN 1 ELSE 0 END) AS INT), lower, 9.25, upper, 10) AS `hist_agg_rank_3`) AS `histrank`' >

return result

# TODO: In tests this code run faster than using agg_exprs when run over all the columns. Not when running over columns individually
# TODO: In tests this code run faster than using agg_exprs when run over all the columns.
# Not when running over columns individually
# columns = parse_columns(self, columns)
# df = self
# for col_name in columns:
Expand Down
2 changes: 1 addition & 1 deletion optimus/dataframe/rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optimus.helpers.columns import parse_columns, validate_columns_names
from optimus.helpers.constants import Actions
from optimus.helpers.converter import one_list_to_val
from optimus.helpers.decorators import *
from optimus.helpers.decorators import add_attr
from optimus.helpers.functions import append as append_df
from optimus.helpers.raiseit import RaiseIt

Expand Down
3 changes: 2 additions & 1 deletion optimus/helpers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def parse_col_names_funcs_to_keys(data):
_col_name = k[len(temp_func_name):]
if is_nan(v):
logger.print(
"'{FUNCTION}' function in '{COL_NAME}' column is returning 'nan'. Is that what you expected?. Seems that '{COL_NAME}' has 'nan' values".format(
"'{FUNCTION}' function in '{COL_NAME}' column is returning 'nan'. Is that what you expected?. "
"Seems that '{COL_NAME}' has 'nan' values".format(
FUNCTION=f,
COL_NAME=_col_name))
# If the value is numeric only get 5 decimals
Expand Down
10 changes: 5 additions & 5 deletions optimus/ml/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pyspark.ml import feature, classification
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.sql.functions import *
from pyspark.sql.functions import F
from pysparkling import *
from pysparkling.ml import H2OAutoML, H2ODeepLearning, H2OXGBoost, H2OGBM

Expand Down Expand Up @@ -139,7 +139,7 @@ def h2o_automl(df, label, columns, **kwargs):
model = automl.fit(df_va)
df_raw = model.transform(df_va)

df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))
df_pred = df_raw.withColumn("prediction", F.when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))

return df_pred, model

Expand All @@ -161,7 +161,7 @@ def h2o_deeplearning(df, label, columns, **kwargs):
model = h2o_deeplearning.fit(df_va)
df_raw = model.transform(df_va)

df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))
df_pred = df_raw.withColumn("prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

return df_pred, model

Expand All @@ -179,7 +179,7 @@ def h2o_xgboost(df, label, columns, **kwargs):
model = h2o_xgboost.fit(df_va)
df_raw = model.transform(df_va)

df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))
df_pred = df_raw.withColumn("prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

return df_pred, model

Expand All @@ -198,6 +198,6 @@ def h2o_gbm(df, label, columns, **kwargs):
model = h2o_gbm.fit(df_va)
df_raw = model.transform(df_va)

df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))
df_pred = df_raw.withColumn("prediction", F.when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

return df_pred, model
3 changes: 2 additions & 1 deletion optimus/profiler/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import math

from optimus.helpers.constants import *
from optimus.helpers.constants import SPARK_DTYPES_TO_PROFILER, ProfilerDataTypes, PROFILER_COLUMN_TYPES, \
CONFIDENCE_LEVEL_CONSTANT
from optimus.helpers.json import json_converter


Expand Down

0 comments on commit 956727f

Please sign in to comment.