Skip to content

Commit

Permalink
added restrictions to the use of pynisher and avoid stalling executions
Browse files Browse the repository at this point in the history
  • Loading branch information
fmohr committed May 13, 2024
1 parent 8ef7f44 commit a6e5c3b
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 6 deletions.
3 changes: 2 additions & 1 deletion python/naiveautoml/algorithm_selection/_sklearn_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,8 @@ def is_pipeline_forbidden(task, pl):
sklearn.preprocessing.RobustScaler,
sklearn.preprocessing.StandardScaler,
sklearn.preprocessing.MinMaxScaler,
sklearn.preprocessing.QuantileTransformer
sklearn.preprocessing.QuantileTransformer,
sklearn.preprocessing.PowerTransformer
]:
return True # scaling has no effect onf tree-based classifiers

Expand Down
40 changes: 36 additions & 4 deletions python/naiveautoml/algorithm_selection/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sklearn.svm

from .._interfaces import AlgorithmSelector, SupervisedTask

# core stuff
Expand Down Expand Up @@ -174,16 +176,19 @@ def run(self, deadline=None):
pl = self.get_pipeline_for_decision_in_step(step_name, comp, self.task.X, self.task.y, decisions)

eval_start_time = time.time()
if is_pipeline_forbidden(self.task, pl):
if (
is_pipeline_forbidden(self.task, pl) or
(self.is_timeout_required(pl) and self.is_pl_prohibited_for_timeout(pl))
):
self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
scores = {scoring["name"]: np.nan for scoring in [self.task.scoring] + self.task.passive_scorings}
self.evaluator.tellEvaluation(pl, [scores[self.task.scoring["name"]]], None, time.time())
status, scores, evaluation_report, exception = "avoided", scores, None, None
else:
if self.task.timeout_candidate is None:
timeout = None
else:
if self.is_timeout_required(pl) and self.task.timeout_candidate is not None:
timeout = min(self.task.timeout_candidate, remaining_time if deadline is not None else 10 ** 10)
else:
timeout = None
status, scores, evaluation_report, exception = self.evaluator.evaluate(pl, timeout)

runtime = time.time() - eval_start_time
Expand Down Expand Up @@ -412,6 +417,33 @@ def get_mandatory_preprocessing(self, X=None, y=None, categorical_features=None)
except Exception:
raise

def is_pl_prohibited_for_timeout(self, pl):
learner = pl["learner"]
if (
isinstance(learner, sklearn.discriminant_analysis.LinearDiscriminantAnalysis) or
isinstance(learner, sklearn.neural_network.MLPClassifier) or
isinstance(learner, sklearn.neural_network.MLPRegressor) or
isinstance(learner, sklearn.ensemble.HistGradientBoostingRegressor) or
isinstance(learner, sklearn.ensemble.HistGradientBoostingClassifier)
):
return True
return False

def is_timeout_required(self, pl):
learner = pl["learner"]
if isinstance(learner, sklearn.svm.SVC) or isinstance(learner, sklearn.svm.LinearSVC):
return True
if "feature-pre-processor" in [e[0] for e in pl.steps]:
feature_pp = [e[1] for e in pl.steps if e[0] == "feature-pre-processor"][0]
if (
isinstance(feature_pp, sklearn.decomposition.KernelPCA) or
isinstance(feature_pp, sklearn.kernel_approximation.RBFSampler) or
isinstance(feature_pp, sklearn.kernel_approximation.Nystroem) or
isinstance(feature_pp, sklearn.preprocessing.PolynomialFeatures)
):
return True
return False

def get_standard_learner_instance(self, X, y):
return self.standard_classifier() if self.inferred_task_type == "classification" else self.standard_regressor()

Expand Down
6 changes: 5 additions & 1 deletion python/test/test_naiveautoml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd

from typing import Callable
import gc


def get_dataset(openmlid, as_numpy = True):
Expand Down Expand Up @@ -311,7 +312,7 @@ def test_constant_algorithms_in_hpo_phase(self):

@parameterized.expand([
(61, 30, 10, 0.9), # on a fast machine, iris can be executed in 10s, but on slow machines it takes longer
(6, 300, 30, 0.96), # letter
(6, 300, 20, 0.96), # letter
(188, 60, 10, 0.5), # eucalyptus. Very important because has both missing values and categorical attributes
#(1485, 240, 0.82),
#(1515, 240, 0.85),
Expand Down Expand Up @@ -356,6 +357,9 @@ def test_naml_results_classification(self, openmlid, exp_runtime_per_seed, timeo
score = sklearn.metrics.accuracy_score(y_test, y_hat)
scores.append(score)
self.logger.info(f"finished test on seed {seed}. Test score for this run is {score}")

del naml
gc.collect()

# check conditions
runtime_mean = int(np.round(np.mean(runtimes)))
Expand Down

0 comments on commit a6e5c3b

Please sign in to comment.