From a6e5c3b25745bed2cacf5e884d947987e0497946 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 13 May 2024 06:19:02 +0200 Subject: [PATCH] added restrictions to the use of pynisher and avoid stalling executions --- .../algorithm_selection/_sklearn_factory.py | 3 +- .../algorithm_selection/sklearn.py | 40 +++++++++++++++++-- python/test/test_naiveautoml.py | 6 ++- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/python/naiveautoml/algorithm_selection/_sklearn_factory.py b/python/naiveautoml/algorithm_selection/_sklearn_factory.py index 905d193..019e0ab 100644 --- a/python/naiveautoml/algorithm_selection/_sklearn_factory.py +++ b/python/naiveautoml/algorithm_selection/_sklearn_factory.py @@ -917,7 +917,8 @@ def is_pipeline_forbidden(task, pl): sklearn.preprocessing.RobustScaler, sklearn.preprocessing.StandardScaler, sklearn.preprocessing.MinMaxScaler, - sklearn.preprocessing.QuantileTransformer + sklearn.preprocessing.QuantileTransformer, + sklearn.preprocessing.PowerTransformer ]: return True # scaling has no effect onf tree-based classifiers diff --git a/python/naiveautoml/algorithm_selection/sklearn.py b/python/naiveautoml/algorithm_selection/sklearn.py index 06b8ca4..afaf640 100644 --- a/python/naiveautoml/algorithm_selection/sklearn.py +++ b/python/naiveautoml/algorithm_selection/sklearn.py @@ -1,3 +1,5 @@ +import sklearn.svm + from .._interfaces import AlgorithmSelector, SupervisedTask # core stuff @@ -174,16 +176,19 @@ def run(self, deadline=None): pl = self.get_pipeline_for_decision_in_step(step_name, comp, self.task.X, self.task.y, decisions) eval_start_time = time.time() - if is_pipeline_forbidden(self.task, pl): + if ( + is_pipeline_forbidden(self.task, pl) or + (self.is_timeout_required(pl) and self.is_pl_prohibited_for_timeout(pl)) + ): self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}") scores = {scoring["name"]: np.nan for scoring in [self.task.scoring] + self.task.passive_scorings} self.evaluator.tellEvaluation(pl, [scores[self.task.scoring["name"]]], None, time.time()) status, scores, evaluation_report, exception = "avoided", scores, None, None else: - if self.task.timeout_candidate is None: - timeout = None - else: + if self.is_timeout_required(pl) and self.task.timeout_candidate is not None: timeout = min(self.task.timeout_candidate, remaining_time if deadline is not None else 10 ** 10) + else: + timeout = None status, scores, evaluation_report, exception = self.evaluator.evaluate(pl, timeout) runtime = time.time() - eval_start_time @@ -412,6 +417,33 @@ def get_mandatory_preprocessing(self, X=None, y=None, categorical_features=None) except Exception: raise + def is_pl_prohibited_for_timeout(self, pl): + learner = pl["learner"] + if ( + isinstance(learner, sklearn.discriminant_analysis.LinearDiscriminantAnalysis) or + isinstance(learner, sklearn.neural_network.MLPClassifier) or + isinstance(learner, sklearn.neural_network.MLPRegressor) or + isinstance(learner, sklearn.ensemble.HistGradientBoostingRegressor) or + isinstance(learner, sklearn.ensemble.HistGradientBoostingClassifier) + ): + return True + return False + + def is_timeout_required(self, pl): + learner = pl["learner"] + if isinstance(learner, sklearn.svm.SVC) or isinstance(learner, sklearn.svm.LinearSVC): + return True + if "feature-pre-processor" in [e[0] for e in pl.steps]: + feature_pp = [e[1] for e in pl.steps if e[0] == "feature-pre-processor"][0] + if ( + isinstance(feature_pp, sklearn.decomposition.KernelPCA) or + isinstance(feature_pp, sklearn.kernel_approximation.RBFSampler) or + isinstance(feature_pp, sklearn.kernel_approximation.Nystroem) or + isinstance(feature_pp, sklearn.preprocessing.PolynomialFeatures) + ): + return True + return False + def get_standard_learner_instance(self, X, y): return self.standard_classifier() if self.inferred_task_type == "classification" else self.standard_regressor() diff --git a/python/test/test_naiveautoml.py b/python/test/test_naiveautoml.py index 48a462f..c8410ee 100644 --- a/python/test/test_naiveautoml.py +++ b/python/test/test_naiveautoml.py @@ -10,6 +10,7 @@ import pandas as pd from typing import Callable +import gc def get_dataset(openmlid, as_numpy = True): @@ -311,7 +312,7 @@ def test_constant_algorithms_in_hpo_phase(self): @parameterized.expand([ (61, 30, 10, 0.9), # on a fast machine, iris can be executed in 10s, but on slow machines it takes longer - (6, 300, 30, 0.96), # letter + (6, 300, 20, 0.96), # letter (188, 60, 10, 0.5), # eucalyptus. Very important because has both missing values and categorical attributes #(1485, 240, 0.82), #(1515, 240, 0.85), @@ -356,6 +357,9 @@ def test_naml_results_classification(self, openmlid, exp_runtime_per_seed, timeo score = sklearn.metrics.accuracy_score(y_test, y_hat) scores.append(score) self.logger.info(f"finished test on seed {seed}. Test score for this run is {score}") + + del naml + gc.collect() # check conditions runtime_mean = int(np.round(np.mean(runtimes)))