From a6e5c3b25745bed2cacf5e884d947987e0497946 Mon Sep 17 00:00:00 2001
From: Felix <fmohr@mail.upb.de>
Date: Mon, 13 May 2024 06:19:02 +0200
Subject: [PATCH] added restrictions to the use of pynisher and avoid stalling
 executions

---
 .../algorithm_selection/_sklearn_factory.py   |  3 +-
 .../algorithm_selection/sklearn.py            | 40 +++++++++++++++++--
 python/test/test_naiveautoml.py               |  6 ++-
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/python/naiveautoml/algorithm_selection/_sklearn_factory.py b/python/naiveautoml/algorithm_selection/_sklearn_factory.py
index 905d193..019e0ab 100644
--- a/python/naiveautoml/algorithm_selection/_sklearn_factory.py
+++ b/python/naiveautoml/algorithm_selection/_sklearn_factory.py
@@ -917,7 +917,8 @@ def is_pipeline_forbidden(task, pl):
             sklearn.preprocessing.RobustScaler,
             sklearn.preprocessing.StandardScaler,
             sklearn.preprocessing.MinMaxScaler,
-            sklearn.preprocessing.QuantileTransformer
+            sklearn.preprocessing.QuantileTransformer,
+            sklearn.preprocessing.PowerTransformer
         ]:
             return True  # scaling has no effect onf tree-based classifiers
 
diff --git a/python/naiveautoml/algorithm_selection/sklearn.py b/python/naiveautoml/algorithm_selection/sklearn.py
index 06b8ca4..afaf640 100644
--- a/python/naiveautoml/algorithm_selection/sklearn.py
+++ b/python/naiveautoml/algorithm_selection/sklearn.py
@@ -1,3 +1,5 @@
+import sklearn.svm
+
 from .._interfaces import AlgorithmSelector, SupervisedTask
 
 # core stuff
@@ -174,16 +176,19 @@ def run(self, deadline=None):
                 pl = self.get_pipeline_for_decision_in_step(step_name, comp, self.task.X, self.task.y, decisions)
 
                 eval_start_time = time.time()
-                if is_pipeline_forbidden(self.task, pl):
+                if (
+                        is_pipeline_forbidden(self.task, pl) or
+                        (self.is_timeout_required(pl) and self.is_pl_prohibited_for_timeout(pl))
+                ):
                     self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
                     scores = {scoring["name"]: np.nan for scoring in [self.task.scoring] + self.task.passive_scorings}
                     self.evaluator.tellEvaluation(pl, [scores[self.task.scoring["name"]]], None, time.time())
                     status, scores, evaluation_report, exception = "avoided", scores, None, None
                 else:
-                    if self.task.timeout_candidate is None:
-                        timeout = None
-                    else:
+                    if self.is_timeout_required(pl) and self.task.timeout_candidate is not None:
                         timeout = min(self.task.timeout_candidate, remaining_time if deadline is not None else 10 ** 10)
+                    else:
+                        timeout = None
                     status, scores, evaluation_report, exception = self.evaluator.evaluate(pl, timeout)
 
                 runtime = time.time() - eval_start_time
@@ -412,6 +417,33 @@ def get_mandatory_preprocessing(self, X=None, y=None, categorical_features=None)
         except Exception:
             raise
 
+    def is_pl_prohibited_for_timeout(self, pl):
+        learner = pl["learner"]
+        if (
+                isinstance(learner, sklearn.discriminant_analysis.LinearDiscriminantAnalysis) or
+                isinstance(learner, sklearn.neural_network.MLPClassifier) or
+                isinstance(learner, sklearn.neural_network.MLPRegressor) or
+                isinstance(learner, sklearn.ensemble.HistGradientBoostingRegressor) or
+                isinstance(learner, sklearn.ensemble.HistGradientBoostingClassifier)
+        ):
+            return True
+        return False
+
+    def is_timeout_required(self, pl):
+        learner = pl["learner"]
+        if isinstance(learner, sklearn.svm.SVC) or isinstance(learner, sklearn.svm.LinearSVC):
+            return True
+        if "feature-pre-processor" in [e[0] for e in pl.steps]:
+            feature_pp = [e[1] for e in pl.steps if e[0] == "feature-pre-processor"][0]
+            if (
+                    isinstance(feature_pp, sklearn.decomposition.KernelPCA) or
+                    isinstance(feature_pp, sklearn.kernel_approximation.RBFSampler) or
+                    isinstance(feature_pp, sklearn.kernel_approximation.Nystroem) or
+                    isinstance(feature_pp, sklearn.preprocessing.PolynomialFeatures)
+            ):
+                return True
+        return False
+
     def get_standard_learner_instance(self, X, y):
         return self.standard_classifier() if self.inferred_task_type == "classification" else self.standard_regressor()
 
diff --git a/python/test/test_naiveautoml.py b/python/test/test_naiveautoml.py
index 48a462f..c8410ee 100644
--- a/python/test/test_naiveautoml.py
+++ b/python/test/test_naiveautoml.py
@@ -10,6 +10,7 @@
 import pandas as pd
 
 from typing import Callable
+import gc
 
 
 def get_dataset(openmlid, as_numpy = True):
@@ -311,7 +312,7 @@ def test_constant_algorithms_in_hpo_phase(self):
 
     @parameterized.expand([
             (61, 30, 10, 0.9),  # on a fast machine, iris can be executed in 10s, but on slow machines it takes longer
-            (6, 300, 30, 0.96),  # letter
+            (6, 300, 20, 0.96),  # letter
             (188, 60, 10, 0.5),  # eucalyptus. Very important because has both missing values and categorical attributes
             #(1485, 240, 0.82),
             #(1515, 240, 0.85),
@@ -356,6 +357,9 @@ def test_naml_results_classification(self, openmlid, exp_runtime_per_seed, timeo
             score = sklearn.metrics.accuracy_score(y_test, y_hat)
             scores.append(score)
             self.logger.info(f"finished test on seed {seed}. Test score for this run is {score}")
+
+            del naml
+            gc.collect()
             
         # check conditions
         runtime_mean = int(np.round(np.mean(runtimes)))