fmohr · fmohr · Mar 20, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 20, 2024
diff --git a/python/naiveautoml/commons.py b/python/naiveautoml/commons.py
@@ -220,66 +220,76 @@ def evaluate(self, pl, timeout=None):
         warnings.filterwarnings('ignore', module='sklearn')
         warnings.filterwarnings('ignore', module='numpy')
 
-        if is_pipeline_forbidden(pl):
-            self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
-            return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
+        try:
 
-        process = psutil.Process(os.getpid())
-        mem = int(process.memory_info().rss / 1024 / 1024)
-        self.logger.info(f"Initializing evaluation of {pl}. Current memory consumption {mem}MB. Now awaiting results.")
+            if is_pipeline_forbidden(pl):
+                self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
+                return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
 
-        start_outer = time.time()
-        spl = str(pl)
-        if self.use_caching and spl in self.cache:
-            out = {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
-            out[get_scoring_name(self.scoring)] = np.round(np.mean(self.cache[spl][1]), 4)
-            return out
-        timestamp = time.time()
-        if timeout is not None:
-            if timeout > 1:
-                with pynisher.limit(self.evaluation_fun, wall_time=timeout) as limited_evaluation:
-                    if hasattr(self.evaluation_fun, "errors"):
-                        scores = limited_evaluation(
-                            pl,
-                            self.X,
-                            self.y,
-                            [self.scoring] + self.side_scores,
-                            errors="ignore"
-                        )
-                    else:
-                        scores = limited_evaluation(
-                            pl,
-                            self.X,
-                            self.y,
-                            [self.scoring] + self.side_scores
-                        )
-            else:  # no time left
-                scores = None
-        else:
-            scores = self.evaluation_fun(pl, self.X, self.y, [self.scoring] + self.side_scores)
-        if scores is None:
-            return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
-        runtime = time.time() - start_outer
-
-        # if scores is a 2-tuple, it is assumed that the evaluator object returned itself (in an altered version)
-        if isinstance(scores, tuple):
-            if not isinstance(scores[1], type(self.evaluation_fun)):
-                raise ValueError(
-                    "If an evaluation function returns an object in its second output,"
-                    "the type must coincide to the previous one!"
-                )
-            self.evaluation_fun = scores[1]
-            scores = scores[0]
+            process = psutil.Process(os.getpid())
+            mem = int(process.memory_info().rss / 1024 / 1024)
+            self.logger.info(
+                f"Initializing evaluation of {pl}. Current memory consumption {mem}MB. Now awaiting results."
+            )
 
-        if not isinstance(scores, dict):
-            raise TypeError(f"""
-            scores is of type {type(scores)} but must be a dictionary with entries for {get_scoring_name(self.scoring)}.
-            Probably you inserted an evaluation_fun argument that does not return a proper dictionary."""
+            start_outer = time.time()
+            spl = str(pl)
+            if self.use_caching and spl in self.cache:
+                out = {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
+                out[get_scoring_name(self.scoring)] = np.round(np.mean(self.cache[spl][1]), 4)
+                return out
+            timestamp = time.time()
+            if timeout is not None:
+                if timeout > 1:
+                    with pynisher.limit(self.evaluation_fun, wall_time=timeout) as limited_evaluation:
+                        if hasattr(self.evaluation_fun, "errors"):
+                            scores = limited_evaluation(
+                                pl,
+                                self.X,
+                                self.y,
+                                [self.scoring] + self.side_scores,
+                                errors="ignore"
                             )
-
-        self.logger.info(f"Completed evaluation of {spl} after {runtime}s. Scores are {scores}")
-        self.tellEvaluation(pl, scores[get_scoring_name(self.scoring)], timestamp)
-        return {scoring: np.round(np.mean(scores[scoring]), 4) for scoring in scores}
+                        else:
+                            scores = limited_evaluation(
+                                pl,
+                                self.X,
+                                self.y,
+                                [self.scoring] + self.side_scores
+                            )
+                else:  # no time left
+                    scores = None
+            else:
+                scores = self.evaluation_fun(pl, self.X, self.y, [self.scoring] + self.side_scores)
+
+            # here we give the evaluator the chance to update itself
+            # this looks funny, but it is done because the evaluation could have been done with a copy of the evaluator
+            if hasattr(self.evaluation_fun, "update"):
+                self.evaluation_fun.update(pl, scores)
+
+            # if no score was observed, return results here
+            if scores is None:
+                return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
+            runtime = time.time() - start_outer
+
+            if not isinstance(scores, dict):
+                raise TypeError(f"""
+                scores is of type {type(scores)} but must be a dictionary
+                with entries for {get_scoring_name(self.scoring)}. Probably you inserted an
+                evaluation_fun argument that does not return a proper dictionary."""
+                                )
+
+            self.logger.info(f"Completed evaluation of {spl} after {runtime}s. Scores are {scores}")
+            self.tellEvaluation(pl, scores[get_scoring_name(self.scoring)], timestamp)
+            return {scoring: np.round(np.mean(scores[scoring]), 4) for scoring in scores}
+
+        # if there was an exception, then tell the evaluator function about a nan
+        except Exception:
+            if hasattr(self.evaluation_fun, "update"):
+                self.evaluation_fun.update(pl, {
+                    get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores
+                })
+            raise
 
 
 def fullname(o):

diff --git a/python/naiveautoml/naiveautoml.py b/python/naiveautoml/naiveautoml.py
@@ -714,7 +714,6 @@ def eval_history(self, X, y):
         return scores
 
     def predict(self, X):
-        print(self.pl)
         return self.pl.predict(X)
 
     def predict_proba(self, X):

diff --git a/python/naiveautoml/searchspace-classification.json b/python/naiveautoml/searchspace-classification.json
@@ -767,7 +767,7 @@
                         {
                             "name": "loss",
                             "type": "constant",
-                            "value": "auto"
+                            "value": "log_loss"
                         },
                         {
                             "name": "max_bins",

diff --git a/python/setup.py b/python/setup.py
@@ -13,7 +13,7 @@
   install_requires=[
           'numpy',
           'pandas',
-          'scikit-learn',
+          'scikit-learn>=1.4',
           'configspace<0.7.1',
           'scipy',
           'pynisher',

diff --git a/python/test/test_naiveautoml.py b/python/test/test_naiveautoml.py
@@ -16,6 +16,8 @@
 import openml
 import pandas as pd
 
+from typing import Callable
+
 
 def get_dataset(openmlid, as_numpy = True):
     ds = openml.datasets.get_dataset(openmlid)
@@ -32,10 +34,11 @@ def get_dataset(openmlid, as_numpy = True):
         y = y.values
     print(f"Data is of shape {X.shape}.")
     return X, y
-    
-    
+
+
 class TestNaiveAutoML(unittest.TestCase):
 
+    @staticmethod
     def setUpClass():
         # setup logger for this test suite
         logger = logging.getLogger('naml_test')
@@ -225,7 +228,7 @@ def test_naml_results_classification(self, openmlid, exp_runtime, exp_result):
     @parameterized.expand([
             (41021, 120, 650), # moneyball
             #(183, 260, 15), # abalone
-            (212, 120, 15) # diabetes, has decimal targets
+            (212, 120, 15)  # diabetes, has decimal targets
 
         ])
     def test_naml_results_regression(self, openmlid, exp_runtime, exp_result):
@@ -245,7 +248,7 @@ def test_naml_results_regression(self, openmlid, exp_runtime, exp_result):
             start = time.time()
             naml = naiveautoml.NaiveAutoML(
                 logger_name="naml",
-                timeout=120,
+                timeout=75,
                 max_hpo_iterations=10,
                 show_progress=True,
                 task_type="regression",
@@ -321,7 +324,13 @@ def test_individual_scoring(self, openmlid, exp_runtime, exp_result):
 
             # run naml
             start = time.time()
-            naml = naiveautoml.NaiveAutoML(logger_name="naml", max_hpo_iterations=10, show_progress=True, scoring = scoring1, side_scores=[scoring2])
+            naml = naiveautoml.NaiveAutoML(
+                logger_name="naml",
+                max_hpo_iterations=10,
+                show_progress=True,
+                scoring = scoring1,
+                side_scores=[scoring2]
+            )
             naml.fit(X_train, y_train)
             end = time.time()
             runtime = end - start
@@ -394,4 +403,85 @@ def evaluation(pl, X, y, scoring_functions):
         score_mean = np.round(np.mean(scores), 2)
         self.assertTrue(runtime_mean <= exp_runtime, msg=f"Permitted runtime exceeded. Expected was {exp_runtime}s but true runtime was {runtime_mean}")
         self.assertTrue(score_mean >= exp_result, msg=f"Returned solution was bad. Expected was at least {exp_result} but true avg score was {score_mean}")
-        self.logger.info(f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")
+        self.logger.info(f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")
+
+
+    @parameterized.expand([
+        (61, 30, 0.9),
+        # (188, 60, 0.5), # eucalyptus. Very important because has both missing values and categorical attributes
+        # (1485, 240, 0.82),
+        # (1515, 240, 0.85),
+        # (1468, 120, 0.94),
+        # (1489, 180, 0.89),
+        # (23512, 600, 0.65),
+        # (23517, 600, 0.5),
+        # (4534, 180, 0.92),
+        # (4538, 400, 0.66),
+        # (4134, 400, 0.79),
+
+    ])
+    def test_individual_stateful_evaluation(self, openmlid, exp_runtime, exp_result):
+        X, y = get_dataset(openmlid)
+        self.logger.info(f"Start result test for NaiveAutoML on classification dataset {openmlid}")
+
+        class Evaluator(Callable):
+
+            def __init__(self):
+                self.history = []
+
+            def reset(self):
+                self.history = []
+
+            def __call__(self, pl, X, y, scoring_functions):
+                results = {
+                    s: np.mean(sklearn.model_selection.cross_validate(pl, X, y, scoring=s)["test_score"])
+                    for s in scoring_functions
+                }
+                return results
+
+            def update(self, pl, results):
+                self.history.append([pl, results])
+
+        scorer = sklearn.metrics.get_scorer("accuracy")
+        evaluation = Evaluator()
+
+        # run naml
+        scores = []
+        runtimes = []
+        for seed in range(1, self.num_seeds + 1):
+
+            evaluation.reset()
+
+            # create split
+            self.logger.debug(f"Running test on seed {seed}/{self.num_seeds}")
+            X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)
+
+            # run naml
+            start = time.time()
+            naml = naiveautoml.NaiveAutoML(logger_name="naml", max_hpo_iterations=10, show_progress=True,
+                                           evaluation_fun=evaluation)
+            naml.fit(X_train, y_train)
+            end = time.time()
+            runtime = end - start
+            runtimes.append(runtime)
+
+            # compute test performance
+            self.logger.debug(
+                f"finished training on seed {seed} after {int(np.round(runtime))}s. Now computing performance of solution.")
+            score = scorer(naml, X_test, y_test)
+            scores.append(score)
+            self.logger.debug(f"finished test on seed {seed}. Test score for this run is {score}")
+
+            self.assertEquals(len(naml.history), len(evaluation.history), "History lengths don't match!")
+
+        # check conditions
+        runtime_mean = int(np.round(np.mean(runtimes)))
+        score_mean = np.round(np.mean(scores), 2)
+        self.assertTrue(runtime_mean <= exp_runtime,
+                        msg=f"Permitted runtime exceeded. Expected was {exp_runtime}s but true runtime was {runtime_mean}")
+
+        # we also check the score, because the result here *should* be good. if not, the values might not be used
+        self.assertTrue(score_mean >= exp_result,
+                        msg=f"Returned solution was bad. Expected was at least {exp_result} but true avg score was {score_mean}")
+        self.logger.info(
+            f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")