solved hyperparameter issues

fmohr · Oct 7, 2024 · c97d8d6 · c97d8d6
1 parent 4efe050
commit c97d8d6
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 47 deletions.
diff --git a/python/naiveautoml/algorithm_selection/_sklearn_factory.py b/python/naiveautoml/algorithm_selection/_sklearn_factory.py
@@ -663,7 +663,7 @@ def score_func(X, y):
         return sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(**params)
 
     if clazz == sklearn.linear_model.LogisticRegression:
-        params["dual"] = check_for_bool(params["dual"])
+        #params["dual"] = check_for_bool(params["dual"])  # disabled now
         return sklearn.linear_model.LogisticRegression(**params)
 
     if clazz == sklearn.neural_network.MLPClassifier:

diff --git a/python/naiveautoml/naiveautoml.py b/python/naiveautoml/naiveautoml.py
@@ -234,8 +234,11 @@ def fit(self, X, y, categorical_features=None):
 
             # get candidate descriptor
             as_result_for_best_candidate = relevant_history.sort_values(self.task.scoring["name"]).iloc[-1]
+            config_space = self.algorithm_selector.get_config_space(as_result_for_best_candidate)
 
-            if (
+            if len(config_space) == 0:
+                self.logger.info(f"The selected algorithms {as_result_for_best_candidate} have no hyperparameters.")
+            elif (
                     deadline is None or
                     deadline is not None and deadline - time.time() >= as_result_for_best_candidate["runtime"] + 5
             ):
@@ -244,7 +247,7 @@ def fit(self, X, y, categorical_features=None):
                 self.hp_optimizer.reset(
                     task=self.task,
                     runtime_of_default_config=as_result_for_best_candidate["runtime"],
-                    config_space=self.algorithm_selector.get_config_space(as_result_for_best_candidate),
+                    config_space=config_space,
                     history_descriptor_creation_fun=lambda hp_config: self.algorithm_selector.create_history_descriptor(
                         as_result_for_best_candidate,
                         hp_config

diff --git a/python/naiveautoml/searchspace-classification.json b/python/naiveautoml/searchspace-classification.json
@@ -1304,26 +1304,6 @@
                 "class": "sklearn.linear_model.LogisticRegression",
                 "params": {
                     "hyperparameters": [
-                        {
-                            "name": "penalty",
-                            "type": "categorical",
-                            "choices": [
-                                "l1",
-                                "l2",
-                                "elasticnet",
-                                "None"
-                            ],
-                            "default_value": "l2"
-                        },
-                        {
-                            "name": "dual",
-                            "type": "categorical",
-                            "choices": [
-                                "True",
-                                "False"
-                            ],
-                            "default_value": "False"
-                        },
                         {
                             "name": "C",
                             "type": "uniform_float",

diff --git a/python/naiveautoml/searchspace-regression.json b/python/naiveautoml/searchspace-regression.json
@@ -906,7 +906,7 @@
                             "default_value": 1e-06
                         },
                         {
-                            "name": "n_iter",
+                            "name": "max_iter",
                             "type": "constant",
                             "value": 300
                         },

diff --git a/python/test/test_naiveautoml.py b/python/test/test_naiveautoml.py
@@ -272,37 +272,39 @@ def test_constant_algorithms_in_hpo_phase(self):
         X, y = get_dataset(61)
 
         # run naml
-        np.random.seed(round(time.time()))
+        np.random.seed(0)#round(time.time()))
         naml = naiveautoml.NaiveAutoML(
             logger_name="naml",
             timeout_overall=60,
             max_hpo_iterations=10,
             show_progress=True,
-            evaluation_fun=evaluate_randomly
+            evaluation_fun=evaluate_randomly,
+            random_state=0
         )
         naml.fit(X, y)
         print(naml.history[["learner_class", "neg_log_loss"]])
 
         # check that there is only one combination of algorithms in the HPO phase
         history = naml.history.iloc[naml.steps_after_which_algorithm_selection_was_completed:]
-        self.assertTrue(len(pd.unique(history["learner_class"])) == 1)
-        self.assertTrue(len(pd.unique(history["data-pre-processor_class"])) == 1)
-        self.assertTrue(len(pd.unique(history["feature-pre-processor_class"])) == 1)
-
-        # get best solution from phase 1
-        phase_1_solutions = naml.history.iloc[:naml.steps_after_which_algorithm_selection_was_completed]
-        phase_1_solutions = phase_1_solutions[phase_1_solutions[naml.task.scoring["name"]].notna()]
-        best_solution_in_phase_1 = phase_1_solutions.sort_values(naml.task.scoring["name"]).iloc[-1]
-
-        for step in ["data-pre-processor", "feature-pre-processor", "learner"]:
-            field = f"{step}_class"
-            class_in_phase1 = best_solution_in_phase_1[field]
-            class_in_phase2 = pd.unique(history[field])[0]
-            self.assertEqual(
-                class_in_phase1,
-                class_in_phase2,
-                f"Choice for {step} should conicide but is {class_in_phase1} in AS phase and {class_in_phase2} in HPO."
-            )
+        if len(history) > 0:
+            self.assertTrue(len(pd.unique(history["learner_class"])) == 1)
+            self.assertTrue(len(pd.unique(history["data-pre-processor_class"])) == 1)
+            self.assertTrue(len(pd.unique(history["feature-pre-processor_class"])) == 1)
+
+            # get best solution from phase 1
+            phase_1_solutions = naml.history.iloc[:naml.steps_after_which_algorithm_selection_was_completed]
+            phase_1_solutions = phase_1_solutions[phase_1_solutions[naml.task.scoring["name"]].notna()]
+            best_solution_in_phase_1 = phase_1_solutions.sort_values(naml.task.scoring["name"]).iloc[-1]
+
+            for step in ["data-pre-processor", "feature-pre-processor", "learner"]:
+                field = f"{step}_class"
+                class_in_phase1 = best_solution_in_phase_1[field]
+                class_in_phase2 = pd.unique(history[field])[0]
+                self.assertEqual(
+                    class_in_phase1,
+                    class_in_phase2,
+                    f"Choice for {step} should conicide but is {class_in_phase1} in AS phase and {class_in_phase2} in HPO."
+                )
 
 
     """
@@ -654,7 +656,7 @@ def update(self, pl, results):
     def test_searchspaces(self):
 
         for openmlid, task_type in {
-            #61: "classification",  # iris
+            61: "classification",  # iris
             531: "regression"  # boston housing
         }.items():
 
@@ -701,10 +703,15 @@ def test_searchspaces(self):
                     })
 
                     # get HPO process for supposed selection
+                    config_space = helper.get_config_space_for_selected_algorithms(selection)
+                    if len(config_space) == 0:
+                        self.logger.info("Config space is empty, nothing to check.")
+                        continue
+
                     hp_optimizer.reset(
                         task=task,
                         runtime_of_default_config=0,
-                        config_space=helper.get_config_space_for_selected_algorithms(selection),
+                        config_space=config_space,
                         history_descriptor_creation_fun=lambda hp_config: naml.algorithm_selector.create_history_descriptor(faked_as_info, hp_config),
                         evaluator=naml.evaluator,
                         is_pipeline_forbidden=naml.algorithm_selector.is_pipeline_forbidden,
@@ -745,7 +752,12 @@ def test_process_leak(self, openmlid):
         X, y = get_dataset(openmlid)
         self.logger.info(f"Start test of individual stateful evaluation function on dataset {openmlid}.")
 
-        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.8)
+        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+            X,
+            y,
+            train_size=10,
+            test_size=10
+        )
         for i in range(1, 21):
             self.logger.info(f"Run {i}-th instance")
             automl = naiveautoml.NaiveAutoML(