From 6d8e5ebd9105b7fc732ac8acdb072b427f7648e6 Mon Sep 17 00:00:00 2001
From: Angel Gomez <angeldagosa@gmail.com>
Date: Wed, 18 Sep 2024 13:44:42 -0500
Subject: [PATCH] fixed bug with dictionary as default parameter

---
 python/naiveautoml/commons.py     | 17 +++++++-----
 python/naiveautoml/evaluators.py  | 44 +++++++++++++++++++------------
 python/naiveautoml/naiveautoml.py | 17 +++++++++---
 3 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/python/naiveautoml/commons.py b/python/naiveautoml/commons.py
index 78fd4c1..3c20902 100644
--- a/python/naiveautoml/commons.py
+++ b/python/naiveautoml/commons.py
@@ -24,7 +24,7 @@ def __init__(self,
                  logger_name=None,
                  use_caching=True,
                  error_treatment="info",
-                 kwargs_evaluation_fun={},
+                 kwargs_evaluation_fun=None,
                  random_state=None
                  ):
 
@@ -83,11 +83,10 @@ def get_evaluation_fun(self, evaluation_fun, kwargs_evaluation_fun):
             self.logger.info("Choosing mccv as default evaluation function.")
             evaluation_fun = "mccv"
 
-        if evaluation_fun in ["lccv", "mccv"]:
+        if evaluation_fun in ["kfold", "mccv"]:
             is_small_dataset = task.X.shape[0] < 2000
             is_medium_dataset = not is_small_dataset and task.X.shape[0] < 20000
             is_large_dataset = not (is_small_dataset or is_medium_dataset)
-
             if not kwargs_evaluation_fun:
                 if is_small_dataset:
                     self.logger.info("This is a small dataset, choosing 5 splits for evaluation")
@@ -103,12 +102,18 @@ def get_evaluation_fun(self, evaluation_fun, kwargs_evaluation_fun):
                         "Invalid case for dataset size!! This should never happen. Please report this as a bug.")
 
             if evaluation_fun == "mccv":
-                return MccvEvaluator(task.inferred_task_type, random_state=self.random_state, **kwargs_evaluation_fun)
+                return MccvEvaluator(task_type=task.inferred_task_type,
+                                     random_state=self.random_state,
+                                     kwargs_evaluation_fun=kwargs_evaluation_fun)
             elif evaluation_fun == "kfold":
-                return KFoldEvaluator(task.inferred_task_type, random_state=self.random_state, **kwargs_evaluation_fun)
+                return KFoldEvaluator(task_type=task.inferred_task_type,
+                                      random_state=self.random_state,
+                                      kwargs_evaluation_fun=kwargs_evaluation_fun)
 
         elif evaluation_fun == "lccv":
-            return LccvEvaluator(task.inferred_task_type, random_state=self.random_state, **kwargs_evaluation_fun)
+            return LccvEvaluator(task_type=task.inferred_task_type,
+                                 random_state=self.random_state,
+                                 kwargs_evaluation_fun=kwargs_evaluation_fun)
         else:
             return evaluation_fun
 
diff --git a/python/naiveautoml/evaluators.py b/python/naiveautoml/evaluators.py
index 9e57095..7682598 100644
--- a/python/naiveautoml/evaluators.py
+++ b/python/naiveautoml/evaluators.py
@@ -11,18 +11,21 @@ class LccvEvaluator:
 
     def __init__(self,
                  task_type,
-                 train_size=0.8,
                  logger_name="naml.evaluator",
-                 repetitions_per_anchor=5,
-                 random_state=None):
+                 random_state=None,
+                 kwargs_evaluation_fun=None):
 
+        self.kwargs_lccv = kwargs_evaluation_fun
         self.task_type = task_type
         self.r = -np.inf
-        self.train_size = train_size
-        self.repetitions_per_anchor = repetitions_per_anchor
         self.random_state = random_state
         self.logger = logging.getLogger(logger_name)
 
+        if "target_anchor" not in self.kwargs_lccv:
+            self.kwargs_lccv["target_anchor"] = 0.8
+        if "max_evaluations" not in self.kwargs_lccv:
+            self.kwargs_lccv["max_evaluations"] = 5
+
     def __call__(self, pl, X, y, scorings, error_treatment="raise"):
         warnings.filterwarnings('ignore', module='sklearn')
         warnings.filterwarnings('ignore', module='numpy')
@@ -38,9 +41,8 @@ def __call__(self, pl, X, y, scorings, error_treatment="raise"):
                     r=self.r,
                     base_scoring=scorings[0]["name"],
                     additional_scorings=[s["name"] for s in scorings[1:]],
-                    target_anchor=self.train_size,
-                    max_evaluations=self.repetitions_per_anchor,
-                    seed=self.random_state
+                    seed=self.random_state,
+                    **self.kwargs_lccv
                 )
                 if not np.isnan(score) and score > self.r:
                     self.r = score
@@ -200,17 +202,21 @@ def evaluate_split(self, pl, X, y, train_index, test_index, scorings, error_trea
 
 class KFoldEvaluator(SplitBasedEvaluator):
 
-    def __init__(self, task_type, n_splits, random_state=None, logger_name="naml.evaluator"):
+    def __init__(self,
+                 task_type,
+                 random_state=None,
+                 logger_name="naml.evaluator",
+                 kwargs_evaluation_fun=None):
 
         # define splitter
         if task_type in ["classification"]:
             splitter = sklearn.model_selection.StratifiedKFold(
-                n_splits=n_splits,
                 random_state=random_state,
-                shuffle=True
+                shuffle=True,
+                **kwargs_evaluation_fun
             )
         elif task_type in ["regression", "multilabel-indicator"]:
-            splitter = sklearn.model_selection.KFold(n_splits=n_splits, random_state=random_state, shuffle=True)
+            splitter = sklearn.model_selection.KFold(random_state=random_state, shuffle=True, **kwargs_evaluation_fun)
         else:
             raise ValueError(f"Unsupported task type {task_type}")
 
@@ -219,19 +225,23 @@ def __init__(self, task_type, n_splits, random_state=None, logger_name="naml.eva
 
 class MccvEvaluator(SplitBasedEvaluator):
 
-    def __init__(self, task_type, n_splits, random_state=None, logger_name="naml.evaluator"):
+    def __init__(self,
+                 task_type,
+                 random_state=None,
+                 logger_name="naml.evaluator",
+                 kwargs_evaluation_fun=None):
 
         if task_type in ["classification"]:
             splitter = sklearn.model_selection.StratifiedShuffleSplit(
-                n_splits=n_splits,
                 train_size=0.8,
-                random_state=random_state
+                random_state=random_state,
+                **kwargs_evaluation_fun
             )
         elif task_type in ["regression", "multilabel-indicator"]:
             splitter = sklearn.model_selection.ShuffleSplit(
-                n_splits=n_splits,
                 train_size=0.8,
-                random_state=random_state
+                random_state=random_state,
+                **kwargs_evaluation_fun
             )
         else:
             raise ValueError(f"Unsupported task type {task_type}")
diff --git a/python/naiveautoml/naiveautoml.py b/python/naiveautoml/naiveautoml.py
index c0a97e5..913f9bf 100644
--- a/python/naiveautoml/naiveautoml.py
+++ b/python/naiveautoml/naiveautoml.py
@@ -29,9 +29,9 @@ def __init__(self,
                  max_hpo_iterations=100,
                  max_hpo_iterations_without_imp=100,
                  max_hpo_time_without_imp=1800,
-                 kwargs_as={},
-                 kwargs_hpo={},
-                 kwargs_evaluation_fun={},
+                 kwargs_as=None,
+                 kwargs_hpo=None,
+                 kwargs_evaluation_fun=None,
                  logger_name=None,
                  random_state: int = None,
                  strictly_naive: bool = False,
@@ -59,6 +59,12 @@ def __init__(self,
         self.logger_name = logger_name
         self.logger = logging.getLogger('naiveautoml' if logger_name is None else logger_name)
 
+        if kwargs_as is None:
+            kwargs_as = {}
+        
+        if kwargs_hpo is None:
+            kwargs_hpo = {}
+
         # configure algorithm selector
         if isinstance(algorithm_selector, str):
             accepted_selectors = ["sklearn"]
@@ -100,7 +106,10 @@ def __init__(self,
 
         # configure evaluation function
         self.evaluation_fun = evaluation_fun
-        self.kwargs_evaluation_fun = kwargs_evaluation_fun
+        if kwargs_evaluation_fun is None:
+            self.kwargs_evaluation_fun = {}
+        else:
+            self.kwargs_evaluation_fun = kwargs_evaluation_fun
 
         # memorize scorings
         self.scoring = None