[Bug] Fix random halt problems on traditional pipelines (automl#147)

* [feat] Fix random halt problems on traditional pipelines * Documentation update * Fix flake * Flake due to kernel pca errors
dengdifan · Mar 29, 2021 · 68fc77f · 68fc77f
1 parent 7bcde56
commit 68fc77f
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 2 deletions.
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -54,6 +54,23 @@
 
 
 class MyTraditionalTabularClassificationPipeline(BaseEstimator):
+    """
+    A wrapper class that holds a pipeline for traditional classification.
+    Estimators like CatBoost, and Random Forest are considered traditional machine
+    learning models and are fitted before neural architecture search.
+
+    This class is an interface to fit a pipeline containing a traditional machine
+    learning model, and is the final object that is stored for inference.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: str,
                  dataset_properties: Dict[str, Any],
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
@@ -98,6 +115,21 @@ def get_default_pipeline_options() -> Dict[str, Any]:
 
 
 class DummyClassificationPipeline(DummyClassifier):
+    """
+    A wrapper class that holds a pipeline for dummy classification.
+
+    A wrapper over DummyClassifier of scikit learn. This estimator is considered the
+    worst performing model. In case of failure, at least this model will be fitted.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None
@@ -148,6 +180,21 @@ def get_default_pipeline_options() -> Dict[str, Any]:
 
 
 class DummyRegressionPipeline(DummyRegressor):
+    """
+    A wrapper class that holds a pipeline for dummy regression.
+
+    A wrapper over DummyRegressor of scikit learn. This estimator is considered the
+    worst performing model. In case of failure, at least this model will be fitted.
+
+    Attributes:
+        dataset_properties (Dict[str, Any]):
+            A dictionary containing dataset specific information
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+    """
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None) -> None:
@@ -351,7 +398,7 @@ def _get_pipeline(self) -> BaseEstimator:
         if isinstance(self.configuration, int):
             pipeline = self.pipeline_class(config=self.configuration,
                                            random_state=np.random.RandomState(self.seed),
-                                           init_params=self.fit_dictionary)
+                                           init_params=self._init_params)
         elif isinstance(self.configuration, Configuration):
             pipeline = self.pipeline_class(config=self.configuration,
                                            dataset_properties=self.dataset_properties,
@@ -364,7 +411,7 @@ def _get_pipeline(self) -> BaseEstimator:
             pipeline = self.pipeline_class(config=self.configuration,
                                            dataset_properties=self.dataset_properties,
                                            random_state=np.random.RandomState(self.seed),
-                                           init_params=self.fit_dictionary)
+                                           init_params=self._init_params)
         else:
             raise ValueError("Invalid configuration entered")
         return pipeline

diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py
@@ -1,3 +1,5 @@
+import flaky
+
 import numpy as np
 
 import pytest
@@ -51,6 +53,7 @@ def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor):
         transformed = column_transformer.transform(X['X_train'])
         assert isinstance(transformed, np.ndarray)
 
+    @flaky.flaky(max_runs=3)
     def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor):
         """
         This test ensures that a tabular classification