diff --git a/MANIFEST.in b/MANIFEST.in index d42ab8a2b..2f6b9ae8b 100755 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,10 +1,11 @@ include requirements.txt include autoPyTorch/utils/logging.yaml include autoPyTorch/configs/default_pipeline_options.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json -include autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json +include autoPyTorch/configs/greedy_portfolio.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json +include autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index e1e1ffedc..7f4cf523e 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -44,7 +44,7 @@ from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics from autoPyTorch.utils.common import FitRequirement, replace_string_bool_to_bool @@ -590,7 +590,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: memory_limit = self._memory_limit if memory_limit is not None: memory_limit = int(math.ceil(memory_limit)) - available_classifiers = get_available_classifiers() + available_classifiers = get_available_traditional_learners() dask_futures = [] total_number_classifiers = len(available_classifiers) @@ -892,21 +892,18 @@ def _search( # ============> Run traditional ml if enable_traditional_pipeline: - if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: - self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") - else: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - # We want time for at least 1 Neural network in SMAC - time_for_traditional = int( - self._time_for_task - elapsed_time - func_eval_time_limit_secs - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) + # We want time for at least 1 Neural network in SMAC + time_for_traditional = int( + self._time_for_task - elapsed_time - func_eval_time_limit_secs + ) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) # ============> Starting ensemble elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index c2dc0eb86..599856ce8 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -106,7 +106,7 @@ def search( budget: Optional[float] = None, total_walltime_limit: int = 100, func_eval_time_limit_secs: Optional[int] = None, - enable_traditional_pipeline: bool = False, + enable_traditional_pipeline: bool = True, memory_limit: Optional[int] = 4096, smac_scenario_args: Optional[Dict[str, Any]] = None, get_smac_object_callback: Optional[Callable] = None, @@ -151,7 +151,7 @@ def search( total_walltime_limit // 2 to allow enough time to fit at least 2 individual machine learning algorithms. Set to np.inf in case no time limit is desired. - enable_traditional_pipeline (bool), (default=False): + enable_traditional_pipeline (bool), (default=True): Not enabled for regression. This flag is here to comply with the API. memory_limit (Optional[int]), (default=4096): Memory @@ -187,7 +187,11 @@ def search( configurations, similar to (...herepathtogreedy...). Additionally, the keyword 'greedy' is supported, which would use the default portfolio from - `AutoPyTorch Tabular ` + `AutoPyTorch Tabular `. + Although portfolio selection is supported for tabular + regression, the portfolio has been built using + classification datasets. We will update a portfolio + to cover tabular regression datasets. Returns: self diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index b5ad43206..19ed70aeb 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -20,6 +20,7 @@ import autoPyTorch.pipeline.tabular_classification import autoPyTorch.pipeline.tabular_regression import autoPyTorch.pipeline.traditional_tabular_classification +import autoPyTorch.pipeline.traditional_tabular_regression from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import ( CLASSIFICATION_TASKS, @@ -64,7 +65,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator): Attributes: dataset_properties (Dict[str, Any]): A dictionary containing dataset specific information - random_state (Optional[Union[int, np.random.RandomState]]): + random_state (Optional[np.random.RandomState]): Object that contains a seed and allows for reproducible results init_params (Optional[Dict]): An optional dictionary that is passed to the pipeline's steps. It complies @@ -73,18 +74,18 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator): def __init__(self, config: str, dataset_properties: Dict[str, Any], - random_state: Optional[Union[int, np.random.RandomState]] = None, + random_state: Optional[np.random.RandomState] = None, init_params: Optional[Dict] = None): self.config = config self.dataset_properties = dataset_properties self.random_state = random_state self.init_params = init_params - self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification.\ + self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification. \ TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties, random_state=self.random_state) configuration_space = self.pipeline.get_hyperparameter_search_space() default_configuration = configuration_space.get_default_configuration().get_dictionary() - default_configuration['model_trainer:tabular_classifier:classifier'] = config + default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config self.configuration = Configuration(configuration_space, default_configuration) self.pipeline.set_hyperparameters(self.configuration) @@ -100,10 +101,7 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame], batch_size: int = 1000) -> np.array: return self.pipeline.predict(X, batch_size=batch_size) - def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201 - return False - - def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201 + def get_additional_run_info(self) -> Dict[str, Any]: """ Can be used to return additional info for the run. Returns: @@ -111,7 +109,7 @@ def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201 Currently contains 1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used 2. trainer_configuration: the parameters for the traditional model used. - Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs + Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs """ return {'pipeline_configuration': self.configuration, 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(), @@ -126,6 +124,71 @@ def get_default_pipeline_options() -> Dict[str, Any]: TraditionalTabularClassificationPipeline.get_default_pipeline_options() +class MyTraditionalTabularRegressionPipeline(BaseEstimator): + """ + A wrapper class that holds a pipeline for traditional regression. + Estimators like CatBoost, and Random Forest are considered traditional machine + learning models and are fitted before neural architecture search. + + This class is an interface to fit a pipeline containing a traditional machine + learning model, and is the final object that is stored for inference. + + Attributes: + dataset_properties (Dict[str, Any]): + A dictionary containing dataset specific information + random_state (Optional[np.random.RandomState]): + Object that contains a seed and allows for reproducible results + init_params (Optional[Dict]): + An optional dictionary that is passed to the pipeline's steps. It complies + a similar function as the kwargs + """ + def __init__(self, config: str, + dataset_properties: Dict[str, Any], + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict] = None): + self.config = config + self.dataset_properties = dataset_properties + self.random_state = random_state + self.init_params = init_params + self.pipeline = autoPyTorch.pipeline.traditional_tabular_regression. \ + TraditionalTabularRegressionPipeline(dataset_properties=dataset_properties, + random_state=self.random_state) + configuration_space = self.pipeline.get_hyperparameter_search_space() + default_configuration = configuration_space.get_default_configuration().get_dictionary() + default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config + self.configuration = Configuration(configuration_space, default_configuration) + self.pipeline.set_hyperparameters(self.configuration) + + def fit(self, X: Dict[str, Any], y: Any, + sample_weight: Optional[np.ndarray] = None) -> object: + return self.pipeline.fit(X, y) + + def predict(self, X: Union[np.ndarray, pd.DataFrame], + batch_size: int = 1000) -> np.array: + return self.pipeline.predict(X, batch_size=batch_size) + + def get_additional_run_info(self) -> Dict[str, Any]: + """ + Can be used to return additional info for the run. + Returns: + Dict[str, Any]: + Currently contains + 1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used + 2. trainer_configuration: the parameters for the traditional model used. + Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs + """ + return {'pipeline_configuration': self.configuration, + 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()} + + def get_pipeline_representation(self) -> Dict[str, str]: + return self.pipeline.get_pipeline_representation() + + @staticmethod + def get_default_pipeline_options() -> Dict[str, Any]: + return autoPyTorch.pipeline.traditional_tabular_regression.\ + TraditionalTabularRegressionPipeline.get_default_pipeline_options() + + class DummyClassificationPipeline(DummyClassifier): """ A wrapper class that holds a pipeline for dummy classification. @@ -175,9 +238,6 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame], new_X = np.ones((X.shape[0], 1)) return super(DummyClassificationPipeline, self).predict(new_X).astype(np.float32) - def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201 - return False - def get_additional_run_info(self) -> Dict: # pylint: disable=R0201 return {'configuration_origin': 'DUMMY'} @@ -234,12 +294,15 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame], new_X = np.ones((X.shape[0], 1)) return super(DummyRegressionPipeline, self).predict(new_X).astype(np.float32) - def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201 - return False - def get_additional_run_info(self) -> Dict: # pylint: disable=R0201 return {'configuration_origin': 'DUMMY'} + def get_pipeline_representation(self) -> Dict[str, str]: + return { + 'Preprocessing': 'None', + 'Estimator': 'Dummy', + } + @staticmethod def get_default_pipeline_options() -> Dict[str, Any]: return {'budget_type': 'epochs', @@ -401,8 +464,7 @@ def __init__(self, backend: Backend, if isinstance(self.configuration, int): self.pipeline_class = DummyRegressionPipeline elif isinstance(self.configuration, str): - raise ValueError("Only tabular classifications tasks " - "are currently supported with traditional methods") + self.pipeline_class = MyTraditionalTabularRegressionPipeline elif isinstance(self.configuration, Configuration): self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline else: @@ -415,8 +477,7 @@ def __init__(self, backend: Backend, if self.task_type in TABULAR_TASKS: self.pipeline_class = MyTraditionalTabularClassificationPipeline else: - raise ValueError("Only tabular classifications tasks " - "are currently supported with traditional methods") + raise ValueError("Only tabular tasks are currently supported with traditional methods") elif isinstance(self.configuration, Configuration): if self.task_type in TABULAR_TASKS: self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline @@ -446,6 +507,7 @@ def __init__(self, backend: Backend, 'y_test': self.y_test, 'backend': self.backend, 'logger_port': logger_port, + 'optimize_metric': self.metric.name }) assert self.pipeline_class is not None, "Could not infer pipeline class" pipeline_config = pipeline_config if pipeline_config is not None \ diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py index fba374a34..7d26c5481 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py @@ -1,16 +1,20 @@ +import logging.handlers import os import sys from abc import abstractmethod -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd +from sklearn.utils import check_random_state + import torch from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.base_classifier import BaseClassifier +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner from autoPyTorch.utils.common import FitRequirement @@ -26,21 +30,24 @@ def enablePrint() -> None: class BaseModelComponent(autoPyTorchSetupComponent): """ - Provide an abstract interface for traditional classification methods + Provide an abstract interface for traditional learner methods in Auto-Pytorch """ def __init__( self, random_state: Optional[np.random.RandomState] = None, - model: Optional[BaseClassifier] = None, + model: Optional[BaseTraditionalLearner] = None, device: Optional[torch.device] = None ) -> None: super(BaseModelComponent, self).__init__() - self.random_state = random_state + if random_state is None: + self.random_state = check_random_state(1) + else: + self.random_state = check_random_state(random_state) self.fit_output: Dict[str, Any] = dict() - self.model: Optional[BaseClassifier] = model + self.model: Optional[BaseTraditionalLearner] = model self.add_fit_requirements([ FitRequirement('X_train', (np.ndarray, list, pd.DataFrame), user_defined=False, dataset_property=False), @@ -57,7 +64,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent: y (Any): not used. To comply with sklearn API Returns: - A instance of self + An instance of self """ # Make sure that input dictionary X has the required # information to fit this stage @@ -74,8 +81,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent: # instantiate model self.model = self.build_model(input_shape=input_shape, - logger_port=X['logger_port'], - output_shape=output_shape) + logger_port=X['logger_port'] if 'logger_port' in X else + logging.handlers.DEFAULT_TCP_LOGGING_PORT, + output_shape=output_shape, + task_type=X['dataset_properties']['task_type'], + output_type=X['dataset_properties']['output_type'], + optimize_metric=X['optimize_metric'] if 'optimize_metric' in X else None) # train model blockPrint() @@ -87,41 +98,53 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent: if 'X_test' in X.keys() and X['X_test'] is not None: if isinstance(X['X_test'], pd.DataFrame): X['X_test'] = X['X_test'].to_numpy() - test_preds = self.model.predict(X_test=X['X_test'], predict_proba=True) + test_preds = self.model.predict(X_test=X['X_test'], predict_proba=self.model.is_classification) self.fit_output["test_preds"] = test_preds return self @abstractmethod - def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], - logger_port: int) -> BaseClassifier: + def build_model( + self, + input_shape: Tuple[int, ...], + output_shape: Tuple[int, ...], + logger_port: int, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None + ) -> BaseTraditionalLearner: """ - This method returns a pytorch model, that is dynamically built using - a self.config that is model specific, and contains the additional - configuration hyperparameters to build a domain specific model + This method returns a traditional learner, that is dynamically + built based on the provided configuration. """ raise NotImplementedError() def predict(self, X_test: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: - assert self.model is not None, "Cant predict without fitting first" + assert self.model is not None, "Can't predict without fitting first" if isinstance(X_test, pd.DataFrame): X_test = X_test.to_numpy() return self.model.predict(X_test=X_test).reshape((-1, 1)) def predict_proba(self, X_test: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: - assert self.model is not None, "Cant predict without fitting first" + assert self.model is not None, "Can't predict without fitting first" if isinstance(X_test, pd.DataFrame): X_test = X_test.to_numpy() return self.model.predict(X_test, predict_proba=True) + def score(self, X_test: Union[pd.DataFrame, np.ndarray], y_test: Union[pd.Series, np.ndarray, List]) -> float: + assert self.model is not None, "Can't score without fitting first" + if isinstance(X_test, pd.DataFrame): + X_test = X_test.to_numpy() + return self.model.score(X_test, y_test) + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ - The transform function updates the model in the X dictionary. + The transform function updates the model and the results in the fit dictionary. """ X.update({'model': self.model}) X.update({'results': self.fit_output}) return X - def get_model(self) -> BaseClassifier: + def get_model(self) -> BaseTraditionalLearner: """ Return the underlying model object. Returns: @@ -132,7 +155,7 @@ def get_model(self) -> BaseClassifier: def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None: """ - This common utility makes sure that the input dictionary X, + This common utility makes sure that the input fit dictionary, used to fit a given component class, contains the minimum information to fit the given component, and it's parents """ diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json b/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json deleted file mode 100644 index 487a62bc9..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/catboost.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "iterations" : 10000, - "learning_rate" : 0.1, - "eval_metric" : "Accuracy" -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/base_classifier.py b/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/base_classifier.py deleted file mode 100644 index 63d2508c6..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/base_classifier.py +++ /dev/null @@ -1,83 +0,0 @@ -import json -import logging.handlers -import os as os -from abc import abstractmethod -from typing import Any, Dict, List, Optional - -import numpy as np - -from sklearn.utils import check_random_state - -from autoPyTorch.metrics import accuracy -from autoPyTorch.utils.logging_ import get_named_client_logger - - -class BaseClassifier: - """ - Base class for classifiers. - """ - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None, name: str = ''): - - self.name = name - self.logger_port = logger_port - self.logger = get_named_client_logger( - name=name, - host='localhost', - port=logger_port, - ) - - if random_state is None: - self.random_state = check_random_state(1) - else: - self.random_state = check_random_state(random_state) - self.config = self.get_config() - - self.categoricals: np.ndarray = np.array(()) - self.all_nan: np.ndarray = np.array(()) - self.encode_dicts: List = [] - self.num_classes: Optional[int] = None - - self.metric = accuracy - - def get_config(self) -> Dict[str, Any]: - """ - Load the parameters for the classifier model from ../classifier_configs/modelname.json. - """ - dirname = os.path.dirname(os.path.abspath(__file__)) - config_path = os.path.join(dirname, "../classifier_configs", self.name + ".json") - with open(config_path, "r") as f: - config = json.load(f) - for k, v in config.items(): - if v == "True": - config[k] = True - if v == "False": - config[k] = False - return config - - @abstractmethod - def fit(self, - X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> Dict[str, Any]: - """ - Fit the model (possible using the validation set for early stopping) and - return the results on the training and validation set. - """ - raise NotImplementedError - - @abstractmethod - def score(self, X_test: np.ndarray, y_test: np.ndarray) -> float: - """ - Score the model performance on a test set. - """ - raise NotImplementedError - - @abstractmethod - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - """ - predict the model performance on a test set. - """ - raise NotImplementedError diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/classifiers.py b/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/classifiers.py deleted file mode 100644 index af3622dfa..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/classifiers.py +++ /dev/null @@ -1,449 +0,0 @@ -import logging.handlers -import tempfile -from typing import Any, Dict, List, Optional, Union - -from catboost import CatBoostClassifier, Pool - -from lightgbm import LGBMClassifier - -import numpy as np - -import pandas as pd - -from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC - -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.base_classifier import BaseClassifier - - -def encode_categoricals(X_train: np.ndarray, - X_val: Optional[np.ndarray] = None, - encode_dicts: Optional[List] = None - ) -> Union[np.ndarray, Optional[np.ndarray], Optional[List]]: - if encode_dicts is None: - encode_dicts = [] - got_encoded_dicts = False - else: - got_encoded_dicts = True - - for ind in range(X_train.shape[1]): - if isinstance(X_train[0, ind], str): - uniques = np.unique(X_train[0, :]) - - if got_encoded_dicts: - cat_to_int_dict = encode_dicts[ind] - else: - cat_to_int_dict = {val: ind for ind, val in enumerate(uniques)} - - converted_column_train = [cat_to_int_dict[v] for v in X_train[0, :]] - X_train[0, :] = converted_column_train - - if X_val is not None: - converted_column_val = [cat_to_int_dict[v] for v in X_val[0, :]] - X_val[0, :] = converted_column_val - - if not got_encoded_dicts: - encode_dicts.append(cat_to_int_dict) - return X_train, X_val, encode_dicts - - -class LGBModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(LGBModel, self).__init__(name="lgb", - logger_port=logger_port, - random_state=random_state) - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray, - categoricals: np.ndarray = np.array(())) -> Dict[str, Any]: - - results = dict() - - self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 # this fixes a bug - self.config["num_class"] = self.num_classes - - early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) - self.config["early_stopping_rounds"] = early_stopping - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - self.model = LGBMClassifier(**self.config, random_state=self.random_state) - self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) - - pred_train = self.model.predict_proba(X_train) - pred_val = self.model.predict_proba(X_val) - - results["val_preds"] = pred_val.tolist() - results["labels"] = y_val.tolist() - - pred_train = np.argmax(pred_train, axis=1) - pred_val = np.argmax(pred_val, axis=1) - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - if predict_proba: - y_pred_proba = self.model.predict_proba(X_test) - if self.num_classes == 2: - y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] - return y_pred_proba - - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'LGBMClassifier', - 'name': 'Light Gradient Boosting Machine Classifier', - } - - -class CatboostModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(CatboostModel, self).__init__(name="catboost", - logger_port=logger_port, - random_state=random_state) - self.config["train_dir"] = tempfile.gettempdir() - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray, - categoricals: np.ndarray = np.array(())) -> Dict[str, Any]: - - results = dict() - - categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)] - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) - - X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals) - X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals) - - # CatBoost Cannot handle a random state object, just the seed - self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0]) - self.model.fit(X_train_pooled, eval_set=X_val_pooled, use_best_model=True, early_stopping_rounds=early_stopping) - - pred_train = self.model.predict_proba(X_train) - pred_val = self.model.predict_proba(X_val) - - results["val_preds"] = pred_val.tolist() - results["labels"] = y_val.tolist() - - try: - pred_train = np.argmax(pred_train, axis=1) - pred_val = np.argmax(pred_val, axis=1) - except ValueError: - self.logger.info("==> No probabilities provided in predictions") - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - if predict_proba: - return self.model.predict_proba(X_test) - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'CatBoostClassifier', - 'name': 'Categorical Boosting Classifier', - } - - -class RFModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(RFModel, self).__init__(name="random_forest", - logger_port=logger_port, - random_state=random_state) - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> Dict[str, Any]: - - results = dict() - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - self.config["warm_start"] = False - self.num_classes = len(np.unique(y_train)) - if self.num_classes > 2: - self.logger.info("==> Using warmstarting for multiclass") - final_n_estimators = self.config["n_estimators"] - self.config["n_estimators"] = 8 - self.config["warm_start"] = True - - self.model = RandomForestClassifier(**self.config, random_state=self.random_state) - - self.model.fit(X_train, y_train) - if self.config["warm_start"]: - self.model.n_estimators = final_n_estimators - self.model.fit(X_train, y_train) - - pred_val_probas = self.model.predict_proba(X_val) - - pred_train = self.model.predict(X_train) - pred_val = self.model.predict(X_val) - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - results["val_preds"] = pred_val_probas.tolist() - results["labels"] = y_val.tolist() - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - if predict_proba: - return self.model.predict_proba(X_test) - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'RFClassifier', - 'name': 'Random Forest Classifier', - } - - -class ExtraTreesModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(ExtraTreesModel, self).__init__(name="extra_trees", - logger_port=logger_port, - random_state=random_state) - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> Dict[str, Any]: - - results = dict() - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - self.config["warm_start"] = False - self.num_classes = len(np.unique(y_train)) - if self.num_classes > 2: - self.logger.info("==> Using warmstarting for multiclass") - final_n_estimators = self.config["n_estimators"] - self.config["n_estimators"] = 8 - self.config["warm_start"] = True - - self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state) - - self.model.fit(X_train, y_train) - if self.config["warm_start"]: - self.model.n_estimators = final_n_estimators - self.model.fit(X_train, y_train) - - pred_val_probas = self.model.predict_proba(X_val) - - pred_train = self.model.predict(X_train) - pred_val = self.model.predict(X_val) - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - results["val_preds"] = pred_val_probas.tolist() - results["labels"] = y_val.tolist() - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - if predict_proba: - return self.model.predict_proba(X_test) - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'ExtraTreesClassifier', - 'name': 'ExtraTreesClassifier', - } - - -class KNNModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(KNNModel, self).__init__(name="knn", - logger_port=logger_port, - random_state=random_state) - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> Dict[str, Any]: - results = dict() - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - self.categoricals = np.array([isinstance(X_train[0, ind], str) for ind in range(X_train.shape[1])]) - X_train = X_train[:, ~self.categoricals] if self.categoricals is not None else X_train - X_val = X_val[:, ~self.categoricals] if self.categoricals is not None else X_val - - self.num_classes = len(np.unique(y_train)) - - # KNN is deterministic, no random seed needed - self.model = KNeighborsClassifier(**self.config) - self.model.fit(X_train, y_train) - - pred_val_probas = self.model.predict_proba(X_val) - - pred_train = self.model.predict(X_train) - pred_val = self.model.predict(X_val) - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - results["val_preds"] = pred_val_probas.tolist() - results["labels"] = y_val.tolist() - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - X_test = X_test[:, ~self.categoricals] if self.categoricals is not None else X_test - if predict_proba: - return self.model.predict_proba(X_test) - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'KNNClassifier', - 'name': 'K Nearest Neighbors Classifier', - } - - -class SVMModel(BaseClassifier): - - def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None): - super(SVMModel, self).__init__(name="svm", - logger_port=logger_port, - random_state=random_state) - - def fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> Dict[str, Any]: - results = dict() - - self.all_nan = np.all(pd.isnull(X_train), axis=0) - X_train = X_train[:, ~self.all_nan] - X_val = X_val[:, ~self.all_nan] - - X_train = np.nan_to_num(X_train) - X_val = np.nan_to_num(X_val) - - self.model = SVC(**self.config, probability=True, random_state=self.random_state) - - self.model.fit(X_train, y_train) - - pred_val_probas = self.model.predict_proba(X_val) - - pred_train = self.model.predict(X_train) - pred_val = self.model.predict(X_val) - - results["train_score"] = self.metric(y_train, pred_train) - results["val_score"] = self.metric(y_val, pred_val) - results["val_preds"] = pred_val_probas.tolist() - results["labels"] = y_val.tolist() - - return results - - def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: - y_pred = self.predict(X_test) - return self.metric(y_test, y_pred) - - def predict(self, X_test: np.ndarray, predict_proba: bool = False) -> np.ndarray: - X_test = X_test[:, ~self.all_nan] - X_test = np.nan_to_num(X_test) - if predict_proba: - return self.model.predict_proba(X_test) - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: - return { - 'shortname': 'SVC', - 'name': 'Support Vector Classification', - } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json new file mode 100644 index 000000000..c65a311fe --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json @@ -0,0 +1,4 @@ +{ + "iterations" : 10000, + "learning_rate" : 0.1 +} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/extra_trees.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/knn.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/lgb.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/random_forest.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/rotation_forest.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json similarity index 100% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs/svm.json rename to autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_classifier.py b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_classifier.py deleted file mode 100644 index 07422f229..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_classifier.py +++ /dev/null @@ -1,67 +0,0 @@ -from typing import Any, Dict, Optional, Tuple, Type - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter -) - -import numpy as np - -from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import ( - BaseClassifier, get_available_classifiers) - - -class TabularClassifier(BaseModelComponent): - """ - Implementation of a dynamic model, that consists of a classifier and a head - """ - - def __init__( - self, - random_state: Optional[np.random.RandomState] = None, - **kwargs: Any - ): - super().__init__( - random_state=random_state, - ) - self.config = kwargs - self._classifiers = get_available_classifiers() - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: - return { - "shortname": "TabularClassifier", - "name": "TabularClassifier", - } - - @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, str]] = None, - **kwargs: Any) -> ConfigurationSpace: - cs = ConfigurationSpace() - classifiers: Dict[str, Type[BaseClassifier]] = get_available_classifiers() - # Remove knn classifier if data is all categorical - if dataset_properties is not None and len(dataset_properties['numerical_columns']) == 0: - del classifiers['knn_classifier'] - classifier_hp = CategoricalHyperparameter("classifier", choices=classifiers.keys()) - cs.add_hyperparameters([classifier_hp]) - - return cs - - def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], - logger_port: int) -> BaseClassifier: - """ - This method returns a classifier, that is dynamically built using - a self.config that is model specific, and contains the additional - configuration hyperparameters to build a domain specific model - """ - classifier_name = self.config["classifier"] - Classifier = self._classifiers[classifier_name] - - classifier = Classifier(random_state=self.random_state, logger_port=logger_port) - - return classifier - - def __str__(self) -> str: - """ Allow a nice understanding of what components where used """ - return f"TabularClassifier: {self.model.name if self.model is not None else None}" diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py new file mode 100644 index 000000000..7b705f750 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py @@ -0,0 +1,70 @@ +from typing import Any, Dict, Optional, Tuple, Type + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter +) + +import numpy as np + +from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import ( + BaseTraditionalLearner, get_available_traditional_learners) + + +class TabularTraditionalModel(BaseModelComponent): + """ + Implementation of a dynamic model, that consists of a learner and a head + """ + + def __init__( + self, + random_state: Optional[np.random.RandomState] = None, + **kwargs: Any + ): + super().__init__( + random_state=random_state, + ) + self.config = kwargs + self._traditional_learners = get_available_traditional_learners() + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return { + "shortname": "TabularTraditionalModel", + "name": "Tabular Traditional Model", + } + + @staticmethod + def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, str]] = None, + **kwargs: Any) -> ConfigurationSpace: + cs = ConfigurationSpace() + traditional_learners: Dict[str, Type[BaseTraditionalLearner]] = get_available_traditional_learners() + # Remove knn if data is all categorical + if dataset_properties is not None and len(dataset_properties['numerical_columns']) == 0: + del traditional_learners['knn'] + learner_hp = CategoricalHyperparameter("traditional_learner", choices=traditional_learners.keys()) + cs.add_hyperparameters([learner_hp]) + + return cs + + def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], + logger_port: int, task_type: str, output_type: str, optimize_metric: Optional[str] = None + ) -> BaseTraditionalLearner: + """ + This method returns a traditional learner, that is dynamically + built using a self.config that is model specific, and contains + the additional configuration hyperparameters to build a domain + specific model + """ + learner_name = self.config["traditional_learner"] + Learner = self._traditional_learners[learner_name] + + learner = Learner(random_state=self.random_state, logger_port=logger_port, + task_type=task_type, output_type=output_type, optimize_metric=optimize_metric) + + return learner + + def __str__(self) -> str: + """ Allow a nice understanding of what components where used """ + return f"TabularTraditionalModel: {self.model.name if self.model is not None else None}" diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py similarity index 53% rename from autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/__init__.py rename to autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py index 0187df10d..f4a7b98de 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/__init__.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py @@ -3,8 +3,9 @@ from autoPyTorch.pipeline.components.base_component import ( ThirdPartyComponents, ) -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.base_classifier import BaseClassifier -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.classifiers import ( +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.learners import ( CatboostModel, ExtraTreesModel, KNNModel, @@ -12,7 +13,7 @@ RFModel, SVMModel) -_classifiers = { +_traditional_learners = { # Sort by more robust models # Depending on the allocated time budget, only the # top models from this dict are two be fitted. @@ -27,17 +28,17 @@ 'catboost': CatboostModel, 'random_forest': RFModel, 'extra_trees': ExtraTreesModel, - 'svm_classifier': SVMModel, - 'knn_classifier': KNNModel, + 'svm': SVMModel, + 'knn': KNNModel, } -_addons = ThirdPartyComponents(BaseClassifier) +_addons = ThirdPartyComponents(BaseTraditionalLearner) -def add_classifier(classifier: BaseClassifier) -> None: - _addons.add_component(classifier) +def add_traditional_learner(traditional_learner: BaseTraditionalLearner) -> None: + _addons.add_component(traditional_learner) -def get_available_classifiers() -> Dict[str, Union[Type[BaseClassifier], Any]]: - classifiers = dict() - classifiers.update(_classifiers) - return classifiers +def get_available_traditional_learners() -> Dict[str, Union[Type[BaseTraditionalLearner], Any]]: + traditional_learners = dict() + traditional_learners.update(_traditional_learners) + return traditional_learners diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py new file mode 100644 index 000000000..cd17b3b21 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py @@ -0,0 +1,266 @@ +import json +import logging.handlers +import os as os +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Union + +from catboost import CatBoost + +import numpy as np + +import pandas as pd + +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + +from autoPyTorch.constants import REGRESSION_TASKS, STRING_TO_TASK_TYPES +from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics +from autoPyTorch.utils.logging_ import get_named_client_logger + + +class BaseTraditionalLearner: + """ + Base wrapper class for Traditional Learners. + + Args: + task_type (str): + Type of the current task. Currently only tabular + tasks are supported. For more info on the tasks + available in AutoPyTorch, see + `autoPyTorch/constants.py` + output_type (str): + Type of output. The string depends on the output of + sklearn's type_of_target. `see + ` + logger_port (int) (default=logging.handlers.DEFAULT_TCP_LOGGING_PORT) + random_state (Optional[np.random.RandomState], default=None): + name (str, default=''): + Name of the learner, when not specified, + uses the name of the class + """ + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + name: Optional[str] = None): + + self.model: Optional[Union[CatBoost, BaseEstimator]] = None + + self.name = name if name is not None else self.__class__.__name__ + self.logger_port = logger_port + self.logger = get_named_client_logger( + name=self.name, + host='localhost', + port=logger_port, + ) + + if random_state is None: + self.random_state = check_random_state(1) + else: + self.random_state = check_random_state(random_state) + self.config = self.get_config() + + self.all_nan: Optional[np.ndarray] = None + self.num_classes: Optional[int] = None + + self.is_classification = STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS + + self.metric = get_metrics(dataset_properties={'task_type': task_type, + 'output_type': output_type}, + names=[optimize_metric] if optimize_metric is not None else None)[0] + + def get_config(self) -> Dict[str, Any]: + """ + Load the parameters for the classifier model from ../estimator_configs/modelname.json. + """ + dirname = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(dirname, "../estimator_configs", self.name + ".json") + with open(config_path, "r") as f: + config = json.load(f) + for k, v in config.items(): + if v == "True": + config[k] = True + if v == "False": + config[k] = False + return config + + def _preprocess(self, + X: np.ndarray + ) -> np.ndarray: + """ + Preprocess the input set, currently imputes the nan columns. + Can be used to add more preprocessing functionality + Args: + X (np.ndarray): + input data + Returns: + (np.ndarray): + Output data + """ + if self.all_nan is None: + self.all_nan = np.all(pd.isnull(X), axis=0) + + X = X[:, ~self.all_nan] + X = np.nan_to_num(X, copy=False) + + return X + + @abstractmethod + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + """ + Abstract method to prepare model. Depending on the + learner, this function will initialise the underlying + estimator and the objects needed to do that + + Args: + X_train (np.ndarray): + Input training data + y_train (np.ndarray): + Target training data + Returns: + + """ + raise NotImplementedError + + @abstractmethod + def _fit(self, + X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + """ + Method that fits the underlying estimator + Args: + X_train (np.ndarray): + Input training data + y_train (np.ndarray): + Target training data + X_val (np.ndarray): + Input validation data + y_val (np.ndarray): + Output validation data + Returns: + None + """ + raise NotImplementedError + + def fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> Dict[str, Any]: + """ + Fit the model (possible using the validation set for early stopping) and + return the results on the training and validation set. + + Args: + X_train (np.ndarray): + Input training data + y_train (np.ndarray): + Target training data + X_val (np.ndarray): + Input validation data + y_val (np.ndarray): + Output validation data + Returns: + Dict[str, Any]: + Dictionary containing the results. see _get_results() + """ + X_train = self._preprocess(X_train) + X_val = self._preprocess(X_val) + + self._prepare_model(X_train, y_train) + + self._fit(X_train, y_train, X_val, y_val) + + results = self._get_results(X_train, y_train, X_val, y_val) + + return results + + def score(self, X_test: np.ndarray, y_test: Union[np.ndarray, List]) -> float: + """ + Score the model performance on a test set. + + Args: + X_test (np.ndarray): + Input data + y_test (Union[np.ndarray, List]): + Target data + Returns: + float: score on the selected metric + """ + y_pred = self.predict(X_test, predict_proba=self.is_classification) + return self.metric(y_test, y_pred) + + def predict(self, X_test: np.ndarray, + predict_proba: bool = False, + preprocess: bool = True) -> np.ndarray: + """ + predict the model performance on a test set. + + Args: + X_test (np.ndarray): + Input data + predict_proba (bool, default=False): + if task is a classification task, + predict the class probabilities + preprocess (bool, default=True): + Whether to preprocess data or not + Returns: + + """ + assert self.model is not None, "No model found. Can't " \ + "predict before fitting. " \ + "Call fit before predicting" + if preprocess: + X_test = self._preprocess(X_test) + if predict_proba: + if not self.is_classification: + raise ValueError("Can't predict probabilities for a regressor") + return self.model.predict_proba(X_test) + y_pred = self.model.predict(X_test) + return y_pred + + def _get_results(self, + X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> Dict[str, Any]: + """ + Gather results of the training. + The following results are calculated: + 1. val_preds: validation predictions + 2. train_preds: training predictions + 3. val_score: score on validation set + 4. train_score: score on the training set + + Args: + X_train (np.ndarray): + Input training data + y_train (np.ndarray): + Target training data + X_val (np.ndarray): + Input validation data + y_val (np.ndarray): + Output validation data + Returns: + Dict[str, Any]: + Dictionary containing the results + """ + pred_train = self.predict(X_train, predict_proba=self.is_classification, preprocess=False) + pred_val = self.predict(X_val, predict_proba=self.is_classification, preprocess=False) + + results = dict() + + results["val_preds"] = pred_val.tolist() + results["labels"] = y_val.tolist() + + results["train_score"] = self.metric(y_train, pred_train) + results["val_score"] = self.metric(y_val, pred_val) + + return results diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py new file mode 100644 index 000000000..e97574b29 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py @@ -0,0 +1,344 @@ +import logging.handlers +import tempfile +from typing import Any, Dict, Optional + +from catboost import CatBoostClassifier, CatBoostRegressor, Pool + +from lightgbm import LGBMClassifier, LGBMRegressor + +import numpy as np + +from sklearn.ensemble import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor +) +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.svm import SVC, SVR + +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.utils import ( + AutoPyTorchToCatboostMetrics +) + + +class LGBModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(LGBModel, self).__init__(name="lgb", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) + self.config["early_stopping_rounds"] = early_stopping + if not self.is_classification: + self.model = LGBMRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 # this fixes a bug + self.config["num_class"] = self.num_classes + + self.model = LGBMClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray + ) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) + + def predict(self, X_test: np.ndarray, + predict_proba: bool = False, + preprocess: bool = True) -> np.ndarray: + assert self.model is not None, "No model found. Can't " \ + "predict before fitting. " \ + "Call fit before predicting" + if preprocess: + X_test = self._preprocess(X_test) + + if predict_proba: + if not self.is_classification: + raise ValueError("Can't predict probabilities for a regressor") + y_pred_proba = self.model.predict_proba(X_test) + if self.num_classes == 2: + y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] + return y_pred_proba + + y_pred = self.model.predict(X_test) + return y_pred + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'LGBMLearner', + 'name': 'Light Gradient Boosting Machine Learner', + } + + +class CatboostModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(CatboostModel, self).__init__(name="catboost", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + self.config["train_dir"] = tempfile.gettempdir() + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + if not self.is_classification: + self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value + # CatBoost Cannot handle a random state object, just the seed + self.model = CatBoostRegressor(**self.config, random_state=self.random_state.get_state()[1][0]) + else: + self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value + # CatBoost Cannot handle a random state object, just the seed + self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0]) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + + assert self.model is not None, "No model found. Can't fit without preparing the model" + early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) + categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)] + + X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals) + X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals) + + self.model.fit(X_train_pooled, eval_set=X_val_pooled, use_best_model=True, early_stopping_rounds=early_stopping) + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'CBLearner', + 'name': 'Categorical Boosting Learner', + } + + +class RFModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(RFModel, self).__init__(name="random_forest", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + + self.config["warm_start"] = False + # TODO: Check if we need to warmstart for regression. + # In autogluon, they warm start when usinf daal backend, see + # ('https://github.com/awslabs/autogluon/blob/master/tabular/src/autogluon/tabular/models/rf/rf_model.py#L35') + if not self.is_classification: + self.model = RandomForestRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) + if self.num_classes > 2: + self.logger.info("==> Using warmstarting for multiclass") + self.final_n_estimators = self.config["n_estimators"] + self.config["n_estimators"] = 8 + self.config["warm_start"] = True + self.model = RandomForestClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + + self.model.fit(X_train, y_train) + if self.config["warm_start"]: + self.model.n_estimators = self.final_n_estimators + self.model.fit(X_train, y_train) + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'RFLearner', + 'name': 'Random Forest Learner', + } + + +class ExtraTreesModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(ExtraTreesModel, self).__init__(name="extra_trees", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + self.config["warm_start"] = False + + if not self.is_classification: + self.model = ExtraTreesRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) + if self.num_classes > 2: + self.logger.info("==> Using warmstarting for multiclass") + self.final_n_estimators = self.config["n_estimators"] + self.config["n_estimators"] = 8 + self.config["warm_start"] = True + + self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train) + if self.config["warm_start"]: + self.model.n_estimators = self.final_n_estimators + self.model.fit(X_train, y_train) + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'ETLearner', + 'name': 'ExtraTreesLearner', + } + + +class KNNModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(KNNModel, self).__init__(name="knn", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + self.categoricals: Optional[np.ndarray[bool]] = None + + def _preprocess(self, + X: np.ndarray + ) -> np.ndarray: + + super(KNNModel, self)._preprocess(X) + if self.categoricals is None: + self.categoricals = np.array([isinstance(X[0, ind], str) for ind in range(X.shape[1])]) + X = X[:, ~self.categoricals] if self.categoricals is not None else X + + return X + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + if not self.is_classification: + self.model = KNeighborsRegressor(**self.config) + else: + self.num_classes = len(np.unique(y_train)) + # KNN is deterministic, no random seed needed + self.model = KNeighborsClassifier(**self.config) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train) + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'KNNLearner', + 'name': 'K Nearest Neighbors Learner', + } + + +class SVMModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None + ): + super(SVMModel, self).__init__(name="svm", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + if not self.is_classification: + # Does not take random state. + self.model = SVR(**self.config) + else: + self.model = SVC(**self.config, probability=True, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train) + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + return { + 'shortname': 'SVMLearner', + 'name': 'Support Vector Machine Learner', + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py new file mode 100644 index 000000000..b45161aa9 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py @@ -0,0 +1,15 @@ +from enum import Enum + + +class AutoPyTorchToCatboostMetrics(Enum): + mean_absolute_error = "MAE" + root_mean_squared_error = "RMSE" + mean_squared_log_error = "MSLE" + r2 = "R2" + accuracy = "Accuracy" + balanced_accuracy = "BalancedAccuracy" + f1 = "F1" + roc_auc = "AUC" + precision = "Precision" + recall = "Recall" + log_loss = "Logloss" diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index 248d8085b..caacd7d24 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -249,6 +249,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic # Support additional user metrics additional_metrics = X['additional_metrics'] if 'additional_metrics' in X else None + if 'optimize_metric' in X: + additional_metrics = additional_metrics.append(X['optimize_metric']) if additional_metrics is not None \ + else [X['optimize_metric']] additional_losses = X['additional_losses'] if 'additional_losses' in X else None self.choice.prepare( model=X['network'], diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index fd607bf70..9f1ad027e 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -12,6 +12,7 @@ import torch +from autoPyTorch.constants import STRING_TO_TASK_TYPES from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent @@ -160,6 +161,33 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n return y + def score(self, X: np.ndarray, y: np.ndarray, + batch_size: Optional[int] = None, + metric_name: str = 'accuracy') -> float: + """Scores the fitted estimator on (X, y) + + Args: + X (np.ndarray): + input to the pipeline, from which to guess targets + batch_size (Optional[int]): + batch_size controls whether the pipeline + will be called on small chunks of the data. + Useful when calling the predict method on + the whole array X results in a MemoryError. + y (np.ndarray): + Ground Truth labels + metric_name (str, default = 'accuracy'): + name of the metric to be calculated + Returns: + float: score based on the metric name + """ + from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score + metrics = get_metrics(self.dataset_properties, [metric_name]) + y_pred = self.predict(X, batch_size=batch_size) + score = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']], + metrics=metrics)[metric_name] + return score + def _get_hyperparameter_search_space(self, dataset_properties: Dict[str, Any], include: Optional[Dict[str, Any]] = None, diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 27a3ae314..24c4c0521 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -9,6 +9,8 @@ from sklearn.base import RegressorMixin +import torch + from autoPyTorch.constants import STRING_TO_TASK_TYPES from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice @@ -88,23 +90,37 @@ def __init__(self, config, steps, dataset_properties, include, exclude, random_state, init_params, search_space_updates) - def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: + # Because a pipeline is passed to a worker, we need to honor the random seed + # in this context. A tabular regression pipeline will implement a torch + # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html + torch.manual_seed(self.random_state.get_state()[1][0]) + + def score(self, X: np.ndarray, y: np.ndarray, + batch_size: Optional[int] = None, + metric_name: str = 'r2') -> float: """Scores the fitted estimator on (X, y) Args: - X (np.ndarray): input to the pipeline, from which to guess targets - batch_size (Optional[int]): batch_size controls whether the pipeline - will be called on small chunks of the data. Useful when calling the - predict method on the whole array X results in a MemoryError. + X (np.ndarray): + input to the pipeline, from which to guess targets + batch_size (Optional[int]): + batch_size controls whether the pipeline will be + called on small chunks of the data. Useful when + calling the predict method on the whole array X + results in a MemoryError. + y (np.ndarray): + Ground Truth labels + metric_name (str, default = 'r2'): + name of the metric to be calculated Returns: - np.ndarray: coefficient of determination R^2 of the prediction + float: score based on the metric name """ from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score - metrics = get_metrics(self.dataset_properties, ['r2']) + metrics = get_metrics(self.dataset_properties, [metric_name]) y_pred = self.predict(X, batch_size=batch_size) - r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']], - metrics=metrics)['r2'] - return r2 + score = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']], + metrics=metrics)[metric_name] + return score def _get_hyperparameter_search_space(self, dataset_properties: Dict[str, Any], diff --git a/autoPyTorch/pipeline/traditional_tabular_classification.py b/autoPyTorch/pipeline/traditional_tabular_classification.py index 5b0471e87..e44e9940a 100644 --- a/autoPyTorch/pipeline/traditional_tabular_classification.py +++ b/autoPyTorch/pipeline/traditional_tabular_classification.py @@ -211,7 +211,7 @@ def _get_estimator_hyperparameter_name(self) -> str: Returns: str: name of the pipeline type """ - return "tabular_classifier" + return "traditional_tabular_learner" def get_pipeline_representation(self) -> Dict[str, str]: """ diff --git a/autoPyTorch/pipeline/traditional_tabular_regression.py b/autoPyTorch/pipeline/traditional_tabular_regression.py new file mode 100644 index 000000000..a6d750aff --- /dev/null +++ b/autoPyTorch/pipeline/traditional_tabular_regression.py @@ -0,0 +1,185 @@ +import warnings +from typing import Any, Dict, List, Optional, Tuple, cast + +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace + +import numpy as np + +from sklearn.base import RegressorMixin + +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice + + +class TraditionalTabularRegressionPipeline(RegressorMixin, BasePipeline): + """ + A pipeline that contains steps to fit traditional ML methods for tabular regression. + + Args: + config (Configuration) + The configuration to evaluate. + random_state (Optional[RandomState): random_state is the random number generator + + Attributes: + """ + + def __init__( + self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, + dataset_properties: Optional[Dict[str, Any]] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None + ): + super().__init__( + config, steps, dataset_properties, include, exclude, + random_state, init_params) + + def predict(self, X: np.ndarray, batch_size: Optional[int] = None + ) -> np.ndarray: + """Predict the output using the selected model. + + Args: + X (np.ndarray): input data to the array + batch_size (Optional[int]): batch_size controls whether the pipeline will be + called on small chunks of the data. Useful when calling the + predict method on the whole array X results in a MemoryError. + + Returns: + np.ndarray: the predicted values given input X + """ + + if batch_size is None: + return self.named_steps['model_trainer'].predict(X) + + else: + if not isinstance(batch_size, int): + raise ValueError("Argument 'batch_size' must be of type int, " + "but is '%s'" % type(batch_size)) + if batch_size <= 0: + raise ValueError("Argument 'batch_size' must be positive, " + "but is %d" % batch_size) + + else: + # Probe for the target array dimensions + target = self.predict(X[0:2].copy()) + if (target.shape) == 1: + target = target.reshape((-1, 1)) + y = np.zeros((X.shape[0], target.shape[1]), + dtype=np.float32) + + for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): + batch_from = k * batch_size + batch_to = min([(k + 1) * batch_size, X.shape[0]]) + pred_prob = self.predict(X[batch_from:batch_to], batch_size=None) + y[batch_from:batch_to] = pred_prob.astype(np.float32) + + return y + + def _get_hyperparameter_search_space(self, + dataset_properties: Dict[str, Any], + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + ) -> ConfigurationSpace: + """Create the hyperparameter configuration space. + + For the given steps, and the Choices within that steps, + this procedure returns a configuration space object to + explore. + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics + of the dataset to guide the pipeline choices of components + + Returns: + cs (Configuration): The configuration space describing the TabularRegressionPipeline. + """ + cs = ConfigurationSpace() + + if dataset_properties is None or not isinstance(dataset_properties, dict): + if not isinstance(dataset_properties, dict): + warnings.warn('The given dataset_properties argument contains an illegal value.' + 'Proceeding with the default value') + dataset_properties = dict() + + if 'target_type' not in dataset_properties: + dataset_properties['target_type'] = 'tabular_regression' + if dataset_properties['target_type'] != 'tabular_regression': + warnings.warn('Tabular regression is being used, however the target_type' + 'is not given as "tabular_regression". Overriding it.') + dataset_properties['target_type'] = 'tabular_regression' + # get the base search space given this + # dataset properties. Then overwrite with custom + # regression requirements + cs = self._get_base_search_space( + cs=cs, dataset_properties=dataset_properties, + exclude=exclude, include=include, pipeline=self.steps) + + # Here we add custom code, like this with this + # is not a valid configuration + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs + + def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> List[Tuple[str, autoPyTorchChoice]]: + """ + Defines what steps a pipeline should follow. + The step itself has choices given via autoPyTorchChoice. + + Returns: + List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised + by the pipeline. + """ + steps = [] # type: List[Tuple[str, autoPyTorchChoice]] + + default_dataset_properties = {'target_type': 'tabular_regression'} + if dataset_properties is not None: + default_dataset_properties.update(dataset_properties) + + steps.extend([ + ("model_trainer", ModelChoice(default_dataset_properties, + random_state=self.random_state)), + ]) + return steps + + def get_pipeline_representation(self) -> Dict[str, str]: + """ + Returns a representation of the pipeline, so that it can be + consumed and formatted by the API. + + It should be a representation that follows: + [{'PreProcessing': <>, 'Estimator': <>}] + + Returns: + Dict: contains the pipeline representation in a short format + """ + estimator_name = 'TraditionalTabularRegression' + if self.steps[0][1].choice is not None: + if self.steps[0][1].choice.model is None: + estimator_name = self.steps[0][1].choice.__class__.__name__ + else: + estimator_name = cast( + str, + self.steps[0][1].choice.model.get_properties()['shortname'] + ) + return { + 'Preprocessing': 'None', + 'Estimator': estimator_name, + } + + def _get_estimator_hyperparameter_name(self) -> str: + """ + Returns the name of the current estimator. + + Returns: + str: name of the pipeline type + """ + return "traditional_tabular_regressor" diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 98bd20a68..bfeee3af5 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -1,4 +1,3 @@ -import hashlib from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -76,26 +75,6 @@ def __str__(self) -> str: self.hyperparameter, self.value_range, self.default_value, self.log) -def replace_prefix_in_config_dict(config: Dict[str, Any], prefix: str, replace: str = "") -> Dict[str, Any]: - """ - Replace the prefix in all keys with the specified replacement string (the empty string by - default to remove the prefix from the key). The functions makes sure that the prefix is a proper config - prefix by checking if it ends with ":", if not it appends ":" to the prefix. - - :param config: config dictionary where the prefixed of the keys should be replaced - :param prefix: prefix to be replaced in each key - :param replace: the string to replace the prefix with - :return: updated config dictionary - """ - # make sure that prefix ends with the config separator ":" - if not prefix.endswith(":"): - prefix = prefix + ":" - # only replace first occurrence of the prefix - return {k.replace(prefix, replace, 1): v - for k, v in config.items() if - k.startswith(prefix)} - - def custom_collate_fn(batch: List) -> List[Optional[torch.tensor]]: """ In the case of not providing a y tensor, in a @@ -142,40 +121,6 @@ def replace_string_bool_to_bool(dictionary: Dict[str, Any]) -> Dict[str, Any]: return dictionary -def hash_array_or_matrix(X: Union[np.ndarray, pd.DataFrame]) -> str: - """ - Creates a hash for a given array. - Used for dataset name in case none is specified - Args: - X: (Union[np.ndarray, pd.DataFrame]) - data - - Returns: - (str): hash of the data as string - """ - m = hashlib.md5() - - if hasattr(X, "iloc"): - X = X.to_numpy() - - if scipy.sparse.issparse(X): - m.update(X.indices) - m.update(X.indptr) - m.update(X.data) - m.update(str(X.shape).encode('utf8')) - else: - if X.flags['C_CONTIGUOUS']: - m.update(X.data) - m.update(str(X.shape).encode('utf8')) - else: - X_tmp = np.ascontiguousarray(X.T) - m.update(X_tmp.data) - m.update(str(X_tmp.shape).encode('utf8')) - - hash = m.hexdigest() - return hash - - def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device: """ Get a torch device object by checking if the fit dictionary specifies a device. If not, or if no GPU is available diff --git a/docs/api.rst b/docs/api.rst index 199e85ccc..01a46245c 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -16,3 +16,11 @@ Classification .. autoclass:: autoPyTorch.api.tabular_classification.TabularClassificationTask :members: :inherited-members: search, refit, predict, score + +~~~~~~~~~~~~~~ +Regression +~~~~~~~~~~~~~~ + +.. autoclass:: autoPyTorch.api.tabular_regression.TabularRegressionTask + :members: + :inherited-members: search, refit, predict, score \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 411e366ad..675cfc1d9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,10 +62,10 @@ # Sphinx-gallery configuration. # get current branch -binder_branch = 'refactor_development' +binder_branch = 'development' import autoPyTorch if "dev" in autoPyTorch.__version__: - binder_branch = "refactor_development" + binder_branch = "development" sphinx_gallery_conf = { # path to the examples diff --git a/docs/installation.rst b/docs/installation.rst index eac2d5e5f..3cac7b7c5 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -11,7 +11,7 @@ System requirements Auto-PyTorch has the following system requirements: -* Linux operating system (for example Ubuntu) `(get Linux here) `_, +* Linux operating system (for example Ubuntu), Mac OS X `(get Linux here) `_, * Python (>=3.6) `(get Python here) `_. * C++ compiler (with C++11 supports) `(get GCC here) `_ and * SWIG (version 3.0.* is required; >=4.0.0 is not supported) `(get SWIG here) `_. @@ -23,7 +23,11 @@ Installing Auto-Pytorch conda create -n autopytorch python=3.8 conda activate autopytorch - conda install gxx_linux-64 gcc_linux-64 swig + For Linux: + conda install gxx_linux-64 gcc_linux-64 swig + For mac: + conda install -c conda-forge clang_osx-64 clangxx_osx-64 + conda install -c anaconda swig cat requirements.txt | xargs -n 1 -L 1 pip install python setup.py install diff --git a/examples/20_basics/example_tabular_regression.py b/examples/20_basics/example_tabular_regression.py index 836d4d6d6..9b4e876e9 100644 --- a/examples/20_basics/example_tabular_regression.py +++ b/examples/20_basics/example_tabular_regression.py @@ -50,7 +50,6 @@ optimize_metric='r2', total_walltime_limit=300, func_eval_time_limit_secs=50, - enable_traditional_pipeline=False, ) ############################################################################ diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index a0752db25..d605a5b37 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -33,7 +33,8 @@ ) from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.optimizer.smbo import AutoMLSMBO -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import _classifiers +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import _traditional_learners from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy @@ -228,6 +229,9 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl # Make sure that a configuration space is stored in the estimator assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) + # test fit on dummy data + assert isinstance(estimator.fit(dataset=backend.load_datamanager()), BasePipeline) + @pytest.mark.parametrize('openml_name', ("boston", )) @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', @@ -412,6 +416,18 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin restored_estimator = pickle.load(f) restored_estimator.predict(X_test) + # Test refit on dummy data + estimator.refit(dataset=backend.load_datamanager()) + + # Make sure that a configuration space is stored in the estimator + assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) + + representation = estimator.show_models() + assert isinstance(representation, str) + assert 'Weight' in representation + assert 'Preprocessing' in representation + assert 'Estimator' in representation + @pytest.mark.parametrize('openml_id', ( 1590, # Adult to test NaN in categorical columns @@ -707,7 +723,7 @@ def test_do_traditional_pipeline(fit_dictionary_tabular): with open(model_path, 'rb') as model_handler: model = pickle.load(model_handler) clone(model) - assert model.config == list(_classifiers.keys())[i - 2] + assert model.config == list(_traditional_learners.keys())[i - 2] at_least_one_model_checked = True if not at_least_one_model_checked: pytest.fail("Not even one single traditional pipeline was fitted") diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 95ab6a0e4..8949f9f28 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -87,3 +87,15 @@ def test_show_models(fit_dictionary_tabular): expected = (r"0\s+|\s+SimpleImputer,OneHotEncoder,NoScaler,NoFeaturePreprocessing\s+" r"|\s+no embedding,ShapedMLPBackbone,FullyConnectedHead,nn.Sequential\s+|\s+1") assert re.search(expected, api.show_models()) is not None + + +def test_set_pipeline_config(): + # checks if we can correctly change the pipeline options + estimator = BaseTask() + pipeline_options = {"device": "cuda", + "budget_type": "epochs", + "min_epochs": 10, + "epochs": 51, + "runtime": 360} + estimator.set_pipeline_config(**pipeline_options) + assert pipeline_options.items() <= estimator.get_pipeline_options().items() diff --git a/test/test_pipeline/components/setup/test_setup_traditional_classification.py b/test/test_pipeline/components/setup/test_setup_traditional_classification.py deleted file mode 100644 index 90c7f18f6..000000000 --- a/test/test_pipeline/components/setup/test_setup_traditional_classification.py +++ /dev/null @@ -1,125 +0,0 @@ -import copy -import os -import sys - -import numpy as np - -import pytest - -from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models.classifiers import ( - CatboostModel, - ExtraTreesModel, - KNNModel, - LGBModel, - RFModel, - SVMModel -) - - -# Disable -def blockPrint(): - sys.stdout = open(os.devnull, 'w') - - -# Restore -def enablePrint(): - sys.stdout = sys.__stdout__ - - -@pytest.fixture(params=[LGBModel(), CatboostModel(), SVMModel(), - RFModel(), ExtraTreesModel(), KNNModel()]) -def classifier(request): - return request.param - - -@pytest.fixture -def dataset_properties(request): - return request.getfixturevalue(request.param) - - -@pytest.fixture -def dataset_properties_num_only(): - return {'numerical_columns': list(range(5))} - - -@pytest.fixture -def dataset_properties_categorical_only(): - return {'numerical_columns': list(range(0))} - - -@pytest.mark.parametrize("dataset_properties", ['dataset_properties_num_only', - 'dataset_properties_categorical_only'], indirect=True) -class TestModelChoice: - def test_get_set_config_space(self, dataset_properties): - """Make sure that we can setup a valid choice in the encoder - choice""" - model_choice = ModelChoice(dataset_properties) - cs = model_choice.get_hyperparameter_search_space(dataset_properties=dataset_properties) - - # Make sure that all hyperparameters are part of the search space - assert sorted(cs.get_hyperparameter('__choice__').choices) == sorted(list(model_choice.get_components().keys())) - - # Make sure we can properly set some random configs - # Whereas just one iteration will make sure the algorithm works, - # doing five iterations increase the confidence. We will be able to - # catch component specific crashes - for i in range(5): - config = cs.sample_configuration() - config_dict = copy.deepcopy(config.get_dictionary()) - model_choice.set_hyperparameters(config) - - assert model_choice.choice.__class__ == model_choice.get_components()[config_dict['__choice__']] - - # Then check the choice configuration - selected_choice = config_dict.pop('__choice__', None) - for key, value in config_dict.items(): - # Remove the selected_choice string from the parameter - # so we can query in the object for it - key = key.replace(selected_choice + ':', '') - assert key in vars(model_choice.choice)['config'] - assert value == model_choice.choice.__dict__['config'][key] - - -@pytest.mark.parametrize("dataset", ['dataset_traditional_classifier_num_only', - 'dataset_traditional_classifier_categorical_only', - 'dataset_traditional_classifier_num_categorical'], indirect=True) -class TestTraditionalClassifiers: - def test_classifier_fit_predict(self, classifier, dataset): - X, y = dataset - - blockPrint() - try: - results = classifier.fit(X_train=X, X_val=X, y_train=y, y_val=y) - except ValueError as e: - assert isinstance(classifier, KNNModel) - assert 'Found array with 0 feature' in e.args[0] - # KNN classifier works only on numerical data - pytest.skip() - - enablePrint() - - assert isinstance(results, dict) - assert 'val_preds' in results.keys() - assert isinstance(results['val_preds'], list) - assert len(results['val_preds']) == y.shape[0] - assert len(results['val_preds'][0]) == len(np.unique(y)) - assert len(np.argwhere(0 > np.array(results['val_preds']).all() > 1)) == 0 - assert 'labels' in results.keys() - assert len(results['labels']) == y.shape[0] - assert 'train_score' in results.keys() - assert isinstance(results['train_score'], float) - assert 'val_score' in results.keys() - assert isinstance(results['val_score'], float) - - # Test if classifier can predict on val set and - # if the result is same as the one in results - y_pred = classifier.predict(X, predict_proba=True) - assert np.allclose(y_pred, results['val_preds'], atol=1e-04) - assert y_pred.shape[0] == y.shape[0] - # Test if classifier can score and - # the result is same as in results - score = classifier.score(X, y) - assert score == results['val_score'] - # Test if score is greater than 0.8 - assert score >= 0.8 diff --git a/test/test_pipeline/components/setup/test_setup_traditional_models.py b/test/test_pipeline/components/setup/test_setup_traditional_models.py new file mode 100644 index 000000000..c7066729f --- /dev/null +++ b/test/test_pipeline/components/setup/test_setup_traditional_models.py @@ -0,0 +1,137 @@ +import copy +import os +import pickle +import sys + +import numpy as np + +import pytest + +from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice +from autoPyTorch.pipeline.components.setup.traditional_ml.tabular_traditional_model import TabularTraditionalModel + + +# Disable +def blockPrint(): + sys.stdout = open(os.devnull, 'w') + + +# Restore +def enablePrint(): + sys.stdout = sys.__stdout__ + + +@pytest.fixture(params=['lgb', 'catboost', + 'random_forest', + 'extra_trees', 'svm', 'knn']) +def traditional_learner(request): + return request.param + + +@pytest.fixture +def dataset_properties(request): + return request.getfixturevalue(request.param) + + +@pytest.fixture +def dataset_properties_num_only(): + return {'numerical_columns': list(range(5))} + + +@pytest.fixture +def dataset_properties_categorical_only(): + return {'numerical_columns': list(range(0))} + + +@pytest.mark.parametrize("dataset_properties", ['dataset_properties_num_only', + 'dataset_properties_categorical_only'], indirect=True) +class TestModelChoice: + def test_get_set_config_space(self, dataset_properties): + """Make sure that we can setup a valid choice in the encoder + choice""" + model_choice = ModelChoice(dataset_properties) + cs = model_choice.get_hyperparameter_search_space(dataset_properties=dataset_properties) + + # Make sure that all hyperparameters are part of the search space + assert sorted(cs.get_hyperparameter('__choice__').choices) == sorted(list(model_choice.get_components().keys())) + + # Make sure we can properly set some random configs + # Whereas just one iteration will make sure the algorithm works, + # doing five iterations increase the confidence. We will be able to + # catch component specific crashes + for i in range(5): + config = cs.sample_configuration() + config_dict = copy.deepcopy(config.get_dictionary()) + model_choice.set_hyperparameters(config) + + assert model_choice.choice.__class__ == model_choice.get_components()[config_dict['__choice__']] + + # Then check the choice configuration + selected_choice = config_dict.pop('__choice__', None) + for key, value in config_dict.items(): + # Remove the selected_choice string from the parameter + # so we can query in the object for it + key = key.replace(selected_choice + ':', '') + assert key in vars(model_choice.choice)['config'] + assert value == model_choice.choice.__dict__['config'][key] + + +@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only', + 'classification_numerical_only', + 'classification_numerical_and_categorical', + "regression_numerical_only", + "regression_categorical_only", + "regression_numerical_and_categorical" + ], indirect=True) +def test_model_fit_predict_score(traditional_learner, fit_dictionary_tabular): + + if len(fit_dictionary_tabular['dataset_properties']['numerical_columns']) == 0 and traditional_learner == 'knn': + pytest.skip("knn can not work with categorical only data") + + model = TabularTraditionalModel(traditional_learner=traditional_learner) + + assert isinstance(model.get_properties(), dict) + + blockPrint() + model.fit(X=fit_dictionary_tabular) + enablePrint() + + assert isinstance(model.fit_output, dict) + assert 'val_preds' in model.fit_output.keys() + assert isinstance(model.fit_output['val_preds'], list) + assert len(model.fit_output['val_preds']) == len(fit_dictionary_tabular['val_indices']) + if model.model.is_classification: + assert len(model.fit_output['val_preds'][0]) == len(np.unique(fit_dictionary_tabular['y_train'])) + assert len(np.argwhere(0 > np.array(model.fit_output['val_preds']).all() > 1)) == 0 + assert 'labels' in model.fit_output.keys() + assert len(model.fit_output['labels']) == len(fit_dictionary_tabular['val_indices']) + assert 'train_score' in model.fit_output.keys() + assert isinstance(model.fit_output['train_score'], float) + assert 'val_score' in model.fit_output.keys() + assert isinstance(model.fit_output['val_score'], float) + + # Test if traditional model can predict on val set + if model.model.is_classification: + y_pred = model.predict_proba(fit_dictionary_tabular['X_train'][fit_dictionary_tabular['val_indices']]) + else: + y_pred = model.predict(fit_dictionary_tabular['X_train'][fit_dictionary_tabular['val_indices']]) + with pytest.raises(ValueError, match="Can't predict probabilities for a regressor"): + model.predict_proba(fit_dictionary_tabular['X_train'][fit_dictionary_tabular['val_indices']]) + + assert np.allclose(y_pred.squeeze(), model.fit_output['val_preds'], atol=1e-04) + assert y_pred.shape[0] == len(fit_dictionary_tabular['val_indices']) + # Test if classifier can score and + # the result is same as in results + score = model.score(fit_dictionary_tabular['X_train'][fit_dictionary_tabular['val_indices']], + fit_dictionary_tabular['y_train'][fit_dictionary_tabular['val_indices']]) + assert np.allclose(score, model.fit_output['val_score'], atol=1e-6) + + if sys.version_info >= (3, 7): + dump_file = os.path.join(fit_dictionary_tabular['backend'].temporary_directory, 'dump.pkl') + + with open(dump_file, 'wb') as f: + pickle.dump(model, f) + + with open(dump_file, 'rb') as f: + restored_estimator = pickle.load(f) + restored_estimator.predict(fit_dictionary_tabular['X_train']) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index c90eb2a04..0184d84f3 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -14,7 +14,6 @@ import torch -from autoPyTorch import metrics from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.common import FitRequirement @@ -447,15 +446,7 @@ def test_pipeline_score(fit_dictionary_tabular_dummy): # Ensure that the network is an instance of torch Module assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module) - # we expect the output to have the same batch size as the test input, - # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties) - expected_output_shape = (X.shape[0], - fit_dictionary_tabular_dummy["dataset_properties"]["output_shape"]) - - prediction = pipeline.predict(X) - assert isinstance(prediction, np.ndarray) - assert prediction.shape == expected_output_shape + accuracy = pipeline.score(X, y) # we should be able to get a decent score on this dummy data - accuracy = metrics.accuracy(y, prediction.squeeze()) assert accuracy >= 0.8, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}" diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 3df3c6c41..ab9e0aba4 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -14,7 +14,6 @@ import torch -from autoPyTorch import metrics from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline from autoPyTorch.utils.common import FitRequirement @@ -300,6 +299,7 @@ def test_pipeline_score(fit_dictionary_tabular_dummy): pipeline = TabularRegressionPipeline( dataset_properties=fit_dictionary_tabular_dummy['dataset_properties'], + random_state=1 ) cs = pipeline.get_hyperparameter_search_space() @@ -311,15 +311,7 @@ def test_pipeline_score(fit_dictionary_tabular_dummy): # Ensure that the network is an instance of torch Module assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module) - # we expect the output to have the same batch size as the test input, - # and number of outputs per batch sample equal to the number of targets ("output_shape" in dataset_properties) - expected_output_shape = (X.shape[0], - fit_dictionary_tabular_dummy["dataset_properties"]["output_shape"]) - - prediction = pipeline.predict(X) - assert isinstance(prediction, np.ndarray) - assert prediction.shape == expected_output_shape + r2_score = pipeline.score(X, y) # we should be able to get a decent score on this dummy data - r2_score = metrics.r2(y, prediction) - assert r2_score >= 0.5, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}" + assert r2_score >= 0.8, f"Pipeline:{pipeline} Config:{config} FitDict: {fit_dictionary_tabular_dummy}" diff --git a/test/test_pipeline/test_traditional_pipeline.py b/test/test_pipeline/test_traditional_pipeline.py index 96b41302a..c36d933e2 100644 --- a/test/test_pipeline/test_traditional_pipeline.py +++ b/test/test_pipeline/test_traditional_pipeline.py @@ -4,26 +4,27 @@ import pytest -from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import _classifiers +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import _traditional_learners from autoPyTorch.pipeline.traditional_tabular_classification import ( TraditionalTabularClassificationPipeline, ) @pytest.mark.parametrize("fit_dictionary_tabular", - ['classification_numerical_and_categorical'], indirect=True) + ['classification_numerical_and_categorical', + 'regression_numerical_and_categorical'], indirect=True) def test_traditional_tabular_pipeline(fit_dictionary_tabular): pipeline = TraditionalTabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'] ) - assert pipeline._get_estimator_hyperparameter_name() == "tabular_classifier" + assert pipeline._get_estimator_hyperparameter_name() == "traditional_tabular_learner" cs = pipeline.get_hyperparameter_search_space() assert isinstance(cs, CS.ConfigurationSpace) config = cs.sample_configuration() - assert config['model_trainer:tabular_classifier:classifier'] in _classifiers + assert config['model_trainer:tabular_traditional_model:traditional_learner'] in _traditional_learners assert pipeline.get_pipeline_representation() == { 'Preprocessing': 'None', - 'Estimator': 'TabularClassifier', + 'Estimator': 'TabularTraditionalModel', } @@ -33,7 +34,7 @@ def test_traditional_tabular_pipeline_predict(fit_dictionary_tabular): pipeline = TraditionalTabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'] ) - assert pipeline._get_estimator_hyperparameter_name() == "tabular_classifier" + assert pipeline._get_estimator_hyperparameter_name() == "traditional_tabular_learner" config = pipeline.get_hyperparameter_search_space().get_default_configuration() pipeline.set_hyperparameters(config) pipeline.fit(fit_dictionary_tabular)