diff --git a/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb b/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb index 03cc272e..becaf73f 100644 --- a/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb +++ b/Tutorial/3_Genetic_Feature_Set_Selectors.ipynb @@ -12,7 +12,8 @@ " If X is a dataframe, items in sel_subset list must correspond to column names\n", " If X is a numpy array, items in sel_subset list must correspond to column indexes\n", " int: index of a single column\n", - "```\n" + "```\n", + "\n" ] }, { @@ -1113,8 +1114,11 @@ } ], "metadata": { + "interpreter": { + "hash": "57aedbec84c390a3287b44649e400696ed2b6dcd408c8519583e8e995dbe6e9b" + }, "kernelspec": { - "display_name": "tpot_dev", + "display_name": "Python 3.10.12 ('tpot2env2')", "language": "python", "name": "python3" }, @@ -1128,7 +1132,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.12" }, "orig_nbformat": 4, "vscode": { diff --git a/Tutorial/7_dask_parallelization.ipynb b/Tutorial/7_dask_parallelization.ipynb index 709bd7cb..cbbfc28d 100644 --- a/Tutorial/7_dask_parallelization.ipynb +++ b/Tutorial/7_dask_parallelization.ipynb @@ -7,11 +7,9 @@ "source": [ "# Parallelization\n", "\n", - "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n", + "This tutorial covers advanced setups for parallelizing TPOT2 with Dask. If you just want to parallelize TPOT2 within a single computer with multiple processes, set the n_jobs parameter to the number of threads you want to use and skip this tutorial. \n", "\n", - "To parallelize TPOT2 all you need to do is set the n_jobs parameter to the number of cores you want to use. Alternatively, users can create a custom Dask client and pass it in to TPOT2.\n", - "\n", - "This is supported the same in all of the different estimators (TPOTEstimator, TPOTEstimatorSteadyState, TPOTClassifier, or TPOTRegressor)" + "TPOT2 uses Dask for parallelization and defaults to using a dask.distributed.LocalCluster for local parallelization. A user can pass in a custom Dask client or cluster for advanced usage. For example, a multi-node parallelization is possible using the dask-jobqueue package." ] }, { @@ -19,9 +17,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Best Practices\n", + "### TPOT2 with Python Scripts\n", + "\n", + "When running tpot from an .py script, it is important to protect code with `if __name__==\"__main__\":`\n", "\n", - "When running tpot from an .py script, it is important to protect code with `if __name__==\"__main__\":`\n" + "This is due to how parallelization is handled in Python. In short, when Python spawns new processes, each new process reimports code from the relevant .py files, including rerunning code. The context under `if __name__==\"__main__\":` ensures the code under it only executed by the main process and only once. More info [here](https://docs.dask.org/en/stable/scheduling.html#standalone-python-scripts)." ] }, { @@ -33,14 +33,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Evaluations: : 232it [02:01, 1.90it/s]\n" + "Evaluations: : 242it [02:01, 1.99it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "0.9998431179414371\n" + "0.9995194086144522\n" ] } ], @@ -85,14 +85,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Evaluations: : 231it [02:00, 1.92it/s]\n" + "Evaluations: : 224it [02:00, 1.86it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "0.9998143035770981\n" + "0.9996005895289903\n" ] } ], @@ -193,22 +193,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "Evaluations: : 142it [02:00, 1.18it/s]\n" + "Evaluations: : 119it [02:01, 1.02s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "0.999735780838626\n" + "0.9988827327847432\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n", - "2023-08-16 16:10:04,735 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n" + "2023-08-23 13:49:06,747 - distributed.nanny - WARNING - Worker process still alive after 3.1999992370605472 seconds, killing\n", + "2023-08-23 13:49:06,748 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n", + "2023-08-23 13:49:06,748 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n" ] } ], @@ -243,23 +244,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "Evaluations: : 131it [02:02, 1.07it/s]\n" + "Evaluations: : 132it [02:00, 1.10it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "0.9999114413297068\n" + "0.999973663151898\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.1999995422363288 seconds, killing\n", - "2023-08-16 16:12:11,659 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n", - "2023-08-16 16:12:11,660 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n" + "2023-08-23 13:51:14,527 - distributed.nanny - WARNING - Worker process still alive after 3.199999694824219 seconds, killing\n", + "2023-08-23 13:51:14,528 - distributed.nanny - WARNING - Worker process still alive after 3.19999984741211 seconds, killing\n" ] } ], @@ -293,20 +293,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Dask multi node parallelization\n", + "## Dask multi node parallelization on HPC\n", + "\n", + "Dask can parallelize across multiple nodes via job queueing systems. This is done using the Dask-Jobqueue package. More information can be found in the official [documentation here.]( https://jobqueue.dask.org/en/latest/)\n", + "\n", + "To parallelize TPOT2 with Dask-Jobqueue, simply pass in a client based on a Jobqueue cluster with desired settings into the client parameter. Each job will evaluate a single pipeline.\n", "\n", - "Dask can parallelize across multiple nodes via job queueing systems. This is done using the dask-jobqueue package. More information can be found in the official [documentation here.]( https://jobqueue.dask.org/en/latest/)\n", + "Note that TPOT will ignore n_jobs and memory_limit as these should be set inside the Dask cluster. \n", "\n", - "To parallelize TPOT2 with dask-jobqueue, simply pass in a client based on a jobqueue cluster with desired settings into the client parameter. Each job will evaluate a single pipeline.\n", "\n", - "Note that TPOT will ignore n_jobs and memory_limit as these should be set inside the dask cluster. " + "The following example is specific to the Sun Grid Engine. Other supported clusters can be found in the [Dask-Jobqueue documentation here](https://jobqueue.dask.org/en/latest/examples.html)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sun Grid Engine is not installed. This example requires Sun Grid Engine to be installed.\n" + ] + } + ], "source": [ "from dask.distributed import Client, LocalCluster\n", "import sklearn\n", @@ -314,32 +325,39 @@ "import sklearn.metrics\n", "import sklearn.model_selection\n", "import tpot2\n", - "\n", "from dask_jobqueue import SGECluster # or SLURMCluster, PBSCluster, etc. Replace SGE with your scheduler.\n", - "cluster = SGECluster(\n", - " queue='all.q',\n", - " cores=2,\n", - " memory=\"50 GB\"\n", + "import os\n", "\n", - ")\n", + "if os.system(\"which qsub\") != 0:\n", + " print(\"Sun Grid Engine is not installed. This example requires Sun Grid Engine to be installed.\")\n", + "else:\n", + " print(\"Sun Grid Engine is installed.\")\n", "\n", - "cluster.adapt(minimum_jobs=10, maximum_jobs=100) # auto-scale between 10 and 100 jobs\n", + " \n", + " cluster = SGECluster(\n", + " queue='all.q',\n", + " cores=2,\n", + " memory=\"50 GB\"\n", "\n", - "client = Client(cluster)\n", + " )\n", "\n", - "scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n", - "X, y = sklearn.datasets.load_digits(return_X_y=True)\n", - "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n", + " cluster.adapt(minimum_jobs=10, maximum_jobs=100) # auto-scale between 10 and 100 jobs\n", "\n", - "est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n", - "# this is equivalent to: \n", - "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n", - "est.fit(X_train, y_train)\n", - "print(scorer(est, X_test, y_test))\n", + " client = Client(cluster)\n", "\n", - "#It is good to close the client and cluster when you are done with them\n", - "client.close()\n", - "cluster.close()" + " scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n", + " X, y = sklearn.datasets.load_digits(return_X_y=True)\n", + " X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n", + "\n", + " est = tpot2.TPOTEstimatorSteadyState( client=client, classification=True, max_eval_time_seconds=60, max_time_seconds=120, scorers=['roc_auc_ovr'], scorers_weights=[1], verbose=1)\n", + " # this is equivalent to: \n", + " # est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n", + " est.fit(X_train, y_train)\n", + " print(scorer(est, X_test, y_test))\n", + "\n", + " #It is good to close the client and cluster when you are done with them\n", + " client.close()\n", + " cluster.close()" ] } ], diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index 5d5dd8e1..d11dc396 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -183,7 +183,7 @@ def params_SGDClassifier(trial, name=None): 'penalty': 'elasticnet', 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 0.01, log=True), 'learning_rate': trial.suggest_categorical(f'learning_rate_{name}', ['invscaling', 'constant']), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 66785b05..930e0e7e 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -42,11 +42,11 @@ def params_RandomForestRegressor(trial, name=None): # SGDRegressor parameters def params_SGDRegressor(trial, name=None): params = { - 'loss': trial.suggest_categorical(f'loss_{name}', ['squared_loss', 'huber', 'epsilon_insensitive']), + 'loss': trial.suggest_categorical(f'loss_{name}', ['huber', 'squared_error', 'epsilon_insensitive', 'squared_epsilon_insensitive']), 'penalty': 'elasticnet', 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 0.01, log=True), 'learning_rate': trial.suggest_categorical(f'learning_rate_{name}', ['invscaling', 'constant']), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept':True, 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True) @@ -58,7 +58,7 @@ def params_SGDRegressor(trial, name=None): def params_Ridge(trial, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), @@ -72,7 +72,7 @@ def params_Ridge(trial, name=None): def params_Lasso(trial, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, # 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), 'precompute': trial.suggest_categorical(f'precompute_{name}', [True, False, 'auto']), @@ -95,7 +95,7 @@ def params_ElasticNet(trial, name=None): # Lars parameters def params_Lars(trial, name=None): params = { - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), @@ -113,7 +113,7 @@ def params_OrthogonalMatchingPursuit(trial, name=None): params = { 'n_nonzero_coefs': trial.suggest_int(f'n_nonzero_coefs_{name}', 1, 100), 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), 'precompute': trial.suggest_categorical(f'precompute_{name}', ['auto', True, False]), } @@ -129,7 +129,7 @@ def params_BayesianRidge(trial, name=None): 'lambda_1': trial.suggest_float(f'lambda_1_{name}', 1e-6, 1e-1, log=True), 'lambda_2': trial.suggest_float(f'lambda_2_{name}', 1e-6, 1e-1, log=True), 'compute_score': trial.suggest_categorical(f'compute_score_{name}', [True, False]), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), } @@ -139,7 +139,7 @@ def params_BayesianRidge(trial, name=None): def params_LassoLars(trial, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), - # 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + # 'fit_intercept': True, # 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), # 'precompute': trial.suggest_categorical(f'precompute_{name}', ['auto_{name}', True, False]), #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), @@ -178,7 +178,7 @@ def params_ARDRegression(trial, name=None): 'lambda_2': trial.suggest_float(f'lambda_2_{name}', 1e-6, 1e-1, log=True), 'compute_score': trial.suggest_categorical(f'compute_score_{name}', [True, False]), 'threshold_lambda': trial.suggest_int(f'threshold_lambda_{name}', 100, 1000), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), } @@ -191,7 +191,7 @@ def params_TheilSenRegressor(trial, name=None): params = { 'n_subsamples': trial.suggest_int(f'n_subsamples_{name}', 10, 100), 'max_subpopulation': trial.suggest_int(f'max_subpopulation_{name}', 100, 1000), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), } @@ -215,7 +215,7 @@ def params_Perceptron(trial, name=None): 'penalty': trial.suggest_categorical(f'penalty_{name}', [None, 'l2', 'l1', 'elasticnet']), 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 1e-1, log=True), 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), - 'fit_intercept': trial.suggest_categorical(f'fit_intercept_{name}', [True, False]), + 'fit_intercept': True, #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'shuffle': trial.suggest_categorical(f'shuffle_{name}', [True, False]), @@ -244,10 +244,6 @@ def params_MLPRegressor(trial, name=None): def params_GradientBoostingRegressor(trial, name=None): loss = trial.suggest_categorical(f'loss_{name}', ['ls', 'lad', 'huber', 'quantile']) - if loss == 'quantile' or loss == 'huber': - alpha = trial.suggest_float(f'alpha_{name}', 0.05, 0.95) - else: - alpha = None params = { 'n_estimators': 100, @@ -258,9 +254,13 @@ def params_GradientBoostingRegressor(trial, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), 'subsample': 1-trial.suggest_float(f'subsample_{name}', 0.05, 1.00, log=True), 'max_features': 1-trial.suggest_float(f'max_features_{name}', 0.05, 1.00, log=True), - 'alpha': alpha, } + + if loss == 'quantile' or loss == 'huber': + alpha = trial.suggest_float(f'alpha_{name}', 0.05, 0.95) + params['alpha'] = alpha + return params diff --git a/tpot2/graphsklearn.py b/tpot2/graphsklearn.py index 61621784..113c70aa 100644 --- a/tpot2/graphsklearn.py +++ b/tpot2/graphsklearn.py @@ -9,6 +9,7 @@ from sklearn.utils.metaestimators import _BaseComposition from sklearn.utils.validation import check_memory +from sklearn.preprocessing import LabelEncoder #labels - str #attributes - "instance" -> instance of the type @@ -254,9 +255,42 @@ def __init__( memory=None, #TODO memory caching like sklearn.pipeline subset_column = None, drop_subset_column = True, + use_label_encoder=False, **kwargs, ): super().__init__(**kwargs) + ''' + An sklearn baseestimator that uses genetic programming to optimize a pipeline. + + Parameters + ---------- + + graph: networkx.DiGraph + A directed graph where the nodes are sklearn estimators and the edges are the inputs to those estimators. + + cross_val_predict_cv: int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy used in inner classifiers or regressors + + method: str, optional + The prediction method to use for the inner classifiers or regressors. If 'auto', it will try to use predict_proba, decision_function, or predict in that order. + + memory: str or object with the joblib.Memory interface, optional + Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. + + subset_column: int, optional + The column of X that contains the subset values. If None, all rows of X are used. If not None, only the rows of X where X[:,subset_column] is in subset_values are used. + Used to evolve pipelines where recursive graphs use different subsets of rows. + + drop_subset_column: bool, optional + If True, the subset_column is dropped from X before being passed to the pipeline. If False, the subset_column is kept in X. + + use_label_encoder: bool, optional + If True, the label encoder is used to encode the labels to be 0 to N. If False, the label encoder is not used. + Mainly useful for classifiers (XGBoost) that require labels to be ints from 0 to N. + + Can also be a sklearn.preprocessing.LabelEncoder object. If so, that label encoder is used. + + ''' self.graph = graph self.cross_val_predict_cv = cross_val_predict_cv @@ -264,6 +298,7 @@ def __init__( self.memory = memory self.subset_column = subset_column self.drop_subset_column = drop_subset_column + self.use_label_encoder = use_label_encoder setup_ordered_successors(graph) @@ -272,6 +307,11 @@ def __init__( self.root = self.topo_sorted_nodes[-1] + if self.use_label_encoder: + if type(self.use_label_encoder) == LabelEncoder: + self.label_encoder = self.use_label_encoder + else: + self.label_encoder = LabelEncoder() #TODO clean this up @@ -299,6 +339,12 @@ def fit(self, X, y, subset_col = None): # X = X[indeces_to_keep] # y = y[indeces_to_keep] + if self.use_label_encoder: + if type(self.use_label_encoder) == LabelEncoder: + y = self.label_encoder.transform(y) + else: + y = self.label_encoder.fit_transform(y) + if self.subset_column is not None: subset_col = X[:,self.subset_column] @@ -347,7 +393,12 @@ def predict(self, X, **predict_params): topo_sort = self.topo_sorted_nodes, ) - return self.graph.nodes[self.root]["instance"].predict(this_X, **predict_params) + preds = self.graph.nodes[self.root]["instance"].predict(this_X, **predict_params) + + if self.use_label_encoder: + preds = self.label_encoder.inverse_transform(preds) + + return preds @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): @@ -394,7 +445,11 @@ def transform(self, X, **predict_params): @property def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" - return self.graph.nodes[self.root]["instance"].classes_ + + if self.use_label_encoder: + return self.label_encoder.classes_ + else: + return self.graph.nodes[self.root]["instance"].classes_ @property def _estimator_type(self): diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 0d72d7cc..1bbe64f0 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -12,6 +12,8 @@ import tpot2 from dask.distributed import Client from dask.distributed import LocalCluster +from sklearn.preprocessing import LabelEncoder +import warnings import math from .estimator_utils import * @@ -54,6 +56,7 @@ def __init__(self, scorers, max_eval_time_seconds=60*10, validation_strategy = "none", validation_fraction = .2, + disable_label_encoder = False, #early stopping parameters early_stop = None, @@ -275,6 +278,10 @@ def __init__(self, scorers, validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. + disable_label_encoder : bool, default=False + If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. + If False, no additional label encoders will be used. + early_stop : int, default=None Number of generations without improvement before early stopping. All objectives must have converged within the tolerance for this to be triggered. @@ -432,6 +439,7 @@ def __init__(self, scorers, self.preprocessing = preprocessing self.validation_strategy = validation_strategy self.validation_fraction = validation_fraction + self.disable_label_encoder = disable_label_encoder self.population_size = population_size self.initial_population_size = initial_population_size self.population_scaling = population_scaling @@ -523,6 +531,9 @@ def __init__(self, scorers, self.evaluated_individuals = None + self.label_encoder_ = None + + set_dask_settings() @@ -544,6 +555,10 @@ def fit(self, X, y): memory_limit=self.memory_limit) _client = Client(cluster) + if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): + warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) self.evaluated_individuals = None #determine validation strategy @@ -901,7 +916,12 @@ def _estimator_has(attr): def predict(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) - return self.fitted_pipeline_.predict(X,**predict_params) + + preds = self.fitted_pipeline_.predict(X,**predict_params) + if self.classification and self.label_encoder_: + preds = self.label_encoder_.inverse_transform(preds) + + return preds @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): @@ -924,7 +944,10 @@ def transform(self, X, **predict_params): @property def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" - return self.fitted_pipeline_.classes_ + if self.label_encoder_: + return self.label_encoder_.classes_ + else: + return self.fitted_pipeline_.classes_ @property diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index 000ba028..fe7a61a7 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -168,7 +168,12 @@ def convert_to_float(x): - +def check_if_y_is_encoded(y): + ''' + checks if the target y is composed of sequential ints from 0 to N + ''' + y = sorted(set(y)) + return all(i == j for i, j in enumerate(y)) diff --git a/tpot2/tpot_estimator/steady_state_estimator.py b/tpot2/tpot_estimator/steady_state_estimator.py index 46836165..3b75cb68 100644 --- a/tpot2/tpot_estimator/steady_state_estimator.py +++ b/tpot2/tpot_estimator/steady_state_estimator.py @@ -17,6 +17,7 @@ from dask import config as cfg from .estimator_utils import * +import warnings def set_dask_settings(): cfg.set({'distributed.scheduler.worker-ttl': None}) @@ -46,6 +47,7 @@ def __init__(self, scorers= [], preprocessing = False, validation_strategy = "none", validation_fraction = .2, + disable_label_encoder = False, initial_population_size = 50, population_size = 50, @@ -240,6 +242,10 @@ def __init__(self, scorers= [], validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. + disable_label_encoder : bool, default=False + If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. + If False, no additional label encoders will be used. + population_size : int, default=50 Size of the population @@ -426,6 +432,7 @@ def __init__(self, scorers= [], self.preprocessing = preprocessing self.validation_strategy = validation_strategy self.validation_fraction = validation_fraction + self.disable_label_encoder = disable_label_encoder self.population_size = population_size self.initial_population_size = initial_population_size @@ -517,6 +524,7 @@ def __init__(self, scorers= [], self._evolver_instance = None self.evaluated_individuals = None + self.label_encoder_ = None set_dask_settings() @@ -540,6 +548,11 @@ def fit(self, X, y): _client = Client(cluster) + if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): + warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) + self.evaluated_individuals = None #determine validation strategy if self.validation_strategy == 'auto': @@ -891,7 +904,11 @@ def _estimator_has(attr): def predict(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) - return self.fitted_pipeline_.predict(X,**predict_params) + preds = self.fitted_pipeline_.predict(X,**predict_params) + if self.classification and self.label_encoder_: + preds = self.label_encoder_.inverse_transform(preds) + + return preds @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): @@ -914,7 +931,11 @@ def transform(self, X, **predict_params): @property def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" - return self.fitted_pipeline_.classes_ + + if self.label_encoder_: + return self.label_encoder_.classes_ + else: + return self.fitted_pipeline_.classes_ @property def _estimator_type(self): diff --git a/tpot2/tpot_estimator/tests/test_estimator_utils.py b/tpot2/tpot_estimator/tests/test_estimator_utils.py index f79422c9..f371c9b7 100644 --- a/tpot2/tpot_estimator/tests/test_estimator_utils.py +++ b/tpot2/tpot_estimator/tests/test_estimator_utils.py @@ -1,6 +1,7 @@ import pytest import numpy as np import pandas as pd +from ..estimator_utils import * def test_remove_underrepresented_classes(): x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) @@ -31,4 +32,12 @@ def test_remove_underrepresented_classes(): pd.testing.assert_frame_equal(x_result, pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T) pd.testing.assert_series_equal(y_result, pd.Series([0, 1, 0, 1])) -test_remove_underrepresented_classes() \ No newline at end of file + +def test_check_if_y_is_encoded(): + assert check_if_y_is_encoded([0, 1, 2, 3]) == True + assert check_if_y_is_encoded([0, 1, 3, 4]) == False + assert check_if_y_is_encoded([0, 2, 3]) == False + assert check_if_y_is_encoded([0]) == True + assert check_if_y_is_encoded([0,0,0,0,1,1,1,1]) == True + assert check_if_y_is_encoded([0,0,0,0,1,1,1,1,3]) == False + assert check_if_y_is_encoded([1,1,1,1,2,2,2,2]) == False