diff --git a/Tutorial/8_SH_and_cv_early_pruning.ipynb b/Tutorial/8_SH_and_cv_early_pruning.ipynb index 160db0be..50ad9585 100644 --- a/Tutorial/8_SH_and_cv_early_pruning.ipynb +++ b/Tutorial/8_SH_and_cv_early_pruning.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -107,348 +107,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/perib/Projects/common/Projects/TPOT_Dev/tpot2/tpot2/tpot_estimator/estimator.py:423: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\n", - " warnings.warn(\"Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\")\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 2%|▏ | 1/50 [00:25<20:56, 25.64s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 1\n", - "Best roc_auc_score score: 0.9909340659340659\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 4%|▍ | 2/50 [00:47<18:41, 23.37s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 2\n", - "Best roc_auc_score score: 0.9918914418914418\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 6%|▌ | 3/50 [01:25<23:39, 30.21s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 3\n", - "Best roc_auc_score score: 0.9925990675990676\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 8%|▊ | 4/50 [02:19<30:20, 39.58s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 4\n", - "Best roc_auc_score score: 0.9925990675990676\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 10%|█ | 5/50 [02:56<28:55, 38.56s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 5\n", - "Best roc_auc_score score: 0.9933816183816184\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 12%|█▏ | 6/50 [03:46<31:15, 42.63s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 6\n", - "Best roc_auc_score score: 0.9933816183816184\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 14%|█▍ | 7/50 [04:41<33:15, 46.42s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 7\n", - "Best roc_auc_score score: 0.9934065934065934\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 16%|█▌ | 8/50 [05:19<30:39, 43.79s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 8\n", - "Best roc_auc_score score: 0.9948468198468199\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 18%|█▊ | 9/50 [05:49<27:01, 39.55s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 9\n", - "Best roc_auc_score score: 0.9948468198468199\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 20%|██ | 10/50 [07:01<33:03, 49.59s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 10\n", - "Best roc_auc_score score: 0.9948468198468199\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 22%|██▏ | 11/50 [07:43<30:38, 47.14s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 11\n", - "Best roc_auc_score score: 0.9972905525846703\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 24%|██▍ | 12/50 [08:35<30:52, 48.75s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 12\n", - "Best roc_auc_score score: 0.9976114081996436\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 26%|██▌ | 13/50 [09:10<27:31, 44.64s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 13\n", - "Best roc_auc_score score: 0.9976114081996436\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 28%|██▊ | 14/50 [10:06<28:49, 48.04s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 14\n", - "Best roc_auc_score score: 0.9976114081996436\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 30%|███ | 15/50 [11:08<30:21, 52.05s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 15\n", - "Best roc_auc_score score: 0.9979411764705883\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 32%|███▏ | 16/50 [12:28<34:16, 60.49s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 16\n", - "Best roc_auc_score score: 0.9985249554367203\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 34%|███▍ | 17/50 [14:09<40:01, 72.78s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 17\n", - "Best roc_auc_score score: 0.999108734402852\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 36%|███▌ | 18/50 [14:49<33:37, 63.03s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 18\n", - "Best roc_auc_score score: 0.999108734402852\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 38%|███▊ | 19/50 [15:12<26:14, 50.78s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 19\n", - "Best roc_auc_score score: 0.999108734402852\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 40%|████ | 20/50 [15:55<24:16, 48.53s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generation: 20\n", - "Best roc_auc_score score: 0.999108734402852\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 40%|████ | 20/50 [16:15<24:23, 48.79s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "KeyboardInterrupt\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "/home/perib/miniconda3/envs/myenv/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total time: 983.1856114864349\n" - ] - } - ], + "outputs": [], "source": [ "# A Graph pipeline starting with at least one selector as a leaf, potentially followed by a series\n", "# of stacking classifiers or transformers, and ending with a classifier. The graph will have at most 15 nodes and a max depth of 6.\n", @@ -529,12 +190,14 @@ "\n", "In practice, the values of these parameters will depend on the specific problem and the available computational resources. \n", "\n", - "In the following sections, we will show you how to set and manipulate these parameters using Python code in a Jupyter Notebook. We will also provide examples of how these parameters can affect the performance of the algorithm." + "In the following sections, we will show you how to set and manipulate these parameters using Python code in a Jupyter Notebook. We will also provide examples of how these parameters can affect the performance of the algorithm.\n", + "\n", + "(Note that in these small test cases, you may not notice much or any performance improvements, these are more likely to be more beneficial in real world scenarios with larger datasets and slower evaluating pipelines.)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -574,23 +237,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/perib/Projects/common/Projects/TPOT_Dev/tpot2/tpot2/tpot_estimator/estimator.py:423: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\n", - " warnings.warn(\"Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\")\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generation: 20%|██ | 1/5 [00:32<02:11, 32.94s/it]" - ] - } - ], + "outputs": [], "source": [ "X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", "\n", @@ -601,16 +248,15 @@ " scorers=['roc_auc_ovr'],\n", " scorers_weights=[1],\n", " classification=True,\n", - " search_space = 'linear-light',\n", + " search_space = 'linear',\n", " n_jobs=32,\n", " cv=cv,\n", - " \n", - " # budget_range = [.3,1],\n", - " # generations_until_end_budget=4,\n", + " early_stop=5,\n", + " verbose=3,\n", "\n", " threshold_evaluation_pruning = threshold_evaluation_pruning,\n", " threshold_evaluation_scaling = threshold_evaluation_scaling,\n", - " verbose=2)\n", + " )\n", "\n", "\n", "start = time.time()\n", @@ -620,12 +266,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -640,7 +286,6 @@ "\n", "selection_evaluation_pruning = [.1, 1]\n", "selection_evaluation_scaling = .5\n", - "cv = 5\n", "\n", "#Population and budget use stepwise\n", "fig, ax1 = plt.subplots()\n", @@ -655,21 +300,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'time' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 16\u001b[0m\n\u001b[1;32m 1\u001b[0m est \u001b[38;5;241m=\u001b[39m tpot2\u001b[38;5;241m.\u001b[39mTPOTEstimator( \n\u001b[1;32m 2\u001b[0m generations\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m,\n\u001b[1;32m 3\u001b[0m scorers\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mroc_auc_ovr\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \n\u001b[1;32m 13\u001b[0m verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m start \u001b[38;5;241m=\u001b[39m \u001b[43mtime\u001b[49m\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 17\u001b[0m est\u001b[38;5;241m.\u001b[39mfit(X, y)\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal time: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtime\u001b[38;5;241m.\u001b[39mtime()\u001b[38;5;241m-\u001b[39mstart\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'time' is not defined" - ] - } - ], + "outputs": [], "source": [ "\n", "\n", @@ -677,16 +310,18 @@ "est = tpot2.TPOTEstimator( \n", " generations=50,\n", " max_time_mins=None,\n", + " scorers=['roc_auc_ovr'],\n", " scorers_weights=[1],\n", " classification=True,\n", - " search_space = \"linear-light\",\n", + " search_space = 'linear',\n", " n_jobs=32,\n", " cv=cv,\n", + " early_stop=5,\n", + " verbose=3,\n", "\n", " selection_evaluation_pruning = selection_evaluation_pruning,\n", " selection_evaluation_scaling = selection_evaluation_scaling,\n", - "\n", - " verbose=0)\n", + " )\n", "\n", "\n", "start = time.time()\n", @@ -695,59 +330,44 @@ ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "All of the above methods can be used independently or simultaneously as done below:" + "# no pruning\n", + "est = tpot2.TPOTEstimator( \n", + " generations=50,\n", + " max_time_mins=None,\n", + " scorers=['roc_auc_ovr'],\n", + " scorers_weights=[1],\n", + " classification=True,\n", + " search_space = 'linear',\n", + " n_jobs=32,\n", + " cv=cv,\n", + " early_stop=5,\n", + " verbose=3,\n", + " )\n", + "\n", + "\n", + "start = time.time()\n", + "est.fit(X, y)\n", + "print(f\"total time: {time.time()-start}\")" ] }, { - "cell_type": "code", - "execution_count": 9, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1.2, 3.4, 1. ])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "import math\n", - "np.array([1.2,3.4,1])" + "All of the above methods can be used independently or simultaneously as done below:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/perib/Projects/common/Projects/TPOT_Dev/tpot2/tpot2/tpot_estimator/estimator.py:423: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\n", - " warnings.warn(\"Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\")\n", - "/home/perib/miniconda3/envs/myenv/lib/python3.10/site-packages/sklearn/preprocessing/_data.py:2785: UserWarning: n_quantiles (802) is greater than the total number of samples (150). n_quantiles is set to n_samples.\n", - " warnings.warn(\n", - "/home/perib/miniconda3/envs/myenv/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total time: 98.23117566108704\n" - ] - } - ], + "outputs": [], "source": [ "est = tpot2.TPOTEstimator( \n", " generations=50,\n", @@ -755,9 +375,11 @@ " scorers=['roc_auc_ovr'],\n", " scorers_weights=[1],\n", " classification=True,\n", - " search_space = 'linear-light',\n", + " search_space = 'linear',\n", " n_jobs=32,\n", " cv=cv,\n", + " early_stop=5,\n", + " verbose=3,\n", "\n", " population_size=population_size,\n", " initial_population_size=initial_population_size,\n", @@ -772,8 +394,7 @@ "\n", " selection_evaluation_pruning = selection_evaluation_pruning,\n", " selection_evaluation_scaling = selection_evaluation_scaling,\n", - "\n", - " verbose=0)\n", + " )\n", "\n", "\n", "start = time.time()\n",