diff --git a/Projects-ML/Reg-models/Supervised-ML-project.ipynb b/Projects-ML/Reg-models/Supervised-ML-project.ipynb index 1ef5cef..45d6ef7 100644 --- a/Projects-ML/Reg-models/Supervised-ML-project.ipynb +++ b/Projects-ML/Reg-models/Supervised-ML-project.ipynb @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -994,12 +994,575 @@ "print(f'Lasso when large number of alpha {lasso_regressor1.best_score_}')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Now let's use train test method:**" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "from sklearn.exceptions import ConvergenceWarning\n", + "\n", + "# Suppress convergence warnings\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear regression = -25.18787473928514\n", + "Ridge regression = {'alpha': 0.01}\n", + "Ridge regression = -25.18689936738697\n", + "Lasso regression = {'alpha': 1e-15}\n", + "Lasso regression = -25.18787473928503\n" + ] + } + ], + "source": [ + "lin_rag2 = LinearRegression() \n", + "mse = cross_val_score(lin_rag, X_train,y_train,scoring='neg_mean_squared_error', cv=5)\n", + "mean_mse = np.mean(mse)\n", + "print(f'Linear regression = {mean_mse}')\n", + "\n", + "ridge2 =Ridge()\n", + "params = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,35,40,50,55,60,80,90,100]}\n", + "ridge_regressor2 = GridSearchCV(ridge2, params, scoring='neg_mean_squared_error', cv=5)\n", + "ridge_regressor2.fit(X_train,y_train)\n", + "\n", + "print(f'Ridge regression = {ridge_regressor2.best_params_}')\n", + "print(f'Ridge regression = {ridge_regressor2.best_score_}')\n", + "\n", + "lasso2 =Lasso()\n", + "params = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,35,40,50,55,60,80,90,100]}\n", + "lasso_regressor2 = GridSearchCV(lasso2, params, scoring='neg_mean_squared_error', cv=5)\n", + "lasso_regressor2.fit(X_train,y_train)\n", + "\n", + "print(f'Lasso regression = {lasso_regressor2.best_params_}')\n", + "print(f'Lasso regression = {lasso_regressor2.best_score_}')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6832260852266521\n" + ] + } + ], + "source": [ + "y_pred = lin_rag.predict(X_test)\n", + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score_lin = r2_score(y_pred, y_test)\n", + "\n", + "print(r2_score_lin)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6708743257533069\n" + ] + } + ], + "source": [ + "y_pred = ridge_regressor2.predict(X_test)\n", + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score_ridge = r2_score(y_pred, y_test)\n", + "\n", + "print(r2_score_ridge)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.670955897674443\n" + ] + } + ], + "source": [ + "y_pred = lasso_regressor2.predict(X_test)\n", + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score_lasso = r2_score(y_pred, y_test)\n", + "\n", + "print(r2_score_lasso)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Logistic regression**\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n", + "\n", + "`class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression \n", + "from sklearn.datasets import load_breast_cancer\n", + "\n", + "df = load_breast_cancer()\n", + "\n", + "# Independent features\n", + "X = pd.DataFrame(df['data'], columns=df['feature_names'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " mean radius mean texture mean perimeter mean area mean smoothness \\\n", + "0 17.99 10.38 122.80 1001.0 0.11840 \n", + "1 20.57 17.77 132.90 1326.0 0.08474 \n", + "2 19.69 21.25 130.00 1203.0 0.10960 \n", + "3 11.42 20.38 77.58 386.1 0.14250 \n", + "4 20.29 14.34 135.10 1297.0 0.10030 \n", + "\n", + " mean compactness mean concavity mean concave points mean symmetry \\\n", + "0 0.27760 0.3001 0.14710 0.2419 \n", + "1 0.07864 0.0869 0.07017 0.1812 \n", + "2 0.15990 0.1974 0.12790 0.2069 \n", + "3 0.28390 0.2414 0.10520 0.2597 \n", + "4 0.13280 0.1980 0.10430 0.1809 \n", + "\n", + " mean fractal dimension ... worst radius worst texture worst perimeter \\\n", + "0 0.07871 ... 25.38 17.33 184.60 \n", + "1 0.05667 ... 24.99 23.41 158.80 \n", + "2 0.05999 ... 23.57 25.53 152.50 \n", + "3 0.09744 ... 14.91 26.50 98.87 \n", + "4 0.05883 ... 22.54 16.67 152.20 \n", + "\n", + " worst area worst smoothness worst compactness worst concavity \\\n", + "0 2019.0 0.1622 0.6656 0.7119 \n", + "1 1956.0 0.1238 0.1866 0.2416 \n", + "2 1709.0 0.1444 0.4245 0.4504 \n", + "3 567.7 0.2098 0.8663 0.6869 \n", + "4 1575.0 0.1374 0.2050 0.4000 \n", + "\n", + " worst concave points worst symmetry worst fractal dimension \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Depednent feature \n", + "y = pd.DataFrame(df['target'], columns=[\"Target\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Target
00
10
20
30
40
......
5640
5650
5660
5670
5681
\n", + "

569 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Target\n", + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + ".. ...\n", + "564 0\n", + "565 0\n", + "566 0\n", + "567 0\n", + "568 1\n", + "\n", + "[569 rows x 1 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Target\n", + "1 357\n", + "0 212\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# lets check the dataset is imbalanced \n", + "y['Target'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "probably it is a balanced dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "params = [{'C': [1,5,10]}, {'max_iter': [100,150]}]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "model1 = LogisticRegression(C=100, max_iter=100)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "GridSearchCV(model1, param_grid=params, scoring='f1')" + ] } ], "metadata": {