From 6286767e4ac6cf07d00f8a72c71b825e55031720 Mon Sep 17 00:00:00 2001 From: Arun Kumar Pandey Date: Thu, 7 Mar 2024 21:35:22 +0100 Subject: [PATCH] Here I discussed the ASM for the decision tree --- .../Project-2.4-Decision-tree.ipynb | 244 ++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 Supervised-learning/Project-2.4-Decision-tree.ipynb diff --git a/Supervised-learning/Project-2.4-Decision-tree.ipynb b/Supervised-learning/Project-2.4-Decision-tree.ipynb new file mode 100644 index 0000000..a4af255 --- /dev/null +++ b/Supervised-learning/Project-2.4-Decision-tree.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " User ID Gender Age EstimatedSalary Purchased\n", + "0 15624510 Male 19 19000 0\n", + "1 15810944 Male 35 20000 0\n", + "2 15668575 Female 26 43000 0\n", + "3 15603246 Female 27 57000 0\n", + "4 15804002 Male 19 76000 0\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random \n", + "import matplotlib.pyplot as plt \n", + "\n", + "df_user = pd.read_csv('User_Data.csv')\n", + "\n", + "# Display the DataFrame\n", + "print(df_user.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the dataset \n", + "X = df_user.iloc[:, [2, 3]].values \n", + "y = df_user.iloc[:, 4].values " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Splitting the dataset into the Training set and Test set \n", + "from sklearn.model_selection import train_test_split \n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature Scaling \n", + "from sklearn.preprocessing import StandardScaler \n", + "sc = StandardScaler() \n", + "X_train = sc.fit_transform(X_train) \n", + "X_test = sc.transform(X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DecisionTreeClassifier(criterion='entropy', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DecisionTreeClassifier(criterion='entropy', random_state=0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Fitting Decision Tree classifier to the training set \n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)\n", + "classifier.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above code, we have created a classifier object, in which we have passed two main parameters;\n", + "\n", + "- `criterion='entropy'`: Criterion is used to measure the quality of split, which is calculated by information gain given by entropy.\n", + "- `random_state=0\"`: For generating the random states." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Predicting the test set result \n", + "y_pred= classifier.predict(X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#Creating the Confusion matrix \n", + "from sklearn.metrics import confusion_matrix \n", + "cm= confusion_matrix(y_test, y_pred) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\hp\\AppData\\Local\\Temp\\ipykernel_18076\\1231252719.py:18: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n", + " axes[0].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],\n", + "C:\\Users\\hp\\AppData\\Local\\Temp\\ipykernel_18076\\1231252719.py:36: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n", + " axes[1].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Importing libraries\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.colors import ListedColormap\n", + "\n", + "# Set up the figure with two subplots in one row and two columns\n", + "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", + "\n", + "# Visulaizing the training set result\n", + "x_set, y_set = X_train, y_train\n", + "X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),\n", + " np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))\n", + "axes[0].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),\n", + " alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))\n", + "axes[0].set_xlim(X1.min(), X1.max())\n", + "axes[0].set_ylim(X2.min(), X2.max())\n", + "\n", + "for i, j in enumerate(np.unique(y_set)):\n", + " axes[0].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],\n", + " c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)\n", + "\n", + "axes[0].set_title('K-NN Algorithm (Training set)')\n", + "axes[0].set_xlabel('Age')\n", + "axes[0].set_ylabel('Estimated Salary')\n", + "axes[0].legend()\n", + "\n", + "# Visulaizing the test set result\n", + "x_set, y_set = X_test, y_test\n", + "X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),\n", + " np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))\n", + "axes[1].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),\n", + " alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))\n", + "axes[1].set_xlim(X1.min(), X1.max())\n", + "axes[1].set_ylim(X2.min(), X2.max())\n", + "\n", + "for i, j in enumerate(np.unique(y_set)):\n", + " axes[1].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],\n", + " c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)\n", + "\n", + "axes[1].set_title('K-NN Algorithm (Test set)')\n", + "axes[1].set_xlabel('Age')\n", + "axes[1].set_ylabel('Estimated Salary')\n", + "axes[1].legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see in the above image that there are some green data points within the purple region and vice versa. So, these are the incorrect predictions which we have discussed in the confusion matrix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}