-
Notifications
You must be signed in to change notification settings - Fork 0
/
heart_disease_classification_and_clustering
1 lines (1 loc) · 41.5 KB
/
heart_disease_classification_and_clustering
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":9485965,"sourceType":"datasetVersion","datasetId":5770734}],"dockerImageVersionId":30776,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Import Needed Libraries","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport plotly_express as px\n\nfrom sklearn.model_selection import train_test_split \nfrom sklearn.preprocessing import LabelEncoder,MinMaxScaler ,OneHotEncoder\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier\nfrom sklearn.feature_selection import SelectKBest, f_classif\nfrom sklearn.cluster import KMeans , AgglomerativeClustering\nimport scipy.cluster.hierarchy as shc\nfrom imblearn.over_sampling import SMOTE\n\nfrom sklearn.metrics import silhouette_score , accuracy_score , classification_report ,confusion_matrix,ConfusionMatrixDisplay\n\nprint('modules loaded')","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-09-26T17:27:57.392773Z","iopub.execute_input":"2024-09-26T17:27:57.393143Z","iopub.status.idle":"2024-09-26T17:28:00.728751Z","shell.execute_reply.started":"2024-09-26T17:27:57.393106Z","shell.execute_reply":"2024-09-26T17:28:00.72776Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## EDA","metadata":{}},{"cell_type":"markdown","source":"## Read data","metadata":{}},{"cell_type":"code","source":"df=pd.read_csv('/kaggle/input/heart-disease/heart_disease.csv')","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:28:08.865859Z","iopub.execute_input":"2024-09-26T17:28:08.866425Z","iopub.status.idle":"2024-09-26T17:28:10.148039Z","shell.execute_reply.started":"2024-09-26T17:28:08.866387Z","shell.execute_reply":"2024-09-26T17:28:10.147202Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.head()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:54.547274Z","iopub.execute_input":"2024-09-26T17:22:54.547589Z","iopub.status.idle":"2024-09-26T17:22:54.576514Z","shell.execute_reply.started":"2024-09-26T17:22:54.547557Z","shell.execute_reply":"2024-09-26T17:22:54.57566Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.info()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:54.57867Z","iopub.execute_input":"2024-09-26T17:22:54.578959Z","iopub.status.idle":"2024-09-26T17:22:54.999051Z","shell.execute_reply.started":"2024-09-26T17:22:54.578928Z","shell.execute_reply":"2024-09-26T17:22:54.998098Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.describe(include='all')","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:55.000369Z","iopub.execute_input":"2024-09-26T17:22:55.000777Z","iopub.status.idle":"2024-09-26T17:22:56.127558Z","shell.execute_reply.started":"2024-09-26T17:22:55.000732Z","shell.execute_reply":"2024-09-26T17:22:56.126679Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"for i in df.columns:\n if df[i].dtype == 'object':\n print(f\"{i} : {df[i].unique()}\")","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:56.128874Z","iopub.execute_input":"2024-09-26T17:22:56.129283Z","iopub.status.idle":"2024-09-26T17:22:56.456806Z","shell.execute_reply.started":"2024-09-26T17:22:56.129234Z","shell.execute_reply":"2024-09-26T17:22:56.455871Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Visulization","metadata":{}},{"cell_type":"code","source":"count_HeartDisease=px.pie(df,names='HeartDisease',title='count of HeartDisease')\ncount_HeartDisease.update_traces(textposition='inside', textinfo='value+percent')\ncount_HeartDisease.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:56.457953Z","iopub.execute_input":"2024-09-26T17:22:56.458275Z","iopub.status.idle":"2024-09-26T17:22:58.948665Z","shell.execute_reply.started":"2024-09-26T17:22:56.458241Z","shell.execute_reply":"2024-09-26T17:22:58.947284Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Smoking=px.pie(df,names='Smoking',title='count of Smoking')\ncount_Smoking.update_traces(textposition='inside', textinfo='value+percent')\ncount_Smoking.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:58.950148Z","iopub.execute_input":"2024-09-26T17:22:58.950477Z","iopub.status.idle":"2024-09-26T17:22:59.821843Z","shell.execute_reply.started":"2024-09-26T17:22:58.950442Z","shell.execute_reply":"2024-09-26T17:22:59.819339Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_AlcoholDrinking=px.pie(df,names='AlcoholDrinking',title='count of AlcoholDrinking')\ncount_AlcoholDrinking.update_traces(textposition='inside', textinfo='value+percent')\ncount_AlcoholDrinking.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:22:59.823095Z","iopub.execute_input":"2024-09-26T17:22:59.823414Z","iopub.status.idle":"2024-09-26T17:23:00.677218Z","shell.execute_reply.started":"2024-09-26T17:22:59.823381Z","shell.execute_reply":"2024-09-26T17:23:00.675904Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Stroke=px.pie(df,names='Stroke',title='count of Stroke')\ncount_Stroke.update_traces(textposition='inside', textinfo='value+percent')\ncount_Stroke.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:00.680694Z","iopub.execute_input":"2024-09-26T17:23:00.680994Z","iopub.status.idle":"2024-09-26T17:23:01.534987Z","shell.execute_reply.started":"2024-09-26T17:23:00.680963Z","shell.execute_reply":"2024-09-26T17:23:01.533768Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_DiffWalking=px.pie(df,names='DiffWalking',title='count of DiffWalking')\ncount_DiffWalking.update_traces(textposition='inside', textinfo='value+percent')\ncount_DiffWalking.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:01.536448Z","iopub.execute_input":"2024-09-26T17:23:01.536824Z","iopub.status.idle":"2024-09-26T17:23:02.393177Z","shell.execute_reply.started":"2024-09-26T17:23:01.536783Z","shell.execute_reply":"2024-09-26T17:23:02.392043Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Sex=px.pie(df,names='Sex',title='count of Sex',color_discrete_sequence=['pink', 'lightblue'])\ncount_Sex.update_traces(textposition='inside', textinfo='value+percent')\ncount_Sex.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:02.394485Z","iopub.execute_input":"2024-09-26T17:23:02.394864Z","iopub.status.idle":"2024-09-26T17:23:03.26778Z","shell.execute_reply.started":"2024-09-26T17:23:02.394823Z","shell.execute_reply":"2024-09-26T17:23:03.265029Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_PhysicalActivity=px.pie(df,names='PhysicalActivity',title='count of PhysicalActivity')\ncount_PhysicalActivity.update_traces(textposition='inside', textinfo='value+percent')\ncount_PhysicalActivity.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:03.269357Z","iopub.execute_input":"2024-09-26T17:23:03.269719Z","iopub.status.idle":"2024-09-26T17:23:04.127575Z","shell.execute_reply.started":"2024-09-26T17:23:03.269681Z","shell.execute_reply":"2024-09-26T17:23:04.126691Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Asthma=px.pie(df,names='Asthma',title='count of Asthma')\ncount_Asthma.update_traces(textposition='inside', textinfo='value+percent')\ncount_Asthma.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:04.128812Z","iopub.execute_input":"2024-09-26T17:23:04.129157Z","iopub.status.idle":"2024-09-26T17:23:04.980893Z","shell.execute_reply.started":"2024-09-26T17:23:04.12912Z","shell.execute_reply":"2024-09-26T17:23:04.979792Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_KidneyDisease=px.pie(df,names='KidneyDisease',title='count of KidneyDisease')\ncount_KidneyDisease.update_traces(textposition='inside', textinfo='value+percent')\ncount_KidneyDisease.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:04.981966Z","iopub.execute_input":"2024-09-26T17:23:04.98225Z","iopub.status.idle":"2024-09-26T17:23:05.832719Z","shell.execute_reply.started":"2024-09-26T17:23:04.982219Z","shell.execute_reply":"2024-09-26T17:23:05.831584Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_SkinCancer=px.pie(df,names='SkinCancer',title='count of SkinCancer')\ncount_SkinCancer.update_traces(textposition='inside', textinfo='value+percent')\ncount_SkinCancer.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:05.833948Z","iopub.execute_input":"2024-09-26T17:23:05.834294Z","iopub.status.idle":"2024-09-26T17:23:06.709875Z","shell.execute_reply.started":"2024-09-26T17:23:05.834256Z","shell.execute_reply":"2024-09-26T17:23:06.708871Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['AgeCategory'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:06.711214Z","iopub.execute_input":"2024-09-26T17:23:06.71154Z","iopub.status.idle":"2024-09-26T17:23:06.767008Z","shell.execute_reply.started":"2024-09-26T17:23:06.711504Z","shell.execute_reply":"2024-09-26T17:23:06.76611Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_AgeCategory=df['AgeCategory'].value_counts().reset_index()\ncount_AgeCategory.columns=['AgeCategory','counts']\nfig_age=px.bar(count_AgeCategory,x='AgeCategory',y='counts',title='count of AgeCategory')\nfig_age.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:06.768071Z","iopub.execute_input":"2024-09-26T17:23:06.768383Z","iopub.status.idle":"2024-09-26T17:23:06.914931Z","shell.execute_reply.started":"2024-09-26T17:23:06.768329Z","shell.execute_reply":"2024-09-26T17:23:06.914033Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Race=df['Race'].value_counts().reset_index()\ncount_Race.columns=['Race','counts']\nfig_Race=px.bar(count_Race,x='Race',y='counts',title='count of Race')\nfig_Race.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:06.91634Z","iopub.execute_input":"2024-09-26T17:23:06.91666Z","iopub.status.idle":"2024-09-26T17:23:07.027501Z","shell.execute_reply.started":"2024-09-26T17:23:06.916592Z","shell.execute_reply":"2024-09-26T17:23:07.026604Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_Diabetic=df['Diabetic'].value_counts().reset_index()\ncount_Diabetic.columns=['Diabetic','counts']\nfig_Diabetic=px.bar(count_Diabetic,x='Diabetic',y='counts',title='count of Diabetic')\nfig_Diabetic.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.028666Z","iopub.execute_input":"2024-09-26T17:23:07.02898Z","iopub.status.idle":"2024-09-26T17:23:07.138743Z","shell.execute_reply.started":"2024-09-26T17:23:07.028947Z","shell.execute_reply":"2024-09-26T17:23:07.137848Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"count_GenHealth=df['GenHealth'].value_counts().reset_index()\ncount_GenHealth.columns=['GenHealth','counts']\nfig_GenHealth=px.bar(count_GenHealth,x='GenHealth',y='counts',title='count of GenHealth')\nfig_GenHealth.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.139986Z","iopub.execute_input":"2024-09-26T17:23:07.140363Z","iopub.status.idle":"2024-09-26T17:23:07.251793Z","shell.execute_reply.started":"2024-09-26T17:23:07.140306Z","shell.execute_reply":"2024-09-26T17:23:07.250873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig_PhysicalHealth=px.histogram(df,x='PhysicalHealth',title='distribution of PhysicalHealth')\nfig_PhysicalHealth.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.253101Z","iopub.execute_input":"2024-09-26T17:23:07.2538Z","iopub.status.idle":"2024-09-26T17:23:07.347301Z","shell.execute_reply.started":"2024-09-26T17:23:07.253752Z","shell.execute_reply":"2024-09-26T17:23:07.346345Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig_SleepTime=px.histogram(df,x='SleepTime',title='distribution of SleepTime')\nfig_SleepTime.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.348645Z","iopub.execute_input":"2024-09-26T17:23:07.349061Z","iopub.status.idle":"2024-09-26T17:23:07.42338Z","shell.execute_reply.started":"2024-09-26T17:23:07.349014Z","shell.execute_reply":"2024-09-26T17:23:07.422534Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig_BMI=px.histogram(df,x='BMI',title='distribution of BMI')\nfig_BMI.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.424685Z","iopub.execute_input":"2024-09-26T17:23:07.425025Z","iopub.status.idle":"2024-09-26T17:23:07.530052Z","shell.execute_reply.started":"2024-09-26T17:23:07.424986Z","shell.execute_reply":"2024-09-26T17:23:07.529052Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig_SleepTime_HeartDisease=px.histogram(df,x='SleepTime',color='HeartDisease',title='distribution of SleepTime by HeartDisease')\nfig_SleepTime_HeartDisease.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.531318Z","iopub.execute_input":"2024-09-26T17:23:07.531762Z","iopub.status.idle":"2024-09-26T17:23:07.683858Z","shell.execute_reply.started":"2024-09-26T17:23:07.531721Z","shell.execute_reply":"2024-09-26T17:23:07.682946Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='GenHealth',color='HeartDisease',title='distribution of GenHealth by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:07.685142Z","iopub.execute_input":"2024-09-26T17:23:07.685601Z","iopub.status.idle":"2024-09-26T17:23:08.644757Z","shell.execute_reply.started":"2024-09-26T17:23:07.685546Z","shell.execute_reply":"2024-09-26T17:23:08.643289Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='AgeCategory',color='HeartDisease',title='distribution of AgeCategory by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:08.646188Z","iopub.execute_input":"2024-09-26T17:23:08.646661Z","iopub.status.idle":"2024-09-26T17:23:09.600252Z","shell.execute_reply.started":"2024-09-26T17:23:08.646601Z","shell.execute_reply":"2024-09-26T17:23:09.599125Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='Diabetic',color='HeartDisease',title='distribution of Diabetic by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:09.607623Z","iopub.execute_input":"2024-09-26T17:23:09.607959Z","iopub.status.idle":"2024-09-26T17:23:10.559035Z","shell.execute_reply.started":"2024-09-26T17:23:09.607924Z","shell.execute_reply":"2024-09-26T17:23:10.55791Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='Race',color='HeartDisease',title='distribution of Race by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:10.560171Z","iopub.execute_input":"2024-09-26T17:23:10.560461Z","iopub.status.idle":"2024-09-26T17:23:11.503999Z","shell.execute_reply.started":"2024-09-26T17:23:10.56043Z","shell.execute_reply":"2024-09-26T17:23:11.502559Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='Sex',color='HeartDisease',title='distribution of Sex by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:11.505092Z","iopub.execute_input":"2024-09-26T17:23:11.505388Z","iopub.status.idle":"2024-09-26T17:23:12.45323Z","shell.execute_reply.started":"2024-09-26T17:23:11.505354Z","shell.execute_reply":"2024-09-26T17:23:12.452153Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='DiffWalking',color='HeartDisease',title='distribution of DiffWalking by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:12.454949Z","iopub.execute_input":"2024-09-26T17:23:12.455316Z","iopub.status.idle":"2024-09-26T17:23:13.388647Z","shell.execute_reply.started":"2024-09-26T17:23:12.455274Z","shell.execute_reply":"2024-09-26T17:23:13.387642Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='AlcoholDrinking',color='HeartDisease',title='distribution of AlcoholDrinking by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:13.389927Z","iopub.execute_input":"2024-09-26T17:23:13.390245Z","iopub.status.idle":"2024-09-26T17:23:14.32703Z","shell.execute_reply.started":"2024-09-26T17:23:13.390211Z","shell.execute_reply":"2024-09-26T17:23:14.326049Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='Stroke',color='HeartDisease',title='distribution of Stroke by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:14.328382Z","iopub.execute_input":"2024-09-26T17:23:14.328738Z","iopub.status.idle":"2024-09-26T17:23:15.254843Z","shell.execute_reply.started":"2024-09-26T17:23:14.328704Z","shell.execute_reply":"2024-09-26T17:23:15.253527Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig=px.histogram(df,x='Smoking',color='HeartDisease',title='distribution of Smoking by HeartDisease')\nfig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:15.255922Z","iopub.execute_input":"2024-09-26T17:23:15.256217Z","iopub.status.idle":"2024-09-26T17:23:16.208929Z","shell.execute_reply.started":"2024-09-26T17:23:15.256184Z","shell.execute_reply":"2024-09-26T17:23:16.207953Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"for i in df.columns:\n if i in ['BMI','PhysicalHealth','MentalHealth','SleepTime']:\n print(i)\n fig = px.box(df, x=i)\n fig.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:16.210172Z","iopub.execute_input":"2024-09-26T17:23:16.21052Z","iopub.status.idle":"2024-09-26T17:23:16.557373Z","shell.execute_reply.started":"2024-09-26T17:23:16.210485Z","shell.execute_reply":"2024-09-26T17:23:16.556597Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## **preprocessing**","metadata":{}},{"cell_type":"code","source":"df.duplicated().sum()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:16.558495Z","iopub.execute_input":"2024-09-26T17:23:16.558886Z","iopub.status.idle":"2024-09-26T17:23:16.95101Z","shell.execute_reply.started":"2024-09-26T17:23:16.558841Z","shell.execute_reply":"2024-09-26T17:23:16.950057Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.drop_duplicates()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:16.952073Z","iopub.execute_input":"2024-09-26T17:23:16.952348Z","iopub.status.idle":"2024-09-26T17:23:17.402102Z","shell.execute_reply.started":"2024-09-26T17:23:16.952318Z","shell.execute_reply":"2024-09-26T17:23:17.401155Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:17.403464Z","iopub.execute_input":"2024-09-26T17:23:17.40388Z","iopub.status.idle":"2024-09-26T17:23:17.807785Z","shell.execute_reply.started":"2024-09-26T17:23:17.403833Z","shell.execute_reply":"2024-09-26T17:23:17.806658Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['HeartDisease'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:17.80881Z","iopub.execute_input":"2024-09-26T17:23:17.8091Z","iopub.status.idle":"2024-09-26T17:23:17.864217Z","shell.execute_reply.started":"2024-09-26T17:23:17.809069Z","shell.execute_reply":"2024-09-26T17:23:17.863242Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Encoder","metadata":{}},{"cell_type":"code","source":"lb =LabelEncoder()\ncolumns_lb=['SkinCancer','KidneyDisease','Asthma','PhysicalActivity','Sex','DiffWalking'\n ,'Stroke','AlcoholDrinking','Smoking','HeartDisease']\nfor i in columns_lb:\n df[i]=lb.fit_transform(df[i])\n \ncolumns_onehot=['AgeCategory','Race','Diabetic','GenHealth']\nencoder = OneHotEncoder(sparse_output=False)\n\none_hot_encoded = encoder.fit_transform(df[columns_onehot])\n\none_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(columns_onehot))\n\ndf = pd.concat([df, one_hot_df], axis=1)\n\ndf = df.drop(columns_onehot, axis=1)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:17.865518Z","iopub.execute_input":"2024-09-26T17:23:17.865961Z","iopub.status.idle":"2024-09-26T17:23:19.357043Z","shell.execute_reply.started":"2024-09-26T17:23:17.86592Z","shell.execute_reply":"2024-09-26T17:23:19.356005Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:19.35837Z","iopub.execute_input":"2024-09-26T17:23:19.358697Z","iopub.status.idle":"2024-09-26T17:23:19.470679Z","shell.execute_reply.started":"2024-09-26T17:23:19.358663Z","shell.execute_reply":"2024-09-26T17:23:19.469549Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.shape","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:19.47208Z","iopub.execute_input":"2024-09-26T17:23:19.472417Z","iopub.status.idle":"2024-09-26T17:23:19.478886Z","shell.execute_reply.started":"2024-09-26T17:23:19.472385Z","shell.execute_reply":"2024-09-26T17:23:19.477662Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import seaborn as sns\nplt.figure(figsize=(15 ,10))\nsns.heatmap(df.corr(), annot=True)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:19.481046Z","iopub.execute_input":"2024-09-26T17:23:19.481657Z","iopub.status.idle":"2024-09-26T17:23:25.510837Z","shell.execute_reply.started":"2024-09-26T17:23:19.481577Z","shell.execute_reply":"2024-09-26T17:23:25.509246Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## check outliers","metadata":{}},{"cell_type":"code","source":"def count_outliers_in_dataframe(df):\n outlier_summary = {}\n \n for column in df.select_dtypes(include=['number']).columns: # Iterate over numeric columns\n Q1 = df[column].quantile(0.25)\n Q3 = df[column].quantile(0.75)\n IQR = Q3 - Q1\n \n # Define lower and upper bounds\n lower_bound = Q1 - 1.5 * IQR\n upper_bound = Q3 + 1.5 * IQR\n \n # Identify outliers\n outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n \n # Store results\n outlier_summary[column] = {\n 'num_outliers': len(outliers),\n 'outlier_values': outliers[column].tolist() # Convert to list for better readability\n }\n \n return outlier_summary","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:25.513072Z","iopub.execute_input":"2024-09-26T17:23:25.513404Z","iopub.status.idle":"2024-09-26T17:23:25.520818Z","shell.execute_reply.started":"2024-09-26T17:23:25.51337Z","shell.execute_reply":"2024-09-26T17:23:25.519471Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"outliers_result = count_outliers_in_dataframe(df)\n\n# Print outlier summary\nfor column, result in outliers_result.items():\n print(f\"Column '{column}':\")\n print(f\" Number of outliers: {result['num_outliers']}\")","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:25.522139Z","iopub.execute_input":"2024-09-26T17:23:25.522429Z","iopub.status.idle":"2024-09-26T17:23:26.251376Z","shell.execute_reply.started":"2024-09-26T17:23:25.522398Z","shell.execute_reply":"2024-09-26T17:23:26.250389Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"out_list = []\n\nfor i in ['BMI','PhysicalHealth','MentalHealth','SleepTime']:\n Q1 = df[i].quantile(0.25)\n Q3 = df[i].quantile(0.75)\n IQR = Q3 - Q1\n up = Q3 + 1.5 * IQR\n low = Q1 - 1.5 * IQR\n\n if (df.loc[(df[i] > up) | (df[i] < low)]).any(axis=None):\n print(i, \"yes\")\n out_list.append(i) # Add the column name to out_list\n else:\n print(i, \"no\")\n\nprint(\"Columns with outliers:\", out_list)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.252817Z","iopub.execute_input":"2024-09-26T17:23:26.253227Z","iopub.status.idle":"2024-09-26T17:23:26.363629Z","shell.execute_reply.started":"2024-09-26T17:23:26.25318Z","shell.execute_reply":"2024-09-26T17:23:26.362653Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Remove outliers","metadata":{}},{"cell_type":"code","source":"def remove_outliers_from_dataframe(df):\n \n for column in df.columns:\n if column in ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']:\n Q1 = df[column].quantile(0.25)\n Q3 = df[column].quantile(0.75)\n IQR = Q3 - Q1\n \n lower_bound = Q1 - 1.5 * IQR\n upper_bound = Q3 + 1.5 * IQR\n \n df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n return df","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.364942Z","iopub.execute_input":"2024-09-26T17:23:26.36524Z","iopub.status.idle":"2024-09-26T17:23:26.371409Z","shell.execute_reply.started":"2024-09-26T17:23:26.365208Z","shell.execute_reply":"2024-09-26T17:23:26.370399Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df=remove_outliers_from_dataframe(df)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.372986Z","iopub.execute_input":"2024-09-26T17:23:26.373389Z","iopub.status.idle":"2024-09-26T17:23:26.540445Z","shell.execute_reply.started":"2024-09-26T17:23:26.373337Z","shell.execute_reply":"2024-09-26T17:23:26.539664Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.541678Z","iopub.execute_input":"2024-09-26T17:23:26.542054Z","iopub.status.idle":"2024-09-26T17:23:26.609576Z","shell.execute_reply.started":"2024-09-26T17:23:26.542012Z","shell.execute_reply":"2024-09-26T17:23:26.60854Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"x=df.drop('HeartDisease',axis=1)\ny=df[['HeartDisease']]","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.61085Z","iopub.execute_input":"2024-09-26T17:23:26.611178Z","iopub.status.idle":"2024-09-26T17:23:26.629111Z","shell.execute_reply.started":"2024-09-26T17:23:26.611143Z","shell.execute_reply":"2024-09-26T17:23:26.628384Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"oversample = SMOTE()\n\nX_resampled, y_resampled = oversample.fit_resample(x, y)\nx_sample=pd.DataFrame(X_resampled)\ny_sample=pd.DataFrame(y_resampled)\n\ndatanew = pd.concat([x_sample,y_sample], axis=1)\ndf = datanew\nprint(f\"The dataset now has {df.shape[0]} rows.\")","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:26.630157Z","iopub.execute_input":"2024-09-26T17:23:26.630434Z","iopub.status.idle":"2024-09-26T17:23:27.985047Z","shell.execute_reply.started":"2024-09-26T17:23:26.630403Z","shell.execute_reply":"2024-09-26T17:23:27.983982Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['HeartDisease'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:27.986266Z","iopub.execute_input":"2024-09-26T17:23:27.986628Z","iopub.status.idle":"2024-09-26T17:23:27.99892Z","shell.execute_reply.started":"2024-09-26T17:23:27.986574Z","shell.execute_reply":"2024-09-26T17:23:27.997958Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"x=df.drop('HeartDisease',axis=1)\ny=df['HeartDisease']","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:28.000031Z","iopub.execute_input":"2024-09-26T17:23:28.000395Z","iopub.status.idle":"2024-09-26T17:23:28.050583Z","shell.execute_reply.started":"2024-09-26T17:23:28.00034Z","shell.execute_reply":"2024-09-26T17:23:28.049534Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.decomposition import PCA\npca=PCA(0.95)\nX_pca=pca.fit_transform(x)\nX_pca.shape","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:28.051863Z","iopub.execute_input":"2024-09-26T17:23:28.052234Z","iopub.status.idle":"2024-09-26T17:23:28.93477Z","shell.execute_reply.started":"2024-09-26T17:23:28.052181Z","shell.execute_reply":"2024-09-26T17:23:28.933795Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## scaling data","metadata":{}},{"cell_type":"code","source":"scaler=MinMaxScaler()\nx=scaler.fit_transform(X_pca)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:28.938519Z","iopub.execute_input":"2024-09-26T17:23:28.940804Z","iopub.status.idle":"2024-09-26T17:23:28.970575Z","shell.execute_reply.started":"2024-09-26T17:23:28.940759Z","shell.execute_reply":"2024-09-26T17:23:28.969694Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## split data","metadata":{}},{"cell_type":"code","source":"X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:28.975035Z","iopub.execute_input":"2024-09-26T17:23:28.977225Z","iopub.status.idle":"2024-09-26T17:23:29.165125Z","shell.execute_reply.started":"2024-09-26T17:23:28.977179Z","shell.execute_reply":"2024-09-26T17:23:29.164295Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## supervised models","metadata":{}},{"cell_type":"markdown","source":"## logistic Regression","metadata":{}},{"cell_type":"code","source":"lg=LogisticRegression()\nlg.fit(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:29.16646Z","iopub.execute_input":"2024-09-26T17:23:29.167335Z","iopub.status.idle":"2024-09-26T17:23:30.035354Z","shell.execute_reply.started":"2024-09-26T17:23:29.167284Z","shell.execute_reply":"2024-09-26T17:23:30.034381Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lg.score(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.036636Z","iopub.execute_input":"2024-09-26T17:23:30.037468Z","iopub.status.idle":"2024-09-26T17:23:30.065549Z","shell.execute_reply.started":"2024-09-26T17:23:30.03742Z","shell.execute_reply":"2024-09-26T17:23:30.064558Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pred_lg=lg.predict(X_test)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.067131Z","iopub.execute_input":"2024-09-26T17:23:30.067769Z","iopub.status.idle":"2024-09-26T17:23:30.076408Z","shell.execute_reply.started":"2024-09-26T17:23:30.067722Z","shell.execute_reply":"2024-09-26T17:23:30.074837Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(f\"accuracy : {accuracy_score(y_test,y_pred_lg)}\")","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.078325Z","iopub.execute_input":"2024-09-26T17:23:30.079197Z","iopub.status.idle":"2024-09-26T17:23:30.090183Z","shell.execute_reply.started":"2024-09-26T17:23:30.079152Z","shell.execute_reply":"2024-09-26T17:23:30.088658Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"ConfusionMatrixDisplay.from_predictions(y_test, y_pred_lg)\nprint(confusion_matrix(y_test, y_pred_lg))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.091702Z","iopub.execute_input":"2024-09-26T17:23:30.092275Z","iopub.status.idle":"2024-09-26T17:23:30.448558Z","shell.execute_reply.started":"2024-09-26T17:23:30.09223Z","shell.execute_reply":"2024-09-26T17:23:30.447683Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(classification_report(y_test, y_pred_lg))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.450019Z","iopub.execute_input":"2024-09-26T17:23:30.450707Z","iopub.status.idle":"2024-09-26T17:23:30.493101Z","shell.execute_reply.started":"2024-09-26T17:23:30.450657Z","shell.execute_reply":"2024-09-26T17:23:30.492246Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Decision tree","metadata":{}},{"cell_type":"code","source":"dt=DecisionTreeClassifier()\ndt.fit(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:30.494334Z","iopub.execute_input":"2024-09-26T17:23:30.494738Z","iopub.status.idle":"2024-09-26T17:23:39.073063Z","shell.execute_reply.started":"2024-09-26T17:23:30.494692Z","shell.execute_reply":"2024-09-26T17:23:39.072058Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"dt.score(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.074214Z","iopub.execute_input":"2024-09-26T17:23:39.074533Z","iopub.status.idle":"2024-09-26T17:23:39.193478Z","shell.execute_reply.started":"2024-09-26T17:23:39.0745Z","shell.execute_reply":"2024-09-26T17:23:39.192581Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pred_dt=dt.predict(X_test)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.194796Z","iopub.execute_input":"2024-09-26T17:23:39.195138Z","iopub.status.idle":"2024-09-26T17:23:39.226909Z","shell.execute_reply.started":"2024-09-26T17:23:39.195103Z","shell.execute_reply":"2024-09-26T17:23:39.226025Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(accuracy_score(y_test,y_pred_dt))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.228078Z","iopub.execute_input":"2024-09-26T17:23:39.228381Z","iopub.status.idle":"2024-09-26T17:23:39.237005Z","shell.execute_reply.started":"2024-09-26T17:23:39.22835Z","shell.execute_reply":"2024-09-26T17:23:39.236125Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"ConfusionMatrixDisplay.from_predictions(y_test, y_pred_dt)\nprint(confusion_matrix(y_test, y_pred_dt))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.238002Z","iopub.execute_input":"2024-09-26T17:23:39.238343Z","iopub.status.idle":"2024-09-26T17:23:39.455676Z","shell.execute_reply.started":"2024-09-26T17:23:39.238296Z","shell.execute_reply":"2024-09-26T17:23:39.454667Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(classification_report(y_test, y_pred_dt))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.456777Z","iopub.execute_input":"2024-09-26T17:23:39.457049Z","iopub.status.idle":"2024-09-26T17:23:39.497889Z","shell.execute_reply.started":"2024-09-26T17:23:39.457019Z","shell.execute_reply":"2024-09-26T17:23:39.497071Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Random forest","metadata":{}},{"cell_type":"code","source":"rf=RandomForestClassifier()\nrf.fit(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:23:39.498961Z","iopub.execute_input":"2024-09-26T17:23:39.499253Z","iopub.status.idle":"2024-09-26T17:26:31.698239Z","shell.execute_reply.started":"2024-09-26T17:23:39.499221Z","shell.execute_reply":"2024-09-26T17:26:31.697356Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"rf.score(X_train,y_train)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:31.699504Z","iopub.execute_input":"2024-09-26T17:26:31.699837Z","iopub.status.idle":"2024-09-26T17:26:41.72687Z","shell.execute_reply.started":"2024-09-26T17:26:31.699803Z","shell.execute_reply":"2024-09-26T17:26:41.725954Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pred_rf=rf.predict(X_test)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:41.728311Z","iopub.execute_input":"2024-09-26T17:26:41.728762Z","iopub.status.idle":"2024-09-26T17:26:44.303369Z","shell.execute_reply.started":"2024-09-26T17:26:41.728714Z","shell.execute_reply":"2024-09-26T17:26:44.302499Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(accuracy_score(y_test,y_pred_rf))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:44.304701Z","iopub.execute_input":"2024-09-26T17:26:44.305381Z","iopub.status.idle":"2024-09-26T17:26:44.313163Z","shell.execute_reply.started":"2024-09-26T17:26:44.305329Z","shell.execute_reply":"2024-09-26T17:26:44.312198Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf)\nprint(confusion_matrix(y_test, y_pred_rf))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:44.315645Z","iopub.execute_input":"2024-09-26T17:26:44.315925Z","iopub.status.idle":"2024-09-26T17:26:44.609206Z","shell.execute_reply.started":"2024-09-26T17:26:44.315894Z","shell.execute_reply":"2024-09-26T17:26:44.608308Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(classification_report(y_test, y_pred_rf))","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:44.610563Z","iopub.execute_input":"2024-09-26T17:26:44.610967Z","iopub.status.idle":"2024-09-26T17:26:44.652332Z","shell.execute_reply.started":"2024-09-26T17:26:44.610921Z","shell.execute_reply":"2024-09-26T17:26:44.651374Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Unsupervised models","metadata":{}},{"cell_type":"markdown","source":"## Kmeans","metadata":{}},{"cell_type":"code","source":"X=df[['PhysicalHealth','PhysicalHealth']]","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:28:22.933291Z","iopub.execute_input":"2024-09-26T17:28:22.934216Z","iopub.status.idle":"2024-09-26T17:28:22.950245Z","shell.execute_reply.started":"2024-09-26T17:28:22.934156Z","shell.execute_reply":"2024-09-26T17:28:22.949159Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"scaler=MinMaxScaler()\nX=scaler.fit_transform(X)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:44.662003Z","iopub.execute_input":"2024-09-26T17:26:44.662339Z","iopub.status.idle":"2024-09-26T17:26:44.676343Z","shell.execute_reply.started":"2024-09-26T17:26:44.662294Z","shell.execute_reply":"2024-09-26T17:26:44.675527Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"cluster=[]\nfor i in range(1,10):\n km=KMeans(n_clusters=i,n_init='auto' ,init='k-means++', random_state= 42)\n km.fit(X)\n cluster.append(km.inertia_)\ncluster ","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:44.677428Z","iopub.execute_input":"2024-09-26T17:26:44.67781Z","iopub.status.idle":"2024-09-26T17:26:47.048239Z","shell.execute_reply.started":"2024-09-26T17:26:44.677766Z","shell.execute_reply":"2024-09-26T17:26:47.047327Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.plot(range(1,10),cluster,marker='.')","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:47.049886Z","iopub.execute_input":"2024-09-26T17:26:47.050277Z","iopub.status.idle":"2024-09-26T17:26:47.236662Z","shell.execute_reply.started":"2024-09-26T17:26:47.050232Z","shell.execute_reply":"2024-09-26T17:26:47.235722Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"km=KMeans(n_clusters=2,n_init='auto' ,init='k-means++', random_state= 42)\nkm.fit(X)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:47.237855Z","iopub.execute_input":"2024-09-26T17:26:47.238166Z","iopub.status.idle":"2024-09-26T17:26:47.461849Z","shell.execute_reply.started":"2024-09-26T17:26:47.238133Z","shell.execute_reply":"2024-09-26T17:26:47.460949Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"pred=km.fit_predict(X)","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:47.463029Z","iopub.execute_input":"2024-09-26T17:26:47.463349Z","iopub.status.idle":"2024-09-26T17:26:47.635293Z","shell.execute_reply.started":"2024-09-26T17:26:47.46331Z","shell.execute_reply":"2024-09-26T17:26:47.633946Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.scatter(X[pred == 0, 0], X[pred == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') \nplt.scatter(X[pred == 1, 0], X[pred == 1, 1], s = 100, c = 'green', label = 'Cluster 2') \nplt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid') \nplt.legend() \nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-09-26T17:26:47.637088Z","iopub.execute_input":"2024-09-26T17:26:47.637969Z","iopub.status.idle":"2024-09-26T17:26:53.345047Z","shell.execute_reply.started":"2024-09-26T17:26:47.637905Z","shell.execute_reply":"2024-09-26T17:26:53.344052Z"},"trusted":true},"execution_count":null,"outputs":[]}]}