-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogistic_regression.py
104 lines (80 loc) · 4.42 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Logistic regression
# Logistic regression aims to solve classification problems.
# It does this by predicting categorical outcomes, unlike linear regression that predicts a continuous outcome.
# Standard operational package imports.
import numpy as np
import pandas as pd
# Important imports for preprocessing, modeling, and evaluation.
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
# Visualization package imports.
import matplotlib.pyplot as plt
import seaborn as sns
df_original = pd.read_csv("F:\Learning_Work\Vs_Work\DM_Project\Invistico_Airline.csv")
# 10 rows shown
print(df_original.head(n=10))
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Type of data in the dataset...\n')
# data types of the data
print(df_original.dtypes)
# to check the number of satisfied customers in the dataset....
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Check the number of satisfied customers in the dataset...\n')
print(df_original['satisfaction'].value_counts(dropna = False))
# Check the number of satisfied customers in the dataset.....
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Check the number of satisfied customers in the dataset...\n')
print(df_original.isnull().sum())
# Drop the missing values
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Drop the rows with missing values...\n')
df_subset = df_original.dropna(axis=0).reset_index(drop = True)
print(df_subset)
# Prepare the data
df_subset.astype({"Inflight entertainment": float})
#Convert the categorical column satisfaction into numeric
df_subset['satisfaction'] = OneHotEncoder(drop='first').fit_transform(df_subset[['satisfaction']]).toarray()
# data show after preperation
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Printing the whole dataset after preparation...\n')
print(df_subset.head(10))
# Create the training and testing data
# I put 70% of the data into a training set and the remaining 30% into a testing set.
X = df_subset[["Inflight entertainment"]]
y = df_subset["satisfaction"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
# Model building
#Fit a LogisticRegression model to the data
clf = LogisticRegression().fit(X_train,y_train)
# obtain parameters estimates
clf.coef_
clf.intercept_
# Create a plot of your model
# The graph seems to indicate that the higher the inflight entertainment value, the higher the customer satisfaction,
sns.regplot(x="Inflight entertainment", y="satisfaction", data=df_subset, logistic=True, ci=None)
plt.show()
# Results and evaluation
# Save predictions.
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Printing the pridiction of test dataset...\n')
y_pred = clf.predict(X_test)
print(y_pred)
# Use the predict_proba and predict functions on X_test
# Use predict_proba to output a probability.
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Printing the probability of the test dataset...\n')
print(clf.predict_proba(X_test))
# Result Analysis
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Result Analysis...\n')
print('Printing the overall accuracy of the algorithm...\n')
print("Accuracy:", "%.6f" % metrics.accuracy_score(y_test, y_pred))
print("Precision:", "%.6f" % metrics.precision_score(y_test, y_pred))
print("Recall:", "%.6f" % metrics.recall_score(y_test, y_pred))
print("F1 Score:", "%.6f" % metrics.f1_score(y_test, y_pred))
print('\n ||||--------------------------------------------------------------------------|||| \n')
print('Logistic regression accurately predicted satisfaction ""80.2"" percent of the time.\n')
print('Customers who rated in-flight entertainment highly were more likely to be satisfied.Improving in-flight entertainment should lead to better customer satisfaction.\n')
print('The model is 80.2 percent accurate. This is an improvement over the datasets customer satisfaction rate.\n')