forked from anderf2706/ML-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_classifer.py
196 lines (136 loc) · 7.38 KB
/
knn_classifer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
######### Imports ############
import pickle
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.sampler import *
from torch.utils.data import Dataset
from torchvision import transforms, datasets
import torchvision
import numpy as np
import torchvision.models as models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
import matplotlib.pyplot as plt
from pyTorch.utils import ROOT_DIR
####################################
dataset_root = ROOT_DIR + '\\chest_xray\\' # folder with images
new_size = (224,224) # new size of images.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # string so thet pyTorch uses cuda for data preparation.
labels = ['NORMAL', 'PNEUMONIA'] # labels for the images
def dataset(Normalized): # Take in the datasets from folders, and make them into tensors, so they can be used for training.
transform = transforms.Compose([
transforms.Resize(new_size), # resizing the images
transforms.Grayscale(), # transforing them to grey-scale as they are black/white
transforms.ToTensor(), # toTensor so it can be used as arrays for
nn.Flatten() # flatten into 1 dim
])
if(Normalized):
train = torchvision.datasets.ImageFolder(root=dataset_root + 'train_normalized',
transform=transform) # Make an image folder for train, using the transforms from above
else:
train = torchvision.datasets.ImageFolder(root=dataset_root + 'train',
transform=transform) # Make an image folder for train, using the transforms from above
test = torchvision.datasets.ImageFolder(root=dataset_root + 'test', transform=transform) # make an image folder for test
val = torchvision.datasets.ImageFolder(root=dataset_root + 'val', transform=transform) # image folder for validation
train_loader = torch.utils.data.DataLoader(train, batch_size=len(train), shuffle=True) # Make a dataloader object for train, where we set the batch to the whole set. Then
#use shuffle to shuffle the images.
val_loader = torch.utils.data.DataLoader(val, batch_size=len(val), shuffle=True)# Make a dataloader object for validation, where we set the batch to the whole set. Then
#use shuffle to shuffle the images.
test_loader = torch.utils.data.DataLoader(test, batch_size=len(test), shuffle=True)# Make a dataloader object for test, where we set the batch to the whole set. Then
#use shuffle to shuffle the images.
#convert the data in the sets to tensors, so we can map them in 2d to calculate nearest neighbour
x_val = []
y_val = []
for idx, (data_in_test, target) in enumerate(val_loader):
x_val = data_in_test.squeeze()
y_val = target.squeeze()
x_test = []
y_test =[]
for idx, (data_in_test, target) in enumerate(test_loader):
x_test = data_in_test.squeeze()
y_test = target.squeeze()
x_train = []
y_train = []
for idx, (data_in_train, target) in enumerate(train_loader):
x_train = data_in_train.squeeze()
y_train = target.squeeze()
# Transform to tensors, and then to numpy arrays. Use cpu() to move the data from the gpu to cpu, as scikit learn does not have cuda support.
x_val = torch.tensor(x_val, device=device)
y_val = torch.tensor(y_val, device=device)
x_test = torch.tensor(x_test, device=device)
y_test = torch.tensor(y_test, device=device)
x_train = torch.tensor(x_train, device=device)
y_train = torch.tensor(y_train, device=device)
x_val = x_val.cpu().data.numpy()
y_val = y_val.cpu().data.numpy()
x_test = x_test.cpu().data.numpy()
y_test = y_test.cpu().data.numpy()
x_train = x_train.cpu().data.numpy()
y_train = y_train.cpu().data.numpy()
return x_train, y_train, x_test, y_test, x_val, y_val # return 3 featurelists, and 3 labellists.
def KNN_model(x_train, y_train, x_test, neighbours, device, log_boolean, log = 100):
amount_of_img = x_test.shape[0]
print(amount_of_img)
amount_of_train = y_train.shape[0]
img_size = x_test.shape[1]
y_test = torch.zeros((amount_of_img), device=device, dtype=torch.float)
for test_idx in range(0, amount_of_img):
#calculate distance to all datapoints in the training set
test_img = x_test[test_idx]
distances_for_test = torch.norm(x_train - test_img, dim=1)
indexes = torch.topk(distances_for_test, neighbours, largest=False)[1]
classes = torch.gather(y_train, 0, indexes)
modus = int(torch.mode(classes)[0])
y_test[test_idx] = modus
if log_boolean:
if test_idx % log == 0:
print("predicting on index = %d" % test_idx)
return y_test
def train():
x_train, y_train, x_test, y_test, x_val, y_val = dataset(False)
print("train and test sizes are: %s, %s" % (str(x_train.shape), str(x_test.shape)))
#pred = KNN_model(x_train, y_train, x_test, neighbours=1, device = device)
#correct = pred.eq(y_test.to(device).view_as(pred)).sum()
#print("Correct pred %d/%d, Accuracy %f" % (correct, y_test.shape[0], 100. * correct/y_test.shape[0]))
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 25, 37, 49, int(np.floor(np.sqrt(x_train.shape[0]))), 200]
correct_vals = []
best_k = -1
best_correct = 0
for k in k_values:
pred = KNN_model(x_train, y_train, x_test, neighbours=k, device=device, log_boolean=False)
correct = pred.eq(y_test.view_as(pred)).sum()
print("K = %d, Correct: %d, Accuracy: %.2f" % (k, correct, 100. * correct / y_test.shape[0]))
x_train, y_train, x_test, y_test, x_val, y_val = dataset(False)
def save_model(classifer): # method for saving the model, so it doesnt need to be fitted anymore.
filename = 'finalized_model.sav'
pickle.dump(classifer, open(filename, 'wb'))
def load_model(): # Method for loading a saved model back for use.
return pickle.load(open('model.sav', 'rb'))
def plot_best_K(): # A method used for showcasing the best K-value for the KNN-model.
error = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):
print(i)
knn = KNeighborsClassifier(n_neighbors=i) #Makes the classifier, with K=i
knn.fit(x_val, y_val) # fits the validation data
pred_i = knn.predict(x_test) # predicts on the test data
error.append(np.mean(pred_i != y_test)) # finds the mean error for the predictions.
#Plots a graph, showing the mean error for each K-value. Low errors, mean a good K-value.
plt.figure(figsize=(12, 6))
plt.plot(range(1, 20), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
plt.show()
if __name__ == '__main__': # main func
classifier = KNeighborsClassifier(n_neighbors=12) #Creating the KNN-model with K=12
classifier.fit(x_train, y_train) # training the model with the train data, features and labels
y_pred = classifier.predict(x_test) # predicting with the model on the test features.
#printing accuracy, and a report showing different metrics, as well as the confusion matrix.
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))