-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingle_perceptron_comparison_between_sgd_and_ulr.py
121 lines (99 loc) · 4.13 KB
/
single_perceptron_comparison_between_sgd_and_ulr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
"""Single Perceptron: Comparison between SGD and ULR
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1s0f349dI97iC0rbpAQhuKBmbQhGTJQGq
"""
#Dataset source: "Titanic dataset"
#The titanic and titanic2 data frames describe the survival status of individual passengers
#on the Titanic. The titanic data frame does not contain information from the crew, but it
#does contain actual ages of half of the passengers. The principal source for data about
#Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by
#a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic:
#Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by
#many researchers and edited by Michael A. Findlay
#References:
#Princeton University COS 495
#Instructor: Yingyu Liang (for difference between ULR and SGD)
#
#import dependencies: for test-train split. pandas, numpy and io (taking data file)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import io
#function to split dataset
#integer is passed to random state to reproduce the same results over different function calls
def splitData(input):
X = input
X_train, X_test = train_test_split(X, test_size=0.2, random_state=3)
return X_train, X_test
#function to pass through all training examples once
#prediction based on step function (pos/neg gives 1/0 respectively)
#update weight vectors and bias at every training example
#learning_rate=1 is the ULR, less than 1 is SGD
def train_one_epoch(input, weights, bias, label, learning_rate):
for i in range(input.shape[0]):
summation= np.dot(input[i][0:input.shape[1]-1],weights) + bias
if summation > 0:
predict=1
else:
predict=0
weights += learning_rate*(label[i]-predict)*input[i][0:input.shape[1]-1]
bias += learning_rate*(label[i]-predict)
return weights , bias
#initialize bias, weights and train the perceptron over all examples
#pass through all training examples epoch number of times
#learning rate decides which algorithm is used for update
def train_perceptron(input, epochs , learning_rate):
weight_init=np.zeros(input.shape[1]-1)
bias=-0.5
label=np.zeros(input.shape[0])
for i in range(input.shape[0]):
label[i]=input[i][input.shape[1]-1]
for i in range(epochs):
weight_init, bias=train_one_epoch(input, weight_init, bias, label, learning_rate)
return weight_init , bias , label
#calculate accuracy
def accuracy(train_input , test_input , epochs, learning_rate):
weights, bias, _ = train_perceptron(train_input, epochs, learning_rate)
correct_predictions=0
label=[]
for i in range(test_input.shape[0]):
label.append(test_input[i][test_input.shape[1]-1])
for i in range(test_input.shape[0]):
mul=np.dot(weights, test_input[i][0:test_input.shape[1]-1]) + bias
if (mul > 0 and label[i]==1) or (mul < 0 and label[i]==0):
correct_predictions+=1
acc= (correct_predictions*100)/test_input.shape[0]
return acc
#calculate precision
def precision(train_input, test_input, epochs, learning_rate):
weights, bias, _ = train_perceptron(train_input, epochs, learning_rate)
tp=0
fp=0
label=[]
for i in range(test_input.shape[0]):
label.append(test_input[i][test_input.shape[1]-1])
for i in range(test_input.shape[0]):
mul=np.dot(weights, test_input[i][0:test_input.shape[1]-1]) + bias
if mul > 0:
if label[i]==1:
tp+=1
else:
fp+=1
prec=(tp*100)/(tp+fp)
return prec
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
data = pd.read_csv(io.BytesIO(uploaded['data.csv']))
data = data.replace(to_replace='^\s+$', value= np.nan, regex=True)
data=data.dropna(how='any')
titanic=pd.DataFrame(data).to_numpy()
X_train, X_test=splitData(titanic)
#accuracy(X_train, X_train , 500, 0.1),
accuracy(X_train, X_test, 500, 0.1)
#precision(X_train, X_train, 500 , 1),
precision(X_train, X_test, 500,1)