single_perceptron_comparison_between_sgd_and_ulr.py

# -*- coding: utf-8 -*-
"""Single Perceptron: Comparison between SGD and ULR

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1s0f349dI97iC0rbpAQhuKBmbQhGTJQGq
"""

#Dataset source: "Titanic dataset"
#The titanic and titanic2 data frames describe the survival status of individual passengers
#on the Titanic. The titanic data frame does not contain information from the crew, but it
#does contain actual ages of half of the passengers. The principal source for data about
#Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by
#a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic:
#Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by
#many researchers and edited by Michael A. Findlay

#References:
#Princeton University COS 495
#Instructor: Yingyu Liang (for difference between ULR and SGD)
#

#import dependencies: for test-train split. pandas, numpy and io (taking data file)

import numpy as np    
import pandas as pd
from sklearn.model_selection import train_test_split
import io

#function to split dataset
#integer is passed to random state to reproduce the same results over different function calls

def splitData(input):   
    X = input
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=3) 
    return X_train, X_test

#function to pass through all training examples once
#prediction based on step function (pos/neg gives 1/0 respectively)
#update weight vectors and bias at every training example 
#learning_rate=1 is the ULR, less than 1 is SGD

def train_one_epoch(input, weights, bias, label, learning_rate):
  for i in range(input.shape[0]):
      summation= np.dot(input[i][0:input.shape[1]-1],weights) + bias
      if summation > 0:
        predict=1
      else:
        predict=0
      weights += learning_rate*(label[i]-predict)*input[i][0:input.shape[1]-1]
      bias += learning_rate*(label[i]-predict)
  return weights , bias

#initialize bias, weights and train the perceptron over all examples
#pass through all training examples epoch number of times
#learning rate decides which algorithm is used for update

def train_perceptron(input, epochs , learning_rate):
  weight_init=np.zeros(input.shape[1]-1)
  bias=-0.5
  label=np.zeros(input.shape[0])
  for i in range(input.shape[0]):
         label[i]=input[i][input.shape[1]-1] 
  for i in range(epochs):
    weight_init, bias=train_one_epoch(input, weight_init, bias, label, learning_rate)
  return weight_init , bias , label

#calculate accuracy
def accuracy(train_input , test_input , epochs, learning_rate):
  weights, bias, _ = train_perceptron(train_input, epochs, learning_rate)
  correct_predictions=0
  label=[]
  for i in range(test_input.shape[0]):
         label.append(test_input[i][test_input.shape[1]-1])
  for i in range(test_input.shape[0]):
   mul=np.dot(weights, test_input[i][0:test_input.shape[1]-1]) + bias
   if (mul > 0 and label[i]==1) or (mul < 0 and label[i]==0):
     correct_predictions+=1
  acc= (correct_predictions*100)/test_input.shape[0]
  return acc

#calculate precision
def precision(train_input, test_input, epochs, learning_rate):
  weights, bias, _ = train_perceptron(train_input, epochs, learning_rate)
  tp=0
  fp=0
  label=[]
  for i in range(test_input.shape[0]):
         label.append(test_input[i][test_input.shape[1]-1])
  for i in range(test_input.shape[0]):
   mul=np.dot(weights, test_input[i][0:test_input.shape[1]-1]) + bias
   if mul > 0:
      if label[i]==1:
        tp+=1
      else:
        fp+=1
  prec=(tp*100)/(tp+fp) 
  return prec

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

data = pd.read_csv(io.BytesIO(uploaded['data.csv']))
data = data.replace(to_replace='^\s+$', value= np.nan, regex=True)
data=data.dropna(how='any')
titanic=pd.DataFrame(data).to_numpy()

X_train, X_test=splitData(titanic)

#accuracy(X_train, X_train , 500, 0.1), 
accuracy(X_train, X_test, 500, 0.1)

#precision(X_train, X_train, 500 , 1), 
precision(X_train, X_test, 500,1)