forked from wolfecameron/MachineLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinearRegression.py
119 lines (101 loc) · 4.08 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# this file implements a simple linear regression algorithm
# utilizes gradient descent and two features to yield the optimal linear
# function for a given randomly selected set of data
# uses matplotlib plotting library to visualize data set and function that is fitted to it
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import numpy as np
from random import random
# global variable holds value of learning rate
# convergence threshold determines when the iterations of regression end
ALPHA = .01
CONVERGENCE_THRESHOLD = .00001
#contains global variable for size of data set and maximum value in data set
DATA_SET_SIZE = 20
MAXIMUM_RANDOM_VALUE = 10
# stores number of iterations that the main loop executes
ITERATIONS = 100000
# contains sets of values for which function is being fitted
# below list is the data set for linear regression
x_values = []
y_values = []
# populates x_values and y_values lists
# obtains DATA_SET_SIZE random values for data set
def getDataSet():
for x in range(1,DATA_SET_SIZE):
# appends value into x and random correponding value into y
x_values.append(x)
y_values.append(x + x*(random() - .5))
# hypothesis for linear regression: h(x) = theta0 + theta1*x
# function used to calculate the error for a given hypothesis
def obtainOutput(theta0, theta1, x):
return theta0 + theta1*x
# finds update value for theta0 at each step of regression
# this value is subtracted from theta0 to find next value
def getThetaZeroCost(theta0, theta1):
sum = 0
for index in range(len(x_values)):
# find difference between predicted value and actual value
sum += (obtainOutput(theta0,theta1,x_values[index]) - y_values[index])
# return sum of differences mulitplied learning rate divided by size of data set
return (ALPHA*sum)/float(len(x_values))
# finds update value for theta1 at each step of regression
# this value is subtracted from theta1 to find next value
def getThetaOneCost(theta0,theta1):
sum = 0
for index in range(len(x_values)):
# find difference between predicted value and actual value
# multiply each difference by current x_value in data set
sum += (obtainOutput(theta0,theta1,x_values[index]) - y_values[index])*x_values[index]
# return sum multiplied by learning rate and divided by size of data set
return (ALPHA*sum)/float(len(x_values))
# main loop of linear regression algorithm
# finds optimal line to fit to the given data set
# implements batch gradient descent for linear regression
# return values of theta0 and theta1 that are obtained from the regression loop
def mainLoop():
# intialize theta0 and theta1 to 0 at beginning of loop
theta0 = 0
theta1 = 0
# update values are changes to features during each iteration of regression
theta1_update = 1
theta2_update = 1
# repeat iterations until convergence
while(np.fabs(theta1_update) > CONVERGENCE_THRESHOLD or np.fabs(theta0_update) > CONVERGENCE_THRESHOLD):
# must calculate cost for both variables before updating
theta0_update = getThetaZeroCost(theta0,theta1)
theta1_update = getThetaOneCost(theta0,theta1)
theta0 -= theta0_update
theta1 -= theta1_update
# return final values of theta0 and theta1
return (theta0,theta1)
# main driver function for linear regression
def main():
# populate data set with random values
getDataSet()
# show data to user before running regression algorithm
plt.title("Data Set")
plt.xlabel("Input Values")
plt.ylabel("Output Values")
plt.scatter(x_values,y_values)
plt.show()
input("Enter anything to run linear regression and obtain result.")
# obtain result for given data set and graph for user
result_x = x_values
result_y = []
featureVals = mainLoop()
#obtain all values to graph result of linear regression
for currInput in result_x:
result_y.append(obtainOutput(featureVals[0],featureVals[1],currInput))
plt.title("Linear Regression Result")
plt.xlabel("Input Values")
plt.ylabel("Output Values")
plt.scatter(x_values,y_values)
plt.plot(result_x,result_y)
plt.show()
input("Enter anything to end program.")
# run main function if the program is run from terminal
if __name__ == "__main__":
# run main function
main()