-
Notifications
You must be signed in to change notification settings - Fork 0
/
iris.py
278 lines (217 loc) · 9.71 KB
/
iris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python
'''
Machine learning for Iris dataset
- Uses TensorFlow to create a Logistic Regression classifier
- Full ML pipeline with TensorFlow backend
Imports raw data from csv, randomizes data, preprocesses data,
splits data, trains model, tests model, saves model
Uses TensorBoard to visualize the results
'''
import sys
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
__version__ = '0.2.1'
__author__ = 'Jacob Manning'
__email__ = '[email protected]'
def load_iris(filename):
col_names = ['sepal_length', 'sepal_width','pedal_length',
'pedal_width', 'label']
# return dataframe labeled with col_names
return pd.read_csv(filename, names=col_names)
def train_valid_test_split(df):
# separate the data frame by label
setosa = df[df.label == 'Iris-setosa']
versicolor = df[df.label == 'Iris-versicolor']
virginica = df[df.label == 'Iris-virginica']
# split that dataframe into training, validation, and test
s_tr, s_v, s_te = _split(setosa)
ve_tr, ve_v, ve_te = _split(versicolor)
vi_tr, vi_v, vi_te = _split(virginica)
# concatenate the dataframes from each type into the threee sets
train_set = pd.concat([s_tr, ve_tr, vi_tr], ignore_index=True)
valid_set = pd.concat([s_v, ve_v, vi_v], ignore_index=True)
test_set = pd.concat([s_te, ve_te, vi_te], ignore_index=True)
return (train_set, valid_set, test_set)
def _split(df):
# seed the random num generator for reproducable train, valid, test sets
np.random.seed(12345)
# randomize the data frame
randomized_df = df.iloc[np.random.permutation(len(df))]
# calculate indicies to split data by 60% train, 20% valid/test
train_idx = int(len(randomized_df) * 0.6)
offset = int(len(randomized_df) * 0.2)
# split randomized data into the sets
train_set = randomized_df.iloc[:train_idx]
valid_set = randomized_df.iloc[train_idx:train_idx+offset]
test_set = randomized_df.iloc[train_idx+offset:]
return (train_set, valid_set, test_set)
def label_split(df):
# get the labels for each example
label = df['label']
# one-hot-encode the labels into vectors
one_hot_label = one_hot(label)
# get all columns but the label for each example
x = np.array(df.iloc[:, :-1], dtype=np.float32)
return (x, one_hot_label)
def one_hot(arr):
# representations of each class string with an integer
class_translations = {
'Iris-setosa': 0,
'Iris-versicolor': 1,
'Iris-virginica': 2
}
# zero 2-d array to hold new one-hot labels
encoded = np.zeros((len(arr), len(class_translations)))
# set value corresponding to label as 1
for i, label in enumerate(arr):
encoded[i][class_translations[label]] = 1
return encoded
def initial_plots(df):
# plot all of the features in the dataframe against each other
sns.pairplot(df, hue='label', size=2)
plt.show()
def variable_summaries(var):
# helper function to create summaries for tensorboard
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev', stddev)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
def train(X_train, y_train, X_valid, y_valid, X_test, y_test,
learning_rate=0.05, load=False, filename='iris_model', stddev=0.1):
# parameters for size of W, b, x, y
n_samples, n_features = X_train.shape
# comma here necessary to unpack tuple (?,)
# n_classes, = y_train[0].shape
_, n_classes = y_train.shape
logdir = 'tensorboard/' + filename
savedir = logdir + '/' + filename
# placeholders for the input -> the features and label
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, [None, n_features], name='x')
y_labels = tf.placeholder(tf.float32, [None, n_classes], name='y_labels')
# variable for the weights
with tf.name_scope('weights'):
W = tf.Variable(tf.random_normal([n_features, n_classes], stddev=stddev),
name='weights')
variable_summaries(W)
# variable for the biases
with tf.name_scope('biases'):
b = tf.Variable(tf.random_normal([n_classes], stddev=stddev),
name='biases')
variable_summaries(b)
# node for softmax(X*W + b)
with tf.name_scope('softmax_Wx_plus_b'):
y = tf.nn.softmax(tf.matmul(x, W) + b)
tf.summary.histogram('activations', y)
with tf.name_scope('train'):
with tf.name_scope('cross_entropy'):
# use tf cross entropy function and reduce mean
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_labels, logits=y))
tf.summary.scalar('cross_entropy', cross_entropy)
with tf.name_scope('optimizer'):
# use gradient descent optimizer with given learning rate
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
with tf.name_scope('objective'):
# the session will run objective -> minimize cross_entropy with g.d.
objective = optimizer.minimize(cross_entropy)
with tf.name_scope('evaluate'):
with tf.name_scope('correct_prediction'):
# determine if max softmax value is equal to max label value
correct_prediction = tf.equal(tf.argmax(y, axis=1),
tf.argmax(y_labels, axis=1))
with tf.name_scope('accuracy'):
# calculate the model accuracy on the set
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
with tf.name_scope('saver'):
# saver node to save/restore weights/biases
saver = tf.train.Saver()
# merge all tf.summaries for tensorboard
merged = tf.summary.merge_all()
# necessary node -> variable initializer
init = tf.global_variables_initializer()
with tf.Session() as sess:
# initialize the global vars
sess.run(init)
if not load:
# writers for tensorboard
writer = tf.summary.FileWriter(logdir)
writer.add_graph(sess.graph)
# train a new model
for i in range(1000):
if i % 50 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: X_valid, y_labels: y_valid})
writer.add_summary(summary, i)
print('Validation set accuracy at step {}: {}'.format(i, acc))
else:
summary, _ = sess.run([merged, objective],
feed_dict={x: X_train, y_labels: y_train})
writer.add_summary(summary, i)
print('\nView tensorboard with:')
print('tensorboard --logdir=' + logdir, end='\n\n')
writer.close()
# save the model
save_path = saver.save(sess, savedir)
print('Model saved in', save_path, end='\n\n')
else:
# import the model
saver = tf.train.import_meta_graph(savedir + '.meta')
saver.restore(sess, savedir)
print('Model loaded successfully!', end='\n\n')
# print the model parameters
print('Weights')
print(sess.run(W), end='\n\n')
print('Biases')
print(sess.run(b), end='\n\n')
# test the model
print('Validation accuracy:', end=' ')
print(sess.run(accuracy, feed_dict={x: X_valid, y_labels: y_valid}))
print('Test accuracy:', end=' ')
print(sess.run(accuracy, feed_dict={x: X_test, y_labels: y_test}))
def main(load=False, visual=False, learning_rate=0.05,
filename='iris_model', stddev=0.1):
# load and split the data
df = load_iris('data/iris.csv')
train_set, valid_set, test_set = train_valid_test_split(df)
# visualize the data
if visual:
initial_plots(train_set)
# split each set into the X (features) and y (labels)
X_train, y_train = label_split(train_set)
X_valid, y_valid = label_split(valid_set)
X_test, y_test = label_split(test_set)
# delete previous tensorboard files
if not load:
logdir = 'tensorboard/' + filename
if tf.gfile.Exists(logdir):
tf.gfile.DeleteRecursively(logdir)
# run the training/testing
train(X_train, y_train, X_valid, y_valid, X_test, y_test, load=load,
learning_rate=learning_rate, filename=filename, stddev=stddev)
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='iris.py',
description='Train/test iris dataset using logistic regression')
parser.add_argument('--load', dest='load', action='store_true',
default=False, help='load model rather than train')
parser.add_argument('--visual', dest='visual', action='store_true',
default=False, help='plot data and features prior to load/test')
parser.add_argument('--learning_rate', dest='learning_rate', type=float,
default=0.05, help='learning rate for GradientDescentOptimizer')
parser.add_argument('--filename', dest='filename', type=str,
default='iris_model', help='file to store/load model to/from')
parser.add_argument('--stddev', dest='stddev', type=float,
default=0.1, help='standard deviation for random_normal init values')
args = parser.parse_args()
main(load=args.load, visual=args.visual, learning_rate=args.learning_rate,
filename=args.filename, stddev=args.stddev)