-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultigpu_cnn_execute.py
165 lines (136 loc) · 6.96 KB
/
multigpu_cnn_execute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from __future__ import print_function
import numpy as np
import tensorflow as tf
import time
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
num_gpus = 2
num_steps = 200
learning_rate = 0.001
batch_size = 1024
display_step = 10
# Network Parameters
num_input = 784 # MNIST data input (img shape: 28*28)
num_classes = 10 # MNIST total classes (0-9 digits)
dropout = 0.75 # Dropout, probability to keep units
# Build a convolutional neural network
def conv_net(x, n_classes, dropout, reuse, is_training):
# Define a scope for reusing the variables
with tf.variable_scope('ConvNet', reuse=reuse):
# MNIST data input is a 1-D vector of 784 features (28*28 pixels)
# Reshape to match picture format [Height x Width x Channel]
# Tensor input become 4-D: [Batch Size, Height, Width, Channel]
x = tf.reshape(x, shape=[-1, 28, 28, 1])
# Convolution Layer with 64 filters and a kernel size of 5
x = tf.layers.conv2d(x, 64, 5, activation=tf.nn.relu)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
x = tf.layers.max_pooling2d(x, 2, 2)
# Convolution Layer with 256 filters and a kernel size of 5
x = tf.layers.conv2d(x, 256, 3, activation=tf.nn.relu)
# Convolution Layer with 512 filters and a kernel size of 5
x = tf.layers.conv2d(x, 512, 3, activation=tf.nn.relu)
# Max Pooling (down-sampling) with strides of 2 and kernel size of 2
x = tf.layers.max_pooling2d(x, 2, 2)
# Flatten the data to a 1-D vector for the fully connected layer
x = tf.contrib.layers.flatten(x)
# Fully connected layer (in contrib folder for now)
x = tf.layers.dense(x, 2048)
# Apply Dropout (if is_training is False, dropout is not applied)
x = tf.layers.dropout(x, rate=dropout, training=is_training)
# Fully connected layer (in contrib folder for now)
x = tf.layers.dense(x, 1024)
# Apply Dropout (if is_training is False, dropout is not applied)
x = tf.layers.dropout(x, rate=dropout, training=is_training)
# Output layer, class prediction
out = tf.layers.dense(x, n_classes)
# Because 'softmax_cross_entropy_with_logits' loss already apply
# softmax, we only apply softmax to testing network
out = tf.nn.softmax(out) if not is_training else out
return out
# Build the function to average the gradients
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
if __name__ == "__main__":
# Place all ops on CPU by default
with tf.device('/cpu:0'):
tower_grads = []
reuse_vars = False
# tf Graph input
X = tf.placeholder(tf.float32, [None, num_input])
Y = tf.placeholder(tf.float32, [None, num_classes])
# Loop over all GPUs and construct their own computation graph
for i in range(num_gpus):
with tf.device('/gpu:%d' % i):
# Split data between GPUs
_x = X[i * batch_size: (i+1) * batch_size]
_y = Y[i * batch_size: (i+1) * batch_size]
# Because Dropout have different behavior at training and prediction time, we
# need to create 2 distinct computation graphs that share the same weights.
# Create a graph for training
logits_train = conv_net(_x, num_classes, dropout,
reuse=reuse_vars, is_training=True)
# Create another graph for testing that reuse the same weights
logits_test = conv_net(_x, num_classes, dropout,
reuse=True, is_training=False)
# Define loss and optimizer (with train logits, for dropout to take effect)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=logits_train, labels=_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
grads = optimizer.compute_gradients(loss_op)
# Only first GPU compute accuracy
if i == 0:
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.argmax(_y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
reuse_vars = True
tower_grads.append(grads)
tower_grads = average_gradients(tower_grads)
train_op = optimizer.apply_gradients(tower_grads)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 1
# Keep training until reach max iterations
for step in range(1, num_steps + 1):
# Get a batch for each GPU
batch_x, batch_y = mnist.train.next_batch(batch_size * num_gpus)
# Run optimization op (backprop)
ts = time.time()
sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
te = time.time() - ts
if step % display_step == 0 or step == 1:
# Calculate batch loss and accuracy
loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
Y: batch_y})
print("Step " + str(step) + ": Minibatch Loss= " + \
"{:.4f}".format(loss) + ", Training Accuracy= " + \
"{:.3f}".format(acc) + ", %i Examples/sec" % int(len(batch_x)/te))
step += 1
print("Optimization Finished!")
# Calculate accuracy for 1000 mnist test images
print("Testing Accuracy:", \
np.mean([sess.run(accuracy, feed_dict={X: mnist.test.images[i:i+batch_size],
Y: mnist.test.labels[i:i+batch_size]}) for i in range(0, len(mnist.test.images), batch_size)]))