forked from schaul/nnsandbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataSet.py
200 lines (174 loc) · 7.47 KB
/
DataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from numpy import *
import BigMat as bm
import cPickle as cp
import zlib
from Util import *
from time import time as now
class BatchSet(object):
def __init__(self,X,Y,S,batchsize):
self._size = m = X.shape[0]
self._X = X
self._Y = Y
self._S = S
self._index = 0
self._Xbuf = None
self._batchsize = batchsize
self._blocksize = batchsize * max(1,2048//batchsize)
self._batches_all = vstack([arange(0 ,m ,batchsize),
arange(batchsize,m+batchsize,batchsize)]).transpose()
self._batches_all[-1,-1] = m
self._batches = self._batches_all.copy()
def __iter__(self):
self._index = 0
return self
def resize(self,newsize):
self._batches = self._batches_all[:newsize,:].copy()
def next(self):
if self._index >= len(self._batches):
raise StopIteration
s = slice(*(self._batches[self._index]))
Xbuf = self._X[s,:]
if Xbuf.dtype == 'uint8':
if self._Xbuf == None:
self._Xbuf = bm.empty((self._batchsize,self._X.shape[1]))
self._Xbuf[:s.stop-s.start,:] = Xbuf # copy
Xbuf = self._Xbuf[:s.stop-s.start,:] # point to copy
bm.imul(Xbuf,1./255)
batch = DataFold(Xbuf,self._Y[s,:],self._S[s,:] if self._S != None else None)
self._index += 1
return batch
def __len__(self):
return len(self._batches)
def shuffle(self):
random.shuffle(self._batches)
class DataFold(object):
'''
A simple structure containing a subset of inputs X,
and the corresponding target outputs Y
'''
def __init__(self,X,Y,S=None):
assert(X.shape[0] == Y.shape[0])
self.X = X
self.Y = Y
self.S = S
self.size = X.shape[0]
def __iter__(self):
return [self.X,self.Y].__iter__() # let X,Y = data unpack
def make_batches(self,batchsize):
return BatchSet(self.X,self.Y,self.S,min(self.X.shape[0],batchsize))
class DataSet(object):
'''
A simple structure containing three DataFold instances:
a 'train', 'valid', and 'test.
'''
def __init__(self,X,Y,Xshape=None,Yshape=None,Xrange=None,Yrange=None,shuffle=True,max_batchsize=1):
if isinstance(X,dict):
Xtest = X['test']
Ytest = Y['test']
X = X['train']
Y = Y['train']
else:
Xtest = X[0,:] # empty
Ytest = Y[0,:] # empty
if shuffle:
perm = random.permutation(X.shape[0])
X = take(X,perm,axis=0)
if not (X is Y):
Y = take(Y,perm,axis=0)
t0 = now()
self._X = bm.asarray(X)
self._Y = bm.asarray(Y) if not (X is Y) else self._X
self._Xtest = bm.asarray(Xtest)
self._Ytest = bm.asarray(Ytest) if not (X is Y) else self._Xtest
if bm.backend_name == "gnumpy":
print "Host->Device transfer of dataset took %.3fs" % (now()-t0)
self._size = X.shape[0]
self._Xrescale = (1.,0.) #scale,bias
self.max_batchsize = max_batchsize
self.Xshape = Xshape or (1,X.shape[1])
self.Yshape = Yshape or (1,Y.shape[1])
self.Xdim = X.shape[1]
self.Ydim = Y.shape[1]
self.Xrange = Xrange or (X.min(axis=0),X.max(axis=0))
self.Yrange = Yrange or (Y.min(axis=0),Y.max(axis=0))
if not isscalar(self.Xrange[0]):
self.Xrange = (bm.asarray(self.Xrange[0]).reshape((1,-1)),bm.asarray(self.Xrange[1]).reshape((1,-1)))
if not isscalar(self.Yrange[0]):
self.Yrange = (bm.asarray(self.Yrange[0]).reshape((1,-1)),bm.asarray(self.Yrange[1]).reshape((1,-1)))
rs = self._rowslice(0,self._size)
self.train = DataFold(self._X[rs,:],self._Y[rs,:])
self.valid = DataFold(self._X[0:0,:],self._Y[0:0,:])
self.test = DataFold(self._Xtest,self._Ytest)
def keys(self): return ['train','valid','test']
def values(self): return [self.train,self.valid,self.test]
def items(self): return zip(self.keys(),self.values())
def _rowslice(self,a,b):
# round endpoint down
b = a + self.max_batchsize *((b-a) // self.max_batchsize)
return slice(a,b)
def __getitem__(self,key):
if key == 'train': return self.train
elif key == 'valid': return self.valid
elif key == 'test': return self.test
raise KeyError("invalid key for DataSet fold")
def split(self,trainsplit,validsplit=0,testsplit=0):
assert(trainsplit + validsplit <= 100)
trainsize = int(trainsplit * self._size // 100)
validsize = int(validsplit * self._size // 100)
trs = self._rowslice(0,trainsize)
vas = self._rowslice(trainsize,trainsize+validsize)
self.train.X = self._X[trs,:]
self.train.Y = self._Y[trs,:]
self.train.size = self.train.X.shape[0]
self.valid.X = self._X[vas,:]
self.valid.Y = self._Y[vas,:]
self.valid.size = self.valid.X.shape[0]
def rescale(self,Xrange,Yrange):
'''
Rescales the entire dataset so that all inputs X lie within (Xrange[0],Xrange[1])
and all targets Y lie within (Yrange[0],Yrange[1]).
The same scaling factor is applied to all folds.
'''
if Xrange != self.Xrange and self._X.dtype != 'uint8':
Xscale = self.Xrange[1]-self.Xrange[0]
if isscalar(Xscale):
Xscale = (Xrange[1]-Xrange[0]) / maximum(1e-5,Xscale)
else:
bm.maximum(Xscale,1e-5,out=Xscale)
bm.reciprocal(Xscale,out=Xscale)
bm.multiply(Xscale,Xrange[1]-Xrange[0],out=Xscale)
bm.isub(self._X,self.Xrange[0])
bm.imul(self._X,Xscale)
bm.iadd(self._X,Xrange[0])
if Yrange != self.Yrange and not (self._X is self._Y):
Yscale = self.Yrange[1]-self.Yrange[0]
if isscalar(Yscale):
Yscale = (Yrange[1]-Yrange[0]) / maximum(1e-5,Yscale)
else:
bm.maximum(Yscale,1e-5,out=Yscale)
bm.reciprocal(Yscale,out=Yscale)
bm.multiply(Yscale,Yrange[1]-Yrange[0],out=Yscale)
bm.isub(self._Y,self.Yrange[0])
bm.imul(self._Y,Yscale)
bm.iadd(self._Y,Yrange[0])
self.Xrange = Xrange
self.Yrange = Yrange
################################################
def load_mnist(digits=range(10)):
X,Y = {},{}
for fold in ('train','test'):
X[fold],Y[fold] = [],[] # start with empty lists
for d in digits:
# Load all N instances of digit 'd' as a Nx768 row vector of inputs,
# and an Nx10 target vector.
Xd,Yd = quickload("data/mnist/mnist_%s_%i.pkl" % (fold,d))
Xd = zlib.decompress(Xd) # decompress byte string
Yd = zlib.decompress(Yd) # decompress byte string
n = len(Xd)/(28*28)
Xd = ndarray(shape=(n,28*28),buffer=Xd,dtype='uint8') # convert back to numpy array
Yd = ndarray(shape=(n,10) ,buffer=Yd,dtype='uint8') # convert back to numpy array
X[fold].append(Xd)
Y[fold].append(asarray(Yd[:,digits],dtype='float32')) # make the output dimensionality match the number of actual targets, for faster training on subsets of digits
X[fold] = vstack(X[fold])
Y[fold] = vstack(Y[fold])
return DataSet(X,Y,Xshape=(28,28,1),Xrange=[0.0,255.0],Yrange=[0.0,1.0],shuffle=True)