-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathDeepNeuralNetPredict.py
259 lines (219 loc) · 12.4 KB
/
DeepNeuralNetPredict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
"""
Multi-task Deep Neural Network (DNN) Prediction Program
Inputs:
- data: a path to data folder that stores the test datasets (.npz file) [Required]
if the data folder only contains raw '_test.csv' files, then pre-process will automatically be done.
- model: a FullFilePath, in which a file named DeepNeuralNetParameters.npz stores the trained DNN model for the prediction [Required]
- output destination: a FullFilePath to specify the output directory [Optional]
- BatchSize: size of the batch of records to predict at once. [Optional]
- label: 0 or 1, indcate whether the input data have true label or not. [Required]
- dropoutProb: the probability of the nodes to be dropped randomly during predicting. [Optional]
- rep: an integer, number of repetitions for random drop-out prediction. [Optional]
- seed: random seed useful for dropout predictions. [Optional]
Output: in the specified output directory,
- a file named DeepNeuralNetPredict_log.txt that stores the log information of the testing process,
- files named DeepNeuralNetPredict_datName.csv, where 'datName' is one of the QSAR task, that stores the predicted results of the inputted test data using the saved DNN model and related parameters.
- files named rsq_datName_test.csv, where 'datName' is one of the QSAR task, that stores the R-Squared between of prediction true activity of the test set and the prediction given by each of the output nodes.
If use dropout predictions:
- folders named DeepNeuralNetPredict_datName_dropout.csv, where 'datName' is one of the QSAR task, that stores the predicted results for each droppout prediction round.
Usage:
- to get help:
python DeepNeuralNetPredict_multi.py -h
- to run the program:
python DeepNeuralNetPredict_multi.py [parameters] --data=FullpathToTestDataFile --model=FullPathToSavedDNNModel --result=FullPathToModelToPredictionOutput
Usage Example:
(Detailed explanations in README)
python DeepNeuralNetPredict.py --label=1 --data=data_sparse --model=models/multi_sparse
python DeepNeuralNetPredict.py --label=1 --seed=0 --rep=10 --data=data_sparse --model=models/multi_sparse_2 --result=predictions/multi_sparse_2
-------------
Prepared By Junshui Ma (Based on George Dahl's Kaggle Code)
06/12/2014
Last modified by Yuting Xu on Aug.07, 2017
"""
import numpy as num
import scipy.sparse as sp
import sys
import os
import argparse
import itertools
import gnumpy as gnp
from activationFunctions import *
import dnn
from DNNSharedFunc import *
from processData_sparse import *
import processData_dense
from DeepNeuralNetTrain import *
def collectPredictions(predStream):
preds = list(predStream)
return num.vstack(preds)
def getTestPreds(net, VariableParaDict, datasets, inpPreproFunct, useDropout = False, dense = False):
"""
Return a list of the test set predictions for eath test data in 'datasets'
"""
predsOrigSpace = []
for i, ds in enumerate(datasets):
testInpMbStream = (inpPreproFunct(x) for x in allMinibatches(512, ds.inps))
if dense:
avgMat = num.repeat(VariableParaDict['targMean'],ds.inps.shape[0],0)
stdMat = num.repeat(VariableParaDict['targStd'],ds.inps.shape[0],0)
else:
avgMat = num.repeat(VariableParaDict['targMean'][num.newaxis,:],ds.inps.shape[0],0)
stdMat = num.repeat(VariableParaDict['targStd'][num.newaxis,:],ds.inps.shape[0],0)
testPreds = collectPredictions(net.predictions(testInpMbStream, True, useDropout)) * stdMat + avgMat
predsOrigSpace.append(testPreds)
return predsOrigSpace
def parseArgs():
parser = argparse.ArgumentParser()
parser.add_argument("--data", action="store", dest='dataPath', type=str, default = None, \
help = 'Path to the folder which contains test data sets, which are all pre-processed and stored in npz format.')
parser.add_argument("--model", action="store", dest='model', type=str, default = '', \
help = 'FullFilePath of the model folder which contains a file DeepNeuralNetParameters.npz, which is in NPZ format and stores the trained Deep Neural Network. (default = (current directory))')
parser.add_argument("--BatchSize", action="store", type=int, dest="BatchSize", default=256,\
help = "size of the batch of records to predict at once.(default=256)")
parser.add_argument("--label", action="store", type=int, dest='label', default=0,\
help = "Whether the data have true label or not. No=0, YES=1.")
parser.add_argument("--dropout", "--dropouts", action="store", dest='dropoutStr', type=str, default = '-1',\
help = "layer-specific drop-out probability")
parser.add_argument("--rep", action="store", type=int, dest='rep', default=0, \
help = "Number of repetitions for drop-out prediction. if == 0, means don't use dropout prediction")
parser.add_argument("--seed", action="store", type=int, default=8, \
help = "Seed the random number generator.")
parser.add_argument("--dense", action = "store_true", default=False, \
help = "Whether use for dense QSAR tasks (default = False)")
parser.add_argument("--result", type=str, dest='PredictResultPath', default = None, \
help = "Full filepath, in which the Predict result is saved")
args = parser.parse_args()
args.BatchSize = num.floor(args.BatchSize)
assert(args.BatchSize>0)
args.dropouts = [float(i) for i in args.dropoutStr.split("_")]
assert(all(d <= 0.5 for d in args.dropouts))
if args.PredictResultPath is None:
args.PredictResultPath = args.model
args.savePrefix = args.PredictResultPath
args.modelPath = os.path.join(args.model, 'DeepNeuralNetParameters.npz')
assert(os.path.exists(args.modelPath))
assert(os.path.exists(args.dataPath))
args.allTestDat = glob.glob(args.dataPath+"/*test.npz")
args.allTestDat = sorted(args.allTestDat)
return args
def main():
# get inputs
args = parseArgs()
# set random seed
if args.rep > 0:
num.random.seed(args.seed)
# create a folder to save prediction results if it doesn't exists
if not os.path.exists(args.savePrefix):
os.makedirs(args.savePrefix)
# begin log
logPath = os.path.join(args.savePrefix, "DeepNeuralNetPredict_log.txt")
_logFile = open(logPath, 'w')
log = Tee(sys.stdout, _logFile)
print >>log, " ".join(sys.argv)
print >>log, "Start time: %s " % (tstamp())
# load trained model
NNet, VariableParaDict = dnn.loadSavedNeuralNet(args.modelPath, args.dense)
if len(args.dropouts)==1 :
if (args.dropouts[0] > 0): # otherwise use same dropouts as in training
NNet.dropouts = [args.dropouts[0] for i in range(len(NNet.weights))]
else:
NNet.dropouts = VariableParaDict['dropouts']
NNet.dropouts = NNet.dropouts.tolist()
else:
assert(len(NNet.weights)==len(args.dropouts))
NNet.dropouts = args.dropouts
# generate featTable from featNames in trained model
featNames = VariableParaDict['featNames']
featTable = dict((feat,j) for j,feat in enumerate(featNames))
if args.dense:
outputNames = VariableParaDict['outputNames']
outputTable = dict((output,j) for j,output in enumerate(outputNames))
# prepare test datasets
if len(args.allTestDat)==0:
# need to preprocess raw test datasets from .csv files
if not args.dense:
preprocess_test(args.dataPath,featTable)
else:
processData_dense.preprocess_test(args.dataPath,featTable,outputTable,args.label)
args.allTestDat = glob.glob(args.dataPath+"/*test.npz")
args.allTestDat = sorted(args.allTestDat)
# QSAR task name of each test set
args.TestdatNames = []
for p in args.allTestDat:
if not args.dense:
args.TestdatNames.append(os.path.basename(p).split("_")[0])
else:
args.TestdatNames.append(os.path.basename(p).split(".")[0])
# load all test datasets from dataPath
datasets = []
for p in args.allTestDat:
print >>log, "loading %s " % (p)
datasets.append(Dataset(p, VariableParaDict['OutsSize'], None, -1))
if VariableParaDict['transform'] == 'zscore':
prior = 0.01
inpsMean = VariableParaDict['trainInpsMean']
inpsStddev = VariableParaDict['trainInpsStddev']
for i, ds in enumerate(datasets):
m = inpsMean[i]
s = inpsStddev[i]
ds.inps = (ds.inps.toarray().astype(dtype=num.float) - m[num.newaxis,:])/(prior + s[num.newaxis,:])
preproInps = lambda xx: xx.toarray()
if VariableParaDict['transform'] != None:
if VariableParaDict['transform'] == 'zscore':
#prior = 0.01
print >>log, "Removing the saved train-data mean from each dimension and divide it by saved train-data standard deviation (plus %f)." % (prior)
#inpsMean = VariableParaDict['trainInpsMean']
#inpsStddev = VariableParaDict['trainInpsStddev']
#preproInps = lambda xx: (xx.toarray() - inpsMean[num.newaxis,:])/(prior + inpsStddev[num.newaxis,:])
preproInps = lambda xx: xx
if VariableParaDict['transform'] == 'sqrt':
print >>log, "Transforming inputs by taking the square root"
preproInps = lambda xx: num.sqrt(xx.toarray())
if VariableParaDict['transform'] == 'binarize':
print >>log, "Transforming inputs by binarizing the input values"
preproInps = lambda xx: (xx.toarray()>0.0)+0.0
if VariableParaDict['transform'] == 'log':
print >>log, "Transforming inputs by taking the logarithm after adding 1"
preproInps = lambda xx: num.log(1.0 + xx.toarray())
if VariableParaDict['transform'] == 'asinh':
print >>log, "Transforming inputs by taking the inverse hyperbolic sine function"
preproInps = lambda xx: num.arcsinh(xx.toarray())
print >>log, "Start predict (no dropout)..."
testPreds = getTestPreds(NNet, VariableParaDict, datasets, preproInps, False, args.dense)
print >>log, "Finish prediction. Save results to folder %s " % (args.savePrefix)
predictPath = []
for datName in args.TestdatNames:
predictPath.append(os.path.join(args.savePrefix,'DeepNeuralNetPredict_%s.csv' % (datName)))
for i,testPred in enumerate(testPreds):
print >>log, "Save prediction results for %s test set." % (args.TestdatNames[i])
writePreds2(predictPath[i], datasets[i].molIds, testPred, header = VariableParaDict['datNames'],multicol=(VariableParaDict['OutsSize']>1))
# If use dropout prediction
if args.rep > 0:
predictPath_dropout = []
for datName in args.TestdatNames:
folderToAdd = os.path.join(args.savePrefix,'DeepNeuralNetPredict_%s_dropout' % (datName))
predictPath_dropout.append(folderToAdd)
if not os.path.exists(folderToAdd):
os.makedirs(folderToAdd)
print >>log, "Start predict (with dropout)..."
for r in range(args.rep):
print "Dropout prediction round: %d" % (r)
testPreds_dropout = getTestPreds(NNet, VariableParaDict, datasets, preproInps, True, args.dense)
for i,testPred in enumerate(testPreds_dropout):
savePathForDropout = os.path.join(predictPath_dropout[i],'DeepNeuralNetPredict_%s_dropout_%d.csv' % (args.TestdatNames[i],r+1))
writePreds2(savePathForDropout, datasets[i].molIds, testPred, header = VariableParaDict['datNames'],multicol=(VariableParaDict['OutsSize']>1))
# If test set has true label
if args.label != 0:
for i,testPred in enumerate(testPreds):
RSQs = num.ndarray(shape = (1, VariableParaDict['OutsSize']))
for j in range(VariableParaDict['OutsSize']):
if not args.dense:
RSQs[0,j] = rSq(testPred[:,j],datasets[i].origTargs)
else:
RSQs[0,j] = rSq(testPred[:,j],datasets[i].origTargs[:,j])
print >>log, "Save R-squared of prediction results for %s test set." % (args.TestdatNames[i])
writeMat(os.path.join(args.savePrefix, 'rsq_%s_test.csv' % (args.TestdatNames[i])), RSQs, header = VariableParaDict['datNames'])
print >>log, "Finish time: %s " % (tstamp())
_logFile.close()
if __name__ == "__main__":
main()