-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuildDNN.py
201 lines (158 loc) · 11.3 KB
/
buildDNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from Functions import *
### Setting a seed for reproducibility
tf.random.set_seed(1234)
savePlot = True
NN = 'DNN'
batchSize = 2048
### Reading the command line
jetCollection, analysis, channel, preselectionCuts, background, trainingFraction, signal, numberOfNodes, numberOfLayers, numberOfEpochs, validationFraction, dropout, testMass = ReadArgParser()
### Reading the configuration file
dfPath, InputFeatures = ReadConfig(analysis, jetCollection)
dfPath += analysis + '/' + channel + '/' + str(signal) + '/' + background
### Loading input data
data_train, data_test, X_train_unscaled, m_train_unscaled, m_test_unscaled = LoadData(dfPath, jetCollection, str(signal), analysis, channel, background, trainingFraction, preselectionCuts, InputFeatures)
### Removing 'mass' from the list of variables that will be given as input to the DNN and from the unscaled train sample (otherwise files saved with "SaveModel" will have the wrong format)
InputFeatures.remove('mass')
X_train_unscaled = X_train_unscaled[InputFeatures]
### Dividing signal from background
data_test_signal = data_test[data_test['isSignal'] == 1]
data_test_bkg = data_test[data_test['isSignal'] == 0]
m_test_unscaled_signal = m_test_unscaled[data_test['isSignal'] == 1]
data_train_signal = data_train[data_train['isSignal'] == 1]
data_train_bkg = data_train[data_train['isSignal'] == 0]
m_train_unscaled_signal = m_train_unscaled[data_train['isSignal'] == 1]
### Saving unscaled train signal masses
unscaledTrainMassPointsList = list(dict.fromkeys(list(m_train_unscaled_signal)))
### Extracting scaled test/train signal masses
#m_test_signal = data_test_signal['mass']
m_train_signal = data_train_signal['mass']
scaledTrainMassPointsList = list(dict.fromkeys(list(m_train_signal)))
if testMass == ['all']:
testMass = []
testMass = list(str(int(item)) for item in set(list(m_test_unscaled_signal)))
for unscaledMass in testMass:
unscaledMass = int(unscaledMass)
### Checking whether there are train events with the selected mass
if unscaledMass not in unscaledTrainMassPointsList:
print(Fore.RED + 'No train signal with mass ' + str(unscaledMass))
continue
### Associating the unscaled mass to the scaled one
mass = scaledTrainMassPointsList[unscaledTrainMassPointsList.index(unscaledMass)]
### Creating the output directory
outputDir = dfPath + '/' + NN + '/' + str(int(unscaledMass))
print(format('Output directory: ' + Fore.GREEN + outputDir), checkCreateDir(outputDir))
### Creating the logFile
logFileName = outputDir + '/logFile.txt'
logFile = open(logFileName, 'w')
logString = WriteLogFile(numberOfNodes, numberOfLayers, numberOfEpochs, validationFraction, dropout, InputFeatures, dfPath)
logFile.write(logString)
logFile.write('\nNumber of train events: ' + str(len(data_train)) + ' (' + str(len(data_train_signal)) + ' signal and ' + str(len(data_train_bkg)) + ' background)' + '\nNumber of test events: ' + str(len(data_test)) + ' (' + str(len(data_test_signal)) + ' signal and ' + str(len(data_test_bkg)) + ' background)')
### Selecting signal events with the same mass
data_train_signal_mass = data_train_signal[data_train_signal['mass'] == mass]
data_test_signal_mass = data_test_signal[data_test_signal['mass'] == mass]
logFile.write('\nNumber of train signal events with mass ' + str(unscaledMass) + ': ' + str(len(data_train_signal_mass)) + '\nNumber of test signal events with mass ' + str(unscaledMass) + ': ' + str(len(data_test_signal_mass)))
### Putting signal and background events back together
data_train_mass = pd.concat([data_train_signal_mass, data_train_bkg], ignore_index = True)
data_test_mass = pd.concat([data_test_signal_mass, data_test_bkg], ignore_index = True)
### Shuffling data
data_train_mass = ShufflingData(data_train_mass)
data_test_mass = ShufflingData(data_test_mass)
### Extracting y_mass and origin_mass as numpy arrays
y_train_mass = np.asarray(data_train_mass['isSignal'].values).astype(np.float32)
y_test_mass = np.asarray(data_test_mass['isSignal'].values).astype(np.float32)
origin_train_mass = np.array(data_train_mass['origin'].values)
#origin_test_mass = np.array(data_test_mass['origin'].values)
### Selecting only the variables to give to the DNN
X_train_mass = data_train_mass[InputFeatures]
X_test_mass = data_test_mass[InputFeatures]
### Converting pandas dataframes into numpy arrays
X_train_mass = np.asarray(X_train_mass.values).astype(np.float32)
X_test_mass = np.asarray(X_test_mass.values).astype(np.float32)
### Weighting train events
w_train_mass, origins_list, DictNumbers, DictWeights = weightEvents(origin_train_mass, str(signal))
logFile.write('\nOrigin list: ' + str(origins_list) + '\nOrigins numbers: ' + str(DictNumbers) + '\nOrigins weights: ' + str(DictWeights))
logFile.write(logString)
### Building the model, compiling and training
model, Loss, Metrics, learningRate, Optimizer = BuildDNN(len(InputFeatures), numberOfNodes, numberOfLayers, dropout)
model.compile(loss = Loss, optimizer = Optimizer, weighted_metrics = Metrics)
logFile.write('\nLoss: ' + Loss + '\nLearning rate: ' + str(learningRate) + '\nOptimizer: ' + str(Optimizer) + '\nweighted_metrics: ' + str(Metrics))
print(Fore.BLUE + 'Training the DNN on train events with mass ' + str(int(unscaledMass)))
modelMetricsHistory = model.fit(X_train_mass, y_train_mass, sample_weight = w_train_mass, epochs = numberOfEpochs, batch_size = batchSize, validation_split = validationFraction, verbose = True, callbacks = EarlyStopping(verbose = True, patience = 10, monitor = 'val_loss', restore_best_weights = True))
### Saving to files
SaveModel(model, X_train_unscaled, outputDir)
### Evaluating the performance of the DNN and writing results to the log file
print(Fore.BLUE + 'Evaluating the performance of the DNN on test events with mass ' + str(int(unscaledMass)))
testLoss, testAccuracy = EvaluatePerformance(model, X_test_mass, y_test_mass, batchSize)
logFile.write('\nTest loss: ' + str(testLoss) + '\nTest accuracy: ' + str(testAccuracy))
### Drawing accuracy and loss
if savePlot:
DrawLoss(modelMetricsHistory, testLoss, outputDir, NN, jetCollection, analysis, channel, preselectionCuts, signal, background, unscaledMass)
DrawAccuracy(modelMetricsHistory, testAccuracy, outputDir, NN, jetCollection, analysis, channel, preselectionCuts, signal, background, unscaledMass)
### Prediction on signal and background
yhat_test, yhat_train = PredictionTrainTest(model, X_test_mass, X_train_mass, batchSize)
yhat_train_signal = yhat_train[y_train_mass == 1];
yhat_train_bkg = yhat_train[y_train_mass == 0];
yhat_test_signal = yhat_test[y_test_mass == 1];
yhat_test_bkg = yhat_test[y_test_mass == 0];
scoresFile = open(outputDir + '/Scores_train_signal.txt', 'w')
for score in yhat_train_signal:
scoresFile.write(str(score) + '\n')
scoresFile.close()
scoresFile = open(outputDir + '/Scores_train_bkg.txt', 'w')
for score in yhat_train_bkg:
scoresFile.write(str(score) + '\n')
scoresFile.close()
scoresFile = open(outputDir + '/Scores_test_signal.txt', 'w')
for score in yhat_test_signal:
scoresFile.write(str(score) + '\n')
scoresFile.close()
scoresFile = open(outputDir + '/Scores_test_bkg.txt', 'w')
for score in yhat_test_bkg:
scoresFile.write(str(score) + '\n')
scoresFile.close()
### Drawing confusion matrix
CMvalues = DrawCM(yhat_test, y_test_mass, True, outputDir, unscaledMass, background, savePlot)
logFile.write('\nTrue bkg, predicted bkg: ' + str(CMvalues[0]) + '\nTrue bkg, predicted signal: ' + str(CMvalues[1]) + '\nTrue signal, predicted bkg: ' + str(CMvalues[2]) + '\nTrue signal, predicted signal: ' + str(CMvalues[3]))
### Drawing scores, ROC and background rejection
AUC, WP, WP_rej = DrawEfficiency(yhat_train_signal, yhat_test_signal, yhat_train_bkg, yhat_test_bkg, outputDir, NN, unscaledMass, jetCollection, analysis, channel, preselectionCuts, signal, background, savePlot)
print(Fore.BLUE + 'AUC (Area Under ROC Curve): ' + str(AUC))
logFile.write('\nAUC: ' + str(AUC) + '\nWorking points: ' + str(WP) + '\nBackground rejection at each working point: ' + str(WP_rej))
'''
### Dividing sample by origin
originsBkgTest = list(background.split('_'))
if len(originsBkgTest) > 1:
for origin in originsBkgTest:
print(Fore.BLUE + 'Evaluating the performance of the DNN on events with mass ' + str(unscaledMass) + ' and origin = \'' + signal + '\' or \'' + origin + '\'')
logFile.write('\nOrigin: ' + origin)
originsTest = origin_test_mass.copy()
### Selecting events with origin equal to signal or the background considered
originsTest = np.where(originsTest == signal, 1, originsTest)
originsTest = np.where(originsTest == origin, 1, originsTest)
X_test_mass_origin = X_test_mass[originsTest == 1]
y_test_mass_origin = y_test_mass[originsTest == 1]
### Prediction on the whole test sample and confusion matrix
yhat_test_origin = model.predict(X_test_mass_origin, batch_size = batchSize)
if savePlot:
DrawCM(yhat_test_origin, y_test_mass_origin, True, outputDir, unscaledMass, origin)
### Prediction on signal and background separately
X_train_bkg_origin = X_train_mass[origin_train_mass == origin]
X_test_bkg_origin = X_test_mass[origin_test_mass == origin]
logFile.write('\nNumber of background train events with origin ' + origin + ': ' + str(len(X_train_bkg_origin)) + '\nNumber of background test events with origin ' + origin + ': ' + str(len(X_test_bkg_origin)))
logFile.write('\nNumber of test events with origin ' + origin + ': ' + str(len(X_test_mass_origin)))
yhat_train_signal_origin, yhat_train_bkg_origin, yhat_test_signal_origin, yhat_test_bkg_origin = PredictionSigBkg(model, X_train_signal_mass, X_train_bkg_origin, X_test_signal_mass, X_test_bkg_origin) ### quelle sul segnale non sono cambiate!
### Drawing scores, ROC and background rejection
AUC, WP, WP_rej = DrawEfficiency(yhat_train_signal_origin, yhat_test_signal_origin, yhat_train_bkg_origin, yhat_test_bkg_origin, outputDir, NN, unscaledMass, jetCollection, analysis, channel, preselectionCuts, signal, origin, savePlot, useWeights, cutTrainEvents)
print(Fore.BLUE + 'AUC (Area Under ROC Curve): ' + str(AUC))
logFile.write('\nAUC: ' + str(AUC) + '\nWorking points: ' + str(WP) + '\nBackground rejection at each working point: ' + str(WP_rej))
'''
'''
with open(outputDir + '/BkgRejection_' + origin + '_last.txt', 'a') as BkgRejectionFile:
BkgRejectionFile.write(str(WP_rej[0]) + '\n')
np.savetxt('data_train_signal_mass.csv', data_train_signal_mass, delimiter = ',', fmt = '%s')
np.savetxt('data_train_bkg.csv', data_train_bkg, delimiter = ',', fmt = '%s')
np.savetxt('data_test_signal_mass.csv', data_test_signal_mass, delimiter = ',', fmt = '%s')
np.savetxt('data_test_bkg.csv', data_test_bkg, delimiter = ',', fmt = '%s')
'''
### Closing the logFile
logFile.close()
print(Fore.GREEN + 'Saved ' + logFileName)