-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep1.py
217 lines (179 loc) · 10.2 KB
/
step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import numpy as np
import glob
import math
import io
import os
import sys
import fnmatch
import pickle
import pandas as pd
import autogluon as ag
from autogluon.tabular import TabularDataset, TabularPredictor
import time
import multiprocess as mp
import myparams
# This code takes all the available data and retrain the ML model [THE FILTER] according to the iterative training procedure
if __name__ == "__main__":
In_file = myparams.In_file
In_label = In_file.split('/')[-1].split('.')[0]
print('\n*** Requested to train the classifier from {}'.format(In_file))
ndecimals=myparams.ij_decimals
if ndecimals>0:
rounding_error=10**(-1*(ndecimals+1))
model_path='MLmodel/classification-{}'.format(In_label)
calc_dirname = 'exact_calculations/{}'.format(In_label)
if not os.path.exists(calc_dirname):
os.makedirs(calc_dirname, exist_ok=True)
# *************
# First I load the data that the classifier has already used for its training
try:
used_data = pd.read_feather('MLmodel/data-used-by-classifier-{}.feather'.format(In_label))
except:
print('First time training the classifier')
used_data = pd.DataFrame()
# Then I check the exact calculations to see the if new data are available
calculation_dir='./exact_calculations/{}'.format(In_label)
if not os.path.isfile('{}/{}'.format(calculation_dir,myparams.calculations_classifier)):
print('\n*(!)* Notice that there are no classification data\n')
use_new_calculations = False
class_0_pairs = pd.DataFrame()
class_1_pairs = pd.DataFrame()
else:
use_new_calculations = True
class_0_pairs = pd.read_csv('{}/{}'.format(calculation_dir,myparams.calculations_classifier), index_col=0)
class_1_pairs = pd.read_csv('{}/{}'.format(calculation_dir,myparams.calculations_predictor), index_col=0)[['conf','i','j']]
print('From the calculation results we have {} class-0 and {} class-1'.format(len(class_0_pairs),len(class_1_pairs)))
# I also have to include the pre-training data, which I now load to see if overall there are more data than the previous iteration (as expected)
if os.path.isfile('MLmodel/{}'.format(myparams.pretraining_classifier)):
pretrain_df = pd.read_feather('MLmodel/{}'.format(myparams.pretraining_classifier))
else:
print('\nNotice that no pretraining is available')
pretrain_df = pd.DataFrame()
#************
# * Check if the model needs to be retrained (you should have collected new data)
if(len(pretrain_df)+len(class_0_pairs)+len(class_1_pairs))>len(used_data):
print('\n*****\nThe model was trained using {} data and now we could use:\n\t{} from pretraining (both classes)\n\t{} calculated class-0\n\t{} calculated class-1'.format(len(used_data),len(pretrain_df),len(class_0_pairs),len(class_1_pairs)))
else:
print('All the data available have been already used to train the model')
sys.exit()
# If the program did not terminate, it means that there are new data and it is worth to retrain the model
if use_new_calculations:
# For the new target_feature information, we look for the corresponding input pair
# so we need to load the input features for all of them
try:
all_pairs_df = pd.read_feather('In_data/{}'.format(myparams.In_file))
if ndecimals>0:
all_pairs_df.i = all_pairs_df.i.round(ndecimals)
all_pairs_df.j = all_pairs_df.j.round(ndecimals)
except:
print('Error: there are no data prepared')
sys.exit()
# split this task between parallel workers
# Notice that we are first processing the bad pairs (class 0)
elements_per_worker=20
chunks=[class_0_pairs.iloc[i:i + elements_per_worker] for i in range(0, len(class_0_pairs), elements_per_worker)]
n_chunks = len(chunks)
print('\nWe are going to submit {} chunks for the bad pairs'.format(n_chunks))
def process_chunk(chunk):
worker_df=pd.DataFrame()
# I search for the given configuration
for index, row in chunk.iterrows():
conf = row['conf']
i = row['i']
j = row['j']
# do we have it ?
if ndecimals>0:
a = all_pairs_df[(all_pairs_df['conf']==conf)&(all_pairs_df['i'].between(i-rounding_error,i+rounding_error))&(all_pairs_df['j'].between(j-rounding_error,j+rounding_error))]
else:
a = all_pairs_df[(all_pairs_df['conf']==conf)&(all_pairs_df['i']==i)&(all_pairs_df['j']==j)]
if len(a)>1:
print('Error! multiple correspondences for {}'.format(element))
sys.exit()
elif len(a)==1:
worker_df = pd.concat([worker_df,a])
return worker_df
print('collecting info for bad pairs')
# Initialize the pool
pool = mp.Pool(mp.cpu_count())
# *** RUN THE PARALLEL FUNCTION
results = pool.map(process_chunk, [chunk for chunk in chunks] )
pool.close()
# and add all the new df to the final one
badpairs_df=pd.DataFrame()
for df_chunk in results:
badpairs_df = pd.concat([badpairs_df,df_chunk])
badpairs_df['class']=0
print('Constructed the database of {} bad pairs, from the new data'.format(len(badpairs_df)))
# *** now get the good pairs using the same routine
chunks=[class_1_pairs.iloc[i:i + elements_per_worker] for i in range(0, len(class_1_pairs), elements_per_worker)]
n_chunks = len(chunks)
print('\nWe are going to submit {} chunks for the good data'.format(n_chunks))
pool = mp.Pool(mp.cpu_count())
results = pool.map(process_chunk, [chunk for chunk in chunks] )
pool.close()
goodpairs_df=pd.DataFrame()
for df_chunk in results:
goodpairs_df = pd.concat([goodpairs_df,df_chunk])
goodpairs_df['class']=1
print('Constructed the database of {} good pairs, from the new data'.format(len(goodpairs_df)))
else:
print('(We are not using data from calculations, but only the pretraining set)')
goodpairs_df = pd.DataFrame()
badpairs_df = pd.DataFrame()
# *******
# add the pretrained data (if any)
if len(pretrain_df)>0:
goodpairs_df = pd.concat([goodpairs_df,pretrain_df[pretrain_df['class']>0]])
goodpairs_df=goodpairs_df.drop_duplicates()
badpairs_df = pd.concat([badpairs_df,pretrain_df[pretrain_df['class']<1]])
badpairs_df=badpairs_df.drop_duplicates()
qs_df = pd.concat([goodpairs_df,badpairs_df])
# set class column as binary
qs_df['class']=qs_df['class'].astype(bool)
goodpairs_df['class']=goodpairs_df['class'].astype(bool)
badpairs_df['class']=badpairs_df['class'].astype(bool)
# This is the new training df that will be stored at the end
new_training_df = qs_df.copy()
if len(new_training_df)<=len(used_data):
print('\n(!) After removing the duplicates it appears that the number of data has not increased since the last time')
if os.path.isfile('{}/predictor.pkl'.format(model_path)):
print('and since the model is already trained, I stop')
sys.exit()
else:
print('but the model is not in {}, so I train anyway'.format(model_path),flush=True)
# Create a balanced subset with same number of good and bad pairs
N = min(len(goodpairs_df),len(badpairs_df))
print('Having {} good and {} bad pairs, we select only {} of each, to balance the classifier'.format(len(goodpairs_df),len(badpairs_df),N))
# and split a part of data for validation
Nval = int(myparams.validation_split*N)
Ntrain = N -Nval
# shuffle
goodpairs_df=goodpairs_df.sample(frac=1, random_state=20, ignore_index=True)
badpairs_df=badpairs_df.sample(frac=1, random_state=20, ignore_index=True)
# and slice
training_set = pd.concat([goodpairs_df.iloc[:Ntrain],badpairs_df.iloc[:Ntrain]])
validation_set = pd.concat([goodpairs_df.iloc[Ntrain:N],badpairs_df.iloc[Ntrain:N]])
# and reshuffle
training_set = training_set.sample(frac=1, random_state=20, ignore_index=True)
validation_set = validation_set.sample(frac=1, random_state=20, ignore_index=True)
print('\nFrom the overall %d data we prepare:\n\t- training set of %d (half good and half bad) \n\t- validation set of %d (half good and half bad)\n\n'%(len(new_training_df),len(training_set),len(validation_set) ),flush=True)
print(training_set)
# ************** TRAIN
# Notice that autogluon offer different 'presets' option to maximize precision vs data-usage vs time
# if you are not satisfied with the results here, you can try different 'presets' option or build your own
# check which one to use
if myparams.Fast_class==True:
print('We are training in the fast way')
presets='good_quality_faster_inference_only_refit'
else:
presets='high_quality_fast_inference_only_refit'
# you can also change the training time
training_hours=myparams.class_train_hours
time_limit = training_hours*60*60
# train
# * I am excluding the KNN model because it is problematic
predictor = TabularPredictor(label='class', path=model_path, eval_metric='accuracy').fit(TabularDataset(training_set.drop(columns=['i','j','conf'])), time_limit=time_limit, presets=presets,excluded_model_types=['KNN'])
# store
new_training_df.reset_index().drop(columns='index').to_feather('MLmodel/data-used-by-classifier-{}.feather'.format(In_label), compression='zstd')
training_set.reset_index().drop(columns='index').to_feather('MLmodel/classifier-training-set-{}.feather'.format(In_label), compression='zstd')
validation_set.reset_index().drop(columns='index').to_feather('MLmodel/classifier-validation-set-{}.feather'.format(In_label), compression='zstd')