-
Notifications
You must be signed in to change notification settings - Fork 0
/
step2.py
86 lines (70 loc) · 3.16 KB
/
step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import time
import glob
import pandas as pd
import csv
import gzip
import io
import os
import sys
import fnmatch
import pickle
import autogluon as ag
from autogluon.tabular import TabularDataset, TabularPredictor
import multiprocess as mp
import myparams
import dask.dataframe as dd
# This code runs the good vs bad pair classifier over all the pairs available
def ensure_dir(filename):
dirname = os.path.dirname(filename)
if dirname:
try:
os.makedirs(dirname)
except OSError:
pass
if __name__ == "__main__":
In_file = myparams.In_file
In_label = In_file.split('/')[-1].split('.')[0]
print('\n*** Requested to apply the classifier to all the pairs in {}'.format(In_file))
ensure_dir('output_ML/{}/'.format(In_label))
# *************
# (1) Load the preprocessed data
new_df = pd.read_feather('In_data/{}'.format(myparams.In_file))
print('\n\t@@@@ Overall we have {} pairs'.format(len(new_df)))
# *************
# (2) (optionally) remove pairs with a classic energy splitting which is too large
print(new_df)
new_df = new_df[new_df[r'$\Delta E$']<0.1]
print('*-> We decide to keep only the ones with Delta_E<{}, which are {}'.format(0.1, len(new_df)))
# *************
# (3) apply the classifier
start= time.time()
classifier_save_path='MLmodel/classification-{}'.format(In_label)
# check if the model is there
if not os.path.isdir(classifier_save_path):
print('Error: I am looking for the classifier in {}, but I can not find it. You probably have to run step1 before this'.format(classifier_save_path))
sys.exit()
else:
print('\nUsing the filter trained in {}'.format(classifier_save_path))
print('\n* Classifier loading',flush=True)
classifier = TabularPredictor.load(classifier_save_path)
npairs = len(new_df)
chunk_size=1e6
nchunks = int(npairs/chunk_size)+1
processed_chunks = []
print('Temporarily splitting the data ({} pairs) in {} parts'.format(npairs,nchunks))
df_chunks = np.array_split(new_df, nchunks)
del new_df
print('Classification starting:',flush=True)
filtered_chunks = []
filtered_df = pd.DataFrame()
for chunk_id, chunk in enumerate(df_chunks):
print('\n* Classifying part {}/{}'.format(chunk_id+1,nchunks),flush=True)
df_chunks[chunk_id]['class'] = classifier.predict(chunk.drop(columns=['conf','i','j']))
# I only keep the predicted class-1
filtered_df = pd.concat([filtered_df,df_chunks[chunk_id][df_chunks[chunk_id]['class']>0]])
print('done in {} sec (collected up to {} class-1) '.format(time.time() -start, len(filtered_df)))
timeclass=time.time()-start
print('From the {} pairs, only {} are in class-1 (in {} sec = {} sec per pair), so {} are class-0.'.format(npairs, len(filtered_df), timeclass, timeclass/npairs, npairs-len(filtered_df)))
filtered_df_name='output_ML/{}/classified_{}.feather'.format(In_label,In_label)
filtered_df.reset_index(drop=True).to_feather(filtered_df_name, compression='zstd')