Skip to content

Commit

Permalink
Prepare ML training for HF jets
Browse files Browse the repository at this point in the history
  • Loading branch information
qgp committed Nov 27, 2023
1 parent 7034555 commit 09d16fa
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ D0pp_jet:
fErrorDecayLength, fErrorDecayLengthXY, fChi2PCA, fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised,
fImpactParameterNormalised0, fPtProng0, fImpactParameterNormalised1, fPtProng1,
fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1,
fNSigTpcPi0, fNSigTpcKa0, fNSigTpcPi1, fNSigTpcKa1,
fNSigTofPi0, fNSigTofKa0, fNSigTofPi1, fNSigTofKa1, fIndexHfCand2Prong_0]
var_jet_data: [fIndexCollisions, fIndexD0ChargedJets, fIndexHfCand2Prong_0, fJetPt, fJetEta, fJetPhi, fJetNConstituents]
var_jet_det: [fIndexCollisions, fIndexD0ChargedMCDetectorLevelJets, fIndexHfCand2Prong_0, fJetPt, fJetEta, fJetPhi, fJetNConstituents]
Expand All @@ -78,7 +79,7 @@ D0pp_jet:
var_gen: [fIndexCollisions, fPt, fY, fFlagMc, fOriginMcGen, fIndexHfCand2Prong_0]
var_evt_match: [df, fIndexCollisions]
var_evt_match_mc: [df, fIndexCollisions]
var_training: [[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1]]
var_training: [[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1]]
#TODO: add new variables for dca, max_norm_d0d0exp
# sel_skim_binmin bins
var_boundaries: [fCosThetaStar, fPtProng]
Expand Down Expand Up @@ -141,38 +142,38 @@ D0pp_jet:

multi:
data:
nprocessesparallel: 1
maxfiles: [5] #list of periods
nprocessesparallel: 20
maxfiles: [-1] #list of periods
chunksizeunp: [100] #list of periods
chunksizeskim: [100] #list of periods
fracmerge: [0.08] #list of periods
seedmerge: [12] #list of periods
period: [LHC22o_pass4] #list of periods
period: [LHC22o] #list of periods
select_period: [1]
prefix_dir: /home/jklein/
unmerged_tree_dir: [data/alice/cern.ch/user/a/alihyperloop/jobs/0024/hy_240062/] #list of periods
pkl: [data/test/d0jet/pkl] #list of periods
pkl_skimmed: [data/test/d0jet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: [data/test/d0jet/pklskml] #list of periods
pkl_skimmed_merge_for_ml_all: data/test/d0jet/pp_data_mltot
pkl_evtcounter_all: data/test/d0jet/pp_data_evttot
prefix_dir: /data2/MLhep/real/train_131050/
unmerged_tree_dir: [alice/cern.ch/user/a/alihyperloop/jobs/0024/] #list of periods
pkl: [d0jet/pkl] #list of periods
pkl_skimmed: [d0jet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods
pkl_skimmed_merge_for_ml_all: d0jet/pp_data_mltot
pkl_evtcounter_all: d0jet/pp_data_evttot
mcreweights: [../Analyses] #list of periods
mc:
nprocessesparallel: 40
maxfiles: [5] #list of periods
nprocessesparallel: 20
maxfiles: [-1] #list of periods
chunksizeunp: [100] #list of periods
chunksizeskim: [1000] #list of periods
fracmerge: [1.0] #list of periods
seedmerge: [12] #list of periods
period: [mctest] #list of periods
period: [LHC22b1b] #list of periods
select_period: [1]
prefix_dir: /home/jklein/
unmerged_tree_dir: [data/alice/cern.ch/user/a/alihyperloop/jobs/0024/hy_240092] #list of periods
pkl: [data/mctest/d0jet/pkl] #list of periods
pkl_skimmed: [data/mctest/d0jet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: [data/mctest/d0jet/pklskml] #list of periods
pkl_skimmed_merge_for_ml_all: data/mctest/d0jet/pp_mc_prod_mltot
pkl_evtcounter_all: data/mctest/d0jet/pp_mc_prod_evttot
prefix_dir: /data2/MLhep/sim/train_131049/
unmerged_tree_dir: [alice/cern.ch/user/a/alihyperloop/jobs/0024]
pkl: [d0jet/pkl] #list of periods
pkl_skimmed: [d0jet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods
pkl_skimmed_merge_for_ml_all: d0jet/pp_mc_prod_mltot
pkl_evtcounter_all: d0jet/pp_mc_prod_evttot
mcreweights: [../Analyses] #list of periods

ml:
Expand All @@ -195,8 +196,9 @@ D0pp_jet:
binmax: [2,4,6,8,12,24,48] # must be equal to sel_skim_binmax (sel_skim_binmin bins)
mltype: BinaryClassification
ncorescrossval: 10
mlplot: /home/jklein/data/mlplot # to be removed
mlout: /home/jklein/data/mlout # to be removed
prefix_dir_ml: /data2/jklein/MLhep
mlplot: mlplot
mlout: mlout

opt:
isFONLLfromROOT: true
Expand All @@ -213,15 +215,17 @@ D0pp_jet:
bkg_function: pol2 # fit function for bkg (among TH1 predefined fit functions, e.g. expo, pol1, pol2, ...)
save_fit: True # save bkg fits with the various cuts on ML output
raahp: [1,1,1,1,1,1,1] # sel_skim_binmin bins
presel_gen_eff: "abs(y_cand) < 0.5 and abs(z_vtx_gen) < 10"
presel_gen_eff: "abs(fY) < 0.5 and abs(fPosZ) < 10"

mlapplication:
data:
pkl_skimmed_dec: [/home/jklein/data/test/d0jet/pklskdec] #list of periods
pkl_skimmed_decmerged: [/home/jklein/data/test/d0jet/pklskdecmerged] #list of periods
prefix_dir_res: /data2/jklein/
pkl_skimmed_dec: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdata] #list of periods
pkl_skimmed_decmerged: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdatamerged] #list of periods
mc:
pkl_skimmed_dec: [/home/jklein/data/mctest/d0jet/pklskdec] #list of periods
pkl_skimmed_decmerged: [/home/jklein/mctest/d0jet/pklskdecmerged] #list of periods
prefix_dir_res: /data2/jklein/
pkl_skimmed_dec: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmc] #list of periods
pkl_skimmed_decmerged: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmcmerged] #list of periods
modelname: xgboost
modelsperptbin: [xgboost_classifierD0pp_FF_dfselection_pt_cand_1.0_2.0.sav,
xgboost_classifierD0pp_FF_dfselection_pt_cand_2.0_4.0.sav,
Expand All @@ -241,7 +245,7 @@ D0pp_jet:
cctype: 1 #kpp7
sigmav0: 57.8e-3 #NB: multiplied by 1e12 before giving to HFPtSpectrum!
inputfonllpred: data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root
dir_general_plots: /home/jklein/data/analysis_plots
dir_general_plots: /data2/jklein/data/analysis_plots

jet_zg: &jet_default
proc_type: Jets
Expand Down Expand Up @@ -284,12 +288,12 @@ D0pp_jet:
mc: null
data: &data_out_default
runselection: [null] #FIXME
results: [/home/jklein/data/test/d0jet/resultsMBjetvspt] #list of periods
resultsallp: /home/jklein/data/test/d0jet/resultsMBjetvspt_all
results: [/data2/jklein/data/test/d0jet/resultsMBjetvspt] #list of periods
resultsallp: /data2/jklein/data/test/d0jet/resultsMBjetvspt_all
mc: &mc_out_default
runselection: [null] #FIXME
results: [/home/jklein/data/mctest/d0jet/resultsMBjetvspt] #list of periods
resultsallp: /home/jklein/data/mctest/d0jet/resultsMBjetvspt_all
results: [/data2/jklein/data/mctest/d0jet/resultsMBjetvspt] #list of periods
resultsallp: /data2/jklein/data/mctest/d0jet/resultsMBjetvspt_all
data_proc: # alternative processor output used as the analyzer input
<<: *data_out_default
mc_proc: # alternative processor output used as the analyzer input
Expand Down
27 changes: 12 additions & 15 deletions machine_learning_hep/optimiser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#############################################################################
## © Copyright CERN 2018. All rights not expressly granted are reserved. ##
## © Copyright CERN 2023. All rights not expressly granted are reserved. ##
## Author: [email protected] ##
## This program is free software: you can redistribute it and/or modify it ##
## under the terms of the GNU General Public License as published by the ##
Expand Down Expand Up @@ -31,7 +31,7 @@
from machine_learning_hep.correlations import vardistplot, scatterplot, correlationmatrix
from machine_learning_hep.models import getclf_scikit, getclf_xgboost, getclf_keras
from machine_learning_hep.models import fit, savemodels, readmodels, test, apply, decisionboundaries
from machine_learning_hep.root import write_tree
# from machine_learning_hep.root import write_tree
from machine_learning_hep.mlperformance import cross_validation_mse, plot_cross_validation_mse
from machine_learning_hep.mlperformance import plot_learning_curves, precision_recall
from machine_learning_hep.mlperformance import roc_train_test, plot_overtraining
Expand All @@ -54,10 +54,11 @@ def __init__(self, data_param, case, typean, model_config, binmin,

self.logger = get_logger()

dirprefix = data_param["multi"]["data"].get("prefix_dir", "")
dirprefixdata = data_param["multi"]["data"].get("prefix_dir", "")
dirprefixmc = data_param["multi"]["mc"].get("prefix_dir", "")
dirprefix_ml = data_param["ml"].get("prefix_dir_ml", "")
dirmcml = dirprefix + data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
dirdataml = dirprefix + data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
dirmcml = dirprefixmc + data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
dirdataml = dirprefixdata + data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
self.v_bin = data_param["var_binning"]
#directory
self.dirmlout = dirprefix_ml + data_param["ml"]["mlout"]
Expand All @@ -75,6 +76,8 @@ def __init__(self, data_param, case, typean, model_config, binmin,
print(f"rm -r {self.dirmlplot}")
self.logger.fatal("Please remove above directories as indicated above first and " \
"run again")
if self.steps_done is None:
self.steps_done = []

#ml file names
self.n_reco = data_param["files_names"]["namefile_reco"]
Expand Down Expand Up @@ -205,7 +208,7 @@ def __init__(self, data_param, case, typean, model_config, binmin,
self.f_mltest_applied = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl"
self.df_mltest_applied = None

print(training_var)
self.logger.info('training variables: %s', training_var)

def create_suffix(self):
string_selection = createstringselection(self.v_bin, self.p_binmin, self.p_binmax)
Expand Down Expand Up @@ -320,16 +323,10 @@ def preparesample(self):
self.step_done("preparemlsamples")

def step_done(self, step):
if self.steps_done is None:
self.steps_done = []

step_name = f"{step}_{self.p_binmin}_{self.p_binmax}"
if step_name in self.steps_done:
print("\n\n")
self.logger.warning("Done ML step %s already. It's skipped now. Remove the step " \
"from the list in the following file", step_name)
print(self.file_steps_done)
print("\n\n")
"from the list in %s", step_name, self.file_steps_done)
return True

# Add this steps and update the corresponsing file
Expand Down Expand Up @@ -443,9 +440,9 @@ def do_test(self):
self.logger.info("Testing")
self.df_mltest_applied = test(self.p_mltype, self.p_classname, self.p_trainedmod,
self.df_mltest, self.v_train, self.v_sig)
df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix)
pickle.dump(self.df_mltest_applied, openfile(self.f_mltest_applied, "wb"), protocol=4)
write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied)
# df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix)
# write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied)

def do_apply(self):

Expand Down
1 change: 0 additions & 1 deletion machine_learning_hep/processer_jet.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,4 @@ def process_histomass_single(self, index):
f'hmass_{ipt}', "",
self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
fill_hist(h_invmass_all, df.fM)
h_invmass_all.Print()
h_invmass_all.Write()
4 changes: 1 addition & 3 deletions machine_learning_hep/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ def openfile(filename, attr):
return gzip.open(filename, attr)
if filename.lower().endswith('.lz4'):
return lz4.frame.open(filename, attr)
if filename.lower().endswith('.pkl'):
return open(filename, attr, encoding='utf-8')
return open(filename, attr)
return open(filename, attr, encoding='utf-8' if 'b' not in attr else None)

def mask_df(df_to_mask, mask_config):
"""
Expand Down

0 comments on commit 09d16fa

Please sign in to comment.