Skip to content

Commit

Permalink
Cleanup and fixes (#832)
Browse files Browse the repository at this point in the history
- update database for D0 jets (incl. D0 flags from Luigi)
- fix fill_hist
- simplification of code
  • Loading branch information
qgp authored Nov 17, 2023
1 parent fdd66e9 commit acad7d9
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ D0pp_jet:
var_ismcbkg: ismcbkg
var_ismcrefl: ismcrefl
isstd : [[1],[]]
ismcsignal: [[1],[]]
ismcprompt: [[1],[]]
ismcfd: [[2],[]]
ismcsignal: [[0],[]]
ismcprompt: [[0],[]]
ismcfd: [[1],[]]
ismcbkg: [[],[1]]
ismcrefl: [[1],[1]]
ismcrefl: [[1],[1]] # probably missing from tree creator

variables:
var_all: [fIndexCollisions, fFlagMc, fCandidateSelFlag, fOriginMcRec, fY, fEta, fPt, fCpa, fCpaXY, fM,
Expand Down
36 changes: 18 additions & 18 deletions machine_learning_hep/processer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#############################################################################
## © Copyright CERN 2018. All rights not expressly granted are reserved. ##
## © Copyright CERN 2023. All rights not expressly granted are reserved. ##
## Author: [email protected] ##
## This program is free software: you can redistribute it and/or modify it ##
## under the terms of the GNU General Public License as published by the ##
Expand Down Expand Up @@ -313,7 +313,7 @@ def unpack(self, file_index, max_no_keys = None): # pylint: disable=too-many-br
dfjetsubgen = None

with uproot.open(self.l_root[file_index]) as rfile:
def read_tree(tree, df_base, var):
def read_df(tree, df_base, var):
try:
df = pd.DataFrame(
columns=var,
Expand All @@ -325,40 +325,40 @@ def read_tree(tree, df_base, var):
sys.exit()

df_processed = set()
keys = rfile.keys()
keys = rfile.keys(recursive=False, filter_name='DF_*')
for (idx, key) in enumerate(keys[:max_no_keys]):
if not (df_key := re.match('^DF_(\\d+);', key)):
continue

if (df_no := df_key.group(1)) in df_processed:
self.logger.warning('multiple versions of DF %d', df_no)
continue
self.logger.debug('processing DF %d - %d / %d',
df_no, idx, len(keys))
self.logger.debug('processing DF %d - %d / %d', df_no, idx, len(keys))
df_processed.add(df_no)
rdir = rfile[key]

dfevtorig = read_tree(rfile[f'{key}/{self.n_treeevt}'], dfevtorig, self.v_evt)
dfreco = read_tree(rfile[f'{key}/{self.n_treereco}'], dfreco, self.v_all)
tree = rdir[self.n_treereco] # accessing the tree is the slow bit!
dfreco = read_df(tree, dfreco, self.v_all)
dfevtorig = read_df(rdir[self.n_treeevt], dfevtorig, self.v_evt)

if self.n_treejetreco:
dfjetreco = read_tree(rfile[f'{key}/{self.n_treejetreco}'],
dfjetreco, self.v_jet)
dfjetreco = read_df(rdir[self.n_treejetreco],
dfjetreco, self.v_jet)

if self.n_treejetsubreco:
dfjetsubreco = read_tree(rfile[f'{key}/{self.n_treejetsubreco}'],
dfjetsubreco, self.v_jetsub)
dfjetsubreco = read_df(rdir[self.n_treejetsubreco],
dfjetsubreco, self.v_jetsub)

if self.mcordata == 'mc':
dfgen = read_tree(rfile[f'{key}/{self.n_treegen}'],
dfgen, self.v_gen)
dfgen = read_df(rdir[self.n_treegen],
dfgen, self.v_gen)

if self.n_treejetgen:
dfjetgen = read_tree(rfile[f'{key}/{self.n_treejetgen}'],
dfjetgen, self.v_jet_gen)
dfjetgen = read_df(rdir[self.n_treejetgen],
dfjetgen, self.v_jet_gen)

if self.n_treejetsubgen:
dfjetsubgen = read_tree(rfile[f'{key}/{self.n_treejetsubgen}'],
dfjetsubgen, self.v_jetsub_gen)
dfjetsubgen = read_df(rdir[self.n_treejetsubgen],
dfjetsubgen, self.v_jetsub_gen)

dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp)
dfevtorig = dfevtorig.reset_index(drop=True)
Expand Down
5 changes: 3 additions & 2 deletions machine_learning_hep/processer_jet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pickle
from ROOT import TFile, TH1F # pylint: disable=import-error, no-name-in-module
from machine_learning_hep.processer import Processer
from machine_learning_hep.utilities import openfile
from machine_learning_hep.utilities import fill_hist, openfile

class ProcesserJets(Processer): # pylint: disable=invalid-name, too-many-instance-attributes
species = "processer"
Expand Down Expand Up @@ -68,5 +68,6 @@ def process_histomass_single(self, index):
h_invmass_all = TH1F(
f'hmass_{ipt}', "",
self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
h_invmass_all.FillN(df.size, df.inv_mass)
fill_hist(h_invmass_all, df.fM)
h_invmass_all.Print()
h_invmass_all.Write()
87 changes: 34 additions & 53 deletions machine_learning_hep/steer_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#############################################################################
## © Copyright CERN 2018. All rights not expressly granted are reserved. ##
## © Copyright CERN 2023. All rights not expressly granted are reserved. ##
## Author: [email protected] ##
## This program is free software: you can redistribute it and/or modify it ##
## under the terms of the GNU General Public License as published by the ##
Expand Down Expand Up @@ -123,58 +123,39 @@ def do_entire_analysis(data_config: dict, data_param: dict, data_param_overwrite

dojetstudies = data_config["analysis"]["dojetstudies"]

dirpklmc = []
dirpklskmc = []
dirpklmlmc = []
dirprefixmc = data_param[case]["multi"]["mc"].get("prefix_dir", "")
for s in data_param[case]["multi"]["mc"]["pkl"]:
dirpklmc.append(dirprefixmc + s)
for s in data_param[case]["multi"]["mc"]["pkl_skimmed"]:
dirpklskmc.append(dirprefixmc + s)
for s in data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml"]:
dirpklmlmc.append(dirprefixmc + s)
dirpklevtcounter_allmc = dirprefixmc + data_param[case]["multi"]["mc"]["pkl_evtcounter_all"]
dirpklmltotmc = dirprefixmc + data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]

dirpkldata = []
dirpklskdata = []
dirpklmldata = []
dirprefixdata = data_param[case]["multi"]["data"].get("prefix_dir", "")
for s in data_param[case]["multi"]["data"]["pkl"]:
dirpkldata.append(dirprefixdata + s)
for s in data_param[case]["multi"]["data"]["pkl_skimmed"]:
dirpklskdata.append(dirprefixdata + s)
for s in data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml"]:
dirpklmldata.append(dirprefixdata + s)
dirpklevtcounter_alldata = dirprefixdata + \
data_param[case]["multi"]["data"]["pkl_evtcounter_all"]
dirpklmltotdata = dirprefixdata + \
data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]

dirpklskdecmc = []
dirpklskdec_mergedmc = []
dirpklskdecdata = []
dirpklskdec_mergeddata = []
dirprefixmcres = data_param[case]["mlapplication"]["mc"].get("prefix_dir_res", "")
for s in data_param[case]["mlapplication"]["mc"]["pkl_skimmed_dec"]:
dirpklskdecmc.append(dirprefixmcres + s)
for s in data_param[case]["mlapplication"]["mc"]["pkl_skimmed_decmerged"]:
dirpklskdec_mergedmc.append(dirprefixmcres + s)
dirprefixdatares = data_param[case]["mlapplication"]["data"].get("prefix_dir_res", "")
for s in data_param[case]["mlapplication"]["data"]["pkl_skimmed_dec"]:
dirpklskdecdata.append(dirprefixdatares + s)
for s in data_param[case]["mlapplication"]["data"]["pkl_skimmed_decmerged"]:
dirpklskdec_mergeddata.append(dirprefixdatares + s)

dirresultsdata = []
dirresultsmc = []
for s in data_param[case]["analysis"][typean]["data"]["results"]:
dirresultsdata.append(dirprefixdatares + s)
for s in data_param[case]["analysis"][typean]["mc"]["results"]:
dirresultsmc.append(dirprefixmcres + s)
dirresultsdatatot = dirprefixdatares + \
data_param[case]["analysis"][typean]["data"]["resultsallp"]
dirresultsmctot = dirprefixmcres + data_param[case]["analysis"][typean]["mc"]["resultsallp"]
dp = data_param[case]["multi"]["mc"]
dirprefixmc = dp.get("prefix_dir", "")
dirpklmc = [dirprefixmc + p for p in dp["pkl"]]
dirpklskmc = [dirprefixmc + p for p in dp["pkl_skimmed"]]
dirpklmlmc = [dirprefixmc + p for p in dp["pkl_skimmed_merge_for_ml"]]
dirpklevtcounter_allmc = dirprefixmc + dp["pkl_evtcounter_all"]
dirpklmltotmc = dirprefixmc + dp["pkl_skimmed_merge_for_ml_all"]

dp = data_param[case]["multi"]["data"]
dirprefixdata = dp.get("prefix_dir", "")
dirpkldata = [dirprefixdata + p for p in dp["pkl"]]
dirpklskdata = [dirprefixdata + p for p in dp["pkl_skimmed"]]
dirpklmldata = [dirprefixdata + p for p in dp["pkl_skimmed_merge_for_ml"]]
dirpklevtcounter_alldata = dirprefixdata + dp["pkl_evtcounter_all"]
dirpklmltotdata = dirprefixdata + dp["pkl_skimmed_merge_for_ml_all"]

dp = data_param[case]["mlapplication"]["mc"]
dirprefixmcres = dp.get("prefix_dir_res", "")
dirpklskdecmc = [dirprefixmcres + p for p in dp["pkl_skimmed_dec"]]
dirpklskdec_mergedmc = [dirprefixmcres + p for p in dp["pkl_skimmed_decmerged"]]

dp = data_param[case]["mlapplication"]["data"]
dirprefixdatares = dp.get("prefix_dir_res", "")
dirpklskdecdata = [dirprefixdatares + p for p in dp["pkl_skimmed_dec"]]
dirpklskdec_mergeddata = [dirprefixdatares + p for p in dp["pkl_skimmed_decmerged"]]

dp = data_param[case]["analysis"][typean]["data"]
dirresultsdata = [dirprefixdatares + p for p in dp["results"]]
dirresultsdatatot = dirprefixdatares + dp["resultsallp"]

dp = data_param[case]["analysis"][typean]["mc"]
dirresultsmc = [dirprefixmcres + p for p in dp["results"]]
dirresultsmctot = dirprefixmcres + dp["resultsallp"]

binminarray = data_param[case]["ml"]["binmin"]
binmaxarray = data_param[case]["ml"]["binmax"]
Expand Down
1 change: 0 additions & 1 deletion machine_learning_hep/submission/default_complete.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ mergingperiods: # pkl_skimmed_merge_for_ml_all

ml_study: # mlout, mlplot
activate: false
dopca: false
docorrelation: false
dotraining: false
dotesting: false
Expand Down
10 changes: 6 additions & 4 deletions machine_learning_hep/utilities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#############################################################################
## © Copyright CERN 2018. All rights not expressly granted are reserved. ##
## © Copyright CERN 2023. All rights not expressly granted are reserved. ##
## Author: [email protected] ##
## This program is free software: you can redistribute it and/or modify it ##
## under the terms of the GNU General Public License as published by the ##
Expand Down Expand Up @@ -46,9 +46,11 @@
# pylint: disable=line-too-long, consider-using-f-string, too-many-lines
# pylint: disable=unspecified-encoding, consider-using-generator, invalid-name, import-outside-toplevel

def fill_hist(hist, arr, weights = 0):
assert arr.ndim == 1 and weights.ndim == 1, 'fill_hist handles 1d histos only'
hist.FillN(len(arr), arr, weights)
def fill_hist(hist, arr, weights = None):
assert arr.ndim == 1, 'fill_hist handles 1d histos only'
if len(arr) == 0:
return
hist.FillN(len(arr), np.float64(arr), weights or 0)

def hist2array(hist):
assert hist.GetDimension() == 1
Expand Down

0 comments on commit acad7d9

Please sign in to comment.