-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
3357 lines (2777 loc) · 155 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
# region
############################## define relevant variables ########################
spectators = ['__weight__', 'D_CMS_p', 'ell_CMS_p', 'B0_CMS3_weMissM2', 'B0_CMS3_weQ2lnuSimple']
CS_variables = ["B0_R2", "B0_thrustOm", "B0_cosTBTO", "B0_cosTBz",
"B0_KSFWV3", "B0_KSFWV4", "B0_KSFWV5", "B0_KSFWV6",
"B0_KSFWV7", "B0_KSFWV8", "B0_KSFWV9", "B0_KSFWV10",
"B0_KSFWV13", "B0_KSFWV14", "B0_KSFWV15", "B0_KSFWV16",
"B0_KSFWV17", "B0_KSFWV18",]
# "B0_thrustBm", "B0_KSFWV1", "B0_KSFWV2", "B0_KSFWV11",
# "B0_KSFWV12" correlates with mm2 or p_D_l
DTC_variables = ['D_vtxReChi2', 'D_A1FflightDistanceSig_IP',
'D_daughterInvM_1_2', 'D_daughterInvM_0_1',]
B_variables = ['B0_vtxReChi2', 'B0_dr',
'B0_CMS_cos_angle_0_1', 'B0_D_l_DisSig',
'B0_roeMbc_my_mask', 'B0_roeDeltae_my_mask',
'B0_roeEextra_my_mask', 'B0_TagVReChi2IP',]
training_variables = CS_variables + DTC_variables + B_variables
mva_variables = training_variables + spectators
analysis_variables=['__experiment__', '__run__', '__event__', '__production__',
'B0_isContinuumEvent','B0_mcPDG', 'B0_mcErrors', 'B0_mcDaughter_0_PDG',
'B0_mcDaughter_1_PDG','B0_deltaE', 'B0_Mbc', 'B0_CMS2_weMbc',
'B0_CMS0_weDeltae', 'B0_cos_angle_0_1',
'D_mcErrors', 'D_genGMPDG', 'D_genMotherPDG', 'D_mcPDG',
'D_BFM', 'D_M', 'D_p',
'D_K_mcErrors', 'D_pi1_mcErrors','D_pi2_mcErrors', 'D_K_charge',
'D_K_cosTheta', 'D_K_p', 'D_K_PDG', 'D_K_mcPDG',
'ell_genMotherPDG', 'ell_mcPDG', 'ell_mcErrors', 'ell_genGMPDG',
'ell_p', 'ell_pValue', 'ell_charge', 'ell_theta',
'ell_PDG', 'ell_eID',
'mode', 'Ecms', 'p_D_l', 'B_D_ReChi2',
'sig_prob', 'fakeD_prob', 'fakeB_prob', 'continuum_prob',]
# 'D_K_kaonIDNN', 'D_K_pionIDNN', 'D_pi2_kaonIDNN', 'D_pi2_pionIDNN',
# 'D_pi1_kaonIDNN', 'D_pi1_pionIDNN',] 'B0_Lab5_weMissPTheta','B0_roeCharge_my_mask',
# 'B0_nROE_Tracks_my_mask', 'B0_nROE_Photons_my_mask', 'B0_nROE_NeutralHadrons_my_mask',
combinatorial_vars = [
'D_511_0_daughterPDG', 'D_511_1_daughterPDG', 'D_511_2_daughterPDG',
'D_511_3_daughterPDG', 'D_511_4_daughterPDG', 'D_511_5_daughterPDG',
'D_511_6_daughterPDG', 'D_521_0_daughterPDG', 'D_521_1_daughterPDG',
'D_521_2_daughterPDG', 'D_521_3_daughterPDG', 'D_521_4_daughterPDG',
'D_521_5_daughterPDG', 'D_521_6_daughterPDG'
]
veto_vars = ['B0_DstVeto_massDiff_0','B0_DstVeto_massDiffErr_0',
'B0_DstVeto_massDiffSignif_0','B0_DstVeto_vtxReChi2',
'B0_DstVeto_isDst']
all_relevant_variables = mva_variables + analysis_variables + combinatorial_vars + veto_vars
DecayMode_new = {'bkg_fakeTracks':0, 'bkg_fakeD':1, 'bkg_TDFl':2,
'bkg_continuum':3, 'bkg_combinatorial':4, 'bkg_singleBbkg':5,
'bkg_other_TDTl':6, 'bkg_other_signal':7,
r'$D\tau\nu$':8, r'$D^\ast\tau\nu$':9, r'$D\ell\nu$':10,
r'$D^\ast\ell\nu$':11, r'$D^{\ast\ast}\tau\nu$':12,
r'$D^{\ast\ast}\ell\nu$_narrow':13, r'$D^{\ast\ast}\ell\nu$_broad':14,
r'$D\ell\nu$_gap_pi':15, r'$D\ell\nu$_gap_eta':16}
# sidebands / signal region
r_D = 753/89529
r_Dst = 557/57253
# for event classification
pi_pdg = [111, 211, -211]
eta_pdg = [221]
D_Dst_pdg = [411, 413, -411, -413]
Dstst_narrow_pdg = [10413, 10423, 415, 425, -10413, -10423, -415, -425]
Dstst_broad_pdg = [10411, 10421, 20413, 20423, -10411, -10421, -20413, -20423]
Dstst_pdg = Dstst_narrow_pdg + Dstst_broad_pdg
# for combinatorial classification
D_mesons_pdg = D_Dst_pdg + Dstst_pdg + [421,-421,423,-423,431,-431] # D0, D*0, D_s
charm_baryons_pdg = [4122, -4122, # Lambda_c
4112, -4112, 4212, -4212, # Sigma_c
4132, -4132, 4232, -4232, # Xi_c
4332, -4332, # Omega_c0
]
single_charm_pdg = D_mesons_pdg + charm_baryons_pdg
double_charm_pdg = {30443, 9010443, # psi
4412, -4412, 4422, -4422, # Xi_cc+
4432, -4432, # Omega_cc+
}
leptons = {11, -11, 13, -13}
Bpdg = {511, -511, 521, -521}
################################ dataframe samples ###########################
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
import lightgbm as lgb
def apply_mva_bcs(df, features, cut, library='ag', version='',model=None,bcs='vtx',importance=False):
# load model
if library is not None:
if library=='ag':
predictor = TabularPredictor.load(f"/home/belle/zhangboy/inclusive_R_D/AutogluonModels/{version}")
pred = predictor.predict_proba(df, model)
pred = pred.rename(columns={0: 'sig_prob',
1: 'fakeD_prob',
2: 'combinatorial_prob',
3: 'continuum_prob'})
elif library=='lgbm':
predictor = lgb.Booster(model_file='/home/belle/zhangboy/inclusive_R_D/BDTs/LightGBM/lgbm_multiclass.txt')
pred_array = predictor.predict(df[features], num_iteration=predictor.best_iteration)
pred = pd.DataFrame(pred_array, columns=['sig_prob','fakeD_prob',
'combinatorial_prob','continuum_prob'])
if importance: # feature importances
lgb.plot_importance(predictor, figsize=(18,20))
# combine the predict result
pred['largest_prob'] = pred[['sig_prob','fakeD_prob','combinatorial_prob','continuum_prob']].max(axis=1)
df_pred = pd.concat([df, pred], axis=1)
# apply the MVA cut and BCS
df_cut=df_pred.query(cut)
else:
df_cut = df.query(cut)
if bcs=='vtx':
df_bestSelected=df_cut.loc[df_cut.groupby(['__experiment__','__run__','__event__','__production__'])['B_D_ReChi2'].idxmin()]
elif bcs=='mva':
df_bestSelected=df_cut.loc[df_cut.groupby(['__experiment__','__run__','__event__','__production__'])['sig_prob'].idxmax()]
return df_bestSelected
def classify_mc_dict(df, mode, template=True) -> dict:
samples = {}
lepton_PDG = {'e':11, 'mu':13}
################## Define D and lepton #################
trueD = 'D_mcErrors==0'
truel = f'abs(ell_mcPDG)=={lepton_PDG[mode]}'
fakeD = '0<D_mcErrors<512'
fakel = f'abs(ell_mcPDG)!={lepton_PDG[mode]} and ell_mcErrors!=512'
fakeTracks = 'B0_mcErrors==512'
################# Define B ####################
FD = f'{fakeD} and ({fakel} or {truel})' # i.e. Not a fakeTrack lepton
TDFl = f'{trueD} and {fakel}'
TDTl = f'{trueD} and {truel}'
# more categories with TDTl
continuum = f'{TDTl} and B0_isContinuumEvent==1'
combinatorial = f'{TDTl} and B0_mcPDG==300553'
signals = f'{TDTl} and (abs(B0_mcPDG)==511 or abs(B0_mcPDG)==521) and \
(ell_genMotherPDG==B0_mcPDG or ell_genGMPDG==B0_mcPDG and abs(ell_genMotherPDG)==15)'
singleBbkg = f'{TDTl} and B0_isContinuumEvent==0 and B0_mcPDG!=300553 and \
( (abs(B0_mcPDG)!=511 and abs(B0_mcPDG)!=521) or \
( ell_genMotherPDG!=B0_mcPDG and (ell_genGMPDG!=B0_mcPDG or abs(ell_genMotherPDG)!=15) ) )'
# more categories with signals
B2D_tau = f'{signals} and B0_mcDaughter_0_PDG*B0_mcDaughter_1_PDG==411*15'
B2D_ell = f'{signals} and B0_mcDaughter_0_PDG*B0_mcDaughter_1_PDG==411*{lepton_PDG[mode]}'
B2Dst_tau = f'{signals} and B0_mcDaughter_0_PDG*B0_mcDaughter_1_PDG==413*15'
B2Dst_ell = f'{signals} and B0_mcDaughter_0_PDG*B0_mcDaughter_1_PDG==413*{lepton_PDG[mode]}'
B2Dstst_tau = f'{signals} and B0_mcDaughter_0_PDG in @Dstst_pdg and abs(B0_mcDaughter_1_PDG)==15'
B2Dstst_ell_narrow = f'{signals} and B0_mcDaughter_0_PDG in @Dstst_narrow_pdg and abs(B0_mcDaughter_1_PDG)=={lepton_PDG[mode]}'
B2Dstst_ell_broad = f'{signals} and B0_mcDaughter_0_PDG in @Dstst_broad_pdg and abs(B0_mcDaughter_1_PDG)=={lepton_PDG[mode]}'
B2D_ell_gap_pi = f'{signals} and B0_mcDaughter_0_PDG in @D_Dst_pdg and B0_mcDaughter_1_PDG in @pi_pdg'
B2D_ell_gap_eta = f'{signals} and B0_mcDaughter_0_PDG in @D_Dst_pdg and B0_mcDaughter_1_PDG in @eta_pdg'
######################### Apply selection ###########################
# Fake background components:
samples.update({
'bkg_fakeD': df.query(FD).copy(),
'bkg_TDFl': df.query(TDFl).copy(),
'bkg_fakeTracks': df.query(fakeTracks).copy(),
})
# True Dl background components:
bkg_continuum = df.query(continuum).copy()
bkg_combinatorial = df.query(combinatorial).copy()
bkg_singleBbkg = df.query(singleBbkg).copy()
df_signals_all = df.query(signals).copy()
df_TDTl_all = df.query(TDTl).copy()
classified_TDTl_indices = pd.concat([bkg_continuum,bkg_combinatorial,
bkg_singleBbkg,df_signals_all]).index
bkg_other_TDTl = df_TDTl_all.loc[~df_TDTl_all.index.isin(classified_TDTl_indices)].copy()
# bkg_other_TDTl = pd.concat([]).drop_duplicates(subset=['__experiment__', '__run__', '__event__', '__production__'], keep=False)
samples.update({
'bkg_continuum': bkg_continuum,
'bkg_combinatorial': bkg_combinatorial,
'bkg_singleBbkg': bkg_singleBbkg,
'bkg_other_TDTl': bkg_other_TDTl,
})
# True Dl signal components:
D_tau_nu = df.query(B2D_tau).copy()
D_l_nu = df.query(B2D_ell).copy()
Dst_tau_nu = df.query(B2Dst_tau).copy()
Dst_l_nu = df.query(B2Dst_ell).copy()
Dstst_tau_nu = df.query(B2Dstst_tau).copy()
Dstst_l_nu_narrow = df.query(B2Dstst_ell_narrow).copy()
Dstst_l_nu_broad = df.query(B2Dstst_ell_broad).copy()
D_l_nu_gap_pi = df.query(B2D_ell_gap_pi).copy()
D_l_nu_gap_eta = df.query(B2D_ell_gap_eta).copy()
classified_signal_indices = pd.concat([D_tau_nu, Dst_tau_nu, D_l_nu,
Dst_l_nu, Dstst_tau_nu,
Dstst_l_nu_narrow,
Dstst_l_nu_broad,
D_l_nu_gap_pi, D_l_nu_gap_eta,]).index
bkg_other_signal = df_signals_all.loc[~df_signals_all.index.isin(classified_signal_indices)].copy()
# Assign signal samples with LaTeX style names:
samples.update({
r'$D\tau\nu$': D_tau_nu,
r'$D^\ast\tau\nu$': Dst_tau_nu,
r'$D\ell\nu$': D_l_nu,
r'$D^\ast\ell\nu$': Dst_l_nu,
r'$D^{\ast\ast}\tau\nu$': Dstst_tau_nu,
r'$D^{\ast\ast}\ell\nu$_narrow': Dstst_l_nu_narrow,
r'$D^{\ast\ast}\ell\nu$_broad': Dstst_l_nu_broad,
r'$D\ell\nu$_gap_pi': D_l_nu_gap_pi,
r'$D\ell\nu$_gap_eta': D_l_nu_gap_eta,
'bkg_other_signal': bkg_other_signal,
})
# Finally, assign a 'mode' to each sample based on an external mapping (DecayMode_new)
# (Make sure that DecayMode_new is defined in your namespace.)
for name, subset_df in samples.items():
subset_df['mode'] = DecayMode_new.get(name, -1)
return samples
def classify_combinatorial(df):
"""
Classifies combinatorial background into 7 distinct classes based on:
1. D mother decay type.
2. Lepton mother PDG classification.
Args:
df (pd.DataFrame): Input DataFrame containing necessary columns.
Returns:
dict: A dictionary where keys are class names and values are sub-DataFrames.
"""
# Define relevant columns for D mother decay classification
study_cols = combinatorial_vars
# 1. Semileptonic B: at least one lepton appears in the study columns
mask_sl = df[study_cols].isin(leptons).any(axis=1)
# 2. Hadronic single charm: no leptons and exactly one value in the D_mesons
charm_mask1 = df[study_cols].abs().isin(single_charm_pdg)
mask_cx = (~mask_sl) & (charm_mask1.sum(axis=1) == 1)
# 3. Hadronic double charm: no leptons and two values in the D_mesons or 1 value in psi
charm_mask2 = df[study_cols].abs().isin(double_charm_pdg)
mask_ccx = (~mask_sl) & ( (charm_mask1.sum(axis=1) == 2) | (charm_mask2.sum(axis=1) == 1) )
# Lepton classification:
mask_primary_ell = (df['ell_genMotherPDG'].abs().isin(Bpdg)) | (
(df['ell_genGMPDG'].abs().isin(Bpdg)) & (df['ell_genMotherPDG'].abs() == 15)
)
mask_secondary_ell = ~mask_primary_ell
# Build dictionary of classified samples:
class_dict = {
'DSemiB_ellPri': df[mask_sl & mask_primary_ell].copy(),
'DSemiB_ellSec': df[mask_sl & mask_secondary_ell].copy(),
'DHad1Charm_ellPri': df[mask_cx & mask_primary_ell].copy(),
'DHad1Charm_ellSec': df[mask_cx & mask_secondary_ell].copy(),
'DHad2Charm_ellPri': df[mask_ccx & mask_primary_ell].copy(),
'DHad2Charm_ellSec': df[mask_ccx & mask_secondary_ell].copy(),
}
# Catch unclassified rows
classified_indices = pd.concat(class_dict.values()).index
class_dict['others'] = df.loc[~df.index.isin(classified_indices)].copy()
return class_dict
# Function to check for duplicate entries in a dictionary of Pandas DataFrames
def check_duplicate_entries(data_dict):
# Create an empty list to store duplicate pairs
duplicate_pairs = []
# Iterate through the dictionary values (assuming each value is a DataFrame)
keys = list(data_dict.keys())
for i in range(len(keys)):
for j in range(i + 1, len(keys)):
df1 = data_dict[keys[i]][['__experiment__','__run__','__event__','__production__']]
df2 = data_dict[keys[j]][['__experiment__','__run__','__event__','__production__']]
# Check for duplicates between the two DataFrames
duplicates = pd.merge(df1, df2, indicator=True, how='inner')
if not duplicates.empty:
duplicate_pairs.append((keys[i], keys[j]))
if duplicate_pairs:
print("Duplicate pairs found:")
for pair in duplicate_pairs:
print(pair)
else:
print("No duplicate pairs found.")
############################### Templates and workspace ######################
from uncertainties import ufloat, correlated_values
import uncertainties.unumpy as unp
from uncertainties import UFloat
import copy
from termcolor import colored
def rebin_histogram(counts, threshold):
"""
Rebins a histogram, merging bins until the count exceeds a threshold.
Handles uncertainties if `counts` is a uarray.
Parameters:
counts (array-like or unp.uarray): Counts with or without uncertainties.
threshold (float): Minimum count threshold to stop merging bins.
Returns:
new_counts (array-like or unp.uarray): Rebinned counts (with propagated uncertainties if input has uncertainties).
new_bin_edges (array-like): New bin edges after rebinning.
old_bin_edges (array-like): The original dummy bin edges (0 to len(counts)).
"""
# Generate dummy bin edges
dummy_bin_edges = np.arange(len(counts) + 1)
# Check if counts have uncertainties
has_uncertainties = isinstance(counts[0], UFloat)
# Extract nominal values and uncertainties if necessary
if has_uncertainties:
counts_nominal = unp.nominal_values(counts)
counts_uncertainties_squared = unp.std_devs(counts) ** 2
else:
counts_nominal = counts
counts_uncertainties_squared = None # No uncertainties in this case
new_counts_nominal = []
new_uncertainties_squared = []
new_edges = [dummy_bin_edges[0]]
i = 0
while i < len(counts_nominal):
bin_count_nominal = counts_nominal[i]
bin_uncertainty_squared = (
counts_uncertainties_squared[i] if counts_uncertainties_squared is not None else 0
)
start_edge = dummy_bin_edges[i]
end_edge = dummy_bin_edges[i + 1]
# Merge bins until bin_count is above the threshold
while bin_count_nominal < threshold and i < len(counts_nominal) - 1:
i += 1
bin_count_nominal += counts_nominal[i]
if counts_uncertainties_squared is not None:
bin_uncertainty_squared += counts_uncertainties_squared[i]
end_edge = dummy_bin_edges[i + 1]
new_counts_nominal.append(bin_count_nominal)
if counts_uncertainties_squared is not None:
new_uncertainties_squared.append(bin_uncertainty_squared)
new_edges.append(end_edge)
i += 1
# Combine nominal values and uncertainties into uarray if applicable
if has_uncertainties:
new_counts = unp.uarray(
new_counts_nominal, np.sqrt(new_uncertainties_squared)
)
else:
new_counts = np.array(new_counts_nominal)
return new_counts, np.array(new_edges), dummy_bin_edges
# Function to rebin another histogram using new bin edges
def rebin_histogram_with_new_edges(counts_with_uncertainties, old_bin_edges, new_bin_edges):
"""
Rebins a histogram with counts and uncertainties grouped using unp.uarray.
Parameters:
counts_with_uncertainties (unp.uarray): Counts with uncertainties as a single uarray.
old_bin_edges (array-like): Original bin edges.
new_bin_edges (array-like): New bin edges for rebinning.
Returns:
new_counts_with_uncertainties (unp.uarray): Rebinned counts with uncertainties.
"""
# Extract nominal values (counts) and standard deviations (uncertainties)
counts = unp.nominal_values(counts_with_uncertainties).round().astype(int)
uncertainties = unp.std_devs(counts_with_uncertainties)
# Repeat the bin centers based on counts for rebinning
new_counts, _ = np.histogram(
np.repeat(old_bin_edges[:-1], counts), bins=new_bin_edges
)
# Initialize new uncertainties (sum in quadrature)
new_uncertainties_squared = np.zeros_like(new_counts, dtype=float)
# Combine uncertainties for the new bins
for i in range(len(old_bin_edges) - 1):
bin_value = old_bin_edges[i]
new_bin_index = np.digitize(bin_value, new_bin_edges) - 1 # Find new bin index
new_uncertainties_squared[new_bin_index] += uncertainties[i] ** 2 # Sum uncertainties in quadrature
# Take square root of summed uncertainties square
new_uncertainties = np.sqrt(new_uncertainties_squared)
# Combine counts and uncertainties into a single uarray
new_counts_with_uncertainties = unp.uarray(new_counts, new_uncertainties)
return new_counts_with_uncertainties
def create_templates(samples:dict, bins:list, scale_lumi=1,
variables=['B0_CMS3_weMissM2','p_D_l'],
bin_threshold=1, merge_threshold=10,
fakeD_from_sideband=False, data=None,
sample_to_exclude=['bkg_fakeTracks','bkg_other_TDTl','bkg_other_signal'],
sample_weights={r'$D^{\ast\ast}\ell\nu$_broad':1,
r'$D\ell\nu$_gap_pi':1,
r'$D\ell\nu$_gap_eta':1}):
"""
Creates 2D templates with uncertainties from input samples and applies rebinning and flattening.
Parameters:
samples (dict): Dictionary of data samples, where keys are sample names and values are pandas DataFrames.
bins (list): List defining the bin edges for the 2D histogram.
scale_lumi (float, optional): Scaling factor for luminosity. Default is 1.
variables (list, optional): List of two variable names to use for the 2D histogram. Default is ['B0_CMS3_weMissM2', 'p_D_l'].
bin_threshold (float, optional): Minimum count threshold for trimming bins. Default is 1.
merge_threshold (float, optional): Minimum count threshold for merging adjacent bins. Default is 10.
fakeD_from_sideband (bool, optional): Whether to include fakeD templates derived from D_M sidebands. Default is False.
data (pandas.DataFrame, optional): Data to be used for fakeD sidebands if `fakeD_from_sideband` is True. Default is None.
sample_to_exclude (list, optional): List of sample names to exclude from template creation. Default includes specific background samples.
sample_weights (dict, optional): Dictionary specifying custom weights for specific samples.
Keys are sample names, and values are weight factors. Default is:
{
'$D^{\ast\ast}\ell\nu$_broad': 1,
'$D\ell\nu$_gap_pi': 1,
'$D\ell\nu$_gap_eta': 1
}
Returns:
tuple:
- indices_threshold (np.ndarray): Indices of bins that pass the count threshold.
- temp_sig (tuple): Tuple containing:
- template_flat (dict): Flattened templates with keys as sample names and values as uarray of counts and uncertainties.
- asimov_data (unp.uarray): Summed template representing the Asimov dataset (counts and uncertainties).
- temp_merged (tuple): Tuple containing:
- template_flat_merged (dict): Re-binned templates with merged bins based on `merge_threshold`.
- asimov_data_merged (unp.uarray): Merged Asimov dataset.
- temp_with_sb (tuple): Tuple containing:
- template_flat_with_sb (dict): Templates including fakeD derived from sidebands (if applicable).
- asimov_data_with_sb (unp.uarray): Asimov dataset including fakeD contributions.
Notes:
- Templates are represented as `unp.uarray` objects that encapsulate counts and uncertainties.
- Sample weights are applied when computing weighted histograms.
- Bins with counts below `bin_threshold` are trimmed.
- Adjacent bins with counts below `merge_threshold` are merged.
- If `fakeD_from_sideband` is True, additional templates are created using sidebands of the D_M variable.
"""
def round_uarray(uarray):
"""Rounds a uarray to 4 decimal places."""
nominal = np.round(unp.nominal_values(uarray), 4)
std_dev = np.round(unp.std_devs(uarray), 4)
return unp.uarray(nominal, std_dev)
#################### Create template 2d histograms with uncertainties ################
histograms = {}
for name, df_sig_sb in samples.items():
if name in sample_to_exclude:
continue
df_sig_sb = df_sig_sb.copy()
df = df_sig_sb.query('1.855<D_M<1.885')
if name in sample_weights.keys():
df.loc[:, '__weight__'] = sample_weights[name]
# Compute weighted histogram
counts, xedges, yedges = np.histogram2d(
df[variables[0]], df[variables[1]],
bins=bins, weights=df['__weight__'])
# Compute sum of weight^2 for uncertainties
staterr_squared, _, _ = np.histogram2d(
df[variables[0]], df[variables[1]],
bins=bins, weights=(df['__weight__']**2))
# Store as uarray: Transpose to have consistent shape (y,x) if needed
if name in [r'$D^{\ast\ast}\ell\nu$_narrow',r'$D^{\ast\ast}\ell\nu$_broad']:
# merge the 2 resonant D** modes
if r'$D^{\ast\ast}\ell\nu$' in histograms:
histograms[r'$D^{\ast\ast}\ell\nu$'] += round_uarray(unp.uarray(counts.T, np.sqrt(staterr_squared.T)))
else:
histograms[r'$D^{\ast\ast}\ell\nu$'] = round_uarray(unp.uarray(counts.T, np.sqrt(staterr_squared.T)))
elif name in [r'$D\ell\nu$_gap_pi', r'$D\ell\nu$_gap_eta']:
# merge the 2 Dellnu gap modes
if r'$D\ell\nu$_gap' in histograms:
histograms[r'$D\ell\nu$_gap'] += round_uarray(unp.uarray(counts.T, np.sqrt(staterr_squared.T)))
else:
histograms[r'$D\ell\nu$_gap'] = round_uarray(unp.uarray(counts.T, np.sqrt(staterr_squared.T)))
else:
# store other modes individually
histograms[name] = round_uarray(unp.uarray(counts.T, np.sqrt(staterr_squared.T)))
################### Trimming and flattening ###############
# Determine which bins pass the threshold based on sum of all templates
all_2dHists_sum = np.sum(list(histograms.values()), axis=0) # uarray sum
indices_threshold = np.where(unp.nominal_values(all_2dHists_sum) >= bin_threshold)
# Flatten the templates after cutting
template_flat = {name: round_uarray(hist[indices_threshold]) for name, hist in histograms.items()}
# Asimov data is the sum of all templates
asimov_data = round_uarray(np.sum(list(template_flat.values()), axis=0)) # uarray
#################### Create additional templates for fakeD from sidebands ###################
if fakeD_from_sideband and 'bkg_fakeD' not in sample_to_exclude:
print('Creating the fakeD template from the sidebands')
if data is None: # MC
df_all = pd.concat(samples.values(), ignore_index=True)
else:
df_all = data
df_sidebands = df_all.query('D_M<1.83 or 1.91<D_M').copy()
# Compute the sideband histogram and assume poisson error
bin_D_M = np.linspace(1.79,1.95,81)
D_M_s2, _ = np.histogram(df_sidebands['D_M'], bins=bin_D_M)
D_M_side_count = round_uarray(unp.uarray(D_M_s2, np.sqrt(D_M_s2)))
# Fit a polynomial to the D_M sidebands
fitter = fit_Dmass(x_edges=bin_D_M, hist=D_M_side_count, poly_only=True)
m_ml, c_ml, result_ml = fitter.fit_gauss_poly_ML(deg=1)
yields_left = fitter.poly_integral(xrange=[1.79,1.82],result=result_ml)
yields_sig = fitter.poly_integral(xrange=[1.855,1.885],result=result_ml)
yields_right = fitter.poly_integral(xrange=[1.92,1.95],result=result_ml)
print(f'sig/left = {round_uarray(yields_sig/yields_left)}, \
sig/right = {round_uarray(yields_sig/yields_right)}')
# Construct the fakeD 2d template from sidebands
region_yield = {'D_M<1.83': yields_left, '1.91<D_M': yields_right}
hist_sbFakeD = 0
for region_i, yields_i in region_yield.items():
df_i = df_sidebands.query(region_i)
weights_i = df_i['__weight__']
(side_counts_i, _1, _2) = np.histogram2d(
df_i[variables[0]], df_i[variables[1]], bins=bins)
# compute the counts and errors, scale with the fit result
hist_side_i = unp.uarray(side_counts_i.T, np.sqrt(side_counts_i.T))
scaled_side_i = hist_side_i * (yields_sig/yields_i/2)
hist_sbFakeD += scaled_side_i
# Create new 2d hists with fakeD replaced by sideband
hists_with_sbFakeD = {k: v for k, v in histograms.items()}
modified_hist_sbFakeD = hist_sbFakeD - r_D*hists_with_sbFakeD[r'$D\ell\nu$']
modified_hist_sbFakeD -= r_Dst*hists_with_sbFakeD[r'$D^\ast\ell\nu$']
# Replace negative nominal values with zero, retain uncertainties
n_mod_hist_side = unp.nominal_values(modified_hist_sbFakeD)
s_mod_hist_side = unp.std_devs(modified_hist_sbFakeD)
n2_mod_hist_side = np.where(n_mod_hist_side < 0, 0, n_mod_hist_side)
modified_hist_sbFakeD_2 = round_uarray(unp.uarray(n2_mod_hist_side, s_mod_hist_side))
hists_with_sbFakeD['bkg_fakeD'] = modified_hist_sbFakeD_2
################### Trimming and flattening ###############
# Determine which bins pass the threshold based on sum of all templates
all_2dHists_with_sbFakeD_sum = np.sum(list(hists_with_sbFakeD.values()), axis=0) # uarray sum
indices_threshold_with_sbFakeD = np.where(unp.nominal_values(all_2dHists_with_sbFakeD_sum) >= bin_threshold)
if np.array_equal(indices_threshold_with_sbFakeD, indices_threshold):
print(colored('fakeD template from sidebands and signal region have the same global 0-entry bins', "green"))
else:
# Combine row and column indices into a single structured array for both sets
combined_indices_with_sbFakeD = set(zip(indices_threshold_with_sbFakeD[0], indices_threshold_with_sbFakeD[1]))
combined_indices = set(zip(indices_threshold[0], indices_threshold[1]))
# Find the intersection of the two sets
common_indices = combined_indices_with_sbFakeD.intersection(combined_indices)
# Separate back into row and column indices
new_indices_threshold = (
np.array([idx[0] for idx in common_indices]),
np.array([idx[1] for idx in common_indices])
)
print(colored('fakeD template from sidebands and signal region have different global 0-entry bins', "red"))
print('created a new indices_threshold masking the 0-entry bins in sig OR sidebands')
print(colored(f'applying the new mask, number of bins was {len(asimov_data)}, now is {len(common_indices)}', "blue"))
# Flatten the templates after cutting
template_flat_with_sb = {name: round_uarray(hist[new_indices_threshold]) for name, hist in hists_with_sbFakeD.items()}
# Asimov data is the sum of all templates
asimov_data_with_sb = round_uarray(np.sum(list(template_flat_with_sb.values()), axis=0))
# Do the same for the signal region
template_flat = {name: round_uarray(hist[new_indices_threshold]) for name, hist in histograms.items()}
asimov_data = round_uarray(np.sum(list(template_flat.values()), axis=0)) # uarray
indices_threshold = new_indices_threshold
else:
template_flat_with_sb = {}
asimov_data_with_sb = []
################## Create a new set of templates with merged bins ###########
# Rebin asimov_data according to merge_threshold
new_counts, new_dummy_bin_edges, old_dummy_bin_edges = rebin_histogram(asimov_data, merge_threshold)
print(f'creating a new template with merged bins, original template length = {len(asimov_data)}, new template (merge small bins) length = {len(new_counts)}')
template_flat_merged = {}
for name, t in template_flat.items():
# Rebin using new edges
# Note: old_dummy_bin_edges and new_dummy_bin_edges come from rebin_histogram
# For consistency, we must use the same old bin edges as for asimov_data:
rebinned = rebin_histogram_with_new_edges(t, old_dummy_bin_edges, new_dummy_bin_edges)
template_flat_merged[name] = round_uarray(rebinned)
asimov_data_merged = round_uarray(np.sum(list(template_flat_merged.values()), axis=0))
#################### Prepare return tuples: (template dict, asimov data)
temp_sig = (template_flat, asimov_data)
temp_with_sb = (template_flat_with_sb, asimov_data_with_sb)
temp_merged = (template_flat_merged, asimov_data_merged)
return indices_threshold, temp_sig, temp_with_sb, temp_merged
def create_2d_template_from_1d(template_flat: dict, data_flat: unp.uarray,
indices_threshold: tuple, bins: list):
"""
Convert a flattened 1D template back to a 2D histogram with uncertainties based on provided binning.
Parameters:
template_flat (dict): Flattened 1D templates for different samples, with values as unp.array.
data_flat (unp.uarray): Flattened Asimov data as unp.array.
indices_threshold (tuple): The indices that correspond to the bins retained after applying the threshold.
bins (list): The bin edges for the 2D histogram.
Returns:
temp_2d (dict): 2D histograms for each sample as unp.array.
data_2d (unp.uarray): 2D Asimov dataset as unp.array.
"""
temp_2d = {}
# Extract bin dimensions
xbins, ybins = bins
shape_2d = (len(xbins) - 1, len(ybins) - 1)
for name, flat_data in template_flat.items():
# Initialize an empty 2D array for the histogram
hist_2d = unp.uarray(np.zeros(shape_2d).T, np.zeros(shape_2d).T)
# Assign the flattened data back into the appropriate indices
hist_2d[indices_threshold] = flat_data
temp_2d[name] = hist_2d
# Initialize an empty 2D array for the Asimov data
data_2d = unp.uarray(np.zeros(shape_2d).T, np.zeros(shape_2d).T)
# Assign the flattened data back into the appropriate indices
data_2d[indices_threshold] = data_flat
return temp_2d, data_2d
def compare_2d_hist(data, model, bins_x, bins_y,
xlabel='X-axis', ylabel='Y-axis',
data_label='Data', model_label='Model'):
"""
Compare two 2D histograms with residual plots by projecting onto x and y axes.
Parameters:
data (2D array): First 2D histogram values (data).
model (2D array): Second 2D histogram values (model).
bins_x (1D array): Bin edges for x-axis.
bins_y (1D array): Bin edges for y-axis.
xlabel (str): Label for the x-axis.
ylabel (str): Label for the y-axis.
data_label (str): Label for the first histogram (data).
model_label (str): Label for the second histogram (model).
"""
def compute_residuals(data, model):
# Compute residuals and errors (assuming Poisson for data)
residuals = data - model
residuals[unp.nominal_values(data) == 0] = 0 # Mask bins with 0 data
res_val = unp.nominal_values(residuals)
res_err = unp.std_devs(residuals)
return res_val, res_err
def plot_residuals(ax, bin_centers, res_val, res_err):
# Mask bins with zero errors
mask = res_err != 0
chi2 = np.sum((res_val[mask] / res_err[mask]) ** 2)
ndf = len(res_val[mask])
label = f'reChi2 = {chi2:.3f} / {ndf} = {chi2/ndf:.3f}' if ndf else 'reChi2 not calculated'
ax.errorbar(bin_centers, res_val, yerr=res_err, fmt='ok', label=label)
ax.axhline(0, color='gray', linestyle='--')
ax.set_ylabel('Residuals')
ax.legend()
# Project histograms onto x-axis
projData_x = np.sum(data, axis=0)
projModel_x = np.sum(model, axis=0)
# Project histograms onto y-axis
projData_y = np.sum(data, axis=1)
projModel_y = np.sum(model, axis=1)
# Bin centers
bin_centers_x = (bins_x[:-1] + bins_x[1:]) / 2
bin_centers_y = (bins_y[:-1] + bins_y[1:]) / 2
# Residuals
res_x, res_err_x = compute_residuals(projData_x, projModel_x)
res_y, res_err_y = compute_residuals(projData_y, projModel_y)
# Create the figure
fig, axes = plt.subplots(2, 2, figsize=(12, 7), gridspec_kw={'height_ratios': [4, 1]})
# X-axis projection (top-left)
axes[0, 0].hist(bin_centers_x, bins=bins_x, weights=unp.nominal_values(projModel_x),
histtype='step', label=model_label)
axes[0, 0].errorbar(bin_centers_x, unp.nominal_values(projData_x),
yerr=unp.std_devs(projData_x), fmt='ok', label=data_label)
axes[0, 0].set_ylabel('# of Events')
axes[0, 0].set_title(f'Projection onto {xlabel}')
axes[0, 0].grid()
axes[0, 0].legend()
# X-axis residuals (bottom-left)
plot_residuals(axes[1, 0], bin_centers_x, res_x, res_err_x)
axes[1, 0].set_xlabel(xlabel)
# Y-axis projection (top-right)
axes[0, 1].hist(bin_centers_y, bins=bins_y, weights=unp.nominal_values(projModel_y),
histtype='step', label=model_label)
axes[0, 1].errorbar(bin_centers_y, unp.nominal_values(projData_y),
yerr=unp.std_devs(projData_y), fmt='ok', label=data_label)
axes[0, 1].set_ylabel('# of Events')
axes[0, 1].set_title(f'Projection onto {ylabel}')
axes[0, 1].grid()
axes[0, 1].legend()
# Y-axis residuals (bottom-right)
plot_residuals(axes[1, 1], bin_centers_y, res_y, res_err_y)
axes[1, 1].set_xlabel(ylabel)
plt.tight_layout()
plt.show()
def create_workspace(temp_asimov_channels: list,
mc_uncer: bool = True, fakeD_uncer: bool = True) -> dict:
"""
Create a structured workspace dictionary for statistical analysis and fitting.
Args:
temp_asimov_channels (list): A list of tuples, where each tuple contains:
- A dictionary of sample templates with bin data.
- The corresponding Asimov dataset.
mc_uncer (bool, optional): If True, includes statistical uncertainties for all MC backgrounds. Default is True.
fakeD_uncer (bool, optional): If True, includes statistical uncertainties for the 'bkg_fakeD' sample. Default is True.
Returns:
dict: A structured dictionary containing:
- 'channels': A list of channels with their respective samples and uncertainties.
- 'measurements': A list defining the measurement setup.
- 'observations': The observed data for each channel.
- 'version': The version identifier of the workspace format.
"""
# Initialize key workspace components
channels = []
observations = []
measurements = [{"name": "R_D", "config": {"poi": "$D\\tau\\nu$_norm", "parameters": []}}]
version = "1.0.0"
# Extract sample names from the first set of templates
sample_names = list(temp_asimov_channels[0][0].keys())
# Loop over each channel (index, tuple of template_flat and asimov_data)
for ch_index, (template_flat, asimov_data) in enumerate(temp_asimov_channels):
# Store observed data for the channel
observations.append({
'name': f'channel_{ch_index}',
'data': unp.nominal_values(asimov_data).tolist() # Extract nominal values from uncertainties
})
# Initialize channel structure
channels.append({
'name': f'channel_{ch_index}',
'samples': []
})
# Loop over each sample in the channel
for sample_index, sample_name in enumerate(sample_names):
if np.sum(template_flat[sample_name])==0:
continue # skip samples with 0 events
# Add the nominal template data for the sample
channels[ch_index]['samples'].append({
'name': sample_name,
'data': unp.nominal_values(template_flat[sample_name]).tolist(),
'modifiers': [
{
'name': sample_name+'_norm',
'type': 'normfactor',
'data': None # Normalization factor modifier
}
]
})
# Add uncertainty modifiers for statistical errors
if sample_name == 'bkg_fakeD' and fakeD_uncer:
# Add statistical uncertainty for 'bkg_fakeD' using shapesys
channels[ch_index]['samples'][sample_index]['modifiers'].append({
'name': f'mc_stat_uncer_ch{ch_index}', # fakeD_stat
'type': 'staterror', # 'shapesys'
'data': unp.std_devs(template_flat[sample_name]).tolist()
})
elif sample_name != 'bkg_fakeD' and mc_uncer:
# Add statistical uncertainty for all other MC backgrounds using staterror
channels[ch_index]['samples'][sample_index]['modifiers'].append({
'name': f'mc_stat_uncer_ch{ch_index}',
'type': 'staterror',
'data': unp.std_devs(template_flat[sample_name]).tolist()
})
# Define parameter bounds based on whether it's a background or signal sample
if sample_name.startswith('bkg'):
par_config = {"name": sample_name+'_norm', "bounds": [[0, 2]], "inits": [1.0], "fixed":True}
else:
par_config = {"name": sample_name+'_norm', "bounds": [[-5, 5]], "inits": [1.0]}
# Add parameter configuration if it doesn't already exist
if par_config not in measurements[0]['config']['parameters']:
measurements[0]['config']['parameters'].append(par_config)
# Construct the final workspace dictionary
workspace = {
'channels': channels,
'measurements': measurements,
'observations': observations,
'version': version
}
return workspace
# for samp_index, sample in enumerate(workspace['channels'][ch_index]['samples']):
# sample = {'name': 'new_sample'} # This would not update the list in `workspace`
# Using sample as a reference to the list element is perfectly fine and will not cause bugs
# as long as you're modifying the contents of sample (like updating values or appending to a list).
# However, be cautious when assigning a completely new value to sample itself,
# as that won't update the original list.
def extract_temp_asimov_channels(workspace: dict, mc_uncer: bool = True) -> list:
"""
Extracts `temp_asimov_channels` from a workspace.
Parameters:
workspace (dict): The workspace from which to extract templates and Asimov data.
mc_uncer (bool, optional): Whether to include statistical uncertainties. Default is True.
Returns:
list: A list of tuples for each channel:
- template_flat (dict): Flattened templates for each sample as unp.array.
- asimov_data (unp.array): Asimov data as unp.array.
"""
temp_asimov_channels = []
for ch_index, channel in enumerate(workspace['channels']):
# Extract flattened templates
template_flat = {}
for sample in channel['samples']:
# Extract nominal values and uncertainties
nominal_values = np.array(sample['data'])
if mc_uncer:
# Find the staterror modifier for uncertainties
staterror_mod = next((m for m in sample['modifiers'] if m['type'] == 'staterror'), None)
if staterror_mod:
uncertainties = np.array(staterror_mod['data'])
else:
uncertainties = np.sqrt(nominal_values)
else:
uncertainties = np.sqrt(nominal_values)
# Store as unp.array
template_flat[sample['name']] = unp.uarray(nominal_values, uncertainties)
# Extract Asimov data
asimov_data_nominal = np.array(workspace['observations'][ch_index]['data'])
asimov_data_uncertainties = np.sqrt(asimov_data_nominal) # Default uncertainties to poisson
asimov_data = unp.uarray(asimov_data_nominal, asimov_data_uncertainties)
# Append the reconstructed channel to the list
temp_asimov_channels.append((template_flat, asimov_data))
return temp_asimov_channels
def inspect_temp_asimov_channels(t1, t2=None):
"""
Inspect and compare the templates and Asimov data for multiple channels.
Parameters:
t1 (list): First list of channel data, where each element is a tuple:
- template_flat (dict): Flattened templates for each sample as unp.array.
- asimov_data (unp.array): Asimov data as unp.array.
t2 (list, optional): Second list of channel data for comparison, structured like `t1`. Default is None.
Returns:
None: Prints the inspection and comparison results to the console.
Notes:
- If `t2` is provided, the function compares the templates and Asimov data in `t1` and `t2`.
- The function checks for equality of `unp.array` objects in both inputs using `np.array_equal`.
- Outputs differences in templates and Asimov data for mismatched channels.
"""
for ch_index, (template_flat, asimov_data) in enumerate(t1):
print(f"Channel {ch_index}:")
for name, data in template_flat.items():
print(f" Sample: {name}, Data: {data}")
if t2 is not None:
nominal1 = unp.nominal_values(data)
nominal2 = unp.nominal_values(t2[ch_index][0][name])
std1 = unp.std_devs(data)
std2 = unp.std_devs(t2[ch_index][0][name])
if np.array_equal(nominal1, nominal2) and np.array_equal(std1, std2):
print(colored(f' {name} templates are equal in the 2 inputs','green'))
else:
print(colored(f' {name} templates are different in the 2 inputs','red'))
print(colored(f' {np.array_equal(nominal1, nominal2)=}, {np.array_equal(std1, std2)=}','red'))
print(f" Sample: {name}, Data (from t2): {t2[ch_index][0][name]}")
print(f" Asimov Data: {asimov_data}")
if t2 is not None:
nominal1 = unp.nominal_values(asimov_data)
nominal2 = unp.nominal_values(t2[ch_index][1])
std1 = unp.std_devs(asimov_data)
std2 = unp.std_devs(t2[ch_index][1])
if np.array_equal(nominal1, nominal2) and np.array_equal(std1, std2):
print(colored(' Asimov data are equal in the 2 inputs','green'))
else:
print(colored(' Asimov data are different in the 2 inputs','red'))
print(colored(f' {np.array_equal(nominal1, nominal2)=}, {np.array_equal(std1, std2)=}','red'))
print(f" Asimov Data (from t2): {t2[ch_index][1]}")
# def calculate_FOM3d(sig_data, bkg_data, variables, test_points):
# sig = pd.concat(sig_data)
# bkg = pd.concat(bkg_data)
# sig_tot = len(sig)
# bkg_tot = len(bkg)
# BDT_FOM = []
# BDT_FOM_err = []
# BDT_sigEff = []
# BDT_sigEff_err = []
# BDT_bkgEff = []
# BDT_bkgEff_err = []
# for i in test_points[0]:
# for j in test_points[1]:
# for k in test_points[2]:
# nsig = len(sig.query(f"{variables[0]}>{i} and {variables[1]}>{j} and {variables[2]}>{k}"))
# nbkg = len(bkg.query(f"{variables[0]}>{i} and {variables[1]}>{j} and {variables[2]}>{k}"))
# tot = nsig+nbkg
# tot_err = np.sqrt(tot)
# FOM = nsig / tot_err # s / √(s+b)
# FOM_err = np.sqrt( (tot_err - FOM/2)**2 /tot**2 * nsig + nbkg**3/(4*tot**3) + 9*nbkg**2*np.sqrt(nsig*nbkg)/(4*tot**5) )
# BDT_FOM.append(round(FOM,2))
# BDT_FOM_err.append(round(FOM_err,2))
# sigEff = nsig / sig_tot
# sigEff_err = sigEff * np.sqrt(1/nsig + 1/sig_tot)
# bkgEff = nbkg / bkg_tot
# bkgEff_err = bkgEff * np.sqrt(1/nbkg + 1/bkg_tot)
# BDT_sigEff.append(round(sigEff,2))
# BDT_sigEff_err.append(round(sigEff_err,2))