Skip to content

Commit

Permalink
Adding the basic functionality of FilledHistogram class (#221)
Browse files Browse the repository at this point in the history
* Adding the basic functionality of FilledHistogram class

* Now only one Histogram class that treats the histogram as filled depending on the specification of the 'bin_edges' input.

* correct bin_edges bug for divide

* Reduced line sizes and remove unused input for helper function

* removed some code duplication by merging functions

* bug fix

* made filled argument a optional parameter to pass unit tests...

* added a basic test for filled histogram

* added test case for sumW2 in filled hists

* reformatting to fit lines

* fixed a sumW2 bug and test bug

* fixed formatting with pre-commit

* add changelog line

* added warning upon defining sum_squared_weights but no bin_edges

---------

Co-authored-by: thmaurin <[email protected]>
Co-authored-by: Sam VS <[email protected]>
  • Loading branch information
3 people authored Nov 27, 2023
1 parent 1369431 commit 46adb1e
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 35 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- Update ratio uncertainty calculation [!225](https://github.com/umami-hep/puma/pull/225)
- Add NaN filtering for HL plots. Bump atlas-ftag-tools version to 0.1.11. [!220](https://github.com/umami-hep/puma/pull/220)
- Making Under- and Overflow bins default in histograms [!218](https://github.com/umami-hep/puma/pull/218)
- Adding "bin_edges" and "sum_squared_weights" parameters to the Histogram constructor to define pre-filled histograms.

### [v0.3.0] (2023/10/03)

Expand Down
32 changes: 30 additions & 2 deletions puma/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def __init__(
self,
values: np.ndarray,
weights: np.ndarray = None,
bin_edges: np.ndarray = None,
sum_squared_weights: np.ndarray = None,
ratio_group: str | None = None,
flavour: str | Flavour = None,
add_flavour_label: bool = True,
Expand All @@ -34,12 +36,21 @@ def __init__(
Parameters
----------
values : np.ndarray
Input data for the histogram
Input data for the histogram. If bin_edges is specified (not None)
then this array is treated as the bin heights.
weights : np.ndarray, optional
Weights for the input data. Has to be an array of same length as the input
data with a weight for each entry. If not specified, weight 1 will be given
to each entry. The uncertainties are calculated as the square root of the
squared weights (for each bin separately). By default None.
bin_edges : np.ndarray, optional
If specified, the histogram is considered "filled": the array given to
values is treated as if it was the bin heights corresponding to these
bin_edges and the "weights" input is ignored. By default None.
sum_squared_weights : np.ndarray, optional
Only considered if the histogram is considered filled (i.e bin_edges
is specified). It is the sum_squared_weights per bin.
By default None.
ratio_group : str, optional
Name of the ratio group this histogram is compared with. The ratio group
allows you to compare different groups of histograms within one plot.
Expand Down Expand Up @@ -85,6 +96,18 @@ def __init__(
raise ValueError("`values` and `weights` are not of same length.")

self.values = values
self.bin_edges = bin_edges # Important to have this defined for any histogram
self.sum_squared_weights = sum_squared_weights

if bin_edges is None and sum_squared_weights is not None:
logger.warning("""The Histogram has no bin edges defined and is thus
not considered filled. Parameter `sum_squared_weights`
is ignored. """)

# This attribute allows to know how to handle the histogram later during
# plotting
self.filled = bin_edges is not None

self.weights = weights
self.ratio_group = ratio_group
self.flavour = Flavours[flavour] if isinstance(flavour, str) else flavour
Expand All @@ -94,7 +117,6 @@ def __init__(

# Set histogram attributes to None. They will be defined when the histograms
# are plotted
self.bin_edges = None
self.hist = None
self.unc = None
self.band = None
Expand Down Expand Up @@ -432,12 +454,18 @@ def plot(self, **kwargs):
elem.bin_edges, elem.hist, elem.unc, elem.band = hist_w_unc(
elem.values,
weights=elem.weights,
bin_edges=elem.bin_edges,
sum_squared_weights=elem.sum_squared_weights,
bins=self.bins,
filled=elem.filled,
bins_range=self.bins_range,
normed=self.norm,
underoverflow=self.underoverflow,
)

# MAYBE CHECK HERE THAT self.bins and elem.bin_edges are
# equivalent for plotting or throw error!

if self.discrete_vals is not None:
# bins are recalculated for the discrete values
bins = self.get_discrete_values(elem)
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
49 changes: 49 additions & 0 deletions puma/tests/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,52 @@ def test_underoverflow_bin(self):
tol=1,
)
)

def test_plot_filled_hist(self):
bin_edges = [0, 1, 2, 3, 4, 5]
bin_counts = [5, 4, 7, 12, 2]

vals = [0, 1, 1, 5, 4, 2, 1, 3, 3, 5, 5, 5, 5, 5]

hist_filled = Histogram(bin_counts, bin_edges=bin_edges)
hist_notfilled = Histogram(vals)

hist_plot = HistogramPlot(bins=bin_edges, underoverflow=False)
hist_plot.add(hist_filled)
hist_plot.add(hist_notfilled)

hist_plot.draw()
plotname = "test_filled_histogram.png"
hist_plot.savefig(f"{self.actual_plots_dir}/{plotname}")

self.assertIsNone(
compare_images(
f"{self.actual_plots_dir}/{plotname}",
f"{self.expected_plots_dir}/{plotname}",
tol=1,
)
)

def test_plot_filled_hist_sumW2(self):
bin_edges = [0, 1, 2, 3, 4, 5]
bin_counts = [5, 4, 7, 12, 2]
sum_squared_weights = [10, 7, 12, 21, 5]

hist_plot = HistogramPlot(bins=bin_edges, underoverflow=False)
hist_plot.add(
Histogram(
bin_counts, bin_edges=bin_edges, sum_squared_weights=sum_squared_weights
)
)

hist_plot.draw()
plotname = "test_filled_histogram_sumW2.png"
hist_plot.savefig(f"{self.actual_plots_dir}/{plotname}")

self.assertIsNone(
compare_images(
f"{self.actual_plots_dir}/{plotname}",
f"{self.expected_plots_dir}/{plotname}",
tol=1,
)
)
95 changes: 62 additions & 33 deletions puma/utils/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,12 @@ def save_divide(
def hist_w_unc(
arr,
bins,
filled: bool = False,
bins_range=None,
normed: bool = True,
weights: np.ndarray = None,
bin_edges: np.ndarray = None,
sum_squared_weights: np.ndarray = None,
underoverflow: bool = False,
):
"""
Expand Down Expand Up @@ -115,42 +118,68 @@ def hist_w_unc(
if np.sum(inf_mask) > 0:
logger.warning("Histogram values contain %i +-inf values!", np.sum(inf_mask))

# Calculate the counts and the bin edges
counts, bin_edges = np.histogram(arr, bins=bins, range=bins_range, weights=weights)

# calculate the uncertainty with sum of squared weights (per bin, so we use
# np.histogram again here)
unc = np.sqrt(
np.histogram(arr, bins=bins, range=bins_range, weights=weights**2)[0]
)
# If the histogram is not already filled we need to produce the histogram counts
# and bin edges
if not filled:
# Calculate the counts and the bin edges
counts, bin_edges = np.histogram(
arr, bins=bins, range=bins_range, weights=weights
)

if underoverflow:
# add two dummy bins (from outermost bins to +-infinity)
bins_with_overunderflow = np.hstack(
[np.array([-np.inf]), bin_edges, np.array([np.inf])]
# calculate the uncertainty with sum of squared weights (per bin, so we use
# np.histogram again here)
unc = np.sqrt(
np.histogram(arr, bins=bins, range=bins_range, weights=weights**2)[0]
)
# recalculate the histogram with this adjusted binning
counts, _ = np.histogram(arr, bins=bins_with_overunderflow, weights=weights)
counts[1] += counts[0] # add underflow values to underflow bin
counts[-2] += counts[-1] # add overflow values to overflow bin
counts = counts[1:-1] # remove dummy bins

# calculate the sum of squared weights
sum_squared_weights = np.histogram(
arr, bins=bins_with_overunderflow, weights=weights**2
)[0]
# add sum of squared weights from under/overflow values to under/overflow bin
sum_squared_weights[1] += sum_squared_weights[0]
sum_squared_weights[-2] += sum_squared_weights[-1]
sum_squared_weights = sum_squared_weights[1:-1] # remove dummy bins

unc = np.sqrt(sum_squared_weights) # uncertainty is sqrt(sum_squared_weights)

if normed:
sum_of_weights = float(np.sum(weights))
counts = save_divide(counts, sum_of_weights, 0)
unc = save_divide(unc, sum_of_weights, 0)

if underoverflow:
# add two dummy bins (from outermost bins to +-infinity)
bins_with_overunderflow = np.hstack(
[np.array([-np.inf]), bin_edges, np.array([np.inf])]
)
# recalculate the histogram with this adjusted binning
counts, _ = np.histogram(arr, bins=bins_with_overunderflow, weights=weights)
counts[1] += counts[0] # add underflow values to underflow bin
counts[-2] += counts[-1] # add overflow values to overflow bin
counts = counts[1:-1] # remove dummy bins

# calculate the sum of squared weights
sum_squared_weights = np.histogram(
arr, bins=bins_with_overunderflow, weights=weights**2
)[0]

# add sum of squared weights from under/overflow values
# to under/overflow bin
sum_squared_weights[1] += sum_squared_weights[0]
sum_squared_weights[-2] += sum_squared_weights[-1]
# remove dummy bins
sum_squared_weights = sum_squared_weights[1:-1]

# uncertainty is sqrt(sum_squared_weights)
unc = np.sqrt(sum_squared_weights)

if normed:
sum_of_weights = float(np.sum(weights))
counts = save_divide(counts, sum_of_weights, 0)
unc = save_divide(unc, sum_of_weights, 0)

# If the histogram is already filled then the uncertainty is computed
# differently
else:
if sum_squared_weights is not None:
sum_squared_weights = np.array(sum_squared_weights)[~nan_mask]
unc = np.sqrt(sum_squared_weights)
else:
unc = np.sqrt(arr) # treat arr as bin heights (counts)

counts = arr

if normed:
counts_sum = float(np.sum(counts))
counts = save_divide(counts, counts_sum, 0)
unc = save_divide(unc, counts_sum, 0)

# regardless of if the histogram is filled
band = counts - unc
hist = counts

Expand Down

0 comments on commit 46adb1e

Please sign in to comment.