Skip to content

Commit

Permalink
perf: drop use of awkward in yield uncertainty calculation (#408)
Browse files Browse the repository at this point in the history
* use pure numpy instead of awkward in yield uncertainty calculation
* reduced memory footprint and performance improvements for yield uncertainty calculation
  • Loading branch information
ekauffma authored Jul 5, 2023
1 parent 5064e38 commit 7b9b6af
Showing 1 changed file with 62 additions and 45 deletions.
107 changes: 62 additions & 45 deletions src/cabinetry/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
import logging
from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple, Union

try:
# use awkward v2
import awkward._v2 as ak
except ModuleNotFoundError: # pragma: no cover
# fallback if the _v2 submodule disappears after the full v2 release
import awkward as ak # pragma: no cover
import numpy as np
import pyhf

Expand Down Expand Up @@ -300,16 +294,26 @@ def yield_stdev(
up_comb = np.vstack((up_comb, np.sum(up_comb, axis=0)))
# turn into list of channels (keep all samples, select correct bins per channel)
# indices: channel, sample, bin
up_yields = [
up_yields_per_channel = [
up_comb[:, model.config.channel_slices[ch]] for ch in model.config.channels
]
# append list of yields summed per channel
up_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in up_yields
]
# indices: variation, channel, sample, bin
up_variations.append(up_yields)
# calculate list of yields summed per channel
up_yields_channel_sum = np.stack(
[
np.sum(chan_yields, axis=-1, keepdims=True)
for chan_yields in up_yields_per_channel
]
)
# reshape to drop bin axis, transpose to turn channel axis into new bin axis
# (channel, sample, bin) -> (sample, bin) where "bin" becomes channel sums
up_yields_channel_sum = up_yields_channel_sum.reshape(
up_yields_channel_sum.shape[:-1]
).T

# concatenate per-channel sums to up_comb (along bin axis)
up_yields = np.concatenate((up_comb, up_yields_channel_sum), axis=1)
# indices: variation, sample, bin
up_variations.append(up_yields.tolist())

# model distribution per sample with this parameter varied down
down_comb = pyhf.tensorlib.to_numpy(
Expand All @@ -318,68 +322,81 @@ def yield_stdev(
# add total prediction (sum over samples)
down_comb = np.vstack((down_comb, np.sum(down_comb, axis=0)))
# turn into list of channels
down_yields = [
down_yields_per_channel = [
down_comb[:, model.config.channel_slices[ch]]
for ch in model.config.channels
]
# append list of yields summed per channel
down_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in down_yields
]

# calculate list of yields summed per channel
down_yields_channel_sum = np.stack(
[
np.sum(chan_yields, axis=-1, keepdims=True)
for chan_yields in down_yields_per_channel
]
)
down_yields_channel_sum = down_yields_channel_sum.reshape(
down_yields_channel_sum.shape[:-1]
).T

down_yields = np.concatenate((down_comb, down_yields_channel_sum), axis=1)
down_variations.append(down_yields)

# convert to awkward arrays for further processing
up_variations_ak = ak.from_iter(up_variations)
down_variations_ak = ak.from_iter(down_variations)
# convert to numpy arrays for further processing
up_variations_np = np.asarray(up_variations)
down_variations_np = np.asarray(down_variations)

# calculate symmetric uncertainties for all components
# indices: variation, channel (last entries sums), sample (last entry sum), bin
sym_uncs = (up_variations_ak - down_variations_ak) / 2
sym_uncs = (up_variations_np - down_variations_np) / 2

# calculate total variance, indexed by channel, sample, bin (per-channel numbers act
# like additional channels with one bin each)
# calculate total variance, indexed by sample, bin (per-channel numbers act like
# additional channels with one bin each)
if np.count_nonzero(corr_mat - np.diagflat(np.ones_like(parameters))) == 0:
# no off-diagonal contributions from correlation matrix (e.g. pre-fit)
total_variance = ak.sum(np.power(sym_uncs, 2), axis=0)
total_variance = np.sum(np.power(sym_uncs, 2), axis=0)
else:
# full calculation including off-diagonal contributions
# with v as vector of variations (each element contains yields under variation)
# and M as correlation matrix, calculate variance as follows:
# variance = sum_i sum_j v[i] * M[i, j] * v[j]
# where the product between elements of v again is elementwise (multiplying bin
# yields), and the final variance shape is the same as element of v (yield
# uncertainties per bin, sample, channel)
# uncertainties per sample, bin)

# possible optimizations that could be considered here:
# - skipping staterror-staterror terms for per-bin calculation (orthogonal)
# - taking advantage of correlation matrix symmetry
# - (optional) skipping combinations with correlations below threshold

# calculate M[i, j] * v[j] first
# indices: pars (i), pars (j), channel, sample, bin
m_times_v = (
corr_mat[..., np.newaxis, np.newaxis, np.newaxis]
* sym_uncs[np.newaxis, ...]
)
# now multiply by v[i] as well, indices: pars(i), pars(j), channel, sample, bin
# indices: pars (i), pars (j), sample, bin
m_times_v = corr_mat[..., np.newaxis, np.newaxis] * sym_uncs[np.newaxis, ...]
# now multiply by v[i] as well, indices: pars(i), pars(j), sample, bin
v_times_m_times_v = sym_uncs[:, np.newaxis, ...] * m_times_v
# finally perform sums over i and j, remaining indices: channel, sample, bin
total_variance = ak.sum(ak.sum(v_times_m_times_v, axis=1), axis=0)
# finally perform sums over i and j, remaining indices: sample, bin
total_variance = np.sum(np.sum(v_times_m_times_v, axis=1), axis=0)

# get number of bins from model config
n_bins = sum(model.config.channel_nbins.values())

# convert to standard deviations per bin and per channel
n_channels = len(model.config.channels)
# add back outer channel dimension for per-bin uncertainty
# indices: (channel, sample, bin)
total_stdev_per_bin = np.sqrt(total_variance[:n_channels])
total_stdev_per_bin = [
np.sqrt(total_variance[:, :n_bins][:, model.config.channel_slices[ch]]).tolist()
for ch in model.config.channels
]
# per-channel: transpose to flip remaining dimensions (channel sums acted as
# individual bins before)
# indices: (channel, sample)
total_stdev_per_channel = ak.sum(np.sqrt(total_variance[n_channels:]), axis=-1)
# log total stdev per bin / channel (-1 index for sample sum)
log.debug(f"total stdev is {total_stdev_per_bin[:, -1, :]}")
log.debug(f"total stdev per channel is {total_stdev_per_channel[:, -1]}")
total_stdev_per_channel = np.sqrt(total_variance[:, n_bins:].T).tolist()

# convert to lists
total_stdev_per_bin = ak.to_list(total_stdev_per_bin)
total_stdev_per_channel = ak.to_list(total_stdev_per_channel)
# log total stdev per bin / channel (-1 index for sample sum)
n_channels = len(model.config.channels)
total_stdev_bin = [total_stdev_per_bin[i][-1] for i in range(n_channels)]
log.debug(f"total stdev is {total_stdev_bin}")
total_stdev_chan = [total_stdev_per_channel[i][-1] for i in range(n_channels)]
log.debug(f"total stdev per channel is {total_stdev_chan}")

# save to cache
_YIELD_STDEV_CACHE.update(
Expand Down

0 comments on commit 7b9b6af

Please sign in to comment.