Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce class core.random.RandomChoice for faster background and signal event generation from MC #173

Merged
merged 13 commits into from
Sep 4, 2023
7 changes: 7 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ v23.2.1

- The iminuit package is an optional tool now.

- Introduce class core.random.RandomChoice to replace numpy.random.choice with a
faster implementation avoiding the recalculation of the cumulative
distribution function (CDF) when called several times (trials) for the same
probabilities (dataset). This new class is used in background and signal
event generation from MC, resulting in a great performance increase when
generating many trials with many events.

v23.2.0
=======
- Complete overhaul of SkyLLH for more generic handling of parameters
Expand Down
101 changes: 39 additions & 62 deletions skyllh/core/background_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
func_has_n_args,
issequenceof,
)
from skyllh.core.random import (
RandomChoice,
)
from skyllh.core.scrambling import (
DataScrambler,
)
Expand Down Expand Up @@ -123,7 +126,6 @@ def __init__(
self,
get_event_prob_func,
get_mean_func=None,
unique_events=False,
data_scrambler=None,
mc_inplace_scrambling=False,
keep_mc_data_fields=None,
Expand Down Expand Up @@ -157,12 +159,8 @@ def __init__(
of events, for which the mean number of background events should get
calculated. This argument can be `None`, which means that the mean
number of background events to generate needs to get specified
through the ``generate_events`` method. However, if an event
through the ``generate_events`` method. However, if a pre event
selection method is provided, this argument cannot be ``None``!
unique_events : bool
Flag if unique events should be drawn from the monte-carlo
(``True``), or if events can be drawn several times (``False``).
Default is ``False``.
data_scrambler : instance of DataScrambler | None
If set to an instance of DataScrambler, the drawn monte-carlo
background events will get scrambled. This can ensure more
Expand All @@ -188,7 +186,6 @@ def __init__(

self.get_event_prob_func = get_event_prob_func
self.get_mean_func = get_mean_func
self.unique_events = unique_events
self.data_scrambler = data_scrambler
self.mc_inplace_scrambling = mc_inplace_scrambling
self.keep_mc_data_field_names = keep_mc_data_fields
Expand All @@ -202,10 +199,11 @@ def __init__(
# Define cache members to cache the background probabilities for each
# monte-carlo event. The probabilities change only if the data changes.
self._cache_data_id = None
self._cache_mc_pre_selected = None
self._cache_mc = None
self._cache_mc_event_bkg_prob = None
self._cache_mc_event_bkg_prob_pre_selected = None
self._cache_mean = None
self._cache_mean_pre_selected = None
self._cache_random_choice = None

@property
def get_event_prob_func(self):
Expand Down Expand Up @@ -251,21 +249,6 @@ def get_mean_func(self, func):
'have 3 arguments!')
self._get_mean_func = func

@property
def unique_events(self):
"""Flag if unique events should be drawn from the monto-carlo (True),
or if the same event can be drawn multiple times from the monte-carlo.
"""
return self._unique_events

@unique_events.setter
def unique_events(self, b):
if not isinstance(b, bool):
raise TypeError(
'The unique_events property must be of type bool! '
f'Its current type is {classname(b)}.')
self._unique_events = b

@property
def data_scrambler(self):
"""The DataScrambler instance that should be used to scramble the drawn
Expand Down Expand Up @@ -453,29 +436,38 @@ def generate_events(
data=data,
events=data_mc)

with TaskTimer(
tl,
'Calculate MC background event probability cache.'):
self._cache_mc_event_bkg_prob = self._get_event_prob_func(
dataset=dataset,
data=data,
events=data_mc)

if self__pre_event_selection_method is not None:
with TaskTimer(
tl,
'Pre-select MC events.'):
(self._cache_mc_pre_selected,
mc_pre_selected_src_evt_idxs,
mc_pre_selected_idxs) =\
(self._cache_mc, _) =\
self__pre_event_selection_method.select_events(
events=data_mc,
ret_original_evt_idxs=True,
ret_original_evt_idxs=False,
tl=tl)
self._cache_mc_event_bkg_prob_pre_selected = np.take(
self._cache_mc_event_bkg_prob, mc_pre_selected_idxs)

with TaskTimer(tl, 'Calculate selected MC background mean.'):
self._cache_mean_pre_selected = self._get_mean_func(
dataset=dataset,
data=data,
events=self._cache_mc)
else:
self._cache_mc_pre_selected = data_mc
self._cache_mc = data_mc

with TaskTimer(
tl,
'Calculate MC background event probability cache.'):
self._cache_mc_event_bkg_prob = self._get_event_prob_func(
dataset=dataset,
data=data,
events=self._cache_mc)

with TaskTimer(
tl,
'Create RandomChoice for MC background events.'):
self._cache_random_choice = RandomChoice(
items=self._cache_mc.indices,
probabilities=self._cache_mc_event_bkg_prob)

if mean is None:
if self._cache_mean is None:
Expand All @@ -496,42 +488,27 @@ def generate_events(
n_bkg = (int(rss.random.poisson(mean)) if poisson else
int(np.round(mean, 0)))

# Apply only event pre-selection before choosing events.
data_mc_selected = self._cache_mc_pre_selected

# Calculate the mean number of background events for the pre-selected
# MC events.
if self__pre_event_selection_method is None:
# No selection at all, use the total mean.
mean_selected = mean
mean_pre_selected = mean
else:
with TaskTimer(tl, 'Calculate selected MC background mean.'):
mean_selected = self._get_mean_func(
dataset=dataset,
data=data,
events=data_mc_selected)
mean_pre_selected = self._cache_mean_pre_selected

# Calculate the actual number of background events for the selected
# events.
p_binomial = mean_selected / mean
with TaskTimer(tl, 'Get p array.'):
if self__pre_event_selection_method is None:
p = self._cache_mc_event_bkg_prob
else:
# Pre-selection.
p = self._cache_mc_event_bkg_prob_pre_selected / p_binomial
n_bkg_selected = int(np.around(n_bkg * p_binomial, 0))
n_bkg_selected = int(np.around(n_bkg * mean_pre_selected / mean, 0))

# Draw the actual background events from the selected events of the
# monto-carlo data set.
with TaskTimer(tl, 'Draw MC background indices.'):
bkg_event_indices = rss.random.choice(
data_mc_selected.indices,
size=n_bkg_selected,
p=p,
replace=(not self._unique_events))
bkg_event_indices = self._cache_random_choice(
rss=rss,
size=n_bkg_selected)

with TaskTimer(tl, 'Select MC background events from indices.'):
bkg_events = data_mc_selected[bkg_event_indices]
bkg_events = self._cache_mc[bkg_event_indices]

# Scramble the drawn MC events if requested.
if self._data_scrambler is not None:
Expand Down
166 changes: 165 additions & 1 deletion skyllh/core/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

import numpy as np

from skyllh.core.py import int_cast
from skyllh.core.py import (
classname,
int_cast,
)


class RandomStateService(
Expand Down Expand Up @@ -70,3 +73,164 @@ def reseed(self, seed):
'The seed argument must be None or castable to type int!',
allow_None=True)
self.random.seed(self._seed)


class RandomChoice(
object,
):
"""This class provides an efficient numpy.random.choice functionality
specialized for SkyLLH. The advantage is that it stores the cumulative
distribution function (CDF), which is assumed to be constant.
"""

def __init__(
self,
items,
probabilities,
**kwargs,
):
"""Creates a new instance of RandomChoice holding the probabilities
and their cumulative distribution function (CDF).

Parameters
----------
items : instance of numpy.ndarray
The (N,)-shaped numpy.ndarray holding the items from which to
choose.
probabilities : instance of numpy.ndarray
The (N,)-shaped numpy.ndarray holding the probability for each item.
"""
super().__init__(**kwargs)

self._assert_items(items)
self._items = items

self._assert_probabilities(probabilities, self._items.size)
self._probabilities = probabilities

# Create the cumulative distribution function (CDF).
self._cdf = self._probabilities.cumsum()
self._cdf /= self._cdf[-1]

@property
def items(self):
"""(read-only) The (N,)-shaped numpy.ndarray holding the items from
which to choose.
"""
return self._items

@property
def probabilities(self):
"""(read-only) The (N,)-shaped numpy.ndarray holding the propability
for each item.
"""
return self._probabilities

def _assert_items(
self,
items,
):
"""Checks for the correct type and shape of the items.

Parameters
----------
items : The (N,)-shaped numpy.ndarray holding the items from which to
choose.

Raises
------
TypeError
If the type of items is not numpy.ndarray.
ValueError
If the items do not have the correct type and shape.
"""
if not isinstance(items, np.ndarray):
raise TypeError(
'The items must be an instance of numpy.ndarray! '
f'Its current type is {classname(items)}!')

if items.ndim != 1:
raise ValueError(
'The items must be a 1-dimensional numpy.ndarray!')

def _assert_probabilities(
self,
p,
n_items,
):
"""Checks for correct values of the probabilities.

Parameters
----------
p : instance of numpy.ndarray
The (N,)-shaped numpy.ndarray holding the probability for each item.
n_items : int
The number of items.

Raises
------
ValueError
If the probabilities do not have the correct type, shape and values.
"""
atol = np.sqrt(np.finfo(np.float64).eps)
atol = max(atol, np.sqrt(np.finfo(p.dtype).eps))

if p.ndim != 1:
raise ValueError(
'The probabilities must be provided as a 1-dimensional '
'numpy.ndarray!')

if p.size != n_items:
raise ValueError(
f'The size ({p.size}) of the probabilities array must match '
f'the number of items ({n_items})!')

if np.any(p < 0):
raise ValueError(
'The probabilities must be greater or equal zero!')

if p.dtype is not np.dtype(np.float64):
raise TypeError(
'The probabilities must be of type numpy.float64!')

p_sum = np.sum(p)
if abs(p_sum - 1.) > atol:
raise ValueError(
f'The sum of the probabilities ({p_sum}) must be 1!')

def __call__(
self,
rss,
size,
):
"""Chooses ``size`` random items from ``self.items`` according to
``self.probabilities``.

Parameters
----------
rss : instance of RandomStateService
The instance of RandomStateService from which random numbers are
drawn from.
size : int
The number of items to draw.

Returns
-------
random_items : instance of numpy.ndarray
The (size,)-shaped numpy.ndarray holding the randomly selected items
from ``self.items``.
"""
uniform_values = rss.random.random(size)

# The np.searchsorted function is much faster when the values are
# sorted. But we want to keep the randomness of the returned items.
idxs_of_sort = np.argsort(uniform_values)
sorted_idxs = np.searchsorted(
self._cdf,
uniform_values[idxs_of_sort],
side='right')
idxs = np.empty_like(sorted_idxs)
idxs[idxs_of_sort] = sorted_idxs
random_items = self._items[idxs]

return random_items
Loading