diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 5139b7aa16..32b3fd60ae 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -18,6 +18,13 @@ v23.2.1 - The iminuit package is an optional tool now. +- Introduce class core.random.RandomChoice to replace numpy.random.choice with a + faster implementation avoiding the recalculation of the cumulative + distribution function (CDF) when called several times (trials) for the same + probabilities (dataset). This new class is used in background and signal + event generation from MC, resulting in a great performance increase when + generating many trials with many events. + v23.2.0 ======= - Complete overhaul of SkyLLH for more generic handling of parameters diff --git a/skyllh/core/background_generation.py b/skyllh/core/background_generation.py index 85b700ed3d..8f01f540d0 100644 --- a/skyllh/core/background_generation.py +++ b/skyllh/core/background_generation.py @@ -24,6 +24,9 @@ func_has_n_args, issequenceof, ) +from skyllh.core.random import ( + RandomChoice, +) from skyllh.core.scrambling import ( DataScrambler, ) @@ -123,7 +126,6 @@ def __init__( self, get_event_prob_func, get_mean_func=None, - unique_events=False, data_scrambler=None, mc_inplace_scrambling=False, keep_mc_data_fields=None, @@ -157,12 +159,8 @@ def __init__( of events, for which the mean number of background events should get calculated. This argument can be `None`, which means that the mean number of background events to generate needs to get specified - through the ``generate_events`` method. However, if an event + through the ``generate_events`` method. However, if a pre event selection method is provided, this argument cannot be ``None``! - unique_events : bool - Flag if unique events should be drawn from the monte-carlo - (``True``), or if events can be drawn several times (``False``). - Default is ``False``. data_scrambler : instance of DataScrambler | None If set to an instance of DataScrambler, the drawn monte-carlo background events will get scrambled. This can ensure more @@ -188,7 +186,6 @@ def __init__( self.get_event_prob_func = get_event_prob_func self.get_mean_func = get_mean_func - self.unique_events = unique_events self.data_scrambler = data_scrambler self.mc_inplace_scrambling = mc_inplace_scrambling self.keep_mc_data_field_names = keep_mc_data_fields @@ -202,10 +199,11 @@ def __init__( # Define cache members to cache the background probabilities for each # monte-carlo event. The probabilities change only if the data changes. self._cache_data_id = None - self._cache_mc_pre_selected = None + self._cache_mc = None self._cache_mc_event_bkg_prob = None - self._cache_mc_event_bkg_prob_pre_selected = None self._cache_mean = None + self._cache_mean_pre_selected = None + self._cache_random_choice = None @property def get_event_prob_func(self): @@ -251,21 +249,6 @@ def get_mean_func(self, func): 'have 3 arguments!') self._get_mean_func = func - @property - def unique_events(self): - """Flag if unique events should be drawn from the monto-carlo (True), - or if the same event can be drawn multiple times from the monte-carlo. - """ - return self._unique_events - - @unique_events.setter - def unique_events(self, b): - if not isinstance(b, bool): - raise TypeError( - 'The unique_events property must be of type bool! ' - f'Its current type is {classname(b)}.') - self._unique_events = b - @property def data_scrambler(self): """The DataScrambler instance that should be used to scramble the drawn @@ -453,29 +436,38 @@ def generate_events( data=data, events=data_mc) - with TaskTimer( - tl, - 'Calculate MC background event probability cache.'): - self._cache_mc_event_bkg_prob = self._get_event_prob_func( - dataset=dataset, - data=data, - events=data_mc) - if self__pre_event_selection_method is not None: with TaskTimer( tl, 'Pre-select MC events.'): - (self._cache_mc_pre_selected, - mc_pre_selected_src_evt_idxs, - mc_pre_selected_idxs) =\ + (self._cache_mc, _) =\ self__pre_event_selection_method.select_events( events=data_mc, - ret_original_evt_idxs=True, + ret_original_evt_idxs=False, tl=tl) - self._cache_mc_event_bkg_prob_pre_selected = np.take( - self._cache_mc_event_bkg_prob, mc_pre_selected_idxs) + + with TaskTimer(tl, 'Calculate selected MC background mean.'): + self._cache_mean_pre_selected = self._get_mean_func( + dataset=dataset, + data=data, + events=self._cache_mc) else: - self._cache_mc_pre_selected = data_mc + self._cache_mc = data_mc + + with TaskTimer( + tl, + 'Calculate MC background event probability cache.'): + self._cache_mc_event_bkg_prob = self._get_event_prob_func( + dataset=dataset, + data=data, + events=self._cache_mc) + + with TaskTimer( + tl, + 'Create RandomChoice for MC background events.'): + self._cache_random_choice = RandomChoice( + items=self._cache_mc.indices, + probabilities=self._cache_mc_event_bkg_prob) if mean is None: if self._cache_mean is None: @@ -496,42 +488,27 @@ def generate_events( n_bkg = (int(rss.random.poisson(mean)) if poisson else int(np.round(mean, 0))) - # Apply only event pre-selection before choosing events. - data_mc_selected = self._cache_mc_pre_selected - # Calculate the mean number of background events for the pre-selected # MC events. if self__pre_event_selection_method is None: # No selection at all, use the total mean. - mean_selected = mean + mean_pre_selected = mean else: - with TaskTimer(tl, 'Calculate selected MC background mean.'): - mean_selected = self._get_mean_func( - dataset=dataset, - data=data, - events=data_mc_selected) + mean_pre_selected = self._cache_mean_pre_selected # Calculate the actual number of background events for the selected # events. - p_binomial = mean_selected / mean - with TaskTimer(tl, 'Get p array.'): - if self__pre_event_selection_method is None: - p = self._cache_mc_event_bkg_prob - else: - # Pre-selection. - p = self._cache_mc_event_bkg_prob_pre_selected / p_binomial - n_bkg_selected = int(np.around(n_bkg * p_binomial, 0)) + n_bkg_selected = int(np.around(n_bkg * mean_pre_selected / mean, 0)) # Draw the actual background events from the selected events of the # monto-carlo data set. with TaskTimer(tl, 'Draw MC background indices.'): - bkg_event_indices = rss.random.choice( - data_mc_selected.indices, - size=n_bkg_selected, - p=p, - replace=(not self._unique_events)) + bkg_event_indices = self._cache_random_choice( + rss=rss, + size=n_bkg_selected) + with TaskTimer(tl, 'Select MC background events from indices.'): - bkg_events = data_mc_selected[bkg_event_indices] + bkg_events = self._cache_mc[bkg_event_indices] # Scramble the drawn MC events if requested. if self._data_scrambler is not None: diff --git a/skyllh/core/random.py b/skyllh/core/random.py index edf55c4cc7..227a598b9f 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -2,7 +2,10 @@ import numpy as np -from skyllh.core.py import int_cast +from skyllh.core.py import ( + classname, + int_cast, +) class RandomStateService( @@ -70,3 +73,164 @@ def reseed(self, seed): 'The seed argument must be None or castable to type int!', allow_None=True) self.random.seed(self._seed) + + +class RandomChoice( + object, +): + """This class provides an efficient numpy.random.choice functionality + specialized for SkyLLH. The advantage is that it stores the cumulative + distribution function (CDF), which is assumed to be constant. + """ + + def __init__( + self, + items, + probabilities, + **kwargs, + ): + """Creates a new instance of RandomChoice holding the probabilities + and their cumulative distribution function (CDF). + + Parameters + ---------- + items : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the items from which to + choose. + probabilities : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the probability for each item. + """ + super().__init__(**kwargs) + + self._assert_items(items) + self._items = items + + self._assert_probabilities(probabilities, self._items.size) + self._probabilities = probabilities + + # Create the cumulative distribution function (CDF). + self._cdf = self._probabilities.cumsum() + self._cdf /= self._cdf[-1] + + @property + def items(self): + """(read-only) The (N,)-shaped numpy.ndarray holding the items from + which to choose. + """ + return self._items + + @property + def probabilities(self): + """(read-only) The (N,)-shaped numpy.ndarray holding the propability + for each item. + """ + return self._probabilities + + def _assert_items( + self, + items, + ): + """Checks for the correct type and shape of the items. + + Parameters + ---------- + items : The (N,)-shaped numpy.ndarray holding the items from which to + choose. + + Raises + ------ + TypeError + If the type of items is not numpy.ndarray. + ValueError + If the items do not have the correct type and shape. + """ + if not isinstance(items, np.ndarray): + raise TypeError( + 'The items must be an instance of numpy.ndarray! ' + f'Its current type is {classname(items)}!') + + if items.ndim != 1: + raise ValueError( + 'The items must be a 1-dimensional numpy.ndarray!') + + def _assert_probabilities( + self, + p, + n_items, + ): + """Checks for correct values of the probabilities. + + Parameters + ---------- + p : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the probability for each item. + n_items : int + The number of items. + + Raises + ------ + ValueError + If the probabilities do not have the correct type, shape and values. + """ + atol = np.sqrt(np.finfo(np.float64).eps) + atol = max(atol, np.sqrt(np.finfo(p.dtype).eps)) + + if p.ndim != 1: + raise ValueError( + 'The probabilities must be provided as a 1-dimensional ' + 'numpy.ndarray!') + + if p.size != n_items: + raise ValueError( + f'The size ({p.size}) of the probabilities array must match ' + f'the number of items ({n_items})!') + + if np.any(p < 0): + raise ValueError( + 'The probabilities must be greater or equal zero!') + + if p.dtype is not np.dtype(np.float64): + raise TypeError( + 'The probabilities must be of type numpy.float64!') + + p_sum = np.sum(p) + if abs(p_sum - 1.) > atol: + raise ValueError( + f'The sum of the probabilities ({p_sum}) must be 1!') + + def __call__( + self, + rss, + size, + ): + """Chooses ``size`` random items from ``self.items`` according to + ``self.probabilities``. + + Parameters + ---------- + rss : instance of RandomStateService + The instance of RandomStateService from which random numbers are + drawn from. + size : int + The number of items to draw. + + Returns + ------- + random_items : instance of numpy.ndarray + The (size,)-shaped numpy.ndarray holding the randomly selected items + from ``self.items``. + """ + uniform_values = rss.random.random(size) + + # The np.searchsorted function is much faster when the values are + # sorted. But we want to keep the randomness of the returned items. + idxs_of_sort = np.argsort(uniform_values) + sorted_idxs = np.searchsorted( + self._cdf, + uniform_values[idxs_of_sort], + side='right') + idxs = np.empty_like(sorted_idxs) + idxs[idxs_of_sort] = sorted_idxs + random_items = self._items[idxs] + + return random_items diff --git a/skyllh/core/signal_generator.py b/skyllh/core/signal_generator.py index f7025e392e..be1a267a0a 100644 --- a/skyllh/core/signal_generator.py +++ b/skyllh/core/signal_generator.py @@ -25,6 +25,9 @@ int_cast, get_smallest_numpy_int_type, ) +from skyllh.core.random import ( + RandomChoice, +) from skyllh.core.services import ( DatasetSignalWeightFactorsService, ) @@ -521,6 +524,11 @@ def _construct_signal_candidates(self): self._sig_candidates_weight_sum = np.sum(self._sig_candidates['weight']) self._sig_candidates['weight'] /= self._sig_candidates_weight_sum + # Create new RandomChoice instance for the signal candidates. + self._sig_candidates_random_choice = RandomChoice( + items=self._sig_candidates, + probabilities=self._sig_candidates['weight']) + def change_shg_mgr( self, shg_mgr): @@ -642,11 +650,11 @@ def generate_signal_events( 'The mean argument must be castable to type of int!') # Draw n_signal signal candidates according to their weight. - sig_events_meta = rss.random.choice( - self._sig_candidates, + sig_events_meta = self._sig_candidates_random_choice( + rss=rss, size=n_signal, - p=self._sig_candidates['weight'] ) + # Get the list of unique dataset and source hypothesis group indices of # the drawn signal events. # Note: This code does not assume the same format for each of the diff --git a/tests/core/test_random.py b/tests/core/test_random.py new file mode 100644 index 0000000000..9687fca81b --- /dev/null +++ b/tests/core/test_random.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import unittest + +import numpy as np + +from skyllh.core.random import ( + RandomChoice, + RandomStateService, +) + + +class RandomChoice_TestCase( + unittest.TestCase, +): + def setUp(self): + self.size = 100 + self.items = np.arange(self.size) + self.probs = np.random.uniform(low=0, high=1, size=self.size) + self.probs /= np.sum(self.probs) + + def test_choice(self): + rss = RandomStateService(seed=1) + np_items = rss.random.choice( + self.items, + size=5, + replace=True, + p=self.probs) + + rss = RandomStateService(seed=1) + random_choice = RandomChoice( + items=self.items, + probabilities=self.probs) + rc_items = random_choice( + rss=rss, + size=5) + + np.testing.assert_equal(np_items, rc_items) + + +if __name__ == '__main__': + unittest.main()