From 4a2f083b2517fdcd562a66d03b4e5804e104e417 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 11:14:04 +0200 Subject: [PATCH 01/12] Introduce efficient random.choice class --- skyllh/core/random.py | 153 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 1 deletion(-) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index edf55c4cc7..05ad988408 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -2,7 +2,10 @@ import numpy as np -from skyllh.core.py import int_cast +from skyllh.core.py import ( + classname, + int_cast, +) class RandomStateService( @@ -70,3 +73,151 @@ def reseed(self, seed): 'The seed argument must be None or castable to type int!', allow_None=True) self.random.seed(self._seed) + + +class RandomChoice( + object, +): + """This class provides an efficient numpy.random.choice functionality + specialized for SkyLLH. The advantage is that it stores the cumulative + distribution function (CDF), which is assumed to be constant. + """ + + def __init__( + self, + items, + probabilities, + **kwargs, + ): + """Creates a new instance of RandomChoice holding the probabilities + and their cumulative distribution function (CDF). + + Parameters + ---------- + items : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the items from which to + choose. + probabilities : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the probability for each item. + """ + super().__init__(**kwargs) + + self._assert_items(items) + self._items = items + + self._assert_probabilitie(probabilities, self._items.size) + self._probabilities = probabilities + + # Create the cumulative distribution function (CDF). + self._cdf = self._probabilities.cumsum() + self._cdf /= self._cdf[-1] + + @property + def items(self): + """(read-only) The (N,)-shaped numpy.ndarray holding the items from + which to choose. + """ + return self._items + + @property + def probabilities(self): + """(read-only) The (N,)-shaped numpy.ndarray holding the propability + for each item. + """ + return self._probabilities + + def _assert_items( + self, + items, + ): + """Checks for the correct type and shape of the items. + + Parameters + ---------- + items : The (N,)-shaped numpy.ndarray holding the items from which to + choose. + + Raises + ------ + TypeError + If the type of items is not numpy.ndarray. + ValueError + If the items do not have the correct type and shape. + """ + if not isinstance(items, np.ndarray): + raise TypeError( + 'The items must be an instance of numpy.ndarray! ' + f'Its current type is {classname(items)}!') + + if items.ndim != 1: + raise ValueError( + 'The items must be a 1-dimensional numpy.ndarray!') + + def _assert_probabilitie( + self, + p, + n_items, + ): + """Checks for correct values of the probabilities. + + Parameters + ---------- + p : instance of numpy.ndarray + The (N,)-shaped numpy.ndarray holding the probability for each item. + n_items : int + The number of items. + + Raises + ------ + ValueError + If the probabilities do not have the correct type, shape and values. + """ + atol = np.sqrt(np.finfo(np.float64).eps) + atol = max(atol, np.sqrt(np.finfo(p.dtype).eps)) + + if p.ndim != 1: + raise ValueError( + 'The probabilities must be provided as a 1-dimensional ' + 'numpy.ndarray!') + + if p.size != n_items: + raise ValueError( + f'The size ({p.size}) of the probabilities array must match ' + f'the number of items ({n_items})!') + + if np.any(p < 0): + raise ValueError( + 'The probabilities must be greater or equal zero!') + + p_sum = np.sum(p) + if abs(p_sum - 1.) > atol: + raise ValueError( + f'The sum of the probabilities ({p_sum}) must be 1!') + + def __call__( + self, + rss, + size, + ): + """Chooses ``size`` random items from ``self.items`` according to + ``self.probabilities``. + + Parameters + ---------- + rss : instance of RandomStateService + The instance of RandomStateService from which random numbers are + drawn from. + size : int + The number of items to draw. + + Returns + ------- + random_items : instance of numpy.ndarray + The (size,)-shaped numpy.ndarray holding the randomly selected items + from ``self.items``. + """ + uniform_samples = rss.random.random(size) + idxs = np.searchsorted(self._cdf, uniform_samples, side='right') + random_items = self._items[idxs] + + return random_items From 6009f0bfdefe154ae63a86ffc5aade72212f08e6 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 11:14:49 +0200 Subject: [PATCH 02/12] Use new RandomChoice for signal generation --- skyllh/core/signal_generator.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/skyllh/core/signal_generator.py b/skyllh/core/signal_generator.py index f7025e392e..be1a267a0a 100644 --- a/skyllh/core/signal_generator.py +++ b/skyllh/core/signal_generator.py @@ -25,6 +25,9 @@ int_cast, get_smallest_numpy_int_type, ) +from skyllh.core.random import ( + RandomChoice, +) from skyllh.core.services import ( DatasetSignalWeightFactorsService, ) @@ -521,6 +524,11 @@ def _construct_signal_candidates(self): self._sig_candidates_weight_sum = np.sum(self._sig_candidates['weight']) self._sig_candidates['weight'] /= self._sig_candidates_weight_sum + # Create new RandomChoice instance for the signal candidates. + self._sig_candidates_random_choice = RandomChoice( + items=self._sig_candidates, + probabilities=self._sig_candidates['weight']) + def change_shg_mgr( self, shg_mgr): @@ -642,11 +650,11 @@ def generate_signal_events( 'The mean argument must be castable to type of int!') # Draw n_signal signal candidates according to their weight. - sig_events_meta = rss.random.choice( - self._sig_candidates, + sig_events_meta = self._sig_candidates_random_choice( + rss=rss, size=n_signal, - p=self._sig_candidates['weight'] ) + # Get the list of unique dataset and source hypothesis group indices of # the drawn signal events. # Note: This code does not assume the same format for each of the From 98f0fa3013e111051be9b87c8c822fdbcf09fbac Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 16:35:58 +0200 Subject: [PATCH 03/12] Simplify background event generation --- skyllh/core/background_generation.py | 63 ++++++++++++---------------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/skyllh/core/background_generation.py b/skyllh/core/background_generation.py index 85b700ed3d..035b79616e 100644 --- a/skyllh/core/background_generation.py +++ b/skyllh/core/background_generation.py @@ -157,7 +157,7 @@ def __init__( of events, for which the mean number of background events should get calculated. This argument can be `None`, which means that the mean number of background events to generate needs to get specified - through the ``generate_events`` method. However, if an event + through the ``generate_events`` method. However, if a pre event selection method is provided, this argument cannot be ``None``! unique_events : bool Flag if unique events should be drawn from the monte-carlo @@ -202,10 +202,10 @@ def __init__( # Define cache members to cache the background probabilities for each # monte-carlo event. The probabilities change only if the data changes. self._cache_data_id = None - self._cache_mc_pre_selected = None + self._cache_mc = None self._cache_mc_event_bkg_prob = None - self._cache_mc_event_bkg_prob_pre_selected = None self._cache_mean = None + self._cache_mean_pre_selected = None @property def get_event_prob_func(self): @@ -453,29 +453,31 @@ def generate_events( data=data, events=data_mc) - with TaskTimer( - tl, - 'Calculate MC background event probability cache.'): - self._cache_mc_event_bkg_prob = self._get_event_prob_func( - dataset=dataset, - data=data, - events=data_mc) - if self__pre_event_selection_method is not None: with TaskTimer( tl, 'Pre-select MC events.'): - (self._cache_mc_pre_selected, - mc_pre_selected_src_evt_idxs, - mc_pre_selected_idxs) =\ + (self._cache_mc, _) =\ self__pre_event_selection_method.select_events( events=data_mc, - ret_original_evt_idxs=True, + ret_original_evt_idxs=False, tl=tl) - self._cache_mc_event_bkg_prob_pre_selected = np.take( - self._cache_mc_event_bkg_prob, mc_pre_selected_idxs) + + with TaskTimer(tl, 'Calculate selected MC background mean.'): + self._cache_mean_pre_selected = self._get_mean_func( + dataset=dataset, + data=data, + events=self._cache_mc) else: - self._cache_mc_pre_selected = data_mc + self._cache_mc = data_mc + + with TaskTimer( + tl, + 'Calculate MC background event probability cache.'): + self._cache_mc_event_bkg_prob = self._get_event_prob_func( + dataset=dataset, + data=data, + events=self._cache_mc) if mean is None: if self._cache_mean is None: @@ -496,42 +498,29 @@ def generate_events( n_bkg = (int(rss.random.poisson(mean)) if poisson else int(np.round(mean, 0))) - # Apply only event pre-selection before choosing events. - data_mc_selected = self._cache_mc_pre_selected - # Calculate the mean number of background events for the pre-selected # MC events. if self__pre_event_selection_method is None: # No selection at all, use the total mean. - mean_selected = mean + mean_pre_selected = mean else: - with TaskTimer(tl, 'Calculate selected MC background mean.'): - mean_selected = self._get_mean_func( - dataset=dataset, - data=data, - events=data_mc_selected) + mean_pre_selected = self._cache_mean_pre_selected # Calculate the actual number of background events for the selected # events. - p_binomial = mean_selected / mean - with TaskTimer(tl, 'Get p array.'): - if self__pre_event_selection_method is None: - p = self._cache_mc_event_bkg_prob - else: - # Pre-selection. - p = self._cache_mc_event_bkg_prob_pre_selected / p_binomial + p_binomial = mean_pre_selected / mean n_bkg_selected = int(np.around(n_bkg * p_binomial, 0)) # Draw the actual background events from the selected events of the # monto-carlo data set. with TaskTimer(tl, 'Draw MC background indices.'): bkg_event_indices = rss.random.choice( - data_mc_selected.indices, + self._cache_mc.indices, size=n_bkg_selected, - p=p, + p=self._cache_mc_event_bkg_prob, replace=(not self._unique_events)) with TaskTimer(tl, 'Select MC background events from indices.'): - bkg_events = data_mc_selected[bkg_event_indices] + bkg_events = self._cache_mc[bkg_event_indices] # Scramble the drawn MC events if requested. if self._data_scrambler is not None: From 5fabe10e617b89f729c2db72c4cfa72cf6847e93 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 17:32:36 +0200 Subject: [PATCH 04/12] Add unit test for RandomChoice --- tests/core/test_random.py | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/core/test_random.py diff --git a/tests/core/test_random.py b/tests/core/test_random.py new file mode 100644 index 0000000000..9687fca81b --- /dev/null +++ b/tests/core/test_random.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import unittest + +import numpy as np + +from skyllh.core.random import ( + RandomChoice, + RandomStateService, +) + + +class RandomChoice_TestCase( + unittest.TestCase, +): + def setUp(self): + self.size = 100 + self.items = np.arange(self.size) + self.probs = np.random.uniform(low=0, high=1, size=self.size) + self.probs /= np.sum(self.probs) + + def test_choice(self): + rss = RandomStateService(seed=1) + np_items = rss.random.choice( + self.items, + size=5, + replace=True, + p=self.probs) + + rss = RandomStateService(seed=1) + random_choice = RandomChoice( + items=self.items, + probabilities=self.probs) + rc_items = random_choice( + rss=rss, + size=5) + + np.testing.assert_equal(np_items, rc_items) + + +if __name__ == '__main__': + unittest.main() From db534587a538ce7576962a3b14c74f372526293b Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 18:32:17 +0200 Subject: [PATCH 05/12] Check for float64 data type --- skyllh/core/random.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index 05ad988408..39efcbf245 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -189,6 +189,10 @@ def _assert_probabilitie( raise ValueError( 'The probabilities must be greater or equal zero!') + if p.dtype is not np.dtype(np.float64): + raise TypeError( + 'The probabilities must be of type numpy.float64!') + p_sum = np.sum(p) if abs(p_sum - 1.) > atol: raise ValueError( From 717cc2b489b1a72ce641fa73a9012de67f147403 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Fri, 1 Sep 2023 18:33:23 +0200 Subject: [PATCH 06/12] Use RandomChoice --- skyllh/core/background_generation.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/skyllh/core/background_generation.py b/skyllh/core/background_generation.py index 035b79616e..fdf1b25c0e 100644 --- a/skyllh/core/background_generation.py +++ b/skyllh/core/background_generation.py @@ -24,6 +24,9 @@ func_has_n_args, issequenceof, ) +from skyllh.core.random import ( + RandomChoice, +) from skyllh.core.scrambling import ( DataScrambler, ) @@ -206,6 +209,7 @@ def __init__( self._cache_mc_event_bkg_prob = None self._cache_mean = None self._cache_mean_pre_selected = None + self._cache_random_choice = None @property def get_event_prob_func(self): @@ -479,6 +483,13 @@ def generate_events( data=data, events=self._cache_mc) + with TaskTimer( + tl, + 'Create RandomChoice for MC background events.'): + self._cache_random_choice = RandomChoice( + items=self._cache_mc.indices, + probabilities=self._cache_mc_event_bkg_prob) + if mean is None: if self._cache_mean is None: raise ValueError( @@ -508,17 +519,15 @@ def generate_events( # Calculate the actual number of background events for the selected # events. - p_binomial = mean_pre_selected / mean - n_bkg_selected = int(np.around(n_bkg * p_binomial, 0)) + n_bkg_selected = int(np.around(n_bkg * mean_pre_selected / mean, 0)) # Draw the actual background events from the selected events of the # monto-carlo data set. with TaskTimer(tl, 'Draw MC background indices.'): - bkg_event_indices = rss.random.choice( - self._cache_mc.indices, - size=n_bkg_selected, - p=self._cache_mc_event_bkg_prob, - replace=(not self._unique_events)) + bkg_event_indices = self._cache_random_choice( + rss=rss, + size=n_bkg_selected) + with TaskTimer(tl, 'Select MC background events from indices.'): bkg_events = self._cache_mc[bkg_event_indices] From b06a4b56ddd14fa0391a62bc2bfd324aa24b88ce Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Sat, 2 Sep 2023 17:23:25 +0200 Subject: [PATCH 07/12] Sorting the uniform values makes searchsorted much faster --- skyllh/core/random.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index 39efcbf245..e4cdc248e0 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -221,6 +221,7 @@ def __call__( from ``self.items``. """ uniform_samples = rss.random.random(size) + uniform_samples.sort() idxs = np.searchsorted(self._cdf, uniform_samples, side='right') random_items = self._items[idxs] From a8d9dbca9b82cb72f9081789da386ba1ae5d6db2 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Sat, 2 Sep 2023 18:54:48 +0200 Subject: [PATCH 08/12] Keep the randomness of the returned items --- skyllh/core/random.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index e4cdc248e0..a6dfdcbd97 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -220,9 +220,17 @@ def __call__( The (size,)-shaped numpy.ndarray holding the randomly selected items from ``self.items``. """ - uniform_samples = rss.random.random(size) - uniform_samples.sort() - idxs = np.searchsorted(self._cdf, uniform_samples, side='right') + uniform_values = rss.random.random(size) + + # The np.searchsorted function is much faster when the values are + # sorted. But we want to keep the randomness of the returned items. + idxs_of_sort = np.argsort(uniform_values) + sorted_idxs = np.searchsorted( + self._cdf, + uniform_values[idxs_of_sort], + side='right') + idxs = np.empty_like(sorted_idxs) + idxs[idxs_of_sort] = sorted_idxs random_items = self._items[idxs] return random_items From 6a9351060a326aa2694daa2180679ef064ab4843 Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Mon, 4 Sep 2023 10:29:43 +0200 Subject: [PATCH 09/12] Update CHANGELOG --- CHANGELOG.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 5139b7aa16..32b3fd60ae 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -18,6 +18,13 @@ v23.2.1 - The iminuit package is an optional tool now. +- Introduce class core.random.RandomChoice to replace numpy.random.choice with a + faster implementation avoiding the recalculation of the cumulative + distribution function (CDF) when called several times (trials) for the same + probabilities (dataset). This new class is used in background and signal + event generation from MC, resulting in a great performance increase when + generating many trials with many events. + v23.2.0 ======= - Complete overhaul of SkyLLH for more generic handling of parameters From dd1c07740e100d73f01502cdf21a50127cd343bc Mon Sep 17 00:00:00 2001 From: Martin Wolf Date: Mon, 4 Sep 2023 11:28:38 +0200 Subject: [PATCH 10/12] Remove obsolete option --- skyllh/core/background_generation.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/skyllh/core/background_generation.py b/skyllh/core/background_generation.py index fdf1b25c0e..8f01f540d0 100644 --- a/skyllh/core/background_generation.py +++ b/skyllh/core/background_generation.py @@ -126,7 +126,6 @@ def __init__( self, get_event_prob_func, get_mean_func=None, - unique_events=False, data_scrambler=None, mc_inplace_scrambling=False, keep_mc_data_fields=None, @@ -162,10 +161,6 @@ def __init__( number of background events to generate needs to get specified through the ``generate_events`` method. However, if a pre event selection method is provided, this argument cannot be ``None``! - unique_events : bool - Flag if unique events should be drawn from the monte-carlo - (``True``), or if events can be drawn several times (``False``). - Default is ``False``. data_scrambler : instance of DataScrambler | None If set to an instance of DataScrambler, the drawn monte-carlo background events will get scrambled. This can ensure more @@ -191,7 +186,6 @@ def __init__( self.get_event_prob_func = get_event_prob_func self.get_mean_func = get_mean_func - self.unique_events = unique_events self.data_scrambler = data_scrambler self.mc_inplace_scrambling = mc_inplace_scrambling self.keep_mc_data_field_names = keep_mc_data_fields @@ -255,21 +249,6 @@ def get_mean_func(self, func): 'have 3 arguments!') self._get_mean_func = func - @property - def unique_events(self): - """Flag if unique events should be drawn from the monto-carlo (True), - or if the same event can be drawn multiple times from the monte-carlo. - """ - return self._unique_events - - @unique_events.setter - def unique_events(self, b): - if not isinstance(b, bool): - raise TypeError( - 'The unique_events property must be of type bool! ' - f'Its current type is {classname(b)}.') - self._unique_events = b - @property def data_scrambler(self): """The DataScrambler instance that should be used to scramble the drawn From d08488864c8aa8cd552bb0a4794d5c7d2a078ea0 Mon Sep 17 00:00:00 2001 From: "Martin Wolf, PhD" Date: Mon, 4 Sep 2023 11:35:52 +0200 Subject: [PATCH 11/12] Update skyllh/core/random.py Co-authored-by: Tomas Kontrimas <52071038+tomaskontrimas@users.noreply.github.com> --- skyllh/core/random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index a6dfdcbd97..cb06dcf1ac 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -153,7 +153,7 @@ def _assert_items( raise ValueError( 'The items must be a 1-dimensional numpy.ndarray!') - def _assert_probabilitie( + def _assert_probabilities( self, p, n_items, From 79f84dca4b543a12be3a53e0d28db223817d5cf0 Mon Sep 17 00:00:00 2001 From: "Martin Wolf, PhD" Date: Mon, 4 Sep 2023 11:35:59 +0200 Subject: [PATCH 12/12] Update skyllh/core/random.py Co-authored-by: Tomas Kontrimas <52071038+tomaskontrimas@users.noreply.github.com> --- skyllh/core/random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyllh/core/random.py b/skyllh/core/random.py index cb06dcf1ac..227a598b9f 100644 --- a/skyllh/core/random.py +++ b/skyllh/core/random.py @@ -105,7 +105,7 @@ def __init__( self._assert_items(items) self._items = items - self._assert_probabilitie(probabilities, self._items.size) + self._assert_probabilities(probabilities, self._items.size) self._probabilities = probabilities # Create the cumulative distribution function (CDF).