icecube · martwo · Sep 4, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -18,6 +18,13 @@ v23.2.1
 
 - The iminuit package is an optional tool now.
 
+- Introduce class core.random.RandomChoice to replace numpy.random.choice with a
+  faster implementation avoiding the recalculation of the cumulative
+  distribution function (CDF) when called several times (trials) for the same
+  probabilities (dataset). This new class is used in background and signal
+  event generation from MC, resulting in a great performance increase when
+  generating many trials with many events.
+
 v23.2.0
 =======
 - Complete overhaul of SkyLLH for more generic handling of parameters

diff --git a/skyllh/core/background_generation.py b/skyllh/core/background_generation.py
@@ -24,6 +24,9 @@
     func_has_n_args,
     issequenceof,
 )
+from skyllh.core.random import (
+    RandomChoice,
+)
 from skyllh.core.scrambling import (
     DataScrambler,
 )
@@ -123,7 +126,6 @@ def __init__(
             self,
             get_event_prob_func,
             get_mean_func=None,
-            unique_events=False,
             data_scrambler=None,
             mc_inplace_scrambling=False,
             keep_mc_data_fields=None,
@@ -157,12 +159,8 @@ def __init__(
             of events, for which the mean number of background events should get
             calculated. This argument can be `None`, which means that the mean
             number of background events to generate needs to get specified
-            through the ``generate_events`` method. However, if an event
+            through the ``generate_events`` method. However, if a pre event
             selection method is provided, this argument cannot be ``None``!
-        unique_events : bool
-            Flag if unique events should be drawn from the monte-carlo
-            (``True``), or if events can be drawn several times (``False``).
-            Default is ``False``.
         data_scrambler : instance of DataScrambler | None
             If set to an instance of DataScrambler, the drawn monte-carlo
             background events will get scrambled. This can ensure more
@@ -188,7 +186,6 @@ def __init__(
 
         self.get_event_prob_func = get_event_prob_func
         self.get_mean_func = get_mean_func
-        self.unique_events = unique_events
         self.data_scrambler = data_scrambler
         self.mc_inplace_scrambling = mc_inplace_scrambling
         self.keep_mc_data_field_names = keep_mc_data_fields
@@ -202,10 +199,11 @@ def __init__(
         # Define cache members to cache the background probabilities for each
         # monte-carlo event. The probabilities change only if the data changes.
         self._cache_data_id = None
-        self._cache_mc_pre_selected = None
+        self._cache_mc = None
         self._cache_mc_event_bkg_prob = None
-        self._cache_mc_event_bkg_prob_pre_selected = None
         self._cache_mean = None
+        self._cache_mean_pre_selected = None
+        self._cache_random_choice = None
 
     @property
     def get_event_prob_func(self):
@@ -251,21 +249,6 @@ def get_mean_func(self, func):
                 'have 3 arguments!')
         self._get_mean_func = func
 
-    @property
-    def unique_events(self):
-        """Flag if unique events should be drawn from the monto-carlo (True),
-        or if the same event can be drawn multiple times from the monte-carlo.
-        """
-        return self._unique_events
-
-    @unique_events.setter
-    def unique_events(self, b):
-        if not isinstance(b, bool):
-            raise TypeError(
-                'The unique_events property must be of type bool! '
-                f'Its current type is {classname(b)}.')
-        self._unique_events = b
-
     @property
     def data_scrambler(self):
         """The DataScrambler instance that should be used to scramble the drawn
@@ -453,29 +436,38 @@ def generate_events(
                         data=data,
                         events=data_mc)
 
-            with TaskTimer(
-                    tl,
-                    'Calculate MC background event probability cache.'):
-                self._cache_mc_event_bkg_prob = self._get_event_prob_func(
-                    dataset=dataset,
-                    data=data,
-                    events=data_mc)
-
             if self__pre_event_selection_method is not None:
                 with TaskTimer(
                         tl,
                         'Pre-select MC events.'):
-                    (self._cache_mc_pre_selected,
-                     mc_pre_selected_src_evt_idxs,
-                     mc_pre_selected_idxs) =\
+                    (self._cache_mc, _) =\
                         self__pre_event_selection_method.select_events(
                             events=data_mc,
-                            ret_original_evt_idxs=True,
+                            ret_original_evt_idxs=False,
                             tl=tl)
-                self._cache_mc_event_bkg_prob_pre_selected = np.take(
-                    self._cache_mc_event_bkg_prob, mc_pre_selected_idxs)
+
+                with TaskTimer(tl, 'Calculate selected MC background mean.'):
+                    self._cache_mean_pre_selected = self._get_mean_func(
+                        dataset=dataset,
+                        data=data,
+                        events=self._cache_mc)
             else:
-                self._cache_mc_pre_selected = data_mc
+                self._cache_mc = data_mc
+
+            with TaskTimer(
+                    tl,
+                    'Calculate MC background event probability cache.'):
+                self._cache_mc_event_bkg_prob = self._get_event_prob_func(
+                    dataset=dataset,
+                    data=data,
+                    events=self._cache_mc)
+
+            with TaskTimer(
+                    tl,
+                    'Create RandomChoice for MC background events.'):
+                self._cache_random_choice = RandomChoice(
+                    items=self._cache_mc.indices,
+                    probabilities=self._cache_mc_event_bkg_prob)
 
         if mean is None:
             if self._cache_mean is None:
@@ -496,42 +488,27 @@ def generate_events(
         n_bkg = (int(rss.random.poisson(mean)) if poisson else
                  int(np.round(mean, 0)))
 
-        # Apply only event pre-selection before choosing events.
-        data_mc_selected = self._cache_mc_pre_selected
-
         # Calculate the mean number of background events for the pre-selected
         # MC events.
         if self__pre_event_selection_method is None:
             # No selection at all, use the total mean.
-            mean_selected = mean
+            mean_pre_selected = mean
         else:
-            with TaskTimer(tl, 'Calculate selected MC background mean.'):
-                mean_selected = self._get_mean_func(
-                    dataset=dataset,
-                    data=data,
-                    events=data_mc_selected)
+            mean_pre_selected = self._cache_mean_pre_selected
 
         # Calculate the actual number of background events for the selected
         # events.
-        p_binomial = mean_selected / mean
-        with TaskTimer(tl, 'Get p array.'):
-            if self__pre_event_selection_method is None:
-                p = self._cache_mc_event_bkg_prob
-            else:
-                # Pre-selection.
-                p = self._cache_mc_event_bkg_prob_pre_selected / p_binomial
-        n_bkg_selected = int(np.around(n_bkg * p_binomial, 0))
+        n_bkg_selected = int(np.around(n_bkg * mean_pre_selected / mean, 0))
 
         # Draw the actual background events from the selected events of the
         # monto-carlo data set.
         with TaskTimer(tl, 'Draw MC background indices.'):
-            bkg_event_indices = rss.random.choice(
-                data_mc_selected.indices,
-                size=n_bkg_selected,
-                p=p,
-                replace=(not self._unique_events))
+            bkg_event_indices = self._cache_random_choice(
+                rss=rss,
+                size=n_bkg_selected)
+
         with TaskTimer(tl, 'Select MC background events from indices.'):
-            bkg_events = data_mc_selected[bkg_event_indices]
+            bkg_events = self._cache_mc[bkg_event_indices]
 
         # Scramble the drawn MC events if requested.
         if self._data_scrambler is not None:

diff --git a/skyllh/core/random.py b/skyllh/core/random.py
@@ -2,7 +2,10 @@
 
 import numpy as np
 
-from skyllh.core.py import int_cast
+from skyllh.core.py import (
+    classname,
+    int_cast,
+)
 
 
 class RandomStateService(
@@ -70,3 +73,164 @@ def reseed(self, seed):
             'The seed argument must be None or castable to type int!',
             allow_None=True)
         self.random.seed(self._seed)
+
+
+class RandomChoice(
+        object,
+):
+    """This class provides an efficient numpy.random.choice functionality
+    specialized for SkyLLH. The advantage is that it stores the cumulative
+    distribution function (CDF), which is assumed to be constant.
+    """
+
+    def __init__(
+            self,
+            items,
+            probabilities,
+            **kwargs,
+    ):
+        """Creates a new instance of RandomChoice holding the probabilities
+        and their cumulative distribution function (CDF).
+
+        Parameters
+        ----------
+        items : instance of numpy.ndarray
+            The (N,)-shaped numpy.ndarray holding the items from which to
+            choose.
+        probabilities : instance of numpy.ndarray
+            The (N,)-shaped numpy.ndarray holding the probability for each item.
+        """
+        super().__init__(**kwargs)
+
+        self._assert_items(items)
+        self._items = items
+
+        self._assert_probabilities(probabilities, self._items.size)
+        self._probabilities = probabilities
+
+        # Create the cumulative distribution function (CDF).
+        self._cdf = self._probabilities.cumsum()
+        self._cdf /= self._cdf[-1]
+
+    @property
+    def items(self):
+        """(read-only) The (N,)-shaped numpy.ndarray holding the items from
+        which to choose.
+        """
+        return self._items
+
+    @property
+    def probabilities(self):
+        """(read-only) The (N,)-shaped numpy.ndarray holding the propability
+        for each item.
+        """
+        return self._probabilities
+
+    def _assert_items(
+            self,
+            items,
+    ):
+        """Checks for the correct type and shape of the items.
+
+        Parameters
+        ----------
+        items : The (N,)-shaped numpy.ndarray holding the items from which to
+            choose.
+
+        Raises
+        ------
+        TypeError
+            If the type of items is not numpy.ndarray.
+        ValueError
+            If the items do not have the correct type and shape.
+        """
+        if not isinstance(items, np.ndarray):
+            raise TypeError(
+                'The items must be an instance of numpy.ndarray! '
+                f'Its current type is {classname(items)}!')
+
+        if items.ndim != 1:
+            raise ValueError(
+                'The items must be a 1-dimensional numpy.ndarray!')
+
+    def _assert_probabilities(
+            self,
+            p,
+            n_items,
+    ):
+        """Checks for correct values of the probabilities.
+
+        Parameters
+        ----------
+        p : instance of numpy.ndarray
+            The (N,)-shaped numpy.ndarray holding the probability for each item.
+        n_items : int
+            The number of items.
+
+        Raises
+        ------
+        ValueError
+            If the probabilities do not have the correct type, shape and values.
+        """
+        atol = np.sqrt(np.finfo(np.float64).eps)
+        atol = max(atol, np.sqrt(np.finfo(p.dtype).eps))
+
+        if p.ndim != 1:
+            raise ValueError(
+                'The probabilities must be provided as a 1-dimensional '
+                'numpy.ndarray!')
+
+        if p.size != n_items:
+            raise ValueError(
+                f'The size ({p.size}) of the probabilities array must match '
+                f'the number of items ({n_items})!')
+
+        if np.any(p < 0):
+            raise ValueError(
+                'The probabilities must be greater or equal zero!')
+
+        if p.dtype is not np.dtype(np.float64):
+            raise TypeError(
+                'The probabilities must be of type numpy.float64!')
+
+        p_sum = np.sum(p)
+        if abs(p_sum - 1.) > atol:
+            raise ValueError(
+                f'The sum of the probabilities ({p_sum}) must be 1!')
+
+    def __call__(
+            self,
+            rss,
+            size,
+    ):
+        """Chooses ``size`` random items from ``self.items`` according to
+        ``self.probabilities``.
+
+        Parameters
+        ----------
+        rss : instance of RandomStateService
+            The instance of RandomStateService from which random numbers are
+            drawn from.
+        size : int
+            The number of items to draw.
+
+        Returns
+        -------
+        random_items : instance of numpy.ndarray
+            The (size,)-shaped numpy.ndarray holding the randomly selected items
+            from ``self.items``.
+        """
+        uniform_values = rss.random.random(size)
+
+        # The np.searchsorted function is much faster when the values are
+        # sorted. But we want to keep the randomness of the returned items.
+        idxs_of_sort = np.argsort(uniform_values)
+        sorted_idxs = np.searchsorted(
+            self._cdf,
+            uniform_values[idxs_of_sort],
+            side='right')
+        idxs = np.empty_like(sorted_idxs)
+        idxs[idxs_of_sort] = sorted_idxs
+        random_items = self._items[idxs]
+
+        return random_items