From 440669e7675a063cbf7bb1cbe897b99c928a82d3 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Mon, 4 Jan 2021 21:54:07 +0100
Subject: [PATCH] Small fixes for estimation of christmas models (#62)

Co-authored-by: Klara Roehrl <klara.roehrl@gmail.com>
Co-authored-by: Tobias Raabe <raabe@posteo.de>
---
 pyproject.toml               |   2 +-
 src/sid/config.py            |   2 +-
 src/sid/contacts.py          | 168 +++++++++++++----------------------
 src/sid/covid_epi_params.csv |   8 +-
 src/sid/msm.py               |   4 +-
 src/sid/parse_model.py       |   5 +-
 src/sid/shared.py            |   1 +
 tests/test_contacts.py       |  11 ---
 8 files changed, 76 insertions(+), 125 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7312048a..430e8224 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,5 +8,5 @@ black = 1
 pyupgrade = 1
 
 [tool.nbqa.addopts]
-isort = ["--treat-comment-as-code", "# %%"]
+isort = ["--treat-comment-as-code", "# %%", "--profile=black"]
 pyupgrade = ["--py36-plus"]
diff --git a/src/sid/config.py b/src/sid/config.py
index ed6a0878..ee9f533f 100644
--- a/src/sid/config.py
+++ b/src/sid/config.py
@@ -35,7 +35,7 @@
 DTYPE_INDEX = np.uint32
 DTYPE_INFECTED = np.bool_
 DTYPE_INFECTION_COUNTER = np.uint16
-DTYPE_N_CONTACTS = np.uint32
+DTYPE_N_CONTACTS = np.uint16
 DTYPE_SID_PERIOD = np.int16
 
 INDEX_NAMES = ["category", "subcategory", "name"]
diff --git a/src/sid/contacts.py b/src/sid/contacts.py
index e22e8aca..a5a4eaa2 100644
--- a/src/sid/contacts.py
+++ b/src/sid/contacts.py
@@ -108,7 +108,7 @@ def calculate_infections_by_contacts(
         [params.loc[("infection_prob", cm, cm), "value"] for cm in indexers]
     )
 
-    reduced_contacts = _reduce_contacts_with_infection_probs(
+    contacts = _reduce_contacts_with_infection_probs(
         contacts, is_recurrent, infect_probs, next(seed)
     )
 
@@ -123,10 +123,6 @@ def calculate_infections_by_contacts(
     for ind in indexers.values():
         indexers_list.append(ind)
 
-    np.random.seed(next(seed))
-
-    loop_order = _get_shuffled_loop_entries(len(states), len(indexers), next(seed))
-
     (
         infected,
         infection_counter,
@@ -134,7 +130,7 @@ def calculate_infections_by_contacts(
         missed,
         was_infected_by,
     ) = _calculate_infections_by_contacts_numba(
-        reduced_contacts,
+        contacts,
         infectious,
         immune,
         group_codes,
@@ -143,7 +139,6 @@ def calculate_infections_by_contacts(
         infect_probs,
         next(seed),
         is_recurrent,
-        loop_order,
     )
 
     infected = pd.Series(infected, index=states.index)
@@ -165,47 +160,6 @@ def calculate_infections_by_contacts(
     return infected, n_has_additionally_infected, missed_contacts, was_infected_by
 
 
-@nb.njit
-def _get_shuffled_loop_entries(n_states, n_contact_models, seed, n_model_orders=1000):
-    """Create an array of loop entries.
-
-    We save the loop entries of the following loop in shuffled order:
-
-    .. code-block:: python
-
-        for i in range(n_states):
-            for j in range(n_contact_models):
-                pass
-
-    The loop entries are stored in an array with ``n_states * n_contact_model`` rows
-    and two columns. The first column contains the i, the second the j elements.
-
-    Achieving complete randomness would require us to first store all loop entries
-    in an array and then shuffle it. However, this would be very slow. Instead
-    we loop over states in random order and cycle through previously.
-
-    """
-    np.random.seed(seed)
-    res = np.empty((n_states * n_contact_models, 2), dtype=np.int64)
-    shuffled_state_indices = np.random.choice(n_states, replace=False, size=n_states)
-
-    # create random permutations of the model orders
-    model_orders = np.zeros((n_model_orders, n_contact_models))
-    for m in range(n_model_orders):
-        model_orders[m] = np.random.choice(
-            n_contact_models, replace=False, size=n_contact_models
-        )
-
-    counter = 0
-
-    for i in shuffled_state_indices:
-        for j in model_orders[i % n_model_orders]:
-            res[counter, 0] = i
-            res[counter, 1] = j
-            counter += 1
-    return res
-
-
 @nb.njit
 def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed):
     """Reduce the number of contacts stochastically.
@@ -227,7 +181,7 @@ def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed):
 
     """
 
-    reduced_contacts = contacts.copy()
+    contacts = contacts.copy()
     np.random.seed(seed)
     n_obs, n_contacts = contacts.shape
     for i in range(n_obs):
@@ -237,8 +191,8 @@ def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed):
                 for _ in range(contacts[i, j]):
                     if boolean_choice(probs[j]):
                         success += 1
-                reduced_contacts[i, j] = success
-    return reduced_contacts
+                contacts[i, j] = success
+    return contacts
 
 
 @nb.njit
@@ -252,7 +206,6 @@ def _calculate_infections_by_contacts_numba(
     infection_probs,
     seed,
     is_recurrent,
-    loop_order,
 ):
     """Match people, draw if they get infected and record who infected whom.
 
@@ -275,8 +228,6 @@ def _calculate_infections_by_contacts_numba(
             probability of infection for each contact model.
         seed (int): Seed value to control randomness.
         is_recurrent (numpy.ndarray): Boolean array of length n_contact_models.
-        loop_orrder (numpy.ndarray): 2d numpy array with two columns. The first column
-            indicates an individual. The second indicates a contact model.
 
     Returns:
         (tuple) Tuple containing
@@ -297,66 +248,67 @@ def _calculate_infections_by_contacts_numba(
     groups_list = [np.arange(len(gp)) for gp in group_cdfs]
     was_infected_by = np.full(len(contacts), -1, dtype=np.int16)
 
+    n_obs, n_contact_models = contacts.shape
     # Loop over all individual-contact_model combinations
-    for k in range(len(loop_order)):
-        i, cm = loop_order[k]
-        if is_recurrent[cm]:
-            # We only check if i infects someone else from his group. Whether
-            # he is infected by some j is only checked, when the main loop arrives at j.
-            # This allows us to skip completely if i is not infectious or has no
-            # contacts under contact model cm.
-            group_i = group_codes[i, cm]
-            if group_i >= 0 and infectious[i] and contacts[i, cm] > 0:
-                others = indexers_list[cm][group_i]
-                # extract infection probability into a variable for faster access
-                prob = infection_probs[cm]
-                for j in others:
-                    # the case i == j is skipped by the next if condition because it
-                    # never happens that i is infectious but not immune
-                    if not immune[j] and contacts[j, cm] > 0:
-                        is_infection = boolean_choice(prob)
-                        if is_infection:
+    for i in range(n_obs):
+        for cm in range(n_contact_models):
+            if is_recurrent[cm]:
+                # We only check if i infects someone else from his group. Whether
+                # he is infected by some j is only checked, when the main loop arrives
+                # at j. This allows us to skip completely if i is not infectious or has
+                # no contacts under contact model cm.
+                group_i = group_codes[i, cm]
+                if group_i >= 0 and infectious[i] and contacts[i, cm] > 0:
+                    others = indexers_list[cm][group_i]
+                    # extract infection probability into a variable for faster access
+                    prob = infection_probs[cm]
+                    for j in others:
+                        # the case i == j is skipped by the next if condition because it
+                        # never happens that i is infectious but not immune
+                        if not immune[j] and contacts[j, cm] > 0:
+                            is_infection = boolean_choice(prob)
+                            if is_infection:
+                                infection_counter[i] += 1
+                                infected[j] = 1
+                                immune[j] = True
+                                was_infected_by[j] = cm
+
+            else:
+                # get the probabilities for meeting another group which depend on the
+                # individual's group.
+                group_i = group_codes[i, cm]
+                group_i_cdf = group_cdfs[cm][group_i]
+
+                # Loop over each contact the individual has, sample the contact's group
+                # and compute the sum of possible contacts in this group.
+                n_contacts = contacts[i, cm]
+                for _ in range(n_contacts):
+                    contact_takes_place = True
+                    group_j = choose_other_group(groups_list[cm], cdf=group_i_cdf)
+                    choice_indices = indexers_list[cm][group_j]
+                    contacts_j = contacts[choice_indices, cm]
+
+                    j = choose_other_individual(choice_indices, weights=contacts_j)
+
+                    if j < 0 or j == i:
+                        contact_takes_place = False
+
+                    # If a contact takes place, find out if one individual got infected.
+                    if contact_takes_place:
+                        contacts[i, cm] -= 1
+                        contacts[j, cm] -= 1
+
+                        if infectious[i] and not immune[j]:
                             infection_counter[i] += 1
                             infected[j] = 1
                             immune[j] = True
                             was_infected_by[j] = cm
 
-        else:
-            # get the probabilities for meeting another group which depend on the
-            # individual's group.
-            group_i = group_codes[i, cm]
-            group_i_cdf = group_cdfs[cm][group_i]
-
-            # Loop over each contact the individual has, sample the contact's group and
-            # compute the sum of possible contacts in this group.
-            n_contacts = contacts[i, cm]
-            for _ in range(n_contacts):
-                contact_takes_place = True
-                group_j = choose_other_group(groups_list[cm], cdf=group_i_cdf)
-                choice_indices = indexers_list[cm][group_j]
-                contacts_j = contacts[choice_indices, cm]
-
-                j = choose_other_individual(choice_indices, weights=contacts_j)
-
-                if j < 0 or j == i:
-                    contact_takes_place = False
-
-                # If a contact takes place, find out if one individual got infected.
-                if contact_takes_place:
-                    contacts[i, cm] -= 1
-                    contacts[j, cm] -= 1
-
-                    if infectious[i] and not immune[j]:
-                        infection_counter[i] += 1
-                        infected[j] = 1
-                        immune[j] = True
-                        was_infected_by[j] = cm
-
-                    elif infectious[j] and not immune[i]:
-                        infection_counter[j] += 1
-                        infected[i] = 1
-                        immune[i] = True
-                        was_infected_by[i] = cm
+                        elif infectious[j] and not immune[i]:
+                            infection_counter[j] += 1
+                            infected[i] = 1
+                            immune[i] = True
+                            was_infected_by[i] = cm
 
     missed = contacts
 
diff --git a/src/sid/covid_epi_params.csv b/src/sid/covid_epi_params.csv
index f283a58a..a3875901 100644
--- a/src/sid/covid_epi_params.csv
+++ b/src/sid/covid_epi_params.csv
@@ -95,7 +95,13 @@ cd_needs_icu_false,all,1,0.22
 cd_needs_icu_false,all,15,0.30
 cd_needs_icu_false,all,25,0.30
 cd_needs_icu_false,all,45,0.18
-cd_received_test_result_true,all,2,1
+cd_received_test_result_true,all,2,0.1
+cd_received_test_result_true,all,3,0.1
+cd_received_test_result_true,all,4,0.1
+cd_received_test_result_true,all,5,0.1
+cd_received_test_result_true,all,6,0.1
+cd_received_test_result_true,all,7,0.2
+cd_received_test_result_true,all,8,0.3
 cd_knows_immune_false,all,-1,1
 cd_knows_infectious_false,all,-1,1
 cd_ever_infected,all,-1,1
diff --git a/src/sid/msm.py b/src/sid/msm.py
index 0af29611..69c94c50 100644
--- a/src/sid/msm.py
+++ b/src/sid/msm.py
@@ -125,7 +125,6 @@ def _msm(
     empirical_moments = copy.deepcopy(empirical_moments)
 
     df = simulate(params)
-    df = df.compute()
 
     simulated_moments = {name: func(df) for name, func in calc_moments.items()}
 
@@ -141,7 +140,8 @@ def _msm(
     flat_empirical_moments = _flatten_index(empirical_moments)
     flat_simulated_moments = _flatten_index(simulated_moments)
 
-    moment_errors = flat_empirical_moments - flat_simulated_moments
+    # Order is important to manfred.
+    moment_errors = flat_simulated_moments - flat_empirical_moments
 
     # Return moment errors as indexed DataFrame or calculate weighted square product of
     # moment errors depending on return_scalar.
diff --git a/src/sid/parse_model.py b/src/sid/parse_model.py
index eab87d28..9cd3a547 100644
--- a/src/sid/parse_model.py
+++ b/src/sid/parse_model.py
@@ -56,7 +56,10 @@ def parse_share_known_cases(share_known_cases, duration, burn_in_periods):
             )
         # Extend series with burn-in periods and if shares for burn-in periods do not
         # exist, backfill NaNs.
-        share_known_cases = share_known_cases.reindex(extended_index).backfill()
+        # .backfill() is only available in pandas which is not supported by estimagic.
+        share_known_cases = share_known_cases.reindex(extended_index).fillna(
+            method="bfill"
+        )
 
     elif share_known_cases is None:
         share_known_cases = pd.Series(index=extended_index, data=0)
diff --git a/src/sid/shared.py b/src/sid/shared.py
index cb3422f6..07628edf 100644
--- a/src/sid/shared.py
+++ b/src/sid/shared.py
@@ -48,6 +48,7 @@ def factorize_assortative_variables(states, assort_by, is_recurrent):
     if is_recurrent:
         assort_by_series = states[assort_by[0]].astype(int).replace({-1: pd.NA})
         group_codes, group_codes_values = pd.factorize(assort_by_series)
+        group_codes = group_codes.astype(DTYPE_GROUP_CODE)
     elif assort_by:
         assort_by_series = [states[col].to_numpy() for col in assort_by]
         group_codes, group_codes_values = pd.factorize(
diff --git a/tests/test_contacts.py b/tests/test_contacts.py
index 23d4e193..2a6da970 100644
--- a/tests/test_contacts.py
+++ b/tests/test_contacts.py
@@ -51,7 +51,6 @@ def test_calculate_infections_numba_with_single_group(num_regression, seed):
         indexer,
         infection_prob,
         is_recurrent,
-        loop_order,
     ) = _sample_data_for_calculate_infections_numba(n_individuals=100, seed=seed)
 
     (
@@ -70,7 +69,6 @@ def test_calculate_infections_numba_with_single_group(num_regression, seed):
         infection_prob,
         seed,
         is_recurrent,
-        loop_order,
     )
 
     num_regression.check(
@@ -143,8 +141,6 @@ def _sample_data_for_calculate_infections_numba(
 
     is_recurrent = np.array([False])
 
-    loop_order = np.array(list(itertools.product(range(n_individuals), range(1))))
-
     return (
         contacts.reshape(-1, 1),
         infectious,
@@ -154,7 +150,6 @@ def _sample_data_for_calculate_infections_numba(
         indexers_list,
         infection_prob,
         is_recurrent,
-        loop_order,
     )
 
 
@@ -332,12 +327,6 @@ def fake_choose_j(a, weights):
 
     m.setattr("sid.contacts.choose_other_individual", fake_choose_j)
 
-    @njit
-    def fix_loop_order(x, replace, size):
-        return NumbaList(range(x))
-
-    m.setattr("sid.contacts.np.random.choice", fix_loop_order)
-
 
 def test_calculate_infections_only_non_recurrent(
     setup_households_w_one_infection, monkeypatch