From 440669e7675a063cbf7bb1cbe897b99c928a82d3 Mon Sep 17 00:00:00 2001 From: Janos Gabler Date: Mon, 4 Jan 2021 21:54:07 +0100 Subject: [PATCH] Small fixes for estimation of christmas models (#62) Co-authored-by: Klara Roehrl Co-authored-by: Tobias Raabe --- pyproject.toml | 2 +- src/sid/config.py | 2 +- src/sid/contacts.py | 168 +++++++++++++---------------------- src/sid/covid_epi_params.csv | 8 +- src/sid/msm.py | 4 +- src/sid/parse_model.py | 5 +- src/sid/shared.py | 1 + tests/test_contacts.py | 11 --- 8 files changed, 76 insertions(+), 125 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7312048a..430e8224 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,5 +8,5 @@ black = 1 pyupgrade = 1 [tool.nbqa.addopts] -isort = ["--treat-comment-as-code", "# %%"] +isort = ["--treat-comment-as-code", "# %%", "--profile=black"] pyupgrade = ["--py36-plus"] diff --git a/src/sid/config.py b/src/sid/config.py index ed6a0878..ee9f533f 100644 --- a/src/sid/config.py +++ b/src/sid/config.py @@ -35,7 +35,7 @@ DTYPE_INDEX = np.uint32 DTYPE_INFECTED = np.bool_ DTYPE_INFECTION_COUNTER = np.uint16 -DTYPE_N_CONTACTS = np.uint32 +DTYPE_N_CONTACTS = np.uint16 DTYPE_SID_PERIOD = np.int16 INDEX_NAMES = ["category", "subcategory", "name"] diff --git a/src/sid/contacts.py b/src/sid/contacts.py index e22e8aca..a5a4eaa2 100644 --- a/src/sid/contacts.py +++ b/src/sid/contacts.py @@ -108,7 +108,7 @@ def calculate_infections_by_contacts( [params.loc[("infection_prob", cm, cm), "value"] for cm in indexers] ) - reduced_contacts = _reduce_contacts_with_infection_probs( + contacts = _reduce_contacts_with_infection_probs( contacts, is_recurrent, infect_probs, next(seed) ) @@ -123,10 +123,6 @@ def calculate_infections_by_contacts( for ind in indexers.values(): indexers_list.append(ind) - np.random.seed(next(seed)) - - loop_order = _get_shuffled_loop_entries(len(states), len(indexers), next(seed)) - ( infected, infection_counter, @@ -134,7 +130,7 @@ def calculate_infections_by_contacts( missed, was_infected_by, ) = _calculate_infections_by_contacts_numba( - reduced_contacts, + contacts, infectious, immune, group_codes, @@ -143,7 +139,6 @@ def calculate_infections_by_contacts( infect_probs, next(seed), is_recurrent, - loop_order, ) infected = pd.Series(infected, index=states.index) @@ -165,47 +160,6 @@ def calculate_infections_by_contacts( return infected, n_has_additionally_infected, missed_contacts, was_infected_by -@nb.njit -def _get_shuffled_loop_entries(n_states, n_contact_models, seed, n_model_orders=1000): - """Create an array of loop entries. - - We save the loop entries of the following loop in shuffled order: - - .. code-block:: python - - for i in range(n_states): - for j in range(n_contact_models): - pass - - The loop entries are stored in an array with ``n_states * n_contact_model`` rows - and two columns. The first column contains the i, the second the j elements. - - Achieving complete randomness would require us to first store all loop entries - in an array and then shuffle it. However, this would be very slow. Instead - we loop over states in random order and cycle through previously. - - """ - np.random.seed(seed) - res = np.empty((n_states * n_contact_models, 2), dtype=np.int64) - shuffled_state_indices = np.random.choice(n_states, replace=False, size=n_states) - - # create random permutations of the model orders - model_orders = np.zeros((n_model_orders, n_contact_models)) - for m in range(n_model_orders): - model_orders[m] = np.random.choice( - n_contact_models, replace=False, size=n_contact_models - ) - - counter = 0 - - for i in shuffled_state_indices: - for j in model_orders[i % n_model_orders]: - res[counter, 0] = i - res[counter, 1] = j - counter += 1 - return res - - @nb.njit def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed): """Reduce the number of contacts stochastically. @@ -227,7 +181,7 @@ def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed): """ - reduced_contacts = contacts.copy() + contacts = contacts.copy() np.random.seed(seed) n_obs, n_contacts = contacts.shape for i in range(n_obs): @@ -237,8 +191,8 @@ def _reduce_contacts_with_infection_probs(contacts, is_recurrent, probs, seed): for _ in range(contacts[i, j]): if boolean_choice(probs[j]): success += 1 - reduced_contacts[i, j] = success - return reduced_contacts + contacts[i, j] = success + return contacts @nb.njit @@ -252,7 +206,6 @@ def _calculate_infections_by_contacts_numba( infection_probs, seed, is_recurrent, - loop_order, ): """Match people, draw if they get infected and record who infected whom. @@ -275,8 +228,6 @@ def _calculate_infections_by_contacts_numba( probability of infection for each contact model. seed (int): Seed value to control randomness. is_recurrent (numpy.ndarray): Boolean array of length n_contact_models. - loop_orrder (numpy.ndarray): 2d numpy array with two columns. The first column - indicates an individual. The second indicates a contact model. Returns: (tuple) Tuple containing @@ -297,66 +248,67 @@ def _calculate_infections_by_contacts_numba( groups_list = [np.arange(len(gp)) for gp in group_cdfs] was_infected_by = np.full(len(contacts), -1, dtype=np.int16) + n_obs, n_contact_models = contacts.shape # Loop over all individual-contact_model combinations - for k in range(len(loop_order)): - i, cm = loop_order[k] - if is_recurrent[cm]: - # We only check if i infects someone else from his group. Whether - # he is infected by some j is only checked, when the main loop arrives at j. - # This allows us to skip completely if i is not infectious or has no - # contacts under contact model cm. - group_i = group_codes[i, cm] - if group_i >= 0 and infectious[i] and contacts[i, cm] > 0: - others = indexers_list[cm][group_i] - # extract infection probability into a variable for faster access - prob = infection_probs[cm] - for j in others: - # the case i == j is skipped by the next if condition because it - # never happens that i is infectious but not immune - if not immune[j] and contacts[j, cm] > 0: - is_infection = boolean_choice(prob) - if is_infection: + for i in range(n_obs): + for cm in range(n_contact_models): + if is_recurrent[cm]: + # We only check if i infects someone else from his group. Whether + # he is infected by some j is only checked, when the main loop arrives + # at j. This allows us to skip completely if i is not infectious or has + # no contacts under contact model cm. + group_i = group_codes[i, cm] + if group_i >= 0 and infectious[i] and contacts[i, cm] > 0: + others = indexers_list[cm][group_i] + # extract infection probability into a variable for faster access + prob = infection_probs[cm] + for j in others: + # the case i == j is skipped by the next if condition because it + # never happens that i is infectious but not immune + if not immune[j] and contacts[j, cm] > 0: + is_infection = boolean_choice(prob) + if is_infection: + infection_counter[i] += 1 + infected[j] = 1 + immune[j] = True + was_infected_by[j] = cm + + else: + # get the probabilities for meeting another group which depend on the + # individual's group. + group_i = group_codes[i, cm] + group_i_cdf = group_cdfs[cm][group_i] + + # Loop over each contact the individual has, sample the contact's group + # and compute the sum of possible contacts in this group. + n_contacts = contacts[i, cm] + for _ in range(n_contacts): + contact_takes_place = True + group_j = choose_other_group(groups_list[cm], cdf=group_i_cdf) + choice_indices = indexers_list[cm][group_j] + contacts_j = contacts[choice_indices, cm] + + j = choose_other_individual(choice_indices, weights=contacts_j) + + if j < 0 or j == i: + contact_takes_place = False + + # If a contact takes place, find out if one individual got infected. + if contact_takes_place: + contacts[i, cm] -= 1 + contacts[j, cm] -= 1 + + if infectious[i] and not immune[j]: infection_counter[i] += 1 infected[j] = 1 immune[j] = True was_infected_by[j] = cm - else: - # get the probabilities for meeting another group which depend on the - # individual's group. - group_i = group_codes[i, cm] - group_i_cdf = group_cdfs[cm][group_i] - - # Loop over each contact the individual has, sample the contact's group and - # compute the sum of possible contacts in this group. - n_contacts = contacts[i, cm] - for _ in range(n_contacts): - contact_takes_place = True - group_j = choose_other_group(groups_list[cm], cdf=group_i_cdf) - choice_indices = indexers_list[cm][group_j] - contacts_j = contacts[choice_indices, cm] - - j = choose_other_individual(choice_indices, weights=contacts_j) - - if j < 0 or j == i: - contact_takes_place = False - - # If a contact takes place, find out if one individual got infected. - if contact_takes_place: - contacts[i, cm] -= 1 - contacts[j, cm] -= 1 - - if infectious[i] and not immune[j]: - infection_counter[i] += 1 - infected[j] = 1 - immune[j] = True - was_infected_by[j] = cm - - elif infectious[j] and not immune[i]: - infection_counter[j] += 1 - infected[i] = 1 - immune[i] = True - was_infected_by[i] = cm + elif infectious[j] and not immune[i]: + infection_counter[j] += 1 + infected[i] = 1 + immune[i] = True + was_infected_by[i] = cm missed = contacts diff --git a/src/sid/covid_epi_params.csv b/src/sid/covid_epi_params.csv index f283a58a..a3875901 100644 --- a/src/sid/covid_epi_params.csv +++ b/src/sid/covid_epi_params.csv @@ -95,7 +95,13 @@ cd_needs_icu_false,all,1,0.22 cd_needs_icu_false,all,15,0.30 cd_needs_icu_false,all,25,0.30 cd_needs_icu_false,all,45,0.18 -cd_received_test_result_true,all,2,1 +cd_received_test_result_true,all,2,0.1 +cd_received_test_result_true,all,3,0.1 +cd_received_test_result_true,all,4,0.1 +cd_received_test_result_true,all,5,0.1 +cd_received_test_result_true,all,6,0.1 +cd_received_test_result_true,all,7,0.2 +cd_received_test_result_true,all,8,0.3 cd_knows_immune_false,all,-1,1 cd_knows_infectious_false,all,-1,1 cd_ever_infected,all,-1,1 diff --git a/src/sid/msm.py b/src/sid/msm.py index 0af29611..69c94c50 100644 --- a/src/sid/msm.py +++ b/src/sid/msm.py @@ -125,7 +125,6 @@ def _msm( empirical_moments = copy.deepcopy(empirical_moments) df = simulate(params) - df = df.compute() simulated_moments = {name: func(df) for name, func in calc_moments.items()} @@ -141,7 +140,8 @@ def _msm( flat_empirical_moments = _flatten_index(empirical_moments) flat_simulated_moments = _flatten_index(simulated_moments) - moment_errors = flat_empirical_moments - flat_simulated_moments + # Order is important to manfred. + moment_errors = flat_simulated_moments - flat_empirical_moments # Return moment errors as indexed DataFrame or calculate weighted square product of # moment errors depending on return_scalar. diff --git a/src/sid/parse_model.py b/src/sid/parse_model.py index eab87d28..9cd3a547 100644 --- a/src/sid/parse_model.py +++ b/src/sid/parse_model.py @@ -56,7 +56,10 @@ def parse_share_known_cases(share_known_cases, duration, burn_in_periods): ) # Extend series with burn-in periods and if shares for burn-in periods do not # exist, backfill NaNs. - share_known_cases = share_known_cases.reindex(extended_index).backfill() + # .backfill() is only available in pandas which is not supported by estimagic. + share_known_cases = share_known_cases.reindex(extended_index).fillna( + method="bfill" + ) elif share_known_cases is None: share_known_cases = pd.Series(index=extended_index, data=0) diff --git a/src/sid/shared.py b/src/sid/shared.py index cb3422f6..07628edf 100644 --- a/src/sid/shared.py +++ b/src/sid/shared.py @@ -48,6 +48,7 @@ def factorize_assortative_variables(states, assort_by, is_recurrent): if is_recurrent: assort_by_series = states[assort_by[0]].astype(int).replace({-1: pd.NA}) group_codes, group_codes_values = pd.factorize(assort_by_series) + group_codes = group_codes.astype(DTYPE_GROUP_CODE) elif assort_by: assort_by_series = [states[col].to_numpy() for col in assort_by] group_codes, group_codes_values = pd.factorize( diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 23d4e193..2a6da970 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -51,7 +51,6 @@ def test_calculate_infections_numba_with_single_group(num_regression, seed): indexer, infection_prob, is_recurrent, - loop_order, ) = _sample_data_for_calculate_infections_numba(n_individuals=100, seed=seed) ( @@ -70,7 +69,6 @@ def test_calculate_infections_numba_with_single_group(num_regression, seed): infection_prob, seed, is_recurrent, - loop_order, ) num_regression.check( @@ -143,8 +141,6 @@ def _sample_data_for_calculate_infections_numba( is_recurrent = np.array([False]) - loop_order = np.array(list(itertools.product(range(n_individuals), range(1)))) - return ( contacts.reshape(-1, 1), infectious, @@ -154,7 +150,6 @@ def _sample_data_for_calculate_infections_numba( indexers_list, infection_prob, is_recurrent, - loop_order, ) @@ -332,12 +327,6 @@ def fake_choose_j(a, weights): m.setattr("sid.contacts.choose_other_individual", fake_choose_j) - @njit - def fix_loop_order(x, replace, size): - return NumbaList(range(x)) - - m.setattr("sid.contacts.np.random.choice", fix_loop_order) - def test_calculate_infections_only_non_recurrent( setup_households_w_one_infection, monkeypatch