From 791eefe93ff142134bf1d138f22efd45387e930f Mon Sep 17 00:00:00 2001 From: David Gardenier Date: Thu, 3 Dec 2020 13:59:14 +0100 Subject: [PATCH] Adding Monte Carlo functionality --- tests/monte_carlo/goodness_of_fit.py | 294 +++++++++++++++++ tests/monte_carlo/plot.py | 400 +++++++++++++++++++++++ tests/monte_carlo/plot_run.py | 75 +++++ tests/monte_carlo/run_mc.py | 142 +++++++++ tests/monte_carlo/simulations.py | 402 ++++++++++++++++++++++++ tests/monte_carlo/weighted_quantiles.py | 83 +++++ 6 files changed, 1396 insertions(+) create mode 100644 tests/monte_carlo/goodness_of_fit.py create mode 100644 tests/monte_carlo/plot.py create mode 100644 tests/monte_carlo/plot_run.py create mode 100644 tests/monte_carlo/run_mc.py create mode 100644 tests/monte_carlo/simulations.py create mode 100644 tests/monte_carlo/weighted_quantiles.py diff --git a/tests/monte_carlo/goodness_of_fit.py b/tests/monte_carlo/goodness_of_fit.py new file mode 100644 index 0000000..a54ad03 --- /dev/null +++ b/tests/monte_carlo/goodness_of_fit.py @@ -0,0 +1,294 @@ +from weighted_quantiles import median +from scipy.stats import ks_2samp +import numpy as np +import os +import matplotlib.pyplot as plt + +from frbpoppy import unpickle, TNS, poisson_interval, pprint +from tests.rates.alpha_real import EXPECTED +from tests.convenience import plot_aa_style, rel_path +from simulations import SimulationOverview, POP_SIZE + +NORM_SURV = 'parkes-htru' + + +class GoodnessOfFit: + + def __init__(self): + self.run_pars = {1: ['alpha', 'si', 'li'], + 2: ['li', 'lum_min', 'lum_max'], + 3: ['w_mean', 'w_std'], + 4: ['dm_igm_slope', 'dm_host']} + self.norm_surv = NORM_SURV + self.so = SimulationOverview() + self.tns = self.get_tns() + + def get_tns(self): + # Only get one-offs + return TNS(repeaters=False, mute=True, update=False).df + + def dm(self, pop, survey_name): + """Calculate GoodnessOfFit for DM distributions.""" + mask = ((self.tns.survey == survey_name) & (self.tns.dm <= 950)) + try: + ks_dm = ks_2samp(pop.frbs.dm, self.tns[mask].dm)[1] + except ValueError: + ks_dm = np.nan + return ks_dm + + def snr(self, pop, survey_name): + mask = ((self.tns.survey == survey_name) & (self.tns.dm <= 950)) + try: + ks_snr = ks_2samp(pop.frbs.snr, self.tns[mask].snr)[1] + except ValueError: + ks_snr = np.nan + return ks_snr + + def rate(self, pop, survey_name, norm_uuid, run, errs=False): + # Add rate details + sr = pop.source_rate + surv_sim_rate = sr.det / sr.days + + # Perhaps use at some stage + if errs: + p_int = poisson_interval(sr.det, sigma=1) + surv_sim_rate_errs = [p/sr.days for p in p_int] + + # Determine ratio of detection rates + if survey_name in EXPECTED: + n_frbs, n_days = EXPECTED[survey_name] + else: + n_frbs, n_days = [np.nan, np.nan] + surv_real_rate = n_frbs/n_days + + # Get normalisation properties + norm_real_n_frbs, norm_real_n_days = EXPECTED[self.norm_surv] + norm_pop = unpickle(f'mc/run_{run}/{norm_uuid}') + norm_sim_n_frbs = norm_pop.source_rate.det + norm_sim_n_days = norm_pop.source_rate.days + norm_sim_rate = norm_sim_n_frbs / norm_sim_n_days + norm_real_rate = norm_real_n_frbs / norm_real_n_days + + if norm_sim_rate == 0: + norm_sim_rate = POP_SIZE / norm_sim_n_days + + sim_ratio = surv_sim_rate / norm_sim_rate + real_ratio = surv_real_rate / norm_real_rate + + diff = np.abs(sim_ratio - real_ratio) + if diff == 0: + rate_diff = 1e-3 + else: + rate_diff = 1 / diff + + return rate_diff, pop.n_sources() + + def calc_gofs(self, run): + + # For each requested run + self.so = SimulationOverview() + par_set = self.so.df[self.so.df.run == run].par_set.iloc[0] + pprint(f'Calculating goodness of fit for run {run}, par set {par_set}') + pars = self.run_pars[par_set] + values = [] + + # Loop through all combination of parameters + for values, group in self.so.df[self.so.df.run == run].groupby(pars): + pprint(f' - {list(zip(pars, values))}') + # Calculate goodness of fit values for each simulation + for row_ix, row in group.iterrows(): + survey_name = row.survey + uuid = row.uuid + pop = unpickle(f'mc/run_{run}/{uuid}') + + # Apply a DM cutoff + mask = (pop.frbs.dm <= 950) + pop.frbs.apply(mask) + pop.source_rate.det = pop.n_sources() * pop.source_rate.f_area + + dm_gof = self.dm(pop, survey_name) + snr_gof = self.snr(pop, survey_name) + self.so.df.at[row_ix, 'dm_gof'] = dm_gof + self.so.df.at[row_ix, 'snr_gof'] = snr_gof + + if pop.n_sources() == 0: + self.so.df.at[row_ix, 'weight'] = 0 + self.so.df.at[row_ix, 'n_det'] = pop.n_sources() + pprint(f' - No sources in {survey_name}') + continue + + # Find corresponding rate normalisation population uuid + norm_mask = dict(zip(pars, values)) + norm_mask['survey'] = self.norm_surv + norm_mask['run'] = run + k = norm_mask.keys() + v = norm_mask.values() + norm_uuid = group.loc[group[k].isin(v).all(axis=1), :].uuid + norm_uuid = norm_uuid.values[0] + rate_diff, n_det = self.rate(pop, survey_name, norm_uuid, run) + + # Get rate weighting + self.so.df.at[row_ix, 'weight'] = rate_diff + self.so.df.at[row_ix, 'n_det'] = n_det + + pprint(f'Saving the results for run {run}') + # Best matching in terms of rates + max_w = np.nanmax(self.so.df.weight) + self.so.df.loc[self.so.df.weight == 1e3]['weight'] = max_w + self.so.save() + + def plot(self, run): + # Get data + # For each requested run + df = self.so.df + par_set = df[df.run == run].par_set.iloc[0] + + # For each parameter + for main_par in self.run_pars[par_set]: + pprint(f'Plotting {main_par}') + other_pars = [e for e in self.run_pars[par_set] if e != main_par] + + for compare_par in ['dm', 'snr']: + compare_col = f'{compare_par}_gof' + + pprint(f' - {compare_col}') + for survey, group_surv in df[df.run == run].groupby('survey'): + + pprint(f' - {survey}') + + # Set up plot + plot_aa_style() + plt.rcParams["figure.figsize"] = (5.75373*3, 5.75373*3) + plt.rcParams['figure.max_open_warning'] = 125 + n_x = group_surv[other_pars[0]].nunique() + if len(other_pars) > 1: + n_y = group_surv[other_pars[1]].nunique() + else: + n_y = 1 + fig, ax = plt.subplots(n_x, n_y, + sharex='col', sharey='row') + + groups = group_surv.groupby(other_pars) + x = -1 + for i, (other_pars_vals, group) in enumerate(groups): + bins = group[main_par].values + values = group[compare_col].values + bins, values = self.add_edges_to_hist(bins, values) + + if n_y > 1: + y = i % n_y + if y == 0: + x += 1 + a = ax[y, x] + else: + y = i + a = ax[y] + + a.step(bins, values, where='mid') + a.set_title = str(other_pars_vals) + + diff = np.diff(bins) + if diff[1] != diff[0]: + a.set_xscale('log') + + # Set axis label + if y == n_y - 1: + p = other_pars[0] + if isinstance(other_pars_vals, float): + val = other_pars_vals + else: + val = other_pars_vals[0] + p = p.replace('_', ' ') + a.set_xlabel(f'{p} = {val:.2}') + + if x == 0: + p = other_pars[1] + val = other_pars_vals[1] + p = p.replace('_', ' ') + a.set_ylabel(f'{p} = {val:.2}') + + # Set axis limits + subset = df[df.run == run][main_par] + y_subset = group_surv[compare_col].copy() + try: + low = np.nanmin(y_subset) + high = np.nanmax(y_subset) + except ValueError: + low = 0.0001 + high = 1 + log = False + if low > 0 and high > 0: + log = True + + for a in ax.flatten(): + a.set_xlim(subset.min(), subset.max()) + if log: + a.set_yscale('log', nonposy='clip') + a.set_ylim(low, high) + + p = main_par.replace('_', ' ') + fig.suptitle(f'{p} - {compare_par} - {survey}') + plt.tight_layout() + plt.subplots_adjust(top=0.95) + + # Save to subdirectory + path_to_save = rel_path(f'./plots/mc/{main_par}_run{run}/') + if not os.path.isdir(path_to_save): + os.mkdir(path_to_save) + path_to_save += f'{compare_par}_{survey}.pdf' + plt.savefig(path_to_save) + plt.clf() + + def add_edges_to_hist(self, bins, n, bin_type='lin'): + """Add edges to histograms""" + + np.seterr(divide='ignore', invalid='ignore') + + if bin_type == 'lin': + bin_dif = np.diff(bins)[-1] + bins = np.insert(bins, 0, bins[0] - bin_dif) + bins = np.insert(bins, len(bins), bins[-1] + bin_dif) + else: + bin_dif = np.diff(np.log10(bins))[-1] + bins = np.insert(bins, 0, 10**(np.log10(bins[0])-bin_dif)) + bins = np.insert(bins, len(bins), 10**(np.log10(bins[-1])+bin_dif)) + + n = np.insert(n, 0, np.nan) + n = np.insert(n, len(n), np.nan) + + return bins, n + + def weighted_median(self, df): + dm_gof = df['dm_gof'].values + dm_weight = df['n_det'].values + snr_gof = df['snr_gof'].values + snr_weight = df['n_det'].values + gofs = np.concatenate([dm_gof, snr_gof]) + weights = np.concatenate([dm_weight, snr_weight]) + return median(gofs, weights) + + def calc_global_max(self, run): + self.so = SimulationOverview() + df = self.so.df[self.so.df.run == run] + par_set = df[df.run == run].par_set.iloc[0] + cols = self.run_pars[par_set] + values = [] + gofs = [] + + # Loop through all combination of parameters + for value, group in df.groupby(cols): + gof = self.weighted_median(group) + values.append(value) + gofs.append(gof) + + gofs = np.array(gofs) + + # Find maximum (best gof) + if np.isnan(gofs).all(): + return dict(zip(cols, [(np.nan, np.nan) for i in cols])) + else: + best_ix = np.nanargmax(gofs) + best_values = values[best_ix] + best_gofs = [gofs[best_ix]]*len(cols) + + return dict(zip(cols, zip(best_values, best_gofs))) diff --git a/tests/monte_carlo/plot.py b/tests/monte_carlo/plot.py new file mode 100644 index 0000000..11d80dd --- /dev/null +++ b/tests/monte_carlo/plot.py @@ -0,0 +1,400 @@ +from frbpoppy import hist +from goodness_of_fit import GoodnessOfFit +from matplotlib.lines import Line2D +from scipy.optimize import curve_fit +from tests.convenience import plot_aa_style, rel_path +import matplotlib.pyplot as plt +import numpy as np + + +class Plot(): + """Plot runs.""" + + def __init__(self): + plot_aa_style() + plt.rcParams['figure.figsize'] = (5.75373, (5.75373/3)*4) + plt.rcParams['font.size'] = 9 + # plt.rcParams['xtick.major.pad'] = 10 + # plt.rcParams['ytick.major.pad'] = 10 + # plt.rcParams['axes.titlepad'] = 10 + self.fig, self.axes = plt.subplots(4, 3, sharey='row') + self.colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] + self.gf = GoodnessOfFit() + self.df = self.gf.so.df + + # Calculate global maximums + self.gm = {} + for run in self.df.run.unique(): + self.gm[run] = self.gf.calc_global_max(run) + print(self.gm) + + # Plot various subplots + self.alpha() + self.si() + self.li() + self.li_2() + self.lum_min() + self.lum_max() + self.w_int_mean() + self.w_int_std() + self.legend() + self.dm_igm_slope() + self.dm_host() + self.axes[3, 2].set_axis_off() + + # Plot run rectangles + # self.runs() + + # Save plot + plt.tight_layout() # rect=[0, 0, 0.98, 1]) + plt.subplots_adjust(wspace=0.1) + plt.savefig(rel_path('./plots/mc/mc.pdf')) + + def alpha(self): + ax = self.axes[0, 0] + parm = 'alpha' + label = r'$\alpha$' + runs = [1, 5, 8] + + ax.set_yscale('log', nonposy='clip') + ax.set_ylabel(r'GoF') + ax.set_xlabel(label) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i==1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.1f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def si(self): + ax = self.axes[0, 1] + parm = 'si' + label = r'\text{si}' + runs = [1, 5, 8] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.1f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def li(self): + ax = self.axes[0, 2] + parm = 'li' + # label = r'\text{lum$_{\text{i}}$}' + label = 'li' + runs = [1, 5, 8] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label) + ax_right = ax.twinx() + ax_right.set_ylabel('Set 1', labelpad=10) + ax_right.tick_params(axis='y', which='both', right=False, labelright=False) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.1f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def li_2(self): + ax = self.axes[1, 0] + ax.set_ylabel(r'GoF') + parm = 'li' + label = 'li' + runs = [2] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + # if not np.isnan(best_value): + # title = fr'{label}=${best_value:.1f}$' + # ax.set_title(title, fontsize=10, color=self.colors[i]) + + def lum_min(self): + ax = self.axes[1, 1] + ax.set_xscale('log') + label = r'\text{lum$_{\text{min}}$}' + parm = 'lum_min' + runs = [2] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label + r' (erg s$^{-1}$)') + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + # if not np.isnan(best_value): + # title = fr'{label}=${best_value:.1e}$' + # ax.set_title(title, fontsize=10, color=self.colors[i]) + + def lum_max(self): + ax = self.axes[1, 2] + ax.set_xscale('log') + label = r'\text{lum$_{\text{max}}$}' + parm = 'lum_max' + runs = [2] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label + r' (erg s$^{-1}$)') + ax_right = ax.twinx() + ax_right.set_ylabel('Set 2', labelpad=10) + ax_right.tick_params(axis='y', which='both', right=False, labelright=False) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + # if not np.isnan(best_value): + # title = fr'{label}=${best_value:.1e}$' + # ax.set_title(title, fontsize=10, color=self.colors[i]) + + def w_int_mean(self): + ax = self.axes[2, 0] + ax.set_xscale('log') + ax.set_ylabel(r'GoF') + label = r'\text{w$_{\text{int, mean}}$}' + parm = 'w_mean' + runs = [3, 6, 9] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(fr'{label} (ms)') + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.1e}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def w_int_std(self): + ax = self.axes[2, 1] + label = r'\text{w$_{\text{int, std}}$}' + parm = 'w_std' + runs = [3, 6, 9] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(fr'{label} (ms)') + ax_right = ax.twinx() + ax_right.set_ylabel('Set 3', labelpad=10) + ax_right.tick_params(axis='y', which='both', right=False, labelright=False) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.1f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def dm_igm_slope(self): + ax = self.axes[3, 0] + ax.set_ylabel(r'GoF') + label = r'\text{DM$_{\text{IGM, slope}}$}' + parm = 'dm_igm_slope' + runs = [4, 7, 10] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label + r' ($\textrm{pc}\ \textrm{cm}^{-3}$)') + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.0f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def dm_host(self): + ax = self.axes[3, 1] + label = r'\text{DM$_{\text{Host}}$}' + parm = 'dm_host' + runs = [4, 7, 10] + + ax.set_yscale('log', nonposy='clip') + ax.set_xlabel(label + r' ($\textrm{pc}\ \textrm{cm}^{-3}$)') + ax_right = ax.twinx() + ax_right.set_ylabel('Set 4', labelpad=10) + ax_right.tick_params(axis='y', which='both', right=False, labelright=False) + best_value = np.nan + + # Plot runs + for i, run in enumerate(runs): + bins, gofs = self.get_data(run, parm) + ax.step(bins, gofs, where='mid') + + # Plot global maximum + try: + best_value, best_gof = self.gm[run][parm] + ax.plot([best_value]*2, [1e-1, best_gof], color=self.colors[i], linestyle='--') + ax.scatter([best_value], [best_gof], marker='x', color=self.colors[i]) + except KeyError: + i -= 1 + continue + + if i == 1 and not np.isnan(best_value): + title = fr'{label}=${best_value:.0f}$' + ax.set_title(title, fontsize=10, color=self.colors[i]) + + def legend(self): + ax = self.axes[2, 2] + + # Add legend elements + elements = [] + + line = Line2D([0], [0], color=self.colors[0]) + elements.append((line, r'Cycle 1')) + + line = Line2D([0], [0], color=self.colors[1]) + elements.append((line, r'Cycle 2')) + + line = Line2D([0], [0], color=self.colors[2]) + elements.append((line, r'Cycle 3')) + + line = Line2D([0], [0], color='grey', linestyle='--') + label = r'Max GoF' + elements.append((line, label)) + + lines, labels = zip(*elements) + self.fig.legend(lines, labels, bbox_to_anchor=(0.84, 0.4), + loc='center') + + ax.set_axis_off() + + def get_data(self, run_number, par): + df = self.df[self.df.run == run_number] + if df.empty: + return [np.nan], [np.nan] + gofs = [] + bins = [] + for bin_val, group in df.groupby(par): + gof = self.gf.weighted_median(group) + gofs.append(gof) + bins.append(bin_val) + bins = np.array(bins) + gofs = np.array(gofs) + diff = np.diff(bins) + bin_type = 'lin' + if not np.isclose(diff[0], diff[1]): + bin_type = 'log' + bins, gofs = self.gf.add_edges_to_hist(bins, gofs, bin_type=bin_type) + gofs[np.isnan(gofs)] = 1e-1 + return bins, gofs + + +if __name__ == '__main__': + Plot() diff --git a/tests/monte_carlo/plot_run.py b/tests/monte_carlo/plot_run.py new file mode 100644 index 0000000..1283f6f --- /dev/null +++ b/tests/monte_carlo/plot_run.py @@ -0,0 +1,75 @@ +"""Visualise run.""" +import matplotlib.pyplot as plt +import numpy as np +from mpl_toolkits.mplot3d import Axes3D + +from simulations import SimulationOverview +from goodness_of_fit import GoodnessOfFit + +RUN = 5 +run_pars = {1: ['alpha', 'si', 'li'], + 2: ['lum_min', 'lum_max', 'li'], + 3: ['w_mean', 'w_std'], + 4: ['dm_igm_slope', 'dm_host']} + +so = SimulationOverview() +df = so.df[so.df.run == RUN].copy() +par_set = df.par_set.iloc[0] +cols = run_pars[par_set] + +# Group together survey results +gofs = {c: [] for c in cols} +gofs['gof'] = [] +gf = GoodnessOfFit() +for vals, group in df.groupby(cols): + gofs['gof'].append(gf.weighted_median(group)) + for i, c in enumerate(cols): + gofs[c].append(vals[i]) + +fig = plt.figure() +ax = fig.add_subplot(111, projection='3d') + +# Log business +labels = [] +for col in cols: + diff = np.diff(np.sort(list(set(gofs[col])))) + if not np.isclose(diff[0], diff[-1]): + gofs[col] = np.log10(gofs[col]) + label = f'log {col}' + else: + label = col + labels.append(label) + +x = gofs[cols[0]] +y = gofs[cols[1]] +colours = np.array(gofs['gof']) +if len(cols) == 2: + z = 0 +else: + z = gofs[cols[2]] + ax.set_zlabel(labels[2]) + +p = ax.scatter(x, y, z, c=colours, s=20*10**colours) +cbar = fig.colorbar(p, orientation='horizontal', + cax=fig.add_axes([0.05, 0.9, 0.2, 0.03])) +cbar.ax.set_ylabel('GoF') #, rotation=-90, va="bottom") + +xlabel = labels[0] +ylabel = labels[1] + +# ugly, but quick +if xlabel == 'alpha': + xlabel = r'$\alpha$' +if xlabel == 'log lum_min': + xlabel = r'lum$_{\rm min}$' +if xlabel == 'log lum_max': + xlabel = r'lum$_{\rm max}$' +if ylabel == 'log lum_min': + ylabel = r'lum$_{\rm min}$' +if ylabel == 'log lum_max': + ylabel = r'lum$_{\rm max}$' +ax.set_xlabel(xlabel) +ax.set_ylabel(ylabel) + +plt.tight_layout() +plt.show() diff --git a/tests/monte_carlo/run_mc.py b/tests/monte_carlo/run_mc.py new file mode 100644 index 0000000..01c4adf --- /dev/null +++ b/tests/monte_carlo/run_mc.py @@ -0,0 +1,142 @@ +"""Link together all classes to run a full Monte Carlo.""" +import pandas as pd +import numpy as np +import frbpoppy.paths +import os + +from simulations import MonteCarlo, POP_SIZE +from goodness_of_fit import GoodnessOfFit +from plot import Plot + +GENERATE = True +CALC_GOFS = True +RUNS = [10] + + +class RunOverview: + """Gather input for each run.""" + + def __init__(self, load_csv=True): + p = frbpoppy.paths.populations() + self.filename = f'{p}mc/run_overview.csv' + + if load_csv and os.path.isfile(self.filename): + self.df = self.load() + else: + self.df = self.gen_runs() + + def gen_run(self): + return {'alpha': None, + 'si': None, + 'li': None, + 'lum_min': None, + 'lum_max': None, + 'w_mean': None, + 'w_std': None, + 'dm_igm_slope': None, + 'dm_host': None, + 'execute': True, + 'par_set': 0, + 'run': 0} + + def gen_runs(self): + runs = [] + for i in range(10): + r = self.gen_run() + r['run_number'] = i + 1 + r['execute'] = True + r['par_set'] = i % 4 + 1 + if i == 9: # Holder for best values + r['execute'] = False + runs.append(r) + df = pd.DataFrame(runs) + df.set_index('run', inplace=True) + return df + + def load(self): + df = pd.read_csv(self.filename) + df.run = df.run.astype(int) + df.par_set = df.par_set.astype(int) + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + return df + + def save(self, df=None): + if df is None: + df = self.df + df.to_csv(self.filename) + + +if __name__ == '__main__': + print('Commencing') + runs = RunOverview(load_csv=True) + mc = MonteCarlo(pop_size=POP_SIZE) + + for i, run in runs.df.iterrows(): + + run = runs.df.iloc[i] + + print('='*50) + print(f'On Run {run.run} with par_set {run.par_set}') + print('='*50) + print(run) + + if run.run not in RUNS: + continue + + # Generate parameter sets + if GENERATE: + if run.par_set == 1: + mc.gen_par_set_1(lum_min=run.lum_min, + lum_max=run.lum_max, + w_mean=run.w_mean, + w_std=run.w_std, + dm_igm_slope=run.dm_igm_slope, + dm_host=run.dm_host, + run=run.run) + if run.par_set == 2: + mc.gen_par_set_2(alpha=run.alpha, + si=run.si, + w_mean=run.w_mean, + w_std=run.w_std, + dm_igm_slope=run.dm_igm_slope, + dm_host=run.dm_host, + run=run.run) + if run.par_set == 3: + mc.gen_par_set_3(alpha=run.alpha, + si=run.si, + li=run.li, + lum_min=run.lum_min, + lum_max=run.lum_max, + dm_igm_slope=run.dm_igm_slope, + dm_host=run.dm_host, + run=run.run) + if run.par_set == 4: + mc.gen_par_set_4(alpha=run.alpha, + si=run.si, + li=run.li, + lum_min=run.lum_min, + lum_max=run.lum_max, + w_mean=run.w_mean, + w_std=run.w_std, + run=run.run) + + # Determine the goodness of fit + gf = GoodnessOfFit() + if CALC_GOFS: + gf.calc_gofs(run.run) + + # Find global maximums + gf = GoodnessOfFit() + gms = gf.calc_global_max(run.run) + print('\n') + print(f' Best fits from run {run.run}-> {gms}') + print('\n') + + # Adapt the input for future runs + for j in range(i+1, len(runs.df)): + for par in gms: + runs.df.at[j, par] = gms[par][0] + + runs.save() + + Plot() diff --git a/tests/monte_carlo/simulations.py b/tests/monte_carlo/simulations.py new file mode 100644 index 0000000..1aa4788 --- /dev/null +++ b/tests/monte_carlo/simulations.py @@ -0,0 +1,402 @@ +"""Run Monte Carlo simulations.""" +from joblib import Parallel, delayed +from frbpoppy import Survey, CosmicPopulation, SurveyPopulation, pprint +from datetime import datetime +from copy import deepcopy + +from glob import glob +import frbpoppy.paths +import os +import numpy as np +import pandas as pd +from tqdm import tqdm +import uuid + +POP_SIZE = 5e7 + + +class SimulationOverview: + """Given values, return uid + + Load from file, or make.""" + + def __init__(self, load_csv=True): + p = frbpoppy.paths.populations() + self.filename = f'{p}mc/simluation_overview.csv' + + if load_csv and os.path.isfile(self.filename): + self.load() + else: + self.df = pd.DataFrame() + + def load(self): + self.df = pd.read_csv(self.filename, index_col=0) + self.df = self.df.loc[:, ~self.df.columns.str.contains('^Unnamed')] + + def save(self): + self.df.to_csv(self.filename) + + def append(self, df): + self.df = self.df.append(df, ignore_index=True) + + def map_surveys(self, ix, names): + mapping = dict(zip(ix, names)) + self.df.replace({"survey": mapping}, inplace=True) + + +class MonteCarlo: + + def __init__(self, pop_size=1e2, load_csv=True): + self.survey_names = ['parkes-htru', + 'chime-frb', + 'askap-incoh', + 'wsrt-apertif'] + self.load_csv = load_csv + self.pop_size = pop_size + self.survey_ix = [i for i in range(len(self.survey_names))] + self.surveys = self.set_up_surveys() + self.so = SimulationOverview(load_csv=self.load_csv) + self.set_up_dirs() + + def set_up_surveys(self): + """Set up surveys.""" + surveys = [] + for name in self.survey_names: + survey = Survey(name=name) + survey.set_beam(model='airy', n_sidelobes=1) + if name in ('chime-frb', 'wsrt-apertif', 'parkes-htru'): + survey.set_beam(model=name) + surveys.append(survey) + return surveys + + def set_up_dirs(self, run=np.nan): + """Create subdirectory for saving populations. + + Returns True if directory had to be set up.""" + f = f'{frbpoppy.paths.populations()}mc/' + if not os.path.isdir(f): + os.mkdir(f) + return True + + if not np.isnan(run): + f = f'{frbpoppy.paths.populations()}mc/run_{run}/' + if not os.path.isdir(f): + os.mkdir(f) + return True + + return False + + def gen_par_set_1(self, + parallel=True, + lum_min=np.nan, + lum_max=np.nan, + w_mean=np.nan, + w_std=np.nan, + dm_igm_slope=np.nan, + dm_host=np.nan, + run=0): + alphas = np.linspace(-2.5, -1, 11) + sis = np.linspace(-2, 2, 11) + lis = np.linspace(-2, 0, 11) + + # Put all options into a dataframe + if 'run' in self.so.df: + self.so.df = self.so.df[self.so.df.run != run] + opt = np.meshgrid(alphas, sis, lis, self.survey_ix) + options = np.array(opt).T.reshape(-1, 4) + df = pd.DataFrame(options, columns=('alpha', 'si', 'li', 'survey')) + df['run'] = run + df['par_set'] = 1 + df['uuid'] = [uuid.uuid4() for _ in range(len(df.index))] + df['date'] = datetime.today() + self.so.append(df) + self.so.map_surveys(self.survey_ix, self.survey_names) + self.so.save() + + # Remove previous par_set of the same number + if not self.set_up_dirs(run=run): + fs = f'{frbpoppy.paths.populations()}mc/run_{run}/*' + for f in glob(fs): + os.remove(f) + + def iter_alpha(i): + alpha = alphas[i] + pop = CosmicPopulation.complex(self.pop_size) + + pop.set_dist(model='vol_co', z_max=1.0, alpha=alpha) + pop.set_lum(model='constant', value=1) + + if not np.isnan(w_mean): + pop.set_w(model='lognormal', mean=w_mean, std=w_std) + if not np.isnan(dm_igm_slope): + pop.set_dm_igm(model='ioka', slope=dm_igm_slope) + pop.set_dm_host(model='constant', value=dm_host) + + pop.generate() + + for si in sis: + pop.set_si(model='constant', value=si) + pop.gen_si() + + for li in lis: + pop.set_lum(model='powerlaw', + low=1e40, + high=1e45, power=li) + + if not np.isnan(lum_min): + pop.set_lum(model='powerlaw', low=lum_min, + high=lum_max, index=li) + + pop.gen_lum() + + for survey in self.surveys: + surv_pop = SurveyPopulation(pop, survey) + + # Get unique identifier + mask = (self.so.df.par_set == 1) + mask &= (self.so.df.run == run) + mask &= (self.so.df.alpha == alpha) + mask &= (self.so.df.si == si) + mask &= (self.so.df.li == li) + mask &= (self.so.df.survey == survey.name) + uuid = self.so.df[mask].uuid.iloc[0] + surv_pop.name = f'mc/run_{run}/{uuid}' + surv_pop.save() + + if parallel: + n_cpu = min([3, os.cpu_count() - 1]) + pprint(f'{os.cpu_count()} CPUs available') + r = range(len(alphas)) + Parallel(n_jobs=n_cpu)(delayed(iter_alpha)(i) for i in tqdm(r)) + else: + [iter_alpha(i) for i in tqdm(range(len(alphas)))] + + def gen_par_set_2(self, + parallel=True, + alpha=-1.5, + si=0, + w_mean=np.nan, + w_std=np.nan, + dm_igm_slope=np.nan, + dm_host=np.nan, + run=np.nan): + lis = np.linspace(-1.5, 0, 11) + lum_mins = 10**np.linspace(38, 46, 11) + lum_maxs = 10**np.linspace(38, 46, 11) + + # Put all options into a dataframe + self.so.df = self.so.df[self.so.df.run != run] + opt = np.meshgrid(lis, lum_mins, lum_maxs, self.survey_ix) + options = np.array(opt).T.reshape(-1, 4) + cols = ('li', 'lum_min', 'lum_max', 'survey') + df = pd.DataFrame(options, columns=cols) + df['par_set'] = 2 + df['run'] = run + df['uuid'] = [uuid.uuid4() for _ in range(len(df.index))] + df['date'] = datetime.today() + df = df[~(df.lum_max < df.lum_min)] + self.so.append(df) + self.so.map_surveys(self.survey_ix, self.survey_names) + self.so.save() + + # Remove previous par_set of the same number + if not self.set_up_dirs(run=run): + fs = f'{frbpoppy.paths.populations()}mc/run_{run}/*' + for f in glob(fs): + os.remove(f) + + pop = CosmicPopulation.complex(self.pop_size) + if not np.isnan(alpha): + pop.set_dist(model='vol_co', z_max=1.0, alpha=alpha) + pop.set_si(model='constant', value=si) + pop.set_lum(model='constant', value=1) + if not np.isnan(w_mean): + pop.set_w(model='lognormal', mean=w_mean, std=w_std) + if not np.isnan(dm_igm_slope): + pop.set_dm_igm(model='ioka', slope=dm_igm_slope) + pop.set_dm_host(model='constant', value=dm_host) + + pop.generate() + + def adapt_pop(e): + li, lum_min, lum_max = e + if lum_max < lum_min: + return + t_pop = deepcopy(pop) + t_pop.set_lum(model='powerlaw', low=lum_min, high=lum_max, + power=li) + t_pop.gen_lum() + + for survey in self.surveys: + surv_pop = SurveyPopulation(t_pop, survey) + + # Get unique identifier + mask = (self.so.df.par_set == 2) + mask &= (self.so.df.run == run) + mask &= (self.so.df.li == li) + mask &= (self.so.df.lum_min == lum_min) + mask &= (self.so.df.lum_max == lum_max) + mask &= (self.so.df.survey == survey.name) + uuid = self.so.df[mask].uuid.iloc[0] + surv_pop.name = f'mc/run_{run}/{uuid}' + surv_pop.save() + + n_cpu = min([3, os.cpu_count() - 1]) + pprint(f'{os.cpu_count()} CPUs available') + mg = np.meshgrid(lis, lum_mins, lum_maxs) + loop = np.array(mg).T.reshape(-1, 3) + if parallel: + Parallel(n_jobs=n_cpu)(delayed(adapt_pop)(e) for e in tqdm(loop)) + else: + [adapt_pop(e) for e in tqdm(loop)] + + def gen_par_set_3(self, + parallel=True, + alpha=-1.5, + si=0, + li=-1, + lum_min=1e40, + lum_max=1e40, + dm_igm_slope=np.nan, + dm_host=np.nan, + run=np.nan): + w_means = 10**np.linspace(-3, 1, 11) + w_stds = np.linspace(0, 3, 11) + + # Put all options into a dataframe + self.so.df = self.so.df[self.so.df.run != run] + opt = np.meshgrid(w_means, w_stds, self.survey_ix) + options = np.array(opt).T.reshape(-1, 3) + cols = ('w_mean', 'w_std', 'survey') + df = pd.DataFrame(options, columns=cols) + df['run'] = run + df['par_set'] = 3 + df['uuid'] = [uuid.uuid4() for _ in range(len(df.index))] + df['date'] = datetime.today() + self.so.append(df) + self.so.map_surveys(self.survey_ix, self.survey_names) + self.so.save() + + # Remove previous par_set of the same number + if not self.set_up_dirs(run=run): + fs = f'{frbpoppy.paths.populations()}mc/run_{run}/*' + for f in glob(fs): + os.remove(f) + + pop = CosmicPopulation.complex(self.pop_size) + + if not np.isnan(alpha): + pop.set_dist(model='vol_co', z_max=1.0, alpha=alpha) + pop.set_si(model='constant', value=si) + if not np.isnan(lum_min): + pop.set_lum(model='powerlaw', low=lum_min, high=lum_max, index=li) + if not np.isnan(dm_igm_slope): + pop.set_dm_igm(model='ioka', slope=dm_igm_slope) + pop.set_dm_host(model='constant', value=dm_host) + + pop.generate() + + def adapt_pop(e): + w_mean, w_std = e + t_pop = deepcopy(pop) + t_pop.set_w(model='lognormal', mean=w_mean, std=w_std) + t_pop.gen_w() + + for survey in self.surveys: + surv_pop = SurveyPopulation(t_pop, survey) + + # Get unique identifier + mask = (self.so.df.par_set == 3) + mask &= (self.so.df.run == run) + mask &= (self.so.df.run == run) + mask &= (self.so.df.w_mean == w_mean) + mask &= (self.so.df.w_std == w_std) + mask &= (self.so.df.survey == survey.name) + uuid = self.so.df[mask].uuid.iloc[0] + surv_pop.name = f'mc/run_{run}/{uuid}' + surv_pop.save() + + n_cpu = min([3, os.cpu_count() - 1]) + pprint(f'{os.cpu_count()} CPUs available') + mg = np.meshgrid(w_means, w_stds) + loop = np.array(mg).T.reshape(-1, 2) + if parallel: + Parallel(n_jobs=n_cpu)(delayed(adapt_pop)(e) for e in tqdm(loop)) + else: + [adapt_pop(e) for e in tqdm(loop)] + + def gen_par_set_4(self, + parallel=True, + alpha=-1.5, + si=0, + li=-1, + lum_min=1e40, + lum_max=1e40, + w_mean=np.nan, + w_std=np.nan, + run=np.nan): + dm_igm_slopes = np.linspace(800, 1200, 11) + dm_hosts = np.linspace(0, 500, 11) + + # Put all options into a dataframe + self.so.df = self.so.df[self.so.df.run != run] + opt = np.meshgrid(dm_igm_slopes, dm_hosts, self.survey_ix) + options = np.array(opt).T.reshape(-1, 3) + cols = ('dm_igm_slope', 'dm_host', 'survey') + df = pd.DataFrame(options, columns=cols) + df['run'] = run + df['par_set'] = 4 + df['uuid'] = [uuid.uuid4() for _ in range(len(df.index))] + df['date'] = datetime.today() + self.so.append(df) + self.so.map_surveys(self.survey_ix, self.survey_names) + self.so.save() + + # Remove previous par_set of the same number + if not self.set_up_dirs(run=run): + fs = f'{frbpoppy.paths.populations()}mc/run_{run}/*' + for f in glob(fs): + os.remove(f) + + pop = CosmicPopulation.complex(self.pop_size) + + if not np.isnan(alpha): + pop.set_dist(model='vol_co', z_max=1.0, alpha=alpha) + pop.set_si(model='constant', value=si) + if not np.isnan(lum_min): + pop.set_lum(model='powerlaw', low=lum_min, high=lum_max, index=li) + if not np.isnan(w_mean): + pop.set_w(model='lognormal', mean=w_mean, std=w_std) + pop.generate() + + def adapt_pop(e): + dm_igm_slope, dm_host = e + t_pop = deepcopy(pop) + t_pop.set_dm_igm(model='ioka', slope=dm_igm_slope) + t_pop.gen_dm_igm() + t_pop.set_dm_host(model='constant', value=dm_host) + t_pop.gen_dm_host() + t_pop.frbs.dm = t_pop.frbs.dm_mw + t_pop.frbs.dm_igm + t_pop.frbs.dm += t_pop.frbs.dm_host + + for survey in self.surveys: + surv_pop = SurveyPopulation(t_pop, survey) + + # Get unique identifier + mask = (self.so.df.par_set == 4) + mask &= (self.so.df.run == run) + mask &= (self.so.df.dm_igm_slope == dm_igm_slope) + mask &= (self.so.df.dm_host == dm_host) + mask &= (self.so.df.survey == survey.name) + uuid = self.so.df[mask].uuid.iloc[0] + surv_pop.name = f'mc/run_{run}/{uuid}' + surv_pop.save() + + n_cpu = min([4, os.cpu_count() - 1]) + pprint(f'{os.cpu_count()} CPUs available') + mg = np.meshgrid(dm_igm_slopes, dm_hosts) + loop = np.array(mg).T.reshape(-1, 2) + if parallel: + Parallel(n_jobs=n_cpu)(delayed(adapt_pop)(e) for e in tqdm(loop)) + else: + [adapt_pop(e) for e in tqdm(loop)] diff --git a/tests/monte_carlo/weighted_quantiles.py b/tests/monte_carlo/weighted_quantiles.py new file mode 100644 index 0000000..136b5d4 --- /dev/null +++ b/tests/monte_carlo/weighted_quantiles.py @@ -0,0 +1,83 @@ +import numpy as np + + +def quantile_1D(data, weights, quantile): + """ + Compute the weighted quantile of a 1D numpy array. + Parameters + ---------- + data : ndarray + Input array (one dimension). + weights : ndarray + Array with the weights of the same size of `data`. + quantile : float + Quantile to compute. It must have a value between 0 and 1. + Returns + ------- + quantile_1D : float + The output value. + """ + # Check the data + if not isinstance(data, np.matrix): + data = np.asarray(data) + if not isinstance(weights, np.matrix): + weights = np.asarray(weights) + nd = data.ndim + if nd != 1: + raise TypeError("data must be a one dimensional array") + ndw = weights.ndim + if ndw != 1: + raise TypeError("weights must be a one dimensional array") + if data.shape != weights.shape: + raise TypeError("the length of data and weights must be the same") + if ((quantile > 1.) or (quantile < 0.)): + raise ValueError("quantile must have a value between 0. and 1.") + # Sort the data + ind_sorted = np.argsort(data) + sorted_data = data[ind_sorted] + sorted_weights = weights[ind_sorted] + # Compute the auxiliary arrays + Sn = np.cumsum(sorted_weights) + # TODO: Check that the weights do not sum zero + # assert Sn != 0, "The sum of the weights must not be zero" + Pn = (Sn-0.5*sorted_weights)/Sn[-1] + # Get the value of the weighted median + return np.interp(quantile, Pn, sorted_data) + + +def quantile(data, weights, quantile): + """ + Weighted quantile of an array with respect to the last axis. + Parameters + ---------- + data : ndarray + Input array. + weights : ndarray + Array with the weights. It must have the same size of the last + axis of `data`. + quantile : float + Quantile to compute. It must have a value between 0 and 1. + Returns + ------- + quantile : float + The output value. + """ + # TODO: Allow to specify the axis + nd = data.ndim + if nd == 0: + TypeError("data must have at least one dimension") + elif nd == 1: + return quantile_1D(data, weights, quantile) + elif nd > 1: + n = data.shape + imr = data.reshape((np.prod(n[:-1]), n[-1])) + result = np.apply_along_axis(quantile_1D, -1, imr, weights, quantile) + return result.reshape(n[:-1]) + + +def median(data, weights): + """ + Weighted median of an array with respect to the last axis. + Alias for `quantile(data, weights, 0.5)`. + """ + return quantile(data, weights, 0.5)