From d8de9ec893179eed699d62ae28d3a1896a9d045b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20K=C3=B6nig?= Date: Mon, 26 Oct 2020 18:12:37 +0100 Subject: [PATCH 1/4] add plot_npc and plot_active_latent_units --- sfaira/train/summaries.py | 114 +++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 2 deletions(-) diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py index c42fa6779..d90aa5fef 100644 --- a/sfaira/train/summaries.py +++ b/sfaira/train/summaries.py @@ -506,7 +506,7 @@ def plot_training_history( ).tolist(): sns_data_temp = pandas.DataFrame(self.histories[run]) sns_data_temp["epoch"] = np.arange(0, sns_data_temp.shape[0]) - sns_data_temp["cv"] = run.split("_")[-1] + sns_data_temp["cv"] = int(run.split("_")[-1]) sns_data.append(sns_data_temp) sns_data = pandas.concat(sns_data, axis=0) else: @@ -1594,4 +1594,114 @@ def plot_gradient_cor( plt.tight_layout() if save is not None: plt.savefig(save) - plt.show() \ No newline at end of file + plt.show() + + def plot_npc( + self, + organ, + topology_version, + cvs=None + ): + import matplotlib.pyplot as plt + if self.summary_tab is None: + self.create_summary_tab() + models = np.unique(self.summary_tab["model_type"]).tolist() + mode = np.mean + self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values] + + with plt.style.context("seaborn-whitegrid"): + fig, ax = plt.subplots(figsize=(12,6)) + for model in models: + model_id, embedding, covar = self.best_model_embedding( + subset={"model_type": model, "organ": organ, "topology": topology_version}, + partition="val", + metric="loss", + cvs=cvs, + ) + if len(embedding[0].shape) == 3: + z, mean, logvar = embedding[0] + else: + z = embedding[0] + latent_dim = z.shape[1] + cov = np.cov(z.T) + eig_vals, eig_vecs = np.linalg.eig(cov) + eig_sum = sum(eig_vals) + var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)] + cum_var_exp = np.cumsum([0] + var_exp) + + #plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center", + # label="%s individual explained variance" % key) + plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3, + label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99))) + plt.yticks([0.0, .25 ,.50, .75, .95, .99]) + #if latent_dim > 20: + # plt.xticks([1,5, 10, 15, 20]) + # plt.xlim([0, 20]) + plt.ylabel("Explained variance ratio", fontsize=16) + plt.xlabel("Principal components", fontsize=16) + plt.legend(loc="best", fontsize=16, frameon=True) + plt.tight_layout() + #plt.savefig(fname=figdir + organ + "_pca_" + model + ".png") + plt.show() + + def plot_active_latent_units( + self, + organ, + topology_version, + cvs=None + ): + colors=['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru'] + def active_latent_units_mask(z): + var_x = np.diagonal(np.cov(z.T)) + min_var_x = 0.01 + active_units_mask = var_x > min_var_x + return active_units_mask + + import matplotlib.pyplot as plt + if self.summary_tab is None: + self.create_summary_tab() + models = np.unique(self.summary_tab["model_type"]).tolist() + self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values] + + with plt.style.context("seaborn-whitegrid"): + plt.figure(figsize=(12, 6)) + plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2, + label="active unit threshold") + for i, model in enumerate(models): + model_id, embedding, covar = self.best_model_embedding( + subset={"model_type": model, "organ": organ, "topology": topology_version}, + partition="val", + metric="loss", + cvs=cvs, + ) + if len(embedding[0].shape) == 3: + z, mean, logvar = embedding[0] + else: + mean = embedding[0] + latent_dim = mean.shape[1] + var = np.sort(np.diagonal(np.cov(mean.T)))[::-1] + log_var = np.log(var) + active_units = np.log(var[active_latent_units_mask(mean)]) + + plt.plot(range(1,log_var.shape[0]+1), log_var, color=colors[i], alpha=1.0, linewidth=3, + label="%s active units: %i" % (model, len(active_units))) + # to plot vertical lines + log_var_cut = var.copy() + log_var_cut[~active_latent_units_mask(mean)] = 0 + log_var_cut = np.log(log_var_cut) + num_active = np.argmax(log_var_cut) + if num_active > 0: + plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3) + if model == "vaevamp": + z1,z2 = np.split(np.log(np.diagonal(np.cov(mean.T))),2) + plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0, + label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3) + plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0, + label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3) + plt.xlabel(r'Latent unit $i$', fontsize=16) + plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16) + #plt.title(r"Latent unit activity", fontsize=16) + plt.legend(loc="upper right", frameon=True, fontsize=12) + plt.tight_layout() + # plt.savefig(fname=figdir + organ + "_activity_statistic_" + model + ".png") + plt.show() From fd02b11a73dc399b72291f8c049fc4049182a227 Mon Sep 17 00:00:00 2001 From: le-ander <20015434+le-ander@users.noreply.github.com> Date: Mon, 2 Nov 2020 14:51:50 +0100 Subject: [PATCH 2/4] make sure handling of z and z_mean is consistent for VAE embeddings --- sfaira/train/summaries.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py index d90aa5fef..2301785c1 100644 --- a/sfaira/train/summaries.py +++ b/sfaira/train/summaries.py @@ -1581,7 +1581,6 @@ def plot_gradient_cor( if by_type: v = avg_grads[model_type[0]] celltypes_coord = celltypes[model_type[0]] - cell_names = [str(i) for i in range(v.shape[0])] cormat = pandas.DataFrame( np.corrcoef(v), index=celltypes_coord, @@ -1602,6 +1601,10 @@ def plot_npc( topology_version, cvs=None ): + """ + If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function) + the model will use z, and not z_mean. + """ import matplotlib.pyplot as plt if self.summary_tab is None: self.create_summary_tab() @@ -1619,7 +1622,7 @@ def plot_npc( cvs=cvs, ) if len(embedding[0].shape) == 3: - z, mean, logvar = embedding[0] + z = embedding[0][0] # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z else: z = embedding[0] latent_dim = z.shape[1] @@ -1629,19 +1632,19 @@ def plot_npc( var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)] cum_var_exp = np.cumsum([0] + var_exp) - #plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center", + # plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center", # label="%s individual explained variance" % key) plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3, label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99))) plt.yticks([0.0, .25 ,.50, .75, .95, .99]) - #if latent_dim > 20: + # if latent_dim > 20: # plt.xticks([1,5, 10, 15, 20]) # plt.xlim([0, 20]) plt.ylabel("Explained variance ratio", fontsize=16) plt.xlabel("Principal components", fontsize=16) plt.legend(loc="best", fontsize=16, frameon=True) plt.tight_layout() - #plt.savefig(fname=figdir + organ + "_pca_" + model + ".png") + # plt.savefig(fname=figdir + organ + "_pca_" + model + ".png") plt.show() def plot_active_latent_units( @@ -1650,7 +1653,13 @@ def plot_active_latent_units( topology_version, cvs=None ): - colors=['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru'] + """ + If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function) + the model will use z, and not z_mean. + """ + + colors = ['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru'] + def active_latent_units_mask(z): var_x = np.diagonal(np.cov(z.T)) min_var_x = 0.01 @@ -1675,25 +1684,25 @@ def active_latent_units_mask(z): cvs=cvs, ) if len(embedding[0].shape) == 3: - z, mean, logvar = embedding[0] + z = embedding[0][0] # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z else: - mean = embedding[0] - latent_dim = mean.shape[1] - var = np.sort(np.diagonal(np.cov(mean.T)))[::-1] + z = embedding[0] + latent_dim = z.shape[1] + var = np.sort(np.diagonal(np.cov(z.T)))[::-1] log_var = np.log(var) - active_units = np.log(var[active_latent_units_mask(mean)]) + active_units = np.log(var[active_latent_units_mask(z)]) plt.plot(range(1,log_var.shape[0]+1), log_var, color=colors[i], alpha=1.0, linewidth=3, label="%s active units: %i" % (model, len(active_units))) # to plot vertical lines log_var_cut = var.copy() - log_var_cut[~active_latent_units_mask(mean)] = 0 + log_var_cut[~active_latent_units_mask(z)] = 0 log_var_cut = np.log(log_var_cut) num_active = np.argmax(log_var_cut) if num_active > 0: plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3) if model == "vaevamp": - z1,z2 = np.split(np.log(np.diagonal(np.cov(mean.T))),2) + z1,z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2) plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0, label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3) plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0, From 6696cd120e5e25b05ed3954382274de317dabb98 Mon Sep 17 00:00:00 2001 From: mko Date: Fri, 6 Nov 2020 14:49:46 +0100 Subject: [PATCH 3/4] clean up and documentation --- sfaira/train/summaries.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py index 2301785c1..886ad5ea4 100644 --- a/sfaira/train/summaries.py +++ b/sfaira/train/summaries.py @@ -1602,6 +1602,8 @@ def plot_npc( cvs=None ): """ + Plots the explained variance ration that accumulates explained variation of the latent space’s ordered + principal components. If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function) the model will use z, and not z_mean. """ @@ -1609,11 +1611,10 @@ def plot_npc( if self.summary_tab is None: self.create_summary_tab() models = np.unique(self.summary_tab["model_type"]).tolist() - mode = np.mean self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values] with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=(12,6)) + plt.figure(figsize=(12, 6)) for model in models: model_id, embedding, covar = self.best_model_embedding( subset={"model_type": model, "organ": organ, "topology": topology_version}, @@ -1625,26 +1626,18 @@ def plot_npc( z = embedding[0][0] # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z else: z = embedding[0] - latent_dim = z.shape[1] cov = np.cov(z.T) eig_vals, eig_vecs = np.linalg.eig(cov) eig_sum = sum(eig_vals) var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)] cum_var_exp = np.cumsum([0] + var_exp) - - # plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center", - # label="%s individual explained variance" % key) plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3, label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99))) plt.yticks([0.0, .25 ,.50, .75, .95, .99]) - # if latent_dim > 20: - # plt.xticks([1,5, 10, 15, 20]) - # plt.xlim([0, 20]) plt.ylabel("Explained variance ratio", fontsize=16) plt.xlabel("Principal components", fontsize=16) plt.legend(loc="best", fontsize=16, frameon=True) plt.tight_layout() - # plt.savefig(fname=figdir + organ + "_pca_" + model + ".png") plt.show() def plot_active_latent_units( @@ -1654,6 +1647,8 @@ def plot_active_latent_units( cvs=None ): """ + Plots latent unit activity measured by empirical variance of the expected latent space. + See: https://arxiv.org/abs/1509.00519 If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function) the model will use z, and not z_mean. """ @@ -1709,8 +1704,7 @@ def active_latent_units_mask(z): label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3) plt.xlabel(r'Latent unit $i$', fontsize=16) plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16) - #plt.title(r"Latent unit activity", fontsize=16) + plt.title(r"Latent unit activity", fontsize=16) plt.legend(loc="upper right", frameon=True, fontsize=12) plt.tight_layout() - # plt.savefig(fname=figdir + organ + "_activity_statistic_" + model + ".png") plt.show() From 6db238ba75edc77df0a57bc7f89504c51b5e703d Mon Sep 17 00:00:00 2001 From: le-ander <20015434+le-ander@users.noreply.github.com> Date: Fri, 6 Nov 2020 16:25:15 +0100 Subject: [PATCH 4/4] formatting --- sfaira/train/summaries.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py index 886ad5ea4..0f20a8748 100644 --- a/sfaira/train/summaries.py +++ b/sfaira/train/summaries.py @@ -1632,8 +1632,8 @@ def plot_npc( var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)] cum_var_exp = np.cumsum([0] + var_exp) plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3, - label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99))) - plt.yticks([0.0, .25 ,.50, .75, .95, .99]) + label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99))) + plt.yticks([0.0, .25, .50, .75, .95, .99]) plt.ylabel("Explained variance ratio", fontsize=16) plt.xlabel("Principal components", fontsize=16) plt.legend(loc="best", fontsize=16, frameon=True) @@ -1669,8 +1669,7 @@ def active_latent_units_mask(z): with plt.style.context("seaborn-whitegrid"): plt.figure(figsize=(12, 6)) - plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2, - label="active unit threshold") + plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2, label="active unit threshold") for i, model in enumerate(models): model_id, embedding, covar = self.best_model_embedding( subset={"model_type": model, "organ": organ, "topology": topology_version}, @@ -1697,11 +1696,13 @@ def active_latent_units_mask(z): if num_active > 0: plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3) if model == "vaevamp": - z1,z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2) - plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0, - label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3) - plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0, - label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3) + z1, z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2) + plt.plot(range(1, int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0, + label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', + linewidth=3) + plt.plot(range(1, int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0, + label=r"%s $z_1$ active units: %i" % (model, len(z1[z1 > np.log(0.01)])), + linestyle='dotted', linewidth=3) plt.xlabel(r'Latent unit $i$', fontsize=16) plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16) plt.title(r"Latent unit activity", fontsize=16)