From d8de9ec893179eed699d62ae28d3a1896a9d045b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20K=C3=B6nig?= <martin.koenig@ymail.com>
Date: Mon, 26 Oct 2020 18:12:37 +0100
Subject: [PATCH 1/4] add plot_npc and plot_active_latent_units

---
 sfaira/train/summaries.py | 114 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py
index c42fa6779..d90aa5fef 100644
--- a/sfaira/train/summaries.py
+++ b/sfaira/train/summaries.py
@@ -506,7 +506,7 @@ def plot_training_history(
                 ).tolist():
                     sns_data_temp = pandas.DataFrame(self.histories[run])
                     sns_data_temp["epoch"] = np.arange(0, sns_data_temp.shape[0])
-                    sns_data_temp["cv"] = run.split("_")[-1]
+                    sns_data_temp["cv"] = int(run.split("_")[-1])
                     sns_data.append(sns_data_temp)
                 sns_data = pandas.concat(sns_data, axis=0)
             else:
@@ -1594,4 +1594,114 @@ def plot_gradient_cor(
         plt.tight_layout()
         if save is not None:
             plt.savefig(save)
-        plt.show()
\ No newline at end of file
+        plt.show()
+
+    def plot_npc(
+        self,
+        organ,
+        topology_version,
+        cvs=None
+    ):
+        import matplotlib.pyplot as plt
+        if self.summary_tab is None:
+            self.create_summary_tab() 
+        models = np.unique(self.summary_tab["model_type"]).tolist()
+        mode = np.mean
+        self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values]
+        
+        with plt.style.context("seaborn-whitegrid"):
+            fig, ax = plt.subplots(figsize=(12,6))
+            for model in models:
+                model_id, embedding, covar = self.best_model_embedding(
+                    subset={"model_type": model, "organ": organ, "topology": topology_version},
+                    partition="val",
+                    metric="loss",
+                    cvs=cvs,
+                )
+                if len(embedding[0].shape) == 3:
+                    z, mean, logvar = embedding[0]
+                else:
+                    z = embedding[0]
+                latent_dim = z.shape[1]
+                cov = np.cov(z.T)
+                eig_vals, eig_vecs = np.linalg.eig(cov)
+                eig_sum = sum(eig_vals)
+                var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)]
+                cum_var_exp = np.cumsum([0] + var_exp)
+
+                #plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center",
+                #        label="%s individual explained variance" % key)
+                plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3,
+                        label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99)))
+            plt.yticks([0.0, .25 ,.50, .75, .95, .99])
+            #if latent_dim > 20:
+            #    plt.xticks([1,5, 10, 15, 20])
+            #    plt.xlim([0, 20])
+            plt.ylabel("Explained variance ratio", fontsize=16)
+            plt.xlabel("Principal components", fontsize=16)
+            plt.legend(loc="best", fontsize=16, frameon=True)
+            plt.tight_layout()
+            #plt.savefig(fname=figdir + organ + "_pca_" + model + ".png")
+            plt.show()
+
+    def plot_active_latent_units(
+        self, 
+        organ, 
+        topology_version,
+        cvs=None
+    ):
+        colors=['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru']
+        def active_latent_units_mask(z):
+            var_x = np.diagonal(np.cov(z.T))
+            min_var_x = 0.01
+            active_units_mask = var_x > min_var_x
+            return active_units_mask
+        
+        import matplotlib.pyplot as plt
+        if self.summary_tab is None:
+            self.create_summary_tab()
+        models = np.unique(self.summary_tab["model_type"]).tolist()
+        self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values]
+
+        with plt.style.context("seaborn-whitegrid"):
+            plt.figure(figsize=(12, 6))
+            plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2,
+                   label="active unit threshold")
+            for i, model in enumerate(models):
+                model_id, embedding, covar = self.best_model_embedding(
+                        subset={"model_type": model, "organ": organ, "topology": topology_version},
+                        partition="val",
+                        metric="loss",
+                        cvs=cvs,
+                    )
+                if len(embedding[0].shape) == 3:
+                    z, mean, logvar = embedding[0]
+                else:
+                    mean = embedding[0]
+                latent_dim = mean.shape[1]
+                var = np.sort(np.diagonal(np.cov(mean.T)))[::-1]
+                log_var = np.log(var)
+                active_units = np.log(var[active_latent_units_mask(mean)])
+
+                plt.plot(range(1,log_var.shape[0]+1), log_var, color=colors[i], alpha=1.0, linewidth=3,
+                         label="%s active units: %i" % (model, len(active_units)))
+                # to plot vertical lines
+                log_var_cut = var.copy()
+                log_var_cut[~active_latent_units_mask(mean)] = 0
+                log_var_cut = np.log(log_var_cut)
+                num_active = np.argmax(log_var_cut)
+                if num_active > 0:
+                    plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3)
+                if model == "vaevamp":
+                    z1,z2 = np.split(np.log(np.diagonal(np.cov(mean.T))),2)
+                    plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0,
+                         label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3)
+                    plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0,
+                         label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3)
+            plt.xlabel(r'Latent unit $i$', fontsize=16)
+            plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16)
+            #plt.title(r"Latent unit activity", fontsize=16)
+            plt.legend(loc="upper right", frameon=True, fontsize=12)
+            plt.tight_layout()
+            # plt.savefig(fname=figdir + organ + "_activity_statistic_" + model + ".png")
+            plt.show()

From fd02b11a73dc399b72291f8c049fc4049182a227 Mon Sep 17 00:00:00 2001
From: le-ander <20015434+le-ander@users.noreply.github.com>
Date: Mon, 2 Nov 2020 14:51:50 +0100
Subject: [PATCH 2/4] make sure handling of z and z_mean is consistent for VAE
 embeddings

---
 sfaira/train/summaries.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py
index d90aa5fef..2301785c1 100644
--- a/sfaira/train/summaries.py
+++ b/sfaira/train/summaries.py
@@ -1581,7 +1581,6 @@ def plot_gradient_cor(
         if by_type:
             v = avg_grads[model_type[0]]
             celltypes_coord = celltypes[model_type[0]]
-            cell_names = [str(i) for i in range(v.shape[0])]
             cormat = pandas.DataFrame(
                 np.corrcoef(v),
                 index=celltypes_coord,
@@ -1602,6 +1601,10 @@ def plot_npc(
         topology_version,
         cvs=None
     ):
+        """
+        If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function)
+        the model will use z, and not z_mean.
+        """
         import matplotlib.pyplot as plt
         if self.summary_tab is None:
             self.create_summary_tab() 
@@ -1619,7 +1622,7 @@ def plot_npc(
                     cvs=cvs,
                 )
                 if len(embedding[0].shape) == 3:
-                    z, mean, logvar = embedding[0]
+                    z = embedding[0][0]  # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z
                 else:
                     z = embedding[0]
                 latent_dim = z.shape[1]
@@ -1629,19 +1632,19 @@ def plot_npc(
                 var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)]
                 cum_var_exp = np.cumsum([0] + var_exp)
 
-                #plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center",
+                # plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center",
                 #        label="%s individual explained variance" % key)
                 plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3,
                         label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99)))
             plt.yticks([0.0, .25 ,.50, .75, .95, .99])
-            #if latent_dim > 20:
+            # if latent_dim > 20:
             #    plt.xticks([1,5, 10, 15, 20])
             #    plt.xlim([0, 20])
             plt.ylabel("Explained variance ratio", fontsize=16)
             plt.xlabel("Principal components", fontsize=16)
             plt.legend(loc="best", fontsize=16, frameon=True)
             plt.tight_layout()
-            #plt.savefig(fname=figdir + organ + "_pca_" + model + ".png")
+            # plt.savefig(fname=figdir + organ + "_pca_" + model + ".png")
             plt.show()
 
     def plot_active_latent_units(
@@ -1650,7 +1653,13 @@ def plot_active_latent_units(
         topology_version,
         cvs=None
     ):
-        colors=['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru']
+        """
+        If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function)
+        the model will use z, and not z_mean.
+        """
+
+        colors = ['red', 'blue', 'green', 'cyan', 'magenta', 'yellow', 'darkgreen', 'lime', 'navy', 'royalblue', 'pink', 'peru']
+
         def active_latent_units_mask(z):
             var_x = np.diagonal(np.cov(z.T))
             min_var_x = 0.01
@@ -1675,25 +1684,25 @@ def active_latent_units_mask(z):
                         cvs=cvs,
                     )
                 if len(embedding[0].shape) == 3:
-                    z, mean, logvar = embedding[0]
+                    z = embedding[0][0]  # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z
                 else:
-                    mean = embedding[0]
-                latent_dim = mean.shape[1]
-                var = np.sort(np.diagonal(np.cov(mean.T)))[::-1]
+                    z = embedding[0]
+                latent_dim = z.shape[1]
+                var = np.sort(np.diagonal(np.cov(z.T)))[::-1]
                 log_var = np.log(var)
-                active_units = np.log(var[active_latent_units_mask(mean)])
+                active_units = np.log(var[active_latent_units_mask(z)])
 
                 plt.plot(range(1,log_var.shape[0]+1), log_var, color=colors[i], alpha=1.0, linewidth=3,
                          label="%s active units: %i" % (model, len(active_units)))
                 # to plot vertical lines
                 log_var_cut = var.copy()
-                log_var_cut[~active_latent_units_mask(mean)] = 0
+                log_var_cut[~active_latent_units_mask(z)] = 0
                 log_var_cut = np.log(log_var_cut)
                 num_active = np.argmax(log_var_cut)
                 if num_active > 0:
                     plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3)
                 if model == "vaevamp":
-                    z1,z2 = np.split(np.log(np.diagonal(np.cov(mean.T))),2)
+                    z1,z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2)
                     plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0,
                          label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3)
                     plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0,

From 6696cd120e5e25b05ed3954382274de317dabb98 Mon Sep 17 00:00:00 2001
From: mko <martin.koenig@tum.de>
Date: Fri, 6 Nov 2020 14:49:46 +0100
Subject: [PATCH 3/4] clean up and documentation

---
 sfaira/train/summaries.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py
index 2301785c1..886ad5ea4 100644
--- a/sfaira/train/summaries.py
+++ b/sfaira/train/summaries.py
@@ -1602,6 +1602,8 @@ def plot_npc(
         cvs=None
     ):
         """
+        Plots the explained variance ration that accumulates explained variation of the latent space’s ordered
+        principal components.
         If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function)
         the model will use z, and not z_mean.
         """
@@ -1609,11 +1611,10 @@ def plot_npc(
         if self.summary_tab is None:
             self.create_summary_tab() 
         models = np.unique(self.summary_tab["model_type"]).tolist()
-        mode = np.mean
         self.summary_tab["topology"] = [x.split("_")[5] for x in self.summary_tab["model_gs_id"].values]
         
         with plt.style.context("seaborn-whitegrid"):
-            fig, ax = plt.subplots(figsize=(12,6))
+            plt.figure(figsize=(12, 6))
             for model in models:
                 model_id, embedding, covar = self.best_model_embedding(
                     subset={"model_type": model, "organ": organ, "topology": topology_version},
@@ -1625,26 +1626,18 @@ def plot_npc(
                     z = embedding[0][0]  # in case of three-dimensional VAE embedding (z, z_mean, z_var), use z
                 else:
                     z = embedding[0]
-                latent_dim = z.shape[1]
                 cov = np.cov(z.T)
                 eig_vals, eig_vecs = np.linalg.eig(cov)
                 eig_sum = sum(eig_vals)
                 var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)]
                 cum_var_exp = np.cumsum([0] + var_exp)
-
-                # plt.bar(range(eig_vals.shape[0]), var_exp, alpha=0.5, align="center",
-                #        label="%s individual explained variance" % key)
                 plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3,
                         label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99)))
             plt.yticks([0.0, .25 ,.50, .75, .95, .99])
-            # if latent_dim > 20:
-            #    plt.xticks([1,5, 10, 15, 20])
-            #    plt.xlim([0, 20])
             plt.ylabel("Explained variance ratio", fontsize=16)
             plt.xlabel("Principal components", fontsize=16)
             plt.legend(loc="best", fontsize=16, frameon=True)
             plt.tight_layout()
-            # plt.savefig(fname=figdir + organ + "_pca_" + model + ".png")
             plt.show()
 
     def plot_active_latent_units(
@@ -1654,6 +1647,8 @@ def plot_active_latent_units(
         cvs=None
     ):
         """
+        Plots latent unit activity measured by empirical variance of the expected latent space.
+        See: https://arxiv.org/abs/1509.00519
         If an embedding file is found that contains z, z_mean, z_var (eg. output from predict_variational() function)
         the model will use z, and not z_mean.
         """
@@ -1709,8 +1704,7 @@ def active_latent_units_mask(z):
                          label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3)
             plt.xlabel(r'Latent unit $i$', fontsize=16)
             plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16)
-            #plt.title(r"Latent unit activity", fontsize=16)
+            plt.title(r"Latent unit activity", fontsize=16)
             plt.legend(loc="upper right", frameon=True, fontsize=12)
             plt.tight_layout()
-            # plt.savefig(fname=figdir + organ + "_activity_statistic_" + model + ".png")
             plt.show()

From 6db238ba75edc77df0a57bc7f89504c51b5e703d Mon Sep 17 00:00:00 2001
From: le-ander <20015434+le-ander@users.noreply.github.com>
Date: Fri, 6 Nov 2020 16:25:15 +0100
Subject: [PATCH 4/4] formatting

---
 sfaira/train/summaries.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py
index 886ad5ea4..0f20a8748 100644
--- a/sfaira/train/summaries.py
+++ b/sfaira/train/summaries.py
@@ -1632,8 +1632,8 @@ def plot_npc(
                 var_exp = [(i / eig_sum) for i in sorted(eig_vals, reverse=True)]
                 cum_var_exp = np.cumsum([0] + var_exp)
                 plt.step(range(0, eig_vals.shape[0]+1), cum_var_exp, where="post", linewidth=3,
-                        label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99)))
-            plt.yticks([0.0, .25 ,.50, .75, .95, .99])
+                         label="%s cumulative explained variance (95%%: %s / 99%%: %s)" % (model, np.sum(cum_var_exp < .95), np.sum(cum_var_exp < .99)))
+            plt.yticks([0.0, .25, .50, .75, .95, .99])
             plt.ylabel("Explained variance ratio", fontsize=16)
             plt.xlabel("Principal components", fontsize=16)
             plt.legend(loc="best", fontsize=16, frameon=True)
@@ -1669,8 +1669,7 @@ def active_latent_units_mask(z):
 
         with plt.style.context("seaborn-whitegrid"):
             plt.figure(figsize=(12, 6))
-            plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2,
-                   label="active unit threshold")
+            plt.axhline(np.log(0.01), color="k", linestyle='dashed', linewidth=2, label="active unit threshold")
             for i, model in enumerate(models):
                 model_id, embedding, covar = self.best_model_embedding(
                         subset={"model_type": model, "organ": organ, "topology": topology_version},
@@ -1697,11 +1696,13 @@ def active_latent_units_mask(z):
                 if num_active > 0:
                     plt.vlines(num_active, ymin = -.15, ymax = 0.15, color=colors[i], linestyle='solid', linewidth=3)
                 if model == "vaevamp":
-                    z1,z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2)
-                    plt.plot(range(1,int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0,
-                         label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed', linewidth=3)
-                    plt.plot(range(1,int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0,
-                         label=r"%s $z_1$ active units: %i" % (model, len(z1[z1>np.log(0.01)])), linestyle='dotted', linewidth=3)
+                    z1, z2 = np.split(np.log(np.diagonal(np.cov(z.T))),2)
+                    plt.plot(range(1, int(latent_dim/2)+1), np.sort(z2)[::-1], color=colors[i], alpha=1.0,
+                             label=r"%s $z_2$ active units: %i" % (model, len(z2[z2>np.log(0.01)])), linestyle='dashed',
+                             linewidth=3)
+                    plt.plot(range(1, int(latent_dim/2)+1), np.sort(z1)[::-1], color=colors[i], alpha=1.0,
+                             label=r"%s $z_1$ active units: %i" % (model, len(z1[z1 > np.log(0.01)])),
+                             linestyle='dotted', linewidth=3)
             plt.xlabel(r'Latent unit $i$', fontsize=16)
             plt.ylabel(r'$\log\,{(A_{\bf z})}_i$', fontsize=16)
             plt.title(r"Latent unit activity", fontsize=16)