MPI-Dortmund · thorstenwagner · Jul 12, 2023 · Jun 20, 2023 · Jun 20, 2023 · Jun 20, 2023
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -1,6 +1,16 @@
 Changes
 =======
 
+Version 0.5.0 (upcoming)
+*************
+
+.. admonition:: **CHANGES**
+
+    * The ``tomotwin_embed.py tomogram`` command has now a optional ``--mask`` option to select region of interestes for embeddings.
+    * The ``tomotwin_tools.py embedding_mask`` now estimates a ROI mask that masks out some portions of empty tomogram volume. Using the generated mask when running ``tomotwin_embed.py tomogram``, the embeddings step is 2 times faster. CAUTION: In TomoTwin 0.4 the ``embeddings_mask`` command calculated a label mask for the clustering workflow. This functionality now happens automatically during the calculation of the umap (``tomotwin_tools.py umap``).
+    * For the clustering workflow, you can calculate the medoid instead of columswise average. This should be a much better representation of the cluster center.
+
+
 Version 0.4.3
 *************
 
@@ -14,7 +24,7 @@ Version 0.4.0
 *************
 
 * Official clustering workflow release. Please checkout the updated installation instructions and in depth tutorial.
-* Added important tools like ``tomotwin_tools umap`` and ``tomotwin_tools embeddings_mask``
+* Added important tools like ``tomotwin_tools.py umap`` and ``tomotwin_tools.py embeddings_mask``
 * Added more unit tests
 
 Version 0.3.0

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -58,7 +58,7 @@ Download latest model
 
 :Number of proteins: 120
 
-:Link: `https <https://ftp.gwdg.de/pub/misc/sphire/TomoTwin/models/tomotwin_model_p120_052022_loss.pth>`_
+:Link: `Zendodo <https://doi.org/10.5281/zenodo.8137931>`_
 
 System requirements
 ^^^^^^^^^^^^^^^^^^^

diff --git a/docs/tutorials/text_modules/embed.rst b/docs/tutorials/text_modules/embed.rst
@@ -4,4 +4,20 @@ To embed your tomogram using two GPUs use:
 
 .. prompt:: bash $
 
-    CUDA_VISIBLE_DEVICES=0,1 tomotwin_embed.py tomogram -m LATEST_TOMOTWIN_MODEL.pth -v your_tomo_a10.mrc -b 256 -o out/embed/tomo/ -s 2
+    CUDA_VISIBLE_DEVICES=0,1 tomotwin_embed.py tomogram -m LATEST_TOMOTWIN_MODEL.pth -v your_tomo_a10.mrc -b 256 -o out/embed/tomo/ -s 2
+
+.. admonition:: Speed up embedding using ROI mask
+
+    With TomoTwin 0.5, the emedding command supports the use of masks. With masks you can define which regions of your tomogram get actually embedded and therefore speedup the embbeding.
+    We also provide new tools that calculates mask that excludes areas that probably does not contain any protein. You can run it with:
+
+    .. prompt:: bash $
+
+        tomotwin_tools.py embedding_mask -i your_tomo_a10.mrc -o out/mask/
+
+    The mask you find there can be used when running ``tomotwin_embed.py`` using the argument ``--mask``.
+    As this is still experimental, please check if the masks do not exclude any important areas. You can do that easiliy with napari by opening the tomogram and your mask and then change the opacity of your mask:
+
+    .. prompt:: bash $
+
+        napari your_tomo_a10.mrc out_mask/your_tomo_a10_mask.mrc
diff --git a/docs/tutorials/tut02_cluster.rst b/docs/tutorials/tut02_cluster.rst
@@ -16,18 +16,14 @@ Now we will approximate the tomogram embeddings to 2D to allow for efficient vis
 
 .. prompt:: bash $
 
-    tomotwin_tools.py umap -i your_tomo_a10/embed/tomo/tomo_embeddings.temb -o your_tomo_a10/clustering/
+    tomotwin_tools.py umap -i out/embed/tomo/tomo_embeddings.temb -o out/clustering/
 
 .. note::
 
     If you encounter an out of memory error here, you may need to reduce the :guilabel:`fit_sample_size` and/or :guilabel:`chunk_size` values (default 400,000).
 
 
-Additionally, we will generate a mask of the embeddings to allow us to track which UMAP values correspond to which points in the tomogram. To generate this mask:
-
-.. prompt:: bash $
-
-    tomotwin_tools.py embedding_mask -i your_tomo_a10/embed/tomo/tomo_embeddings.temb -o your_tomo_a10/clustering/
+Additionally, it generated a mask (`tomo_embeddings_label_mask.mrci`) of the embeddings to allow us to track which UMAP values correspond to which points in the tomogram.
 
 4. Load data for clustering in Napari
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -36,7 +32,7 @@ Now that we have all the input files for the clustering workflow we can get star
 
 .. prompt:: bash $
 
-    napari your_tomo_a10.mrc your_tomo_a10/clustering/your_tomo_a10_embedding_label_mask.mrci
+    napari your_tomo_a10.mrc out/clustering/your_tomo_a10_embedding_label_mask.mrci
 
 Next open the napari-tomotwin clustering tool via :guilabel:`Plugins` -> :guilabel:`napari-tomotwin` -> :guilabel:`Cluster UMAP embeddings`. Then choose the :guilabel:`Path to UMAP` by clicking on :guilabel:`Select file` and provide the path to your :file:`your_tomo_a10_embeddings.tumap`. 
 Click :guilabel:`Load` and after a second, a 2D plot of the umap embeddings should appear in the plugin window.
@@ -101,7 +97,7 @@ The map command will calculate the pairwise distances/similarity between the tar
 
 .. prompt:: bash $
 
-    tomotwin_map.py distance -r your_tomo_a10/clustering/cluster_targets.temb -v your_tomo_a10/embed/tomo/your_tomo_a10_embeddings.temb -o your_tomo_a10/map/
+    tomotwin_map.py distance -r out/clustering/cluster_targets.temb -v out/embed/tomo/your_tomo_a10_embeddings.temb -o out/map/
 
 8. Localize potential particles
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/tests/test_boxer.py b/tests/test_boxer.py
@@ -11,7 +11,7 @@ def test_SlidingWindowBoxer_stride1(self):
         tomo = np.random.randn(9,9,9)
         boxes = boxer.box(tomogram=tomo)
 
-        self.assertEqual(boxes.volumes.shape[0]*boxes.volumes.shape[1]*boxes.volumes.shape[2], 7*7*7)
+        self.assertEqual(7*7*7,len(boxes))
 
     def test_SlidingWindowBoxer_stride2(self):
         boxer = SlidingWindowBoxer(
@@ -21,7 +21,24 @@ def test_SlidingWindowBoxer_stride2(self):
         import numpy as np
         tomo = np.random.randn(9,9,9)
         boxes = boxer.box(tomogram=tomo)
-        self.assertEqual(boxes.volumes.shape[0]*boxes.volumes.shape[1]*boxes.volumes.shape[2], 4*4*4)
+        self.assertEqual(len(boxes), 4*4*4)
+
+    def test_SlidingWindowBoxer_stride2_mask(self):
+
+        import numpy as np
+        tomo = np.random.randn(9, 9, 9)
+        mask = np.zeros(shape=(9,9,9))
+        mask[3:6,3:6,3:6] = 1
+        mask = mask != 0
+
+        boxer = SlidingWindowBoxer(
+            stride=3,
+            box_size=3,
+            mask=mask
+        )
+
+        boxes = boxer.box(tomogram=tomo)
+        self.assertEqual(len(boxes), 1)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_volumedata.py b/tests/test_volumedata.py
@@ -1,5 +1,6 @@
 import unittest
-from tomotwin.modules.inference.volumedata import SlidingWindowVolumeData
+from tomotwin.modules.inference.volumedata import SimpleVolumeData
+from tomotwin.modules.inference.boxer import SlidingWindowBoxer
 import numpy as np
 import numpy.lib.stride_tricks as tricks
 
@@ -15,18 +16,17 @@ def test_something(self):
         box_size = 3
         stride = 2
 
-        window_shape = (box_size, box_size, box_size)
-        sliding_window_views = tricks.sliding_window_view(
-            vol, window_shape=window_shape
+        sliding_window_strides = SlidingWindowBoxer._calc_sliding_volumes(
+            tomogram=vol,
+            stride=(stride,stride,stride),
+            window_shape=(box_size,box_size,box_size)
         )
-
-        sliding_window_strides = sliding_window_views[
-                                 :: stride, :: stride, :: stride
-                                 ]
-        dat = SlidingWindowVolumeData(volumes=sliding_window_strides, stride=stride, boxsize=box_size)
+        roi = SlidingWindowBoxer._calc_volume_roi(sliding_window_strides,stride=(stride,stride,stride),box_size=box_size)
+        dat = SimpleVolumeData(volumes=sliding_window_strides, roi=roi)
 
         for i in range(len(dat)):
-            loc = dat.get_localization(i)
+            loc = roi.center_coords[i]
+
             sub = dat[i]
             if loc[0] == pos0 and loc[1] == pos1 and loc[2] == pos2:
                 self.assertEqual(sub[1, 1, 1], val)

diff --git a/tomotwin/embed_main.py b/tomotwin/embed_main.py
@@ -480,11 +480,16 @@ def embed_tomogram(
         tomo: np.array,
         embedor: Embedor,
         conf: EmbedConfiguration,
-        window_size: int) -> pd.DataFrame:
+        window_size: int,
+        mask: np.array = None) -> pd.DataFrame:
     """
     Embeds a tomogram
     :return: DataFrame of embeddings
     """
+
+    if mask is not None:
+        assert tomo.shape == mask.shape, f"Tomogram shape ({tomo.shape}) and mask shape ({mask.shape}) need to be equal."
+
     if conf.zrange:
         hb = int((window_size - 1) // 2)
         minz = max(0, conf.zrange[0] - hb)
@@ -495,7 +500,7 @@ def embed_tomogram(
         )  # here we need to take make sure that the box size is subtracted etc.
 
     boxer = SlidingWindowBoxer(
-        box_size=window_size, stride=conf.stride, zrange=conf.zrange
+        box_size=window_size, stride=conf.stride, zrange=conf.zrange, mask=mask
     )
     embeddings = sliding_window_embedding(tomo=tomo, boxer=boxer, embedor=embedor)
 
@@ -557,7 +562,10 @@ def run(conf: EmbedConfiguration) -> None:
     window_size = get_window_size(conf.model_path)
     if conf.mode == EmbedMode.TOMO:
         tomo = -1 * MrcFormat.read(conf.volumes_path)  # -1 to invert the contrast
-        embed_tomogram(tomo, embedor, conf, window_size)
+        mask = None
+        if conf.maskpth is not None:
+            mask = MrcFormat.read(conf.maskpth)!=0
+        embed_tomogram(tomo, embedor, conf, window_size, mask)
     elif conf.mode == EmbedMode.VOLUMES:
         paths = []
         for p in conf.volumes_path:

diff --git a/tomotwin/modules/inference/argparse_embed_ui.py b/tomotwin/modules/inference/argparse_embed_ui.py
@@ -393,6 +393,7 @@ def __init__(self):
         self.stride = None
         self.mode = None
         self.zrange = None
+        self.maskpth = None
 
     def run(self, args=None) -> None:
         parser = self.create_parser()
@@ -413,6 +414,7 @@ def run(self, args=None) -> None:
             self.zrange = args.zrange
             if len(self.stride) == 1:
                 self.stride = self.stride*3
+            self.maskpth = args.mask
 
     def get_embed_configuration(self) -> EmbedConfiguration:
         conf = EmbedConfiguration(
@@ -422,7 +424,8 @@ def get_embed_configuration(self) -> EmbedConfiguration:
             mode=self.mode,
             batchsize=self.batchsize,
             stride=self.stride,
-            zrange=self.zrange
+            zrange=self.zrange,
+            maskpth=self.maskpth
         )
         return conf
 
@@ -466,6 +469,8 @@ def create_volume_parser(parser):
             help="All output files are written in that path.",
         )
 
+
+
     @staticmethod
     def create_tomo_parser(parser):
         """
@@ -523,6 +528,14 @@ def create_tomo_parser(parser):
             help="Minimum z and maximum z for to run the sliding window on. Handy to skip the void volume in order to speed up the embedding.",
         )
 
+        parser.add_argument(
+            "--mask",
+            type=str,
+            required=False,
+            default=None,
+            help="Path to binary mask to define embedding region (mrc format). All values != 0 are interpreted as 'True'.",
+        )
+
     def create_parser(self) -> argparse.ArgumentParser:
         """
         Create the embedor parser