Add docs for SpeakerManager

coqui-ai · Jul 3, 2021 · c25a218 · c25a218
1 parent f382e4c
commit c25a218
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 30 deletions.
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
@@ -13,40 +13,43 @@
 
 
 class SpeakerManager:
-    """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information
-    in a way that you can query. There are 3 different scenarios considered.
-
-    1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids.
-    2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following
-    format.
-
-    ```
-    {
-        'clip_name.wav':{
-            'name': 'speakerA',
-            'embedding'[<d_vector_values>]
-        },
-        ...
-    }
-    ```
-
-    3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and
-    computes x vectors for a given instance.
-
-    >>> >>> # load audio processor and speaker encoder
-    >>> ap = AudioProcessor(**config.audio)
-    >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
-    >>> # load a sample audio and compute embedding
-    >>> waveform = ap.load_wav(sample_wav_path)
-    >>> mel = ap.melspectrogram(waveform)
-    >>> d_vector = manager.compute_d_vector(mel.T)
+    """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
+    in a way that can be queried by speaker or clip.
+
+    There are 3 different scenarios considered:
+
+    1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
+    2. Models using d-vectors. The datafile includes a dictionary in the following format.
+
+    ::
+
+        {
+            'clip_name.wav':{
+                'name': 'speakerA',
+                'embedding'[<d_vector_values>]
+            },
+            ...
+        }
+
+
+    3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
+    computes the d-vectors for a given clip or speaker.
 
     Args:
         d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
         speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by
         TTS models. Defaults to "".
         encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
         encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
+
+    Examples:
+        >>> # load audio processor and speaker encoder
+        >>> ap = AudioProcessor(**config.audio)
+        >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
+        >>> # load a sample audio and compute embedding
+        >>> waveform = ap.load_wav(sample_wav_path)
+        >>> mel = ap.melspectrogram(waveform)
+        >>> d_vector = manager.compute_d_vector(mel.T)
     """
 
     def __init__(
@@ -188,7 +191,7 @@ def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize
         Args:
             speaker_idx (str): Target speaker ID.
             num_samples (int, optional): Number of samples to be averaged. Defaults to None.
-            randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False.
+            randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False.
 
         Returns:
             np.ndarray: Mean d_vector.
@@ -311,7 +314,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
 
 
 def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
-    """Create a SpeakerManager instance based on provided configuration.
+    """Initiate a `SpeakerManager` instance by the provided config.
 
     Args:
         c (Coqpit): Model configuration.
@@ -321,7 +324,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
         out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
 
     Returns:
-        SpeakerManager:
+        SpeakerManager: initialized and ready to use instance.
     """
     speaker_manager = SpeakerManager()
     if c.use_speaker_embedding:

diff --git a/docs/source/index.md b/docs/source/index.md
@@ -37,6 +37,7 @@
     main_classes/model_api
     main_classes/dataset
     main_classes/gan
+    main_classes/speaker_manager
 
 .. toctree::
     :maxdepth: 2

diff --git a/docs/source/main_classes/speaker_manager.md b/docs/source/main_classes/speaker_manager.md
@@ -0,0 +1,11 @@
+# Speaker Manager API
+
+The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is
+especially useful for multi-speaker models.
+
+
+## Speaker Manager
+```{eval-rst}
+.. automodule:: TTS.tts.utils.speakers
+    :members:
+```