From 880f114f6a8232482775db44631c388e35aa72c5 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 28 Feb 2022 09:30:35 -0300 Subject: [PATCH 1/2] Add support for the speaker encoder training using torch spectrograms --- TTS/bin/train_encoder.py | 1 + TTS/speaker_encoder/dataset.py | 10 ++++++++-- TTS/tts/datasets/formatters.py | 11 ++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5828411c2a..b7424698f6 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -46,6 +46,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, augmentation_config=c.audio_augmentation, + use_torch_spec=c.model_params.get("use_torch_spec", False), ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 28a23e2fa3..ece424eee3 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -20,6 +20,7 @@ def __init__( skip_speakers=False, verbose=False, augmentation_config=None, + use_torch_spec=None, ): """ Args: @@ -37,6 +38,7 @@ def __init__( self.skip_speakers = skip_speakers self.ap = ap self.verbose = verbose + self.use_torch_spec = use_torch_spec self.__parse_items() storage_max_size = storage_size * num_speakers_in_batch self.storage = Storage( @@ -241,8 +243,12 @@ def collate_fn(self, batch): self.gaussian_augmentation_config["max_amplitude"], size=len(wav), ) - mel = self.ap.melspectrogram(wav) - feats_.append(torch.FloatTensor(mel)) + + if not self.use_torch_spec: + mel = self.ap.melspectrogram(wav) + feats_.append(torch.FloatTensor(mel)) + else: + feats_.append(torch.FloatTensor(wav)) labels.append(torch.LongTensor(labels_)) feats.extend(feats_) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index aacfc64745..7555e0f751 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -334,21 +334,22 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic return items -def vctk_old(root_path, meta_files=None, wavs_path="wav48"): +def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([text, wav_file, "VCTK_old_" + speaker_id]) + items.append([text, wav_file, "VCTK_" + speaker_id]) + return items From f5b964d0dc1ca3128e5b5724a2ce26a51b04c5f4 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 28 Feb 2022 09:40:33 -0300 Subject: [PATCH 2/2] Remove useless function in speaker encoder dataset class --- TTS/speaker_encoder/dataset.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index ece424eee3..07fa924660 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -74,22 +74,6 @@ def load_wav(self, filename): audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio - def load_data(self, idx): - text, wav_file, speaker_name = self.items[idx] - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - # sample seq_len - - assert text.size > 0, self.items[idx]["audio_file"] - assert wav.size > 0, self.items[idx]["audio_file"] - - sample = { - "mel": mel, - "item_idx": self.items[idx]["audio_file"], - "speaker_name": speaker_name, - } - return sample - def __parse_items(self): self.speaker_to_utters = {} for i in self.items: