diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index eec493ecf3..cdc6766909 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,3 +1,4 @@ +import base64 import collections import os import random @@ -34,6 +35,12 @@ def noise_augment_audio(wav): return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) +def string2filename(string): + # generate a safe and reversible filename based on a string + filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") + return filename + + class TTSDataset(Dataset): def __init__( self, @@ -201,7 +208,7 @@ def get_phonemes(self, idx, text): def get_f0(self, idx): out_dict = self.f0_dataset[idx] item = self.samples[idx] - assert item["audio_file"] == out_dict["audio_file"] + assert item["audio_unique_name"] == out_dict["audio_unique_name"] return out_dict @staticmethod @@ -561,19 +568,18 @@ def __init__( def __getitem__(self, index): item = self.samples[index] - ids = self.compute_or_load(item["audio_file"], item["text"]) + ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"]) ph_hat = self.tokenizer.ids_to_text(ids) return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)} def __len__(self): return len(self.samples) - def compute_or_load(self, wav_file, text): + def compute_or_load(self, file_name, text): """Compute phonemes for the given text. If the phonemes are already cached, load them from cache. """ - file_name = os.path.splitext(os.path.basename(wav_file))[0] file_ext = "_phoneme.npy" cache_path = os.path.join(self.cache_path, file_name + file_ext) try: @@ -670,11 +676,11 @@ def __init__( def __getitem__(self, idx): item = self.samples[idx] - f0 = self.compute_or_load(item["audio_file"]) + f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"])) if self.normalize_f0: assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available" f0 = self.normalize(f0) - return {"audio_file": item["audio_file"], "f0": f0} + return {"audio_unique_name": item["audio_unique_name"], "f0": f0} def __len__(self): return len(self.samples) @@ -706,8 +712,7 @@ def get_pad_id(self): return self.pad_id @staticmethod - def create_pitch_file_path(wav_file, cache_path): - file_name = os.path.splitext(os.path.basename(wav_file))[0] + def create_pitch_file_path(file_name, cache_path): pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") return pitch_file @@ -745,11 +750,11 @@ def denormalize(self, pitch): pitch[zero_idxs] = 0.0 return pitch - def compute_or_load(self, wav_file): + def compute_or_load(self, wav_file, audio_unique_name): """ compute pitch and return a numpy array of pitch values """ - pitch_file = self.create_pitch_file_path(wav_file, self.cache_path) + pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path) if not os.path.exists(pitch_file): pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file) else: @@ -757,14 +762,14 @@ def compute_or_load(self, wav_file): return pitch.astype(np.float32) def collate_fn(self, batch): - audio_file = [item["audio_file"] for item in batch] + audio_unique_name = [item["audio_unique_name"] for item in batch] f0s = [item["f0"] for item in batch] f0_lens = [len(item["f0"]) for item in batch] f0_lens_max = max(f0_lens) f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id()) for i, f0_len in enumerate(f0_lens): f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i]) - return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens} + return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens} def print_logs(self, level: int = 0) -> None: indent = "\t" * level diff --git a/tests/data/ljspeech/f0_cache/pitch_stats.npy b/tests/data/ljspeech/f0_cache/pitch_stats.npy index aaa385c3c0..051118236f 100644 Binary files a/tests/data/ljspeech/f0_cache/pitch_stats.npy and b/tests/data/ljspeech/f0_cache/pitch_stats.npy differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy deleted file mode 100644 index fc024a6d0f..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy deleted file mode 100644 index 9ac355d9f5..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy deleted file mode 100644 index ea3944c1ac..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy deleted file mode 100644 index ff7f0e3ea4..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy deleted file mode 100644 index d7c6b2e396..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy deleted file mode 100644 index 4e7fa19a56..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy deleted file mode 100644 index b6db446041..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy deleted file mode 100644 index ff9fb8bc13..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy deleted file mode 100644 index d273bbb2dd..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy deleted file mode 100644 index c53050da7a..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy deleted file mode 100644 index 98921023c2..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy deleted file mode 100644 index bfdf81234d..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy deleted file mode 100644 index f0e7c1a1e5..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy deleted file mode 100644 index f0ed86b0bb..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy deleted file mode 100644 index 381b981ab9..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy deleted file mode 100644 index 5667d54bc7..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy deleted file mode 100644 index 7f6c7fc734..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy deleted file mode 100644 index b060181d52..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy deleted file mode 100644 index b0e818cc5d..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy deleted file mode 100644 index 472ac10be0..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy deleted file mode 100644 index 8fd4d5ab88..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy deleted file mode 100644 index 25273f681c..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy deleted file mode 100644 index 94b639df74..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy deleted file mode 100644 index 1178fc90b8..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy deleted file mode 100644 index b00b1750c1..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy deleted file mode 100644 index da78e6afc3..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy deleted file mode 100644 index 313f2b4113..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy deleted file mode 100644 index d6cf441c01..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy deleted file mode 100644 index 0c0d379da1..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy deleted file mode 100644 index a8bcc0ae26..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy deleted file mode 100644 index 835ea796b1..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy and /dev/null differ diff --git a/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy b/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy deleted file mode 100644 index 60a7b433a3..0000000000 Binary files a/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy and /dev/null differ