diff --git a/.gitignore b/.gitignore index 579bfbea1..668107c85 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ TODO.txt data/* notebooks/data/* TTS/tts/layers/glow_tts/monotonic_align/core.c + +.vscode +temp_build \ No newline at end of file diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 7642f86bf..aed5a236a 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -19,7 +19,7 @@ def main(): description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, + parser.add_argument("--out_path", default=None, type=str, required=True, help="directory to save the output file.") args = parser.parse_args() diff --git a/TTS/setup_datasets.sh b/TTS/setup_datasets.sh new file mode 100644 index 000000000..9dccf1f57 --- /dev/null +++ b/TTS/setup_datasets.sh @@ -0,0 +1,8 @@ +mkdir -p /home/models/sinhala /home/models/phoneme_cache + +cd /home +mkdir -p datasets/sinhala +cd datasets/sinhala +wget https://github.com/pathnirvana/tacotron2/releases/download/1/sinhala.zip +unzip sinhala.zip +cd /home \ No newline at end of file diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 48f20e8f2..2a1daff36 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -1,6 +1,6 @@ { "model": "Tacotron2", - "run_name": "ljspeech-ddc", + "run_name": "sinhala-tacotron2-ddc", "run_description": "tacotron2 with DDC and differential spectral loss.", // AUDIO PARAMETERS @@ -37,7 +37,8 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + // python TTS/bin/compute_statistics.py --config_path TTS/tts/configs/config.json --out_path=/home/datasets/sinhala/scale_stats.npy + "stats_path": "/home/datasets/sinhala/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS @@ -135,12 +136,13 @@ "use_noise_augment": true, // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", + "output_path": "/home/models/sinhala/", // PHONEMES - "phoneme_cache_path": "/home/erogol/Models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "/home/models/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + "phoneme_language": "kn", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + // for sinhala use kn and add language_switch='remove-flags' when the phenomize is called - input should be in sinhala letters // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. @@ -162,10 +164,12 @@ "datasets": // List of datasets. They all merged and they get different speaker_ids. [ { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers - "meta_file_val": null + "name": "sinhala", // "ljspeech", + "path": "/home/datasets/sinhala/", //"/home/erogol/Data/LJSpeech-1.1/", + //"meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers + //"meta_file_val": null, + "meta_file_train": "train_filelist.txt", + "meta_file_val": "val_filelist.txt" } ] } diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 56fc75f5f..122af07fb 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -166,6 +166,19 @@ def ljspeech(root_path, meta_file): return items +def sinhala(root_path, meta_file): + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "sinhala" # todo can take the speaker id from the sinhala dataset if needed + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = cols[0] #os.path.join(root_path, 'wavs', cols[0] + '.wav') + text = cols[1] + items.append([text, wav_file, speaker_name]) + return items + + def nancy(root_path, meta_file): """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 29f4af1d4..4daec52ae 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -33,7 +33,7 @@ def text2phone(text, language): #try: punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) if version.parse(phonemizer.__version__) < version.parse('2.1'): - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) + ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, language_switch='remove-flags') ph = ph[:-1].strip() # skip the last empty character # phonemizer does not tackle punctuations. Here we do. # Replace \n with matching punctuations.