diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index d19e77872a..a1eaf4c9a7 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -16,7 +16,7 @@ def adjust_path_and_remove_silence(audio_path): output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: - return output_path + return output_path, False # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) @@ -28,7 +28,6 @@ def adjust_path_and_remove_silence(audio_path): trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda, ) - return output_path, is_speech @@ -70,7 +69,7 @@ def preprocess_audios(): # write files that do not have speech with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f: for file in filtered_files: - f.write(file + "\n") + f.write(str(file) + "\n") else: print("> No files Found !") @@ -79,10 +78,8 @@ def preprocess_audios(): parser = argparse.ArgumentParser( description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" ) - parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") - parser.add_argument( - "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" - ) + parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True) + parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="") parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files") parser.add_argument( "-g", @@ -118,6 +115,10 @@ def preprocess_audios(): help="Number of processes to use", ) args = parser.parse_args() + + if args.output_dir == "": + args.output_dir = args.input_dir + # load the model and utils - model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda) + model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx) preprocess_audios() diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 90c45e4961..aefce2b50b 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,4 +1,3 @@ -import soundfile as sf import torch import torchaudio @@ -35,8 +34,10 @@ def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False) return new_timestamps -def get_vad_model_and_utils(use_cuda=False): - model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False) +def get_vad_model_and_utils(use_cuda=False, use_onnx=False): + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True + ) if use_cuda: model = model.cuda() @@ -51,7 +52,11 @@ def remove_silence( model, get_speech_timestamps, _, collect_chunks = model_and_utils # read ground truth wav and resample the audio for the VAD - wav, gt_sample_rate = read_audio(audio_path) + try: + wav, gt_sample_rate = read_audio(audio_path) + except: + print(f"> ❗ Failed to read {audio_path}") + return None, False # if needed, resample the audio for the VAD model if gt_sample_rate != vad_sample_rate: @@ -78,6 +83,6 @@ def remove_silence( print(f"> The file {audio_path} probably does not have speech please check it !!") is_speech = False - # save audio - sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16") + # save + torchaudio.save(out_path, wav[None, :], gt_sample_rate) return out_path, is_speech