Update VAD for silence trimming. (coqui-ai#2604)

* Update vad for mp3 and fault tolerance * Make style * Remove importt * Remove stupid defaults
pugtech-co · Aug 14, 2023 · bd34b16 · bd34b16
1 parent d4d7d9d
commit bd34b16
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 14 deletions.
diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
@@ -16,7 +16,7 @@ def adjust_path_and_remove_silence(audio_path):
     output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
     # ignore if the file exists
     if os.path.exists(output_path) and not args.force:
-        return output_path
+        return output_path, False
 
     # create all directory structure
     pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
@@ -28,7 +28,6 @@ def adjust_path_and_remove_silence(audio_path):
         trim_just_beginning_and_end=args.trim_just_beginning_and_end,
         use_cuda=args.use_cuda,
     )
-
     return output_path, is_speech
 
 
@@ -70,7 +69,7 @@ def preprocess_audios():
         # write files that do not have speech
         with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
             for file in filtered_files:
-                f.write(file + "\n")
+                f.write(str(file) + "\n")
     else:
         print("> No files Found !")
 
@@ -79,10 +78,8 @@ def preprocess_audios():
     parser = argparse.ArgumentParser(
         description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
     )
-    parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
-    parser.add_argument(
-        "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
-    )
+    parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
+    parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
     parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
     parser.add_argument(
         "-g",
@@ -118,6 +115,10 @@ def preprocess_audios():
         help="Number of processes to use",
     )
     args = parser.parse_args()
+
+    if args.output_dir == "":
+        args.output_dir = args.input_dir
+
     # load the model and utils
-    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
     preprocess_audios()
diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py
@@ -1,4 +1,3 @@
-import soundfile as sf
 import torch
 import torchaudio
 
@@ -35,8 +34,10 @@ def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False)
     return new_timestamps
 
 
-def get_vad_model_and_utils(use_cuda=False):
-    model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
+def get_vad_model_and_utils(use_cuda=False, use_onnx=False):
+    model, utils = torch.hub.load(
+        repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True
+    )
     if use_cuda:
         model = model.cuda()
 
@@ -51,7 +52,11 @@ def remove_silence(
     model, get_speech_timestamps, _, collect_chunks = model_and_utils
 
     # read ground truth wav and resample the audio for the VAD
-    wav, gt_sample_rate = read_audio(audio_path)
+    try:
+        wav, gt_sample_rate = read_audio(audio_path)
+    except:
+        print(f"> ❗ Failed to read {audio_path}")
+        return None, False
 
     # if needed, resample the audio for the VAD model
     if gt_sample_rate != vad_sample_rate:
@@ -78,6 +83,6 @@ def remove_silence(
         print(f"> The file {audio_path} probably does not have speech please check it !!")
         is_speech = False
 
-    # save audio
-    sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
+    # save
+    torchaudio.save(out_path, wav[None, :], gt_sample_rate)
     return out_path, is_speech