Merge pull request #598 from ShiromiyaG/new-functions

Option to cut or not cut the audios in preprocess
IAHispano · Aug 16, 2024 · 29c5aac · 29c5aac
2 parents 642c6da + c870706
commit 29c5aac
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 23 deletions.
diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json
@@ -17,6 +17,8 @@
   "Settings": "Settings",
 
   "Preprocess": "Preprocess",
+  "Audio cutting": "Audio cutting",
+  "It's recommended to deactivate this option if your dataset has already been processed.": "It's recommended to deactivate this option if your dataset has already been processed.",
   "Model Name": "Model Name",
   "Name of the new model.": "Name of the new model.",
   "Enter model name": "Enter model name",

diff --git a/core.py b/core.py
@@ -243,7 +243,11 @@ def run_tts_script(
 
 # Preprocess
 def run_preprocess_script(
-    model_name: str, dataset_path: str, sample_rate: int, cpu_cores: int
+    model_name: str,
+    dataset_path: str,
+    sample_rate: int,
+    cpu_cores: int,
+    cut_preprocess: bool,
 ):
     config = get_config()
     per = 3.0 if config.is_half else 3.7
@@ -259,6 +263,7 @@ def run_preprocess_script(
                 sample_rate,
                 per,
                 cpu_cores,
+                cut_preprocess,
             ],
         ),
     ]
@@ -976,6 +981,14 @@ def parse_arguments():
         help="Number of CPU cores to use for preprocessing.",
         choices=range(1, 65),
     )
+    preprocess_parser.add_argument(
+        "--cut_preprocess",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Cut the dataset into smaller segments for faster preprocessing.",
+        default=True,
+        required=False,
+    )
 
     # Parser for 'extract' mode
     extract_parser = subparsers.add_parser(
@@ -1442,6 +1455,7 @@ def main():
                 dataset_path=args.dataset_path,
                 sample_rate=args.sample_rate,
                 cpu_cores=args.cpu_cores,
+                cut_preprocess=args.cut_preprocess,
             )
         elif args.mode == "extract":
             run_extract_script(

diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py
@@ -8,6 +8,7 @@
 import numpy as np
 import multiprocessing
 from pydub import AudioSegment
+from distutils.util import strtobool
 
 multiprocessing.set_start_method("spawn", force=True)
 
@@ -81,42 +82,47 @@ def process_audio_segment(self, audio_segment: torch.Tensor, idx0: int, idx1: in
         wav_16k_path = os.path.join(self.wavs16k_dir, f"{idx0}_{idx1}.wav")
         self._write_audio(audio_16k, wav_16k_path, SAMPLE_RATE_16K)
 
-    def process_audio(self, path: str, idx0: int):
+    def process_audio(self, path: str, idx0: int, cut_preprocess: bool):
         try:
             audio = load_audio(path, self.sr)
             audio = torch.tensor(
                 signal.lfilter(self.b_high, self.a_high, audio), device=self.device
             ).float()
 
             idx1 = 0
-            for audio_segment in self.slicer.slice(audio.cpu().numpy()):
-                audio_segment = torch.tensor(audio_segment, device=self.device).float()
-                i = 0
-                while True:
-                    start = int(self.sr * (self.per - OVERLAP) * i)
-                    i += 1
-                    if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr:
-                        tmp_audio = audio_segment[
-                            start : start + int(self.per * self.sr)
-                        ]
-                        self.process_audio_segment(tmp_audio, idx0, idx1)
-                        idx1 += 1
-                    else:
-                        tmp_audio = audio_segment[start:]
-                        self.process_audio_segment(tmp_audio, idx0, idx1)
-                        idx1 += 1
-                        break
+            if cut_preprocess:
+                for audio_segment in self.slicer.slice(audio.cpu().numpy()):
+                    audio_segment = torch.tensor(
+                        audio_segment, device=self.device
+                    ).float()
+                    i = 0
+                    while True:
+                        start = int(self.sr * (self.per - OVERLAP) * i)
+                        i += 1
+                        if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr:
+                            tmp_audio = audio_segment[
+                                start : start + int(self.per * self.sr)
+                            ]
+                            self.process_audio_segment(tmp_audio, idx0, idx1)
+                            idx1 += 1
+                        else:
+                            tmp_audio = audio_segment[start:]
+                            self.process_audio_segment(tmp_audio, idx0, idx1)
+                            idx1 += 1
+                            break
+            else:
+                self.process_audio_segment(audio, idx0, idx1)
         except Exception as error:
             print(f"An error occurred on {path} path: {error}")
 
-    def process_audio_file(self, file_path_idx):
+    def process_audio_file(self, file_path_idx, cut_preprocess):
         file_path, idx0 = file_path_idx
         ext = os.path.splitext(file_path)[1].lower()
         if ext not in [".wav"]:
             audio = AudioSegment.from_file(file_path)
             file_path = os.path.join("/tmp", f"{idx0}.wav")
             audio.export(file_path, format="wav")
-        self.process_audio(file_path, idx0)
+        self.process_audio(file_path, idx0, cut_preprocess)
 
 
 def preprocess_training_set(
@@ -125,6 +131,7 @@ def preprocess_training_set(
     num_processes: int,
     exp_dir: str,
     per: float,
+    cut_preprocess: bool,
 ):
     start_time = time.time()
 
@@ -139,7 +146,7 @@ def preprocess_training_set(
 
     ctx = multiprocessing.get_context("spawn")
     with ctx.Pool(processes=num_processes) as pool:
-        pool.map(pp.process_audio_file, files)
+        pool.starmap(pp.process_audio_file, [(file, cut_preprocess) for file in files])
 
     elapsed_time = time.time() - start_time
     print(f"Preprocess completed in {elapsed_time:.2f} seconds.")
@@ -153,11 +160,13 @@ def preprocess_training_set(
     num_processes = (
         int(sys.argv[5]) if len(sys.argv) > 5 else multiprocessing.cpu_count()
     )
+    cut_preprocess = strtobool(sys.argv[6])
 
     preprocess_training_set(
         input_root,
         sample_rate,
         num_processes,
         experiment_directory,
         percentage,
+        cut_preprocess,
     )
diff --git a/tabs/train/train.py b/tabs/train/train.py
@@ -362,7 +362,15 @@ def train_tab():
                     ),
                     interactive=True,
                 )
-
+                cut_preprocess = gr.Checkbox(
+                    label=i18n("Audio cutting"),
+                    info=i18n(
+                        "It's recommended to deactivate this option if your dataset has already been processed."
+                    ),
+                    value=True,
+                    interactive=True,
+                    visible=True,
+                )
         preprocess_output_info = gr.Textbox(
             label=i18n("Output Information"),
             info=i18n("The output information will be displayed here."),
@@ -380,6 +388,7 @@ def train_tab():
                     dataset_path,
                     sampling_rate,
                     cpu_cores_preprocess,
+                    cut_preprocess,
                 ],
                 outputs=[preprocess_output_info],
                 api_name="preprocess_dataset",