Merge pull request #892 from IAHispano/exp/dev

dev to main
IAHispano · Dec 1, 2024 · de5b3d8 · de5b3d8
2 parents 2a35f50 + 4d689d2
commit de5b3d8
Show file tree

Hide file tree

Showing 22 changed files with 1,087 additions and 1,289 deletions.
diff --git a/assets/Applio_NoUI.ipynb b/assets/Applio_NoUI.ipynb
diff --git a/assets/discord_presence.py b/assets/discord_presence.py
@@ -1,7 +1,5 @@
 from pypresence import Presence
 import datetime as dt
-import time
-
 
 class RichPresenceManager:
     def __init__(self):
@@ -32,7 +30,7 @@ def update_presence(self):
                 details="Open ecosystem for voice cloning",
                 buttons=[
                     {"label": "Home", "url": "https://applio.org"},
-                    {"label": "Download", "url": "https://applio.org/download"},
+                    {"label": "Download", "url": "https://applio.org/products/applio"},
                 ],
                 large_image="logo",
                 large_text="Experimenting with applio",

diff --git a/assets/zluda/README.md b/assets/zluda/README.md
diff --git a/core.py b/core.py
@@ -33,7 +33,7 @@ def load_voices_data():
 
 
 voices_data = load_voices_data()
-locales = list({voice["Locale"] for voice in voices_data})
+locales = list({voice["ShortName"] for voice in voices_data})
 
 
 @lru_cache(maxsize=None)
@@ -69,7 +69,6 @@ def run_infer_script(
     clean_audio: bool,
     clean_strength: float,
     export_format: str,
-    upscale_audio: bool,
     f0_file: str,
     embedder_model: str,
     embedder_model_custom: str = None,
@@ -134,7 +133,6 @@ def run_infer_script(
         "clean_audio": clean_audio,
         "clean_strength": clean_strength,
         "export_format": export_format,
-        "upscale_audio": upscale_audio,
         "f0_file": f0_file,
         "embedder_model": embedder_model,
         "embedder_model_custom": embedder_model_custom,
@@ -207,7 +205,6 @@ def run_batch_infer_script(
     clean_audio: bool,
     clean_strength: float,
     export_format: str,
-    upscale_audio: bool,
     f0_file: str,
     embedder_model: str,
     embedder_model_custom: str = None,
@@ -272,7 +269,6 @@ def run_batch_infer_script(
         "clean_audio": clean_audio,
         "clean_strength": clean_strength,
         "export_format": export_format,
-        "upscale_audio": upscale_audio,
         "f0_file": f0_file,
         "embedder_model": embedder_model,
         "embedder_model_custom": embedder_model_custom,
@@ -348,7 +344,6 @@ def run_tts_script(
     clean_audio: bool,
     clean_strength: float,
     export_format: str,
-    upscale_audio: bool,
     f0_file: str,
     embedder_model: str,
     embedder_model_custom: str = None,
@@ -394,7 +389,6 @@ def run_tts_script(
         clean_audio=clean_audio,
         clean_strength=clean_strength,
         export_format=export_format,
-        upscale_audio=upscale_audio,
         f0_file=f0_file,
         embedder_model=embedder_model,
         embedder_model_custom=embedder_model_custom,
@@ -824,14 +818,6 @@ def parse_arguments():
         help=embedder_model_custom_description,
         default=None,
     )
-    upscale_audio_description = "Upscale the input audio to a higher quality before processing. This can improve the overall quality of the output, especially for low-quality input audio."
-    infer_parser.add_argument(
-        "--upscale_audio",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
-        help=upscale_audio_description,
-        default=False,
-    )
     f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
     infer_parser.add_argument(
         "--f0_file",
@@ -1346,13 +1332,6 @@ def parse_arguments():
         help=embedder_model_custom_description,
         default=None,
     )
-    batch_infer_parser.add_argument(
-        "--upscale_audio",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
-        help=upscale_audio_description,
-        default=False,
-    )
     batch_infer_parser.add_argument(
         "--f0_file",
         type=str,
@@ -1840,13 +1819,6 @@ def parse_arguments():
         help=embedder_model_custom_description,
         default=None,
     )
-    tts_parser.add_argument(
-        "--upscale_audio",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
-        help=upscale_audio_description,
-        default=False,
-    )
     tts_parser.add_argument(
         "--f0_file",
         type=str,
@@ -2317,7 +2289,6 @@ def main():
                 export_format=args.export_format,
                 embedder_model=args.embedder_model,
                 embedder_model_custom=args.embedder_model_custom,
-                upscale_audio=args.upscale_audio,
                 f0_file=args.f0_file,
                 formant_shifting=args.formant_shifting,
                 formant_qfrency=args.formant_qfrency,
@@ -2381,7 +2352,6 @@ def main():
                 export_format=args.export_format,
                 embedder_model=args.embedder_model,
                 embedder_model_custom=args.embedder_model_custom,
-                upscale_audio=args.upscale_audio,
                 f0_file=args.f0_file,
                 formant_shifting=args.formant_shifting,
                 formant_qfrency=args.formant_qfrency,
@@ -2437,8 +2407,8 @@ def main():
                 protect=args.protect,
                 hop_length=args.hop_length,
                 f0_method=args.f0_method,
-                input_path=args.input_path,
-                output_path=args.output_path,
+                output_tts_path=args.output_tts_path,
+                output_rvc_path=args.output_rvc_path,
                 pth_path=args.pth_path,
                 index_path=args.index_path,
                 split_audio=args.split_audio,
@@ -2449,7 +2419,6 @@ def main():
                 export_format=args.export_format,
                 embedder_model=args.embedder_model,
                 embedder_model_custom=args.embedder_model_custom,
-                upscale_audio=args.upscale_audio,
                 f0_file=args.f0_file,
             )
         elif args.mode == "preprocess":

diff --git a/requirements.txt b/requirements.txt
@@ -6,20 +6,14 @@ numpy==1.23.5
 requests>=2.31.0,<2.32.0
 tqdm
 wget
-pydantic==2.8.2
-fastapi==0.112.0
-starlette==0.37.2
 
 # Audio processing
 ffmpeg-python>=0.2.0
 faiss-cpu==1.7.3
 librosa==0.9.2
-pyworld==0.3.4
 scipy==1.11.1
 soundfile==0.12.1
-praat-parselmouth
 noisereduce
-versatile-audio-upscaler
 pedalboard
 stftpitchshift
 
@@ -44,11 +38,8 @@ gradio==4.43.0
 # Miscellaneous utilities
 certifi>=2023.07.22; sys_platform == 'darwin'  
 antlr4-python3-runtime==4.8; sys_platform == 'darwin'
-ffmpy==0.3.1
 tensorboardX
 edge-tts==6.1.9
 pypresence
 beautifulsoup4
-flask
-local-attention
-
+flask
diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py
@@ -108,7 +108,7 @@ def convert_audio_format(input_path, output_path, output_format):
         """
         try:
             if output_format != "WAV":
-                print(f"Converting audio to {output_format} format...")
+                print(f"Saving audio as {output_format}...")
                 audio, sample_rate = librosa.load(input_path, sr=None)
                 common_sample_rates = [
                     8000,
@@ -255,10 +255,6 @@ def convert_audio(
             start_time = time.time()
             print(f"Converting audio '{audio_input_path}'...")
 
-            if upscale_audio == True:
-                from audio_upscaler import upscale
-
-                upscale(audio_input_path, audio_input_path)
             audio = load_audio_infer(
                 audio_input_path,
                 16000,

diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py
@@ -286,7 +286,7 @@ def get_f0_hybrid(
         if methods_str:
             methods = [method.strip() for method in methods_str.group(1).split("+")]
         f0_computation_stack = []
-        print(f"Calculating f0 pitch estimations for methods {str(methods)}")
+        print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}")
         x = x.astype(np.float32)
         x /= np.quantile(np.abs(x), 0.999)
         for method in methods: