Introduce MVA (Multi-Vocoder Architecture)

IAHispano · Dec 22, 2024 · c306f1c · c306f1c
2 parents 908aa8c + c22a7d4
commit c306f1c
Show file tree

Hide file tree

Showing 32 changed files with 1,858 additions and 644 deletions.
diff --git a/assets/Applio_NoUI.ipynb b/assets/Applio_NoUI.ipynb
@@ -429,12 +429,14 @@
     "sample_rate = \"40k\"  # @param [\"32k\", \"40k\", \"48k\"] {allow-input: false}\n",
     "sr = int(sample_rate.rstrip(\"k\")) * 1000\n",
     "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n",
-    "cut_preprocess = True # @param{type:\"boolean\"}\n",
+    "cut_preprocess = \"Automatic\" # @param [\"Skip\", \"Simple\", \"Automatic\"] {allow-input: false}\n",
     "process_effects = False # @param{type:\"boolean\"}\n",
     "noise_reduction = False # @param{type:\"boolean\"}\n",
     "noise_reduction_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
+    "chunk_len = 3.0  # @param {type:\"slider\", min:0.5, max:5.0, step:0.5}\n",
+    "overlap_len = 0.3 # @param {type:\"slider\", min:0.0, max:0.4, step:0.1}\n",
     "\n",
-    "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\""
+    "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\"  --chunk_len \"{chunk_len}\" --overlap_len \"{overlap_len}\""
    ]
   },
   {
@@ -453,10 +455,11 @@
     "\n",
     "sr = int(sample_rate.rstrip(\"k\")) * 1000\n",
     "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n",
+    "include_mutes = 2 # @param {type:\"slider\", min:0, max:10, step:1}\n",
     "embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"] {allow-input: false}\n",
     "embedder_model_custom = \"\" # @param {type:\"string\"}\n",
     "\n",
-    "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\""
+    "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\" --include_mutes \"{include_mutes}\""
    ]
   },
   {
@@ -597,7 +600,7 @@
     "    print(\"Autobackup Disabled\")\n",
     "else:\n",
     "    autobackups = True\n",
-    "    print(\"Autobackup Enabled\") \n",
+    "    print(\"Autobackup Enabled\")\n",
     "# @markdown ### ⚙️ Train Settings\n",
     "total_epoch = 800  # @param {type:\"integer\"}\n",
     "batch_size = 15  # @param {type:\"slider\", min:1, max:25, step:0}\n",
@@ -618,6 +621,8 @@
     "custom_pretrained = False  # @param{type:\"boolean\"}\n",
     "g_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/G48k.pth\"  # @param {type:\"string\"}\n",
     "d_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/D48k.pth\"  # @param {type:\"string\"}\n",
+    "vocoder = \"HiFi-GAN\" # @param [\"HiFi-GAN\", \"MRF HiFi-GAN\", \"RefineGAN\"] {allow-input: false}\n",
+    "checkpointing = False # @param{type:\"boolean\"}\n",
     "\n",
     "if \"pretrained\" not in globals():\n",
     "    pretrained = True\n",
@@ -636,8 +641,7 @@
     "    if tensorboard == True:\n",
     "        %load_ext tensorboard\n",
     "        %tensorboard --logdir /content/Applio/logs/\n",
-    "    !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n",
-    "\n",
+    "    !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\" --vocoder \"{vocoder}\" --checkpointing \"{checkpointing}\"\n",
     "\n",
     "server_thread = threading.Thread(target=start_train)\n",
     "server_thread.start()\n",

diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json
@@ -78,8 +78,7 @@
   "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.": "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.",
   "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.": "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.",
   "Extract Features": "Extract Features",
-  "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.",
-  "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.",
+  "Configure GPU and CPU settings.": "Configure GPU and CPU settings.",
   "Cache Dataset in GPU": "Cache Dataset in GPU",
   "Cache the dataset in GPU memory to speed up the training process.": "Cache the dataset in GPU memory to speed up the training process.",
   "Index Algorithm": "Index Algorithm",
@@ -321,5 +320,19 @@
   "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.": "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.",
   "Model Author Name": "Model Author Name",
   "The name that will appear in the model information.": "The name that will appear in the model information.",
-  "Set name": "Set name"
+  "Set name": "Set name",
+  "Vocoder": "Vocoder",
+  "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).",
+  "Checkpointing": "Checkpointing",
+  "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.",
+  "Enable Experimental Options": "Enable Experimental Options",
+  "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.",
+  "Model Settings": "Model Settings",
+  "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.": "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.",
+  "Chunk length (sec)": "Chunk length (sec)",
+  "Length of the audio slice for 'Simple' method.": "Length of the audio slice for 'Simple' method.",
+  "Overlap length (sec)": "Overlap length (sec)",
+  "Length of the overlap between slices for 'Simple' method.": "Length of the overlap between slices for 'Simple' method.",
+  "Silent training files": "Silent training files",
+  "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence.": "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence."
 }
diff --git a/core.py b/core.py
@@ -421,10 +421,12 @@ def run_preprocess_script(
     dataset_path: str,
     sample_rate: int,
     cpu_cores: int,
-    cut_preprocess: bool,
+    cut_preprocess: str,
     process_effects: bool,
     noise_reduction: bool,
     clean_strength: float,
+    chunk_len: float,
+    overlap_len: float,
 ):
     config = get_config()
     per = 3.0 if config.is_half else 3.7
@@ -444,6 +446,8 @@ def run_preprocess_script(
                 process_effects,
                 noise_reduction,
                 clean_strength,
+                chunk_len,
+                overlap_len,
             ],
         ),
     ]
@@ -462,6 +466,7 @@ def run_extract_script(
     sample_rate: int,
     embedder_model: str,
     embedder_model_custom: str = None,
+    include_mutes: int = 2,
 ):
 
     model_path = os.path.join(logs_path, model_name)
@@ -482,6 +487,7 @@ def run_extract_script(
                 sample_rate,
                 embedder_model,
                 embedder_model_custom,
+                include_mutes
             ],
         ),
     ]
@@ -502,7 +508,6 @@ def run_train_script(
     sample_rate: int,
     batch_size: int,
     gpu: int,
-    pitch_guidance: bool,
     overtraining_detector: bool,
     overtraining_threshold: int,
     pretrained: bool,
@@ -512,15 +517,15 @@ def run_train_script(
     custom_pretrained: bool = False,
     g_pretrained_path: str = None,
     d_pretrained_path: str = None,
+    vocoder: str = "HiFi-GAN",
+    checkpointing: bool = False,
 ):
 
     if pretrained == True:
         from rvc.lib.tools.pretrained_selector import pretrained_selector
 
         if custom_pretrained == False:
-            pg, pd = pretrained_selector(bool(pitch_guidance))[str(rvc_version)][
-                int(sample_rate)
-            ]
+            pg, pd = pretrained_selector(str(rvc_version), str(vocoder), True, int(sample_rate))
         else:
             if g_pretrained_path is None or d_pretrained_path is None:
                 raise ValueError(
@@ -546,13 +551,14 @@ def run_train_script(
                 gpu,
                 batch_size,
                 sample_rate,
-                pitch_guidance,
                 save_only_latest,
                 save_every_weights,
                 cache_data_in_gpu,
                 overtraining_detector,
                 overtraining_threshold,
                 cleanup,
+                vocoder,
+                checkpointing
             ],
         ),
     ]
@@ -1840,7 +1846,7 @@ def parse_arguments():
         "--sample_rate",
         type=int,
         help="Target sampling rate for the audio data.",
-        choices=[32000, 40000, 48000],
+        choices=[32000, 40000, 44100, 48000],
         required=True,
     )
     preprocess_parser.add_argument(
@@ -1851,11 +1857,11 @@ def parse_arguments():
     )
     preprocess_parser.add_argument(
         "--cut_preprocess",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
+        type=str,
+        choices=['Skip', 'Simple', 'Automatic'],
         help="Cut the dataset into smaller segments for faster preprocessing.",
-        default=True,
-        required=False,
+        default='Automatic',
+        required=True,
     )
     preprocess_parser.add_argument(
         "--process_effects",
@@ -1881,6 +1887,22 @@ def parse_arguments():
         default=0.7,
         required=False,
     )
+    preprocess_parser.add_argument(
+        "--chunk_len",
+        type=float,
+        help="Chunk length.",
+        choices=[i * 0.5 for i in range(1, 11)],
+        default=3.0,
+        required=False,
+    )
+    preprocess_parser.add_argument(
+        "--overlap_len",
+        type=float,
+        help="Overlap length.",
+        choices=[0.0, 0.1, 0.2, 0.3, 0.4],
+        default=0.3,
+        required=False,
+    )    
 
     # Parser for 'extract' mode
     extract_parser = subparsers.add_parser(
@@ -1923,15 +1945,15 @@ def parse_arguments():
     )
     extract_parser.add_argument(
         "--gpu",
-        type=int,
+        type=str,
         help="GPU device to use for feature extraction (optional).",
         default="-",
     )
     extract_parser.add_argument(
         "--sample_rate",
         type=int,
         help="Target sampling rate for the audio data.",
-        choices=[32000, 40000, 48000],
+        choices=[32000, 40000, 44100, 48000],
         required=True,
     )
     extract_parser.add_argument(
@@ -1953,6 +1975,14 @@ def parse_arguments():
         help=embedder_model_custom_description,
         default=None,
     )
+    extract_parser.add_argument(
+        "--include_mutes",
+        type=int,
+        help="Number of silent files to include.",
+        choices=range(0, 11),
+        default=2,
+        required=True
+    )    
 
     # Parser for 'train' mode
     train_parser = subparsers.add_parser("train", help="Train an RVC model.")
@@ -1966,6 +1996,21 @@ def parse_arguments():
         choices=["v1", "v2"],
         default="v2",
     )
+    train_parser.add_argument(
+        "--vocoder",
+        type=str,
+        help="Vocoder name",
+        choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
+        default="HiFi-GAN",
+    )
+    train_parser.add_argument(
+        "--checkpointing",
+        type=lambda x: bool(strtobool(x)),
+        choices=[True, False],
+        help="Enables memory-efficient training.",
+        default=False,
+        required=False,
+    )    
     train_parser.add_argument(
         "--save_every_epoch",
         type=int,
@@ -2014,13 +2059,6 @@ def parse_arguments():
         help="GPU device to use for training (e.g., '0').",
         default="0",
     )
-    train_parser.add_argument(
-        "--pitch_guidance",
-        type=lambda x: bool(strtobool(x)),
-        choices=[True, False],
-        help="Enable or disable pitch guidance during training.",
-        default=True,
-    )
     train_parser.add_argument(
         "--pretrained",
         type=lambda x: bool(strtobool(x)),
@@ -2431,6 +2469,8 @@ def main():
                 process_effects=args.process_effects,
                 noise_reduction=args.noise_reduction,
                 clean_strength=args.noise_reduction_strength,
+                chunk_len=args.chunk_len,
+                overlap_len=args.overlap_len,
             )
         elif args.mode == "extract":
             run_extract_script(
@@ -2443,6 +2483,7 @@ def main():
                 sample_rate=args.sample_rate,
                 embedder_model=args.embedder_model,
                 embedder_model_custom=args.embedder_model_custom,
+                include_mutes=args.include_mutes,
             )
         elif args.mode == "train":
             run_train_script(
@@ -2455,7 +2496,6 @@ def main():
                 sample_rate=args.sample_rate,
                 batch_size=args.batch_size,
                 gpu=args.gpu,
-                pitch_guidance=args.pitch_guidance,
                 overtraining_detector=args.overtraining_detector,
                 overtraining_threshold=args.overtraining_threshold,
                 pretrained=args.pretrained,
@@ -2465,6 +2505,8 @@ def main():
                 cache_data_in_gpu=args.cache_data_in_gpu,
                 g_pretrained_path=args.g_pretrained_path,
                 d_pretrained_path=args.d_pretrained_path,
+                vocoder=args.vocoder,
+                checkpointing=args.checkpointing,
             )
         elif args.mode == "index":
             run_index_script(

diff --git a/logs/mute/sliced_audios/mute44100.wav b/logs/mute/sliced_audios/mute44100.wav
diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,7 @@ soundfile==0.12.1
 noisereduce
 pedalboard
 stftpitchshift
+soxr
 
 # Machine learning and deep learning
 omegaconf>=2.0.6; sys_platform == 'darwin' 

diff --git a/rvc/configs/config.py b/rvc/configs/config.py
@@ -5,9 +5,11 @@
 version_config_paths = [
     os.path.join("v1", "32000.json"),
     os.path.join("v1", "40000.json"),
+    os.path.join("v1", "44100.json"),
     os.path.join("v1", "48000.json"),
     os.path.join("v2", "48000.json"),
     os.path.join("v2", "40000.json"),
+    os.path.join("v2", "44100.json"),
     os.path.join("v2", "32000.json"),
 ]
 

diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json
@@ -0,0 +1,43 @@
+{
+    "train": {
+        "log_interval": 200,
+        "seed": 1234,
+        "learning_rate": 0.0001,
+        "betas": [0.8, 0.99],
+        "eps": 1e-09,
+        "fp16_run": true,
+        "lr_decay": 0.999875,
+        "segment_size": 15876,
+        "c_mel": 45,
+        "c_kl": 1.0
+    },
+    "data": {
+        "max_wav_value": 32768.0,
+        "sample_rate": 44100,
+        "filter_length": 2048,
+        "hop_length": 441,
+        "win_length": 2048,
+        "n_mel_channels": 160,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "text_enc_hidden_dim": 256,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0,
+        "resblock": "1",
+        "resblock_kernel_sizes": [3,7,11],
+        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+        "upsample_rates": [7,7,3,3],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [14,14,6,6],
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "spk_embed_dim": 109
+    }
+}