Skip to content

Commit

Permalink
Introduce MVA (Multi-Vocoder Architecture)
Browse files Browse the repository at this point in the history
Introduce MVA (Multi-Vocoder Architecture)
  • Loading branch information
blaisewf authored Dec 22, 2024
2 parents 908aa8c + c22a7d4 commit c306f1c
Show file tree
Hide file tree
Showing 32 changed files with 1,858 additions and 644 deletions.
16 changes: 10 additions & 6 deletions assets/Applio_NoUI.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -429,12 +429,14 @@
"sample_rate = \"40k\" # @param [\"32k\", \"40k\", \"48k\"] {allow-input: false}\n",
"sr = int(sample_rate.rstrip(\"k\")) * 1000\n",
"cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n",
"cut_preprocess = True # @param{type:\"boolean\"}\n",
"cut_preprocess = \"Automatic\" # @param [\"Skip\", \"Simple\", \"Automatic\"] {allow-input: false}\n",
"process_effects = False # @param{type:\"boolean\"}\n",
"noise_reduction = False # @param{type:\"boolean\"}\n",
"noise_reduction_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
"chunk_len = 3.0 # @param {type:\"slider\", min:0.5, max:5.0, step:0.5}\n",
"overlap_len = 0.3 # @param {type:\"slider\", min:0.0, max:0.4, step:0.1}\n",
"\n",
"!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\""
"!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\" --chunk_len \"{chunk_len}\" --overlap_len \"{overlap_len}\""
]
},
{
Expand All @@ -453,10 +455,11 @@
"\n",
"sr = int(sample_rate.rstrip(\"k\")) * 1000\n",
"cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n",
"include_mutes = 2 # @param {type:\"slider\", min:0, max:10, step:1}\n",
"embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"] {allow-input: false}\n",
"embedder_model_custom = \"\" # @param {type:\"string\"}\n",
"\n",
"!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\""
"!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\" --include_mutes \"{include_mutes}\""
]
},
{
Expand Down Expand Up @@ -597,7 +600,7 @@
" print(\"Autobackup Disabled\")\n",
"else:\n",
" autobackups = True\n",
" print(\"Autobackup Enabled\") \n",
" print(\"Autobackup Enabled\")\n",
"# @markdown ### ⚙️ Train Settings\n",
"total_epoch = 800 # @param {type:\"integer\"}\n",
"batch_size = 15 # @param {type:\"slider\", min:1, max:25, step:0}\n",
Expand All @@ -618,6 +621,8 @@
"custom_pretrained = False # @param{type:\"boolean\"}\n",
"g_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/G48k.pth\" # @param {type:\"string\"}\n",
"d_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/D48k.pth\" # @param {type:\"string\"}\n",
"vocoder = \"HiFi-GAN\" # @param [\"HiFi-GAN\", \"MRF HiFi-GAN\", \"RefineGAN\"] {allow-input: false}\n",
"checkpointing = False # @param{type:\"boolean\"}\n",
"\n",
"if \"pretrained\" not in globals():\n",
" pretrained = True\n",
Expand All @@ -636,8 +641,7 @@
" if tensorboard == True:\n",
" %load_ext tensorboard\n",
" %tensorboard --logdir /content/Applio/logs/\n",
" !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n",
"\n",
" !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\" --vocoder \"{vocoder}\" --checkpointing \"{checkpointing}\"\n",
"\n",
"server_thread = threading.Thread(target=start_train)\n",
"server_thread.start()\n",
Expand Down
19 changes: 16 additions & 3 deletions assets/i18n/languages/en_US.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@
"By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.": "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.",
"Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.": "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.",
"Extract Features": "Extract Features",
"We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.",
"We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.",
"Configure GPU and CPU settings.": "Configure GPU and CPU settings.",
"Cache Dataset in GPU": "Cache Dataset in GPU",
"Cache the dataset in GPU memory to speed up the training process.": "Cache the dataset in GPU memory to speed up the training process.",
"Index Algorithm": "Index Algorithm",
Expand Down Expand Up @@ -321,5 +320,19 @@
"Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.": "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.",
"Model Author Name": "Model Author Name",
"The name that will appear in the model information.": "The name that will appear in the model information.",
"Set name": "Set name"
"Set name": "Set name",
"Vocoder": "Vocoder",
"Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).",
"Checkpointing": "Checkpointing",
"Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.",
"Enable Experimental Options": "Enable Experimental Options",
"Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.",
"Model Settings": "Model Settings",
"Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.": "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.",
"Chunk length (sec)": "Chunk length (sec)",
"Length of the audio slice for 'Simple' method.": "Length of the audio slice for 'Simple' method.",
"Overlap length (sec)": "Overlap length (sec)",
"Length of the overlap between slices for 'Simple' method.": "Length of the overlap between slices for 'Simple' method.",
"Silent training files": "Silent training files",
"Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence.": "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence."
}
84 changes: 63 additions & 21 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,10 +421,12 @@ def run_preprocess_script(
dataset_path: str,
sample_rate: int,
cpu_cores: int,
cut_preprocess: bool,
cut_preprocess: str,
process_effects: bool,
noise_reduction: bool,
clean_strength: float,
chunk_len: float,
overlap_len: float,
):
config = get_config()
per = 3.0 if config.is_half else 3.7
Expand All @@ -444,6 +446,8 @@ def run_preprocess_script(
process_effects,
noise_reduction,
clean_strength,
chunk_len,
overlap_len,
],
),
]
Expand All @@ -462,6 +466,7 @@ def run_extract_script(
sample_rate: int,
embedder_model: str,
embedder_model_custom: str = None,
include_mutes: int = 2,
):

model_path = os.path.join(logs_path, model_name)
Expand All @@ -482,6 +487,7 @@ def run_extract_script(
sample_rate,
embedder_model,
embedder_model_custom,
include_mutes
],
),
]
Expand All @@ -502,7 +508,6 @@ def run_train_script(
sample_rate: int,
batch_size: int,
gpu: int,
pitch_guidance: bool,
overtraining_detector: bool,
overtraining_threshold: int,
pretrained: bool,
Expand All @@ -512,15 +517,15 @@ def run_train_script(
custom_pretrained: bool = False,
g_pretrained_path: str = None,
d_pretrained_path: str = None,
vocoder: str = "HiFi-GAN",
checkpointing: bool = False,
):

if pretrained == True:
from rvc.lib.tools.pretrained_selector import pretrained_selector

if custom_pretrained == False:
pg, pd = pretrained_selector(bool(pitch_guidance))[str(rvc_version)][
int(sample_rate)
]
pg, pd = pretrained_selector(str(rvc_version), str(vocoder), True, int(sample_rate))
else:
if g_pretrained_path is None or d_pretrained_path is None:
raise ValueError(
Expand All @@ -546,13 +551,14 @@ def run_train_script(
gpu,
batch_size,
sample_rate,
pitch_guidance,
save_only_latest,
save_every_weights,
cache_data_in_gpu,
overtraining_detector,
overtraining_threshold,
cleanup,
vocoder,
checkpointing
],
),
]
Expand Down Expand Up @@ -1840,7 +1846,7 @@ def parse_arguments():
"--sample_rate",
type=int,
help="Target sampling rate for the audio data.",
choices=[32000, 40000, 48000],
choices=[32000, 40000, 44100, 48000],
required=True,
)
preprocess_parser.add_argument(
Expand All @@ -1851,11 +1857,11 @@ def parse_arguments():
)
preprocess_parser.add_argument(
"--cut_preprocess",
type=lambda x: bool(strtobool(x)),
choices=[True, False],
type=str,
choices=['Skip', 'Simple', 'Automatic'],
help="Cut the dataset into smaller segments for faster preprocessing.",
default=True,
required=False,
default='Automatic',
required=True,
)
preprocess_parser.add_argument(
"--process_effects",
Expand All @@ -1881,6 +1887,22 @@ def parse_arguments():
default=0.7,
required=False,
)
preprocess_parser.add_argument(
"--chunk_len",
type=float,
help="Chunk length.",
choices=[i * 0.5 for i in range(1, 11)],
default=3.0,
required=False,
)
preprocess_parser.add_argument(
"--overlap_len",
type=float,
help="Overlap length.",
choices=[0.0, 0.1, 0.2, 0.3, 0.4],
default=0.3,
required=False,
)

# Parser for 'extract' mode
extract_parser = subparsers.add_parser(
Expand Down Expand Up @@ -1923,15 +1945,15 @@ def parse_arguments():
)
extract_parser.add_argument(
"--gpu",
type=int,
type=str,
help="GPU device to use for feature extraction (optional).",
default="-",
)
extract_parser.add_argument(
"--sample_rate",
type=int,
help="Target sampling rate for the audio data.",
choices=[32000, 40000, 48000],
choices=[32000, 40000, 44100, 48000],
required=True,
)
extract_parser.add_argument(
Expand All @@ -1953,6 +1975,14 @@ def parse_arguments():
help=embedder_model_custom_description,
default=None,
)
extract_parser.add_argument(
"--include_mutes",
type=int,
help="Number of silent files to include.",
choices=range(0, 11),
default=2,
required=True
)

# Parser for 'train' mode
train_parser = subparsers.add_parser("train", help="Train an RVC model.")
Expand All @@ -1966,6 +1996,21 @@ def parse_arguments():
choices=["v1", "v2"],
default="v2",
)
train_parser.add_argument(
"--vocoder",
type=str,
help="Vocoder name",
choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
default="HiFi-GAN",
)
train_parser.add_argument(
"--checkpointing",
type=lambda x: bool(strtobool(x)),
choices=[True, False],
help="Enables memory-efficient training.",
default=False,
required=False,
)
train_parser.add_argument(
"--save_every_epoch",
type=int,
Expand Down Expand Up @@ -2014,13 +2059,6 @@ def parse_arguments():
help="GPU device to use for training (e.g., '0').",
default="0",
)
train_parser.add_argument(
"--pitch_guidance",
type=lambda x: bool(strtobool(x)),
choices=[True, False],
help="Enable or disable pitch guidance during training.",
default=True,
)
train_parser.add_argument(
"--pretrained",
type=lambda x: bool(strtobool(x)),
Expand Down Expand Up @@ -2431,6 +2469,8 @@ def main():
process_effects=args.process_effects,
noise_reduction=args.noise_reduction,
clean_strength=args.noise_reduction_strength,
chunk_len=args.chunk_len,
overlap_len=args.overlap_len,
)
elif args.mode == "extract":
run_extract_script(
Expand All @@ -2443,6 +2483,7 @@ def main():
sample_rate=args.sample_rate,
embedder_model=args.embedder_model,
embedder_model_custom=args.embedder_model_custom,
include_mutes=args.include_mutes,
)
elif args.mode == "train":
run_train_script(
Expand All @@ -2455,7 +2496,6 @@ def main():
sample_rate=args.sample_rate,
batch_size=args.batch_size,
gpu=args.gpu,
pitch_guidance=args.pitch_guidance,
overtraining_detector=args.overtraining_detector,
overtraining_threshold=args.overtraining_threshold,
pretrained=args.pretrained,
Expand All @@ -2465,6 +2505,8 @@ def main():
cache_data_in_gpu=args.cache_data_in_gpu,
g_pretrained_path=args.g_pretrained_path,
d_pretrained_path=args.d_pretrained_path,
vocoder=args.vocoder,
checkpointing=args.checkpointing,
)
elif args.mode == "index":
run_index_script(
Expand Down
Binary file added logs/mute/sliced_audios/mute44100.wav
Binary file not shown.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ soundfile==0.12.1
noisereduce
pedalboard
stftpitchshift
soxr

# Machine learning and deep learning
omegaconf>=2.0.6; sys_platform == 'darwin'
Expand Down
2 changes: 2 additions & 0 deletions rvc/configs/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
version_config_paths = [
os.path.join("v1", "32000.json"),
os.path.join("v1", "40000.json"),
os.path.join("v1", "44100.json"),
os.path.join("v1", "48000.json"),
os.path.join("v2", "48000.json"),
os.path.join("v2", "40000.json"),
os.path.join("v2", "44100.json"),
os.path.join("v2", "32000.json"),
]

Expand Down
43 changes: 43 additions & 0 deletions rvc/configs/v1/44100.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"train": {
"log_interval": 200,
"seed": 1234,
"learning_rate": 0.0001,
"betas": [0.8, 0.99],
"eps": 1e-09,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 15876,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"max_wav_value": 32768.0,
"sample_rate": 44100,
"filter_length": 2048,
"hop_length": 441,
"win_length": 2048,
"n_mel_channels": 160,
"mel_fmin": 0.0,
"mel_fmax": null
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"text_enc_hidden_dim": 256,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [7,7,3,3],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [14,14,6,6],
"use_spectral_norm": false,
"gin_channels": 256,
"spk_embed_dim": 109
}
}
Loading

0 comments on commit c306f1c

Please sign in to comment.