diff --git a/assets/Applio_NoUI.ipynb b/assets/Applio_NoUI.ipynb index 9a0ac6c03..edd9a9ca0 100644 --- a/assets/Applio_NoUI.ipynb +++ b/assets/Applio_NoUI.ipynb @@ -429,12 +429,14 @@ "sample_rate = \"40k\" # @param [\"32k\", \"40k\", \"48k\"] {allow-input: false}\n", "sr = int(sample_rate.rstrip(\"k\")) * 1000\n", "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n", - "cut_preprocess = True # @param{type:\"boolean\"}\n", + "cut_preprocess = \"Automatic\" # @param [\"Skip\", \"Simple\", \"Automatic\"] {allow-input: false}\n", "process_effects = False # @param{type:\"boolean\"}\n", "noise_reduction = False # @param{type:\"boolean\"}\n", "noise_reduction_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n", + "chunk_len = 3.0 # @param {type:\"slider\", min:0.5, max:5.0, step:0.5}\n", + "overlap_len = 0.3 # @param {type:\"slider\", min:0.0, max:0.4, step:0.1}\n", "\n", - "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\"" + "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\" --chunk_len \"{chunk_len}\" --overlap_len \"{overlap_len}\"" ] }, { @@ -453,10 +455,11 @@ "\n", "sr = int(sample_rate.rstrip(\"k\")) * 1000\n", "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n", + "include_mutes = 2 # @param {type:\"slider\", min:0, max:10, step:1}\n", "embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"] {allow-input: false}\n", "embedder_model_custom = \"\" # @param {type:\"string\"}\n", "\n", - "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\"" + "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\" --include_mutes \"{include_mutes}\"" ] }, { @@ -597,7 +600,7 @@ " print(\"Autobackup Disabled\")\n", "else:\n", " autobackups = True\n", - " print(\"Autobackup Enabled\") \n", + " print(\"Autobackup Enabled\")\n", "# @markdown ### ⚙️ Train Settings\n", "total_epoch = 800 # @param {type:\"integer\"}\n", "batch_size = 15 # @param {type:\"slider\", min:1, max:25, step:0}\n", @@ -618,6 +621,8 @@ "custom_pretrained = False # @param{type:\"boolean\"}\n", "g_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/G48k.pth\" # @param {type:\"string\"}\n", "d_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/D48k.pth\" # @param {type:\"string\"}\n", + "vocoder = \"HiFi-GAN\" # @param [\"HiFi-GAN\", \"MRF HiFi-GAN\", \"RefineGAN\"] {allow-input: false}\n", + "checkpointing = False # @param{type:\"boolean\"}\n", "\n", "if \"pretrained\" not in globals():\n", " pretrained = True\n", @@ -636,8 +641,7 @@ " if tensorboard == True:\n", " %load_ext tensorboard\n", " %tensorboard --logdir /content/Applio/logs/\n", - " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n", - "\n", + " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\" --vocoder \"{vocoder}\" --checkpointing \"{checkpointing}\"\n", "\n", "server_thread = threading.Thread(target=start_train)\n", "server_thread.start()\n", diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 083677160..0b3367c3a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -78,8 +78,7 @@ "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.": "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.", "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.": "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.", "Extract Features": "Extract Features", - "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.", - "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.", + "Configure GPU and CPU settings.": "Configure GPU and CPU settings.", "Cache Dataset in GPU": "Cache Dataset in GPU", "Cache the dataset in GPU memory to speed up the training process.": "Cache the dataset in GPU memory to speed up the training process.", "Index Algorithm": "Index Algorithm", @@ -321,5 +320,19 @@ "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.": "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.", "Model Author Name": "Model Author Name", "The name that will appear in the model information.": "The name that will appear in the model information.", - "Set name": "Set name" + "Set name": "Set name", + "Vocoder": "Vocoder", + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).", + "Checkpointing": "Checkpointing", + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.", + "Enable Experimental Options": "Enable Experimental Options", + "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.", + "Model Settings": "Model Settings", + "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.": "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.", + "Chunk length (sec)": "Chunk length (sec)", + "Length of the audio slice for 'Simple' method.": "Length of the audio slice for 'Simple' method.", + "Overlap length (sec)": "Overlap length (sec)", + "Length of the overlap between slices for 'Simple' method.": "Length of the overlap between slices for 'Simple' method.", + "Silent training files": "Silent training files", + "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence.": "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." } \ No newline at end of file diff --git a/core.py b/core.py index dc3889c19..c50d408ab 100644 --- a/core.py +++ b/core.py @@ -421,10 +421,12 @@ def run_preprocess_script( dataset_path: str, sample_rate: int, cpu_cores: int, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, clean_strength: float, + chunk_len: float, + overlap_len: float, ): config = get_config() per = 3.0 if config.is_half else 3.7 @@ -444,6 +446,8 @@ def run_preprocess_script( process_effects, noise_reduction, clean_strength, + chunk_len, + overlap_len, ], ), ] @@ -462,6 +466,7 @@ def run_extract_script( sample_rate: int, embedder_model: str, embedder_model_custom: str = None, + include_mutes: int = 2, ): model_path = os.path.join(logs_path, model_name) @@ -482,6 +487,7 @@ def run_extract_script( sample_rate, embedder_model, embedder_model_custom, + include_mutes ], ), ] @@ -502,7 +508,6 @@ def run_train_script( sample_rate: int, batch_size: int, gpu: int, - pitch_guidance: bool, overtraining_detector: bool, overtraining_threshold: int, pretrained: bool, @@ -512,15 +517,15 @@ def run_train_script( custom_pretrained: bool = False, g_pretrained_path: str = None, d_pretrained_path: str = None, + vocoder: str = "HiFi-GAN", + checkpointing: bool = False, ): if pretrained == True: from rvc.lib.tools.pretrained_selector import pretrained_selector if custom_pretrained == False: - pg, pd = pretrained_selector(bool(pitch_guidance))[str(rvc_version)][ - int(sample_rate) - ] + pg, pd = pretrained_selector(str(rvc_version), str(vocoder), True, int(sample_rate)) else: if g_pretrained_path is None or d_pretrained_path is None: raise ValueError( @@ -546,13 +551,14 @@ def run_train_script( gpu, batch_size, sample_rate, - pitch_guidance, save_only_latest, save_every_weights, cache_data_in_gpu, overtraining_detector, overtraining_threshold, cleanup, + vocoder, + checkpointing ], ), ] @@ -1840,7 +1846,7 @@ def parse_arguments(): "--sample_rate", type=int, help="Target sampling rate for the audio data.", - choices=[32000, 40000, 48000], + choices=[32000, 40000, 44100, 48000], required=True, ) preprocess_parser.add_argument( @@ -1851,11 +1857,11 @@ def parse_arguments(): ) preprocess_parser.add_argument( "--cut_preprocess", - type=lambda x: bool(strtobool(x)), - choices=[True, False], + type=str, + choices=['Skip', 'Simple', 'Automatic'], help="Cut the dataset into smaller segments for faster preprocessing.", - default=True, - required=False, + default='Automatic', + required=True, ) preprocess_parser.add_argument( "--process_effects", @@ -1881,6 +1887,22 @@ def parse_arguments(): default=0.7, required=False, ) + preprocess_parser.add_argument( + "--chunk_len", + type=float, + help="Chunk length.", + choices=[i * 0.5 for i in range(1, 11)], + default=3.0, + required=False, + ) + preprocess_parser.add_argument( + "--overlap_len", + type=float, + help="Overlap length.", + choices=[0.0, 0.1, 0.2, 0.3, 0.4], + default=0.3, + required=False, + ) # Parser for 'extract' mode extract_parser = subparsers.add_parser( @@ -1923,7 +1945,7 @@ def parse_arguments(): ) extract_parser.add_argument( "--gpu", - type=int, + type=str, help="GPU device to use for feature extraction (optional).", default="-", ) @@ -1931,7 +1953,7 @@ def parse_arguments(): "--sample_rate", type=int, help="Target sampling rate for the audio data.", - choices=[32000, 40000, 48000], + choices=[32000, 40000, 44100, 48000], required=True, ) extract_parser.add_argument( @@ -1953,6 +1975,14 @@ def parse_arguments(): help=embedder_model_custom_description, default=None, ) + extract_parser.add_argument( + "--include_mutes", + type=int, + help="Number of silent files to include.", + choices=range(0, 11), + default=2, + required=True + ) # Parser for 'train' mode train_parser = subparsers.add_parser("train", help="Train an RVC model.") @@ -1966,6 +1996,21 @@ def parse_arguments(): choices=["v1", "v2"], default="v2", ) + train_parser.add_argument( + "--vocoder", + type=str, + help="Vocoder name", + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], + default="HiFi-GAN", + ) + train_parser.add_argument( + "--checkpointing", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help="Enables memory-efficient training.", + default=False, + required=False, + ) train_parser.add_argument( "--save_every_epoch", type=int, @@ -2014,13 +2059,6 @@ def parse_arguments(): help="GPU device to use for training (e.g., '0').", default="0", ) - train_parser.add_argument( - "--pitch_guidance", - type=lambda x: bool(strtobool(x)), - choices=[True, False], - help="Enable or disable pitch guidance during training.", - default=True, - ) train_parser.add_argument( "--pretrained", type=lambda x: bool(strtobool(x)), @@ -2431,6 +2469,8 @@ def main(): process_effects=args.process_effects, noise_reduction=args.noise_reduction, clean_strength=args.noise_reduction_strength, + chunk_len=args.chunk_len, + overlap_len=args.overlap_len, ) elif args.mode == "extract": run_extract_script( @@ -2443,6 +2483,7 @@ def main(): sample_rate=args.sample_rate, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom, + include_mutes=args.include_mutes, ) elif args.mode == "train": run_train_script( @@ -2455,7 +2496,6 @@ def main(): sample_rate=args.sample_rate, batch_size=args.batch_size, gpu=args.gpu, - pitch_guidance=args.pitch_guidance, overtraining_detector=args.overtraining_detector, overtraining_threshold=args.overtraining_threshold, pretrained=args.pretrained, @@ -2465,6 +2505,8 @@ def main(): cache_data_in_gpu=args.cache_data_in_gpu, g_pretrained_path=args.g_pretrained_path, d_pretrained_path=args.d_pretrained_path, + vocoder=args.vocoder, + checkpointing=args.checkpointing, ) elif args.mode == "index": run_index_script( diff --git a/logs/mute/sliced_audios/mute44100.wav b/logs/mute/sliced_audios/mute44100.wav new file mode 100644 index 000000000..de029a9c1 Binary files /dev/null and b/logs/mute/sliced_audios/mute44100.wav differ diff --git a/requirements.txt b/requirements.txt index 78cf0a35b..5167f3766 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ soundfile==0.12.1 noisereduce pedalboard stftpitchshift +soxr # Machine learning and deep learning omegaconf>=2.0.6; sys_platform == 'darwin' diff --git a/rvc/configs/config.py b/rvc/configs/config.py index 71f12bc22..dbdbdecf0 100644 --- a/rvc/configs/config.py +++ b/rvc/configs/config.py @@ -5,9 +5,11 @@ version_config_paths = [ os.path.join("v1", "32000.json"), os.path.join("v1", "40000.json"), + os.path.join("v1", "44100.json"), os.path.join("v1", "48000.json"), os.path.join("v2", "48000.json"), os.path.join("v2", "40000.json"), + os.path.join("v2", "44100.json"), os.path.join("v2", "32000.json"), ] diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json new file mode 100644 index 000000000..39246c326 --- /dev/null +++ b/rvc/configs/v1/44100.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 0.0001, + "betas": [0.8, 0.99], + "eps": 1e-09, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 15876, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 44100, + "filter_length": 2048, + "hop_length": 441, + "win_length": 2048, + "n_mel_channels": 160, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [7,7,3,3], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [14,14,6,6], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc/configs/v2/44100.json b/rvc/configs/v2/44100.json new file mode 100644 index 000000000..dd1e57b21 --- /dev/null +++ b/rvc/configs/v2/44100.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 0.0001, + "betas": [0.8, 0.99], + "eps": 1e-09, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 15876, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 44100, + "filter_length": 2048, + "hop_length": 441, + "win_length": 2048, + "n_mel_channels": 160, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [7,7,3,3], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [14,14,6,6], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 69813942a..ad8cddea3 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -481,15 +481,13 @@ def setup_network(self): *self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, - is_half=self.config.is_half, - vocoder=self.vocoder, + is_half=False, + vocoder=self.vocoder ) del self.net_g.enc_q self.net_g.load_state_dict(self.cpt["weight"], strict=False) self.net_g.eval().to(self.config.device) - self.net_g = ( - self.net_g.half() if self.config.is_half else self.net_g.float() - ) + self.net_g = self.net_g.float() def setup_vc_instance(self): """ diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py index bf82afd4c..6e73efcbc 100644 --- a/rvc/infer/pipeline.py +++ b/rvc/infer/pipeline.py @@ -485,7 +485,7 @@ def voice_conversion( pitch, pitchf = None, None p_len = torch.tensor([p_len], device=self.device).long() audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0]) .data.cpu() .float() .numpy() diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py index eed8d7c37..cd7d9da2c 100644 --- a/rvc/lib/algorithm/commons.py +++ b/rvc/lib/algorithm/commons.py @@ -98,7 +98,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): if x_lengths is None: x_lengths = t ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long) ret = slice_segments(x, ids_str, segment_size, dim=3) return ret, ids_str diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py index ecef449a5..c9ede6c27 100644 --- a/rvc/lib/algorithm/discriminators.py +++ b/rvc/lib/algorithm/discriminators.py @@ -1,5 +1,6 @@ import torch from torch.nn.utils.parametrizations import spectral_norm, weight_norm +import torch.utils.checkpoint as checkpoint from rvc.lib.algorithm.commons import get_padding from rvc.lib.algorithm.residuals import LRELU_SLOPE @@ -20,28 +21,29 @@ class MultiPeriodDiscriminator(torch.nn.Module): Defaults to False. """ - def __init__(self, version: str, use_spectral_norm: bool = False): + def __init__(self, version: str, use_spectral_norm: bool = False, checkpointing: bool = False): super(MultiPeriodDiscriminator, self).__init__() periods = ( [2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37] ) + self.checkpointing = checkpointing self.discriminators = torch.nn.ModuleList( - [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + [DiscriminatorS(use_spectral_norm=use_spectral_norm, checkpointing=checkpointing)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing) for p in periods] ) def forward(self, y, y_hat): - """ - Forward pass of the multi-period discriminator. - - Args: - y (torch.Tensor): Real audio signal. - y_hat (torch.Tensor): Fake audio signal. - """ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] for d in self.discriminators: - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) + if self.training and self.checkpointing: + def forward_discriminator(d, y, y_hat): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + return y_d_r, fmap_r, y_d_g, fmap_g + y_d_r, fmap_r, y_d_g, fmap_g = checkpoint.checkpoint(forward_discriminator, d, y, y_hat, use_reentrant=False) + else: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) y_d_gs.append(y_d_g) fmap_rs.append(fmap_r) @@ -59,8 +61,9 @@ class DiscriminatorS(torch.nn.Module): convolutional layers that are applied to the input signal. """ - def __init__(self, use_spectral_norm: bool = False): + def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False): super(DiscriminatorS, self).__init__() + self.checkpointing = checkpointing norm_f = spectral_norm if use_spectral_norm else weight_norm self.convs = torch.nn.ModuleList( [ @@ -73,18 +76,16 @@ def __init__(self, use_spectral_norm: bool = False): ] ) self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): - """ - Forward pass of the discriminator. - - Args: - x (torch.Tensor): Input audio signal. - """ fmap = [] for conv in self.convs: - x = self.lrelu(conv(x)) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(conv, x, use_reentrant = False) + x = checkpoint.checkpoint(self.lrelu, x, use_reentrant = False) + else: + x = self.lrelu(conv(x)) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -114,8 +115,10 @@ def __init__( kernel_size: int = 5, stride: int = 3, use_spectral_norm: bool = False, + checkpointing: bool = False, ): super(DiscriminatorP, self).__init__() + self.checkpointing = checkpointing self.period = period norm_f = spectral_norm if use_spectral_norm else weight_norm @@ -138,15 +141,9 @@ def __init__( ) self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): - """ - Forward pass of the discriminator. - - Args: - x (torch.Tensor): Input audio signal. - """ fmap = [] b, c, t = x.shape if t % self.period != 0: @@ -155,7 +152,11 @@ def forward(self, x): x = x.view(b, c, -1, self.period) for conv in self.convs: - x = self.lrelu(conv(x)) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(conv, x, use_reentrant = False) + x = checkpoint.checkpoint(self.lrelu, x, use_reentrant = False) + else: + x = self.lrelu(conv(x)) fmap.append(x) x = self.conv_post(x) diff --git a/rvc/lib/algorithm/encoders.py b/rvc/lib/algorithm/encoders.py index e742378ae..3f94a1478 100644 --- a/rvc/lib/algorithm/encoders.py +++ b/rvc/lib/algorithm/encoders.py @@ -85,7 +85,8 @@ def forward(self, x, x_mask): class TextEncoder(torch.nn.Module): - """Text Encoder with configurable embedding dimension. + """ + Text Encoder with configurable embedding dimension. Args: out_channels (int): Output channels of the encoder. @@ -152,7 +153,8 @@ def forward( class PosteriorEncoder(torch.nn.Module): - """Posterior Encoder for inferring latent representation. + """ + Posterior Encoder for inferring latent representation. Args: in_channels (int): Number of channels in the input. @@ -211,11 +213,9 @@ def forward( return z, m, logs, x_mask def remove_weight_norm(self): - """Removes weight normalization from the encoder.""" self.enc.remove_weight_norm() def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for hook in self.enc._forward_pre_hooks.values(): if ( hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" diff --git a/rvc/lib/algorithm/generators.py b/rvc/lib/algorithm/generators/hifigan.py similarity index 74% rename from rvc/lib/algorithm/generators.py rename to rvc/lib/algorithm/generators/hifigan.py index c380eabfc..4f1f6cab5 100644 --- a/rvc/lib/algorithm/generators.py +++ b/rvc/lib/algorithm/generators/hifigan.py @@ -7,19 +7,22 @@ from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock from rvc.lib.algorithm.commons import init_weights +class HiFiGANGenerator(torch.nn.Module): + """ + HiFi-GAN Generator module for audio synthesis. -class Generator(torch.nn.Module): - """Generator for synthesizing audio. + This module implements the generator part of the HiFi-GAN architecture, + which uses transposed convolutions for upsampling and residual blocks for + refining the audio output. It can also incorporate global conditioning. Args: - initial_channel (int): Number of channels in the initial convolutional layer. - resblock (str): Type of residual block to use (1 or 2). - resblock_kernel_sizes (list): Kernel sizes of the residual blocks. - resblock_dilation_sizes (list): Dilation rates of the residual blocks. - upsample_rates (list): Upsampling rates. - upsample_initial_channel (int): Number of channels in the initial upsampling layer. - upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. - gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0. """ def __init__( @@ -32,7 +35,7 @@ def __init__( upsample_kernel_sizes: list, gin_channels: int = 0, ): - super(Generator, self).__init__() + super(HiFiGANGenerator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = torch.nn.Conv1d( @@ -76,7 +79,7 @@ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): x = self.ups[i](x) xs = None for j in range(self.num_kernels): - if xs == None: + if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) @@ -89,7 +92,6 @@ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): return x def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for l in self.ups_and_resblocks: for hook in l._forward_pre_hooks.values(): if ( @@ -100,23 +102,24 @@ def __prepare_scriptable__(self): return self def remove_weight_norm(self): - """Removes weight normalization from the upsampling and residual blocks.""" for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() - class SineGenerator(torch.nn.Module): """ - A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise. + Sine wave generator with optional harmonic overtones and noise. + + This module generates sine waves for a fundamental frequency and its harmonics. + It can also add Gaussian noise and apply a voiced/unvoiced mask. Args: - sampling_rate (int): The sampling rate in Hz. - num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0. - sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1. - noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003. - voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0. + sampling_rate (int): The sampling rate of the audio in Hz. + num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0. + sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1. + noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0. """ def __init__( @@ -137,21 +140,21 @@ def __init__( def _compute_voiced_unvoiced(self, f0: torch.Tensor): """ - Generate a binary mask to indicate voiced/unvoiced frames. + Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency. Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length). + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length). """ uv_mask = (f0 > self.voiced_threshold).float() return uv_mask def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): """ - Generate sine waves for the fundamental frequency and its harmonics. + Generates sine waves for the fundamental frequency and its harmonics. Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). - upsampling_factor (int): Upsampling factor. + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + upsampling_factor (int): The factor by which to upsample the sine wave. """ batch_size, length, _ = f0.shape @@ -187,13 +190,6 @@ def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): return sine_waves def forward(self, f0: torch.Tensor, upsampling_factor: int): - """ - Forward pass to generate sine waveforms with noise and voiced/unvoiced masking. - - Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). - upsampling_factor (int): Upsampling factor. - """ with torch.no_grad(): # Expand `f0` to include waveform dimensions f0 = f0.unsqueeze(-1) @@ -224,4 +220,4 @@ def forward(self, f0: torch.Tensor, upsampling_factor: int): # Combine sine waves and noise sine_waveforms = sine_waves * voiced_mask + noise - return sine_waveforms, voiced_mask, noise + return sine_waveforms, voiced_mask, noise \ No newline at end of file diff --git a/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc/lib/algorithm/generators/hifigan_mrf.py new file mode 100644 index 000000000..e3834ab82 --- /dev/null +++ b/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -0,0 +1,379 @@ +import math +import numpy as np +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +import torch.utils.checkpoint as checkpoint +from typing import Optional + +LRELU_SLOPE = 0.1 + + +class MRFLayer(torch.nn.Module): + """ + A single layer of the Multi-Receptive Field (MRF) block. + + This layer consists of two 1D convolutional layers with weight normalization + and Leaky ReLU activation in between. The first convolution has a dilation, + while the second has a dilation of 1. A skip connection is added from the input + to the output. + + Args: + channels (int): The number of input and output channels. + kernel_size (int): The kernel size of the convolutional layers. + dilation (int): The dilation rate for the first convolutional layer. + """ + + def __init__(self, channels, kernel_size, dilation): + super().__init__() + self.conv1 = weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + padding=(kernel_size * dilation - dilation) // 2, + dilation=dilation, + ) + ) + self.conv2 = weight_norm( + torch.nn.Conv1d( + channels, channels, kernel_size, padding=kernel_size // 2, dilation=1 + ) + ) + + def forward(self, x: torch.Tensor): + y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + y = self.conv1(y) + y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE) + y = self.conv2(y) + return x + y + + def remove_weight_norm(self): + remove_weight_norm(self.conv1) + remove_weight_norm(self.conv2) + + +class MRFBlock(torch.nn.Module): + """ + A Multi-Receptive Field (MRF) block. + + This block consists of multiple MRFLayers with different dilation rates. + It applies each layer sequentially to the input. + + Args: + channels (int): The number of input and output channels for the MRFLayers. + kernel_size (int): The kernel size for the convolutional layers in the MRFLayers. + dilations (list[int]): A list of dilation rates for the MRFLayers. + """ + + def __init__(self, channels, kernel_size, dilations): + super().__init__() + self.layers = torch.nn.ModuleList() + for dilation in dilations: + self.layers.append(MRFLayer(channels, kernel_size, dilation)) + + def forward(self, x: torch.Tensor): + for layer in self.layers: + x = layer(x) + return x + + def remove_weight_norm(self): + for layer in self.layers: + layer.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + """ + + def __init__( + self, + samp_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super(SineGenerator, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0: torch.Tensor): + """ + Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0). + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + """ + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values: torch.Tensor): + """ + Generates sine waveforms based on the fundamental frequency (F0) and its harmonics. + + Args: + f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics, + shape (batch_size, length, dim), where dim indicates + the fundamental tone and overtones. + """ + # convert to F0 in rad. The integer part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0: torch.Tensor): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Generates harmonic and noise source features. + + This module uses the SineGenerator to create harmonic signals based on the + fundamental frequency (F0) and merges them into a single excitation signal. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + """ + + def __init__( + self, + sampling_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGenerator( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + + +class HiFiGANMRFGenerator(torch.nn.Module): + """ + HiFi-GAN generator with Multi-Receptive Field (MRF) blocks. + + This generator takes an input feature sequence and fundamental frequency (F0) + as input and generates an audio waveform. It utilizes transposed convolutions + for upsampling and MRF blocks for feature refinement. It can also condition + on global conditioning features. + + Args: + in_channel (int): Number of input channels. + upsample_initial_channel (int): Number of channels after the initial convolution. + upsample_rates (list[int]): List of upsampling rates for the transposed convolutions. + upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions. + resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks. + resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks. + gin_channels (int): Number of global conditioning input channels (0 if no global conditioning). + sample_rate (int): Sampling rate of the audio. + harmonic_num (int): Number of harmonics to generate. + checkpointing (bool): Whether to use checkpointing to save memory during training (default: False). + """ + + def __init__( + self, + in_channel: int, + upsample_initial_channel: int, + upsample_rates: list[int], + upsample_kernel_sizes: list[int], + resblock_kernel_sizes: list[int], + resblock_dilations: list[list[int]], + gin_channels: int, + sample_rate: int, + harmonic_num: int, + checkpointing: bool = False, + ): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.checkpointing = checkpointing + + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) + + self.conv_pre = weight_norm( + torch.nn.Conv1d( + in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3 + ) + ) + self.upsamples = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + + self.upsamples.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=k, + stride=u, + padding=padding, + output_padding=u % 2, + ) + ) + ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel, + stride=stride, + padding=padding, + ) + ) + self.mrfs = torch.nn.ModuleList() + for i in range(len(self.upsamples)): + channel = upsample_initial_channel // (2 ** (i + 1)) + self.mrfs.append( + torch.nn.ModuleList( + [ + MRFBlock(channel, kernel_size=k, dilations=d) + for k, d in zip(resblock_kernel_sizes, resblock_dilations) + ] + ) + ) + self.conv_post = weight_norm( + torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3) + ) + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward( + self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None + ): + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + else: + x = ups(x) + + x += noise_conv(har_source) + + def mrf_sum(x, layers): + return sum(layer(x) for layer in layers) / self.num_kernels + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(mrf_sum, x, mrf, use_reentrant=False) + else: + x = mrf_sum(x, mrf) + + x = torch.nn.functional.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv_pre) + for up in self.upsamples: + remove_weight_norm(up) + for mrf in self.mrfs: + mrf.remove_weight_norm() + remove_weight_norm(self.conv_post) diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py similarity index 53% rename from rvc/lib/algorithm/nsf.py rename to rvc/lib/algorithm/generators/hifigan_nsf.py index 514b5371d..c17b7a6a1 100644 --- a/rvc/lib/algorithm/nsf.py +++ b/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -2,24 +2,26 @@ import torch from torch.nn.utils import remove_weight_norm from torch.nn.utils.parametrizations import weight_norm +import torch.utils.checkpoint as checkpoint from typing import Optional -from rvc.lib.algorithm.generators import SineGenerator +from rvc.lib.algorithm.generators.hifigan import SineGenerator from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock from rvc.lib.algorithm.commons import init_weights - class SourceModuleHnNSF(torch.nn.Module): """ - Source Module for harmonic-plus-noise excitation. + Source Module for generating harmonic and noise components for audio synthesis. + + This module generates a harmonic source signal using sine waves and adds + optional noise. It's often used in neural vocoders as a source of excitation. Args: - sample_rate (int): Sampling rate in Hz. - harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. - sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. - add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. - voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. - is_half (bool, optional): Whether to use half precision. Defaults to True. + sample_rate (int): Sampling rate of the audio in Hz. + harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0. """ def __init__( @@ -29,13 +31,11 @@ def __init__( sine_amp: float = 0.1, add_noise_std: float = 0.003, voiced_threshod: float = 0, - is_half: bool = True, ): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std - self.is_half = is_half self.l_sin_gen = SineGenerator( sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod @@ -49,22 +49,24 @@ def forward(self, x: torch.Tensor, upsample_factor: int = 1): sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None - -class GeneratorNSF(torch.nn.Module): +class HiFiGANNSFGenerator(torch.nn.Module): """ - Generator for synthesizing audio using the NSF (Neural Source Filter) approach. + Generator module based on the Neural Source Filter (NSF) architecture. + + This generator synthesizes audio by first generating a source excitation signal + (harmonic and noise) and then filtering it through a series of upsampling and + residual blocks. Global conditioning can be applied to influence the generation. Args: - initial_channel (int): Number of channels in the initial convolutional layer. - resblock (str): Type of residual block to use (1 or 2). - resblock_kernel_sizes (list): Kernel sizes of the residual blocks. - resblock_dilation_sizes (list): Dilation rates of the residual blocks. - upsample_rates (list): Upsampling rates. - upsample_initial_channel (int): Number of channels in the initial upsampling layer. - upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. - gin_channels (int): Number of channels for the global conditioning input. - sr (int): Sampling rate. - is_half (bool, optional): Whether to use half precision. Defaults to False. + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used. + sr (int): Sampling rate of the audio. + checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False. """ def __init__( @@ -77,15 +79,16 @@ def __init__( upsample_kernel_sizes: list, gin_channels: int, sr: int, - is_half: bool = False, + checkpointing: bool = False, ): - super(GeneratorNSF, self).__init__() + super(HiFiGANNSFGenerator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) + self.checkpointing = checkpointing self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sample_rate=sr, harmonic_num=0, is_half=is_half + sample_rate=sr, harmonic_num=0 ) self.conv_pre = torch.nn.Conv1d( @@ -105,6 +108,13 @@ def __init__( ] for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + self.ups.append( weight_norm( torch.nn.ConvTranspose1d( @@ -112,18 +122,33 @@ def __init__( channels[i], k, u, - padding=(k - u) // 2, + padding=padding, + output_padding=u % 2, ) ) ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 self.noise_convs.append( torch.nn.Conv1d( 1, channels[i], - kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1), - stride=stride_f0s[i], - padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0), + kernel_size=kernel, + stride=stride, + padding=padding, ) ) @@ -144,7 +169,7 @@ def __init__( self.upp = math.prod(upsample_rates) self.lrelu_slope = LRELU_SLOPE - def forward(self, x, f0, g: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None): har_source, _, _ = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) @@ -155,14 +180,27 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) - x = ups(x) + + # Apply upsampling layer + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + else: + x = ups(x) + + # Add noise excitation x += noise_convs(har_source) - xs = sum( - self.resblocks[j](x) - for j in range(i * self.num_kernels, (i + 1) * self.num_kernels) - ) - x = xs / self.num_kernels + # Apply residual blocks + def resblock_forward(x, blocks): + return sum(block(x) for block in blocks) / len(blocks) + + blocks = self.resblocks[i * self.num_kernels : (i + 1) * self.num_kernels] + + # Checkpoint or regular computation for ResBlocks + if self.training and self.checkpointing: + x = checkpoint.checkpoint(resblock_forward, x, blocks, use_reentrant=False) + else: + x = resblock_forward(x, blocks) x = torch.nn.functional.leaky_relu(x) x = torch.tanh(self.conv_post(x)) @@ -190,4 +228,4 @@ def __prepare_scriptable__(self): and hook.__class__.__name__ == "WeightNorm" ): remove_weight_norm(l) - return self + return self \ No newline at end of file diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py new file mode 100644 index 000000000..0af95eeb3 --- /dev/null +++ b/rvc/lib/algorithm/generators/refinegan.py @@ -0,0 +1,480 @@ +import numpy as np +import torch +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations +import torch.utils.checkpoint as checkpoint + +from rvc.lib.algorithm.commons import get_padding + +class ResBlock(torch.nn.Module): + """ + Residual block with multiple dilated convolutions. + + This block applies a sequence of dilated convolutional layers with Leaky ReLU activation. + It's designed to capture information at different scales due to the varying dilation rates. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7. + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ + + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_size: int = 7, + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super(ResBlock, self).__init__() + + self.leaky_relu_slope = leaky_relu_slope + self.in_channels = in_channels + self.out_channels = out_channels + + self.convs1 = torch.nn.ModuleList( + [ + weight_norm( + torch.nn.Conv1d( + in_channels=in_channels if idx == 0 else out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ) + ) + for idx, d in enumerate(dilation) + ] + ) + self.convs1.apply(self.init_weights) + + self.convs2 = torch.nn.ModuleList( + [ + weight_norm( + torch.nn.Conv1d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ) + ) + for idx, d in enumerate(dilation) + ] + ) + self.convs2.apply(self.init_weights) + + def forward(self, x: torch.Tensor): + for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)): + xt = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope) + xt = c1(xt) + xt = torch.nn.functional.leaky_relu(xt, self.leaky_relu_slope) + xt = c2(xt) + + if idx != 0 or self.in_channels == self.out_channels: + x = xt + x + else: + x = xt + + return x + + def remove_parametrizations(self): + for c1, c2 in zip(self.convs1, self.convs2): + remove_parametrizations(c1) + remove_parametrizations(c2) + + def init_weights(self, m): + if type(m) == torch.nn.Conv1d: + m.weight.data.normal_(0, 0.01) + m.bias.data.fill_(0.0) + +class AdaIN(torch.nn.Module): + """ + Adaptive Instance Normalization layer. + + This layer applies a scaling factor to the input based on a learnable weight. + + Args: + channels (int): Number of input channels. + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2. + """ + def __init__( + self, + *, + channels: int, + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.weight = torch.nn.Parameter(torch.ones(channels)) + self.activation = torch.nn.LeakyReLU(leaky_relu_slope) + + def forward(self, x: torch.Tensor): + gaussian = torch.randn_like(x) * self.weight[None, :, None] + + return self.activation(x + gaussian) + +class ParallelResBlock(torch.nn.Module): + """ + Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11). + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_sizes: tuple[int] = (3, 7, 11), + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + + self.input_conv = torch.nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=7, + stride=1, + padding=3, + ) + + self.blocks = torch.nn.ModuleList( + [ + torch.nn.Sequential( + AdaIN(channels=out_channels), + ResBlock( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + leaky_relu_slope=leaky_relu_slope, + ), + AdaIN(channels=out_channels), + ) + for kernel_size in kernel_sizes + ] + ) + + def forward(self, x: torch.Tensor): + x = self.input_conv(x) + + results = [block(x) for block in self.blocks] + + return torch.mean(torch.stack(results), dim=0) + + def remove_parametrizations(self): + for block in self.blocks: + block[1].remove_parametrizations() + +class SineGenerator(torch.nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + ): + super(SineGenerator, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise * (1 - uv) + return sine_waves, uv, noise + +class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for generating harmonic and noise signals. + + This module uses a SineGenerator to produce harmonic signals based on the fundamental frequency (F0). + + Args: + sampling_rate (int): Sampling rate of the audio. + harmonic_num (int, optional): Number of harmonics to generate. Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive noise. Defaults to 0.003. + voiced_threshold (int, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0. + """ + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshold=0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGenerator( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + +class RefineGANGenerator(torch.nn.Module): + """ + RefineGAN generator for audio synthesis. + + This generator uses a combination of downsampling, residual blocks, and parallel residual blocks + to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform. + It can also incorporate global conditioning. + + Args: + sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100. + downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8). + upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128. + start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256. + checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False. + """ + def __init__( + self, + *, + sample_rate: int = 44100, + downsample_rates: tuple[int] = (2, 2, 8, 8), + upsample_rates: tuple[int] = (8, 8, 2, 2), + leaky_relu_slope: float = 0.2, + num_mels: int = 128, + start_channels: int = 16, + gin_channels: int = 256, + checkpointing=False, + ): + super().__init__() + + self.downsample_rates = downsample_rates + self.upsample_rates = upsample_rates + self.leaky_relu_slope = leaky_relu_slope + self.checkpointing = checkpointing + + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num=8) + + # expands + self.source_conv = weight_norm( + torch.nn.Conv1d( + in_channels=1, + out_channels=start_channels, + kernel_size=7, + stride=1, + padding=3, + ) + ) + + channels = start_channels + self.downsample_blocks = torch.nn.ModuleList([]) + for rate in downsample_rates: + new_channels = channels * 2 + + self.downsample_blocks.append( + torch.nn.Sequential( + torch.nn.Upsample(scale_factor=1 / rate, mode="linear"), + ResBlock( + in_channels=channels, + out_channels=new_channels, + kernel_size=7, + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ), + ) + ) + + channels = new_channels + + self.mel_conv = weight_norm( + torch.nn.Conv1d( + in_channels=num_mels, + out_channels=channels, + kernel_size=7, + stride=1, + padding=3, + ) + ) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(256, channels, 1) + + channels *= 2 + + self.upsample_blocks = torch.nn.ModuleList([]) + self.upsample_conv_blocks = torch.nn.ModuleList([]) + + for rate in upsample_rates: + new_channels = channels // 2 + + self.upsample_blocks.append( + torch.nn.Upsample(scale_factor=rate, mode="linear") + ) + + self.upsample_conv_blocks.append( + ParallelResBlock( + in_channels=channels + channels // 4, + out_channels=new_channels, + kernel_sizes=(3, 7, 11), + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ) + ) + + channels = new_channels + + self.conv_post = weight_norm( + torch.nn.Conv1d( + in_channels=channels, + out_channels=1, + kernel_size=7, + stride=1, + padding=3, + ) + ) + + def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + + # expanding pitch source to 16 channels + x = self.source_conv(har_source) + # making a downscaled version to match upscaler stages + downs = [] + for i, block in enumerate(self.downsample_blocks): + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) + downs.append(x) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(block, x, use_reentrant=False) + else: + x = block(x) + + # expanding spectrogram from 192 to 256 channels + mel = self.mel_conv(mel) + + if g is not None: + # adding expanded speaker embedding + x = x + self.cond(g) + x = torch.cat([x, mel], dim=1) + + for ups, res, down in zip( + self.upsample_blocks, + self.upsample_conv_blocks, + reversed(downs), + ): + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + x = torch.cat([x, down], dim=1) + x = checkpoint.checkpoint(res, x, use_reentrant=False) + else: + x = ups(x) + x = torch.cat([x, down], dim=1) + x = res(x) + + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_parametrizations(self): + remove_parametrizations(self.source_conv) + remove_parametrizations(self.mel_conv) + remove_parametrizations(self.conv_post) + + for block in self.downsample_blocks: + block[1].remove_parametrizations() + + for block in self.upsample_conv_blocks: + block.remove_parametrizations() \ No newline at end of file diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py index 55454abb6..611c45d38 100644 --- a/rvc/lib/algorithm/modules.py +++ b/rvc/lib/algorithm/modules.py @@ -3,7 +3,8 @@ class WaveNet(torch.nn.Module): - """WaveNet residual blocks as used in WaveGlow. + """ + WaveNet residual blocks as used in WaveGlow. Args: hidden_channels (int): Number of hidden channels. @@ -75,13 +76,6 @@ def __init__( ) def forward(self, x, x_mask, g=None): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps). - x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps). - g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps). - """ output = x.clone().zero_() # Apply conditional layer if global conditioning is provided @@ -115,7 +109,6 @@ def forward(self, x, x_mask, g=None): return output * x_mask def remove_weight_norm(self): - """Remove weight normalization from the module.""" if self.gin_channels: torch.nn.utils.remove_weight_norm(self.cond_layer) for layer in self.in_layers: diff --git a/rvc/lib/algorithm/normalization.py b/rvc/lib/algorithm/normalization.py index a2a898cb6..94a29bac9 100644 --- a/rvc/lib/algorithm/normalization.py +++ b/rvc/lib/algorithm/normalization.py @@ -2,7 +2,8 @@ class LayerNorm(torch.nn.Module): - """Layer normalization module. + """ + Layer normalization module. Args: channels (int): Number of channels. @@ -16,12 +17,6 @@ def __init__(self, channels: int, eps: float = 1e-5): self.beta = torch.nn.Parameter(torch.zeros(channels)) def forward(self, x): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). - - """ # Transpose to (batch_size, time_steps, channels) for layer_norm x = x.transpose(1, -1) x = torch.nn.functional.layer_norm( diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py index 7483d298d..ac151187a 100644 --- a/rvc/lib/algorithm/residuals.py +++ b/rvc/lib/algorithm/residuals.py @@ -65,12 +65,6 @@ def _create_convs(channels: int, kernel_size: int, dilations: Tuple[int]): return layers def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, sequence_length). - x_mask (torch.Tensor, optional): Optional mask to apply to the input and output tensors. - """ for conv1, conv2 in zip(self.convs1, self.convs2): x_residual = x x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) @@ -82,36 +76,29 @@ def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): return apply_mask(x, x_mask) def remove_weight_norm(self): - """ - Removes weight normalization from all convolutional layers in the block. - """ for conv in chain(self.convs1, self.convs2): remove_weight_norm(conv) class Flip(torch.nn.Module): - """Flip module for flow-based models. + """ + Flip module for flow-based models. This module flips the input along the time dimension. """ def forward(self, x, *args, reverse=False, **kwargs): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor. - reverse (bool, optional): Whether to reverse the operation. Defaults to False. - """ x = torch.flip(x, [1]) if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) return x, logdet else: return x class ResidualCouplingBlock(torch.nn.Module): - """Residual Coupling Block for normalizing flow. + """ + Residual Coupling Block for normalizing flow. Args: channels (int): Number of channels in the input. @@ -173,12 +160,10 @@ def forward( return x def remove_weight_norm(self): - """Removes weight normalization from the coupling layers.""" for i in range(self.n_flows): self.flows[i * 2].remove_weight_norm() def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for i in range(self.n_flows): for hook in self.flows[i * 2]._forward_pre_hooks.values(): if ( @@ -191,7 +176,8 @@ def __prepare_scriptable__(self): class ResidualCouplingLayer(torch.nn.Module): - """Residual coupling layer for flow-based models. + """ + Residual coupling layer for flow-based models. Args: channels (int): Number of channels. @@ -247,15 +233,6 @@ def forward( g: Optional[torch.Tensor] = None, reverse: bool = False, ): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). - x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps). - g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps). - Defaults to None. - reverse (bool, optional): Whether to reverse the operation. Defaults to False. - """ x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) * x_mask h = self.enc(h, x_mask, g=g) @@ -277,5 +254,4 @@ def forward( return x def remove_weight_norm(self): - """Remove weight normalization from the module.""" self.enc.remove_weight_norm() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index e0fd1db2c..9c8ed8253 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -1,7 +1,9 @@ import torch from typing import Optional -from rvc.lib.algorithm.nsf import GeneratorNSF -from rvc.lib.algorithm.generators import Generator +from rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator +from rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator +from rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator +from rvc.lib.algorithm.generators.refinegan import RefineGANGenerator from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments from rvc.lib.algorithm.residuals import ResidualCouplingBlock from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder @@ -57,12 +59,15 @@ def __init__( sr: int, use_f0: bool, text_enc_hidden_dim: int = 768, + vocoder: str = "HiFi-GAN", + randomized: bool = True, + checkpointing: bool = False, **kwargs, ): super().__init__() self.segment_size = segment_size - self.gin_channels = gin_channels self.use_f0 = use_f0 + self.randomized = randomized self.enc_p = TextEncoder( inter_channels, @@ -75,30 +80,60 @@ def __init__( text_enc_hidden_dim, f0=use_f0, ) - + print(f"Using {vocoder} vocoder") if use_f0: - self.dec = GeneratorNSF( - inter_channels, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) + if vocoder == "MRF HiFi-GAN": + self.dec = HiFiGANMRFGenerator( + in_channel=inter_channels, + upsample_initial_channel=upsample_initial_channel, + upsample_rates=upsample_rates, + upsample_kernel_sizes=upsample_kernel_sizes, + resblock_kernel_sizes=resblock_kernel_sizes, + resblock_dilations=resblock_dilation_sizes, + gin_channels=gin_channels, + sample_rate=sr, + harmonic_num=8, + checkpointing=checkpointing, + ) + elif vocoder == "RefineGAN": + self.dec = RefineGANGenerator( + sample_rate=sr, + downsample_rates=upsample_rates[::-1], + upsample_rates=upsample_rates, + start_channels=16, + num_mels=inter_channels, + checkpointing=checkpointing, + ) + else: + self.dec = HiFiGANNSFGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + checkpointing=checkpointing, + ) else: - self.dec = Generator( - inter_channels, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - + if vocoder == "MRF HiFi-GAN": + print("MRF HiFi-GAN does not support training without pitch guidance.") + self.dec = None + elif vocoder == "RefineGAN": + print("RefineGAN does not support training without pitch guidance.") + self.dec = None + else: + self.dec = HiFiGANGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + checkpointing=checkpointing + ) self.enc_q = PosteriorEncoder( spec_channels, inter_channels, @@ -119,13 +154,11 @@ def __init__( self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels) def _remove_weight_norm_from(self, module): - """Utility to remove weight normalization from a module.""" for hook in module._forward_pre_hooks.values(): if getattr(hook, "__class__", None).__name__ == "WeightNorm": torch.nn.utils.remove_weight_norm(module) def remove_weight_norm(self): - """Removes weight normalization from the model.""" for module in [self.dec, self.flow, self.enc_q]: self._remove_weight_norm_from(module) @@ -143,35 +176,32 @@ def forward( y_lengths: Optional[torch.Tensor] = None, ds: Optional[torch.Tensor] = None, ): - """ - Forward pass of the model. - - Args: - phone (torch.Tensor): Phoneme sequence. - phone_lengths (torch.Tensor): Lengths of the phoneme sequences. - pitch (torch.Tensor, optional): Pitch sequence. - pitchf (torch.Tensor, optional): Fine-grained pitch sequence. - y (torch.Tensor, optional): Target spectrogram. - y_lengths (torch.Tensor, optional): Lengths of the target spectrograms. - ds (torch.Tensor, optional): Speaker embedding. - """ g = self.emb_g(ds).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) if y is not None: z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - - if self.use_f0 and pitchf is not None: - pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) - o = self.dec(z_slice, pitchf, g=g) + # regular old training method using random slices + if self.randomized: + z_slice, ids_slice = rand_slice_segments( + z, y_lengths, self.segment_size + ) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + # future use for finetuning using the entire dataset each pass else: - o = self.dec(z_slice, g=g) - - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + if self.use_f0: + o = self.dec(z, pitchf, g=g) + else: + o = self.dec(z, g=g) + return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + else: + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) @torch.jit.export def infer( diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py index 12f6c346a..9edbf0672 100644 --- a/rvc/lib/predictors/FCPE.py +++ b/rvc/lib/predictors/FCPE.py @@ -141,7 +141,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): spec = torch.stft( y, - n_fft_new, + n_fft=n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index e982fac50..56c18d263 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -1,63 +1,22 @@ -def pretrained_selector(pitch_guidance): - if pitch_guidance == True: - return { - "v1": { - 32000: ( - "rvc/models/pretraineds/pretrained_v1/f0G32k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v1/f0G40k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v1/f0G48k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D48k.pth", - ), - }, - "v2": { - 32000: ( - "rvc/models/pretraineds/pretrained_v2/f0G32k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v2/f0G40k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v2/f0G48k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D48k.pth", - ), - }, - } - elif pitch_guidance == False: - return { - "v1": { - 32000: ( - "rvc/models/pretraineds/pretrained_v1/G32k.pth", - "rvc/models/pretraineds/pretrained_v1/D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v1/G40k.pth", - "rvc/models/pretraineds/pretrained_v1/D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v1/G48k.pth", - "rvc/models/pretraineds/pretrained_v1/D48k.pth", - ), - }, - "v2": { - 32000: ( - "rvc/models/pretraineds/pretrained_v2/G32k.pth", - "rvc/models/pretraineds/pretrained_v2/D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v2/G40k.pth", - "rvc/models/pretraineds/pretrained_v2/D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v2/G48k.pth", - "rvc/models/pretraineds/pretrained_v2/D48k.pth", - ), - }, - } +import os + +def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): + base_path = os.path.join("rvc", "models", "pretraineds", f"pretrained_{version}") + f0 = "f0" if pitch_guidance else "" + + if vocoder == "HiFi-GAN": + vocoder_path = "" + elif vocoder == "MRF HiFi-GAN": + vocoder_path = "HiFiGAN_" + elif vocoder == "RefineGAN": + vocoder_path = "RefineGAN_" + else: + vocoder_path = "" + + path_g = os.path.join(base_path, f"{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth") + path_d = os.path.join(base_path, f"{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth") + + if os.path.exists(path_g) and os.path.exists(path_d): + return path_g, path_d + else: + return "", "" \ No newline at end of file diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py index 482009cc4..43ef3e6dd 100644 --- a/rvc/lib/zluda.py +++ b/rvc/lib/zluda.py @@ -1,33 +1,54 @@ import torch if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): - _torch_stft = torch.stft + class STFT: + def __init__(self): + self.device = "cuda" + self.fourier_bases = {} # Cache for Fourier bases + + def _get_fourier_basis(self, n_fft): + # Check if the basis for this n_fft is already cached + if n_fft in self.fourier_bases: + return self.fourier_bases[n_fft] + fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to(self.device) + # stack separated real and imaginary components and convert to torch tensor + cutoff = n_fft // 2 + 1 + fourier_basis = torch.cat([fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0) + # cache the tensor and return + self.fourier_bases[n_fft] = fourier_basis + return fourier_basis + def transform(self, input, n_fft, hop_length, window): + # fetch cached Fourier basis + fourier_basis = self._get_fourier_basis(n_fft) + # apply hann window to Fourier basis + fourier_basis = fourier_basis * window + # pad input to center with reflect + pad_amount = n_fft // 2 + input = torch.nn.functional.pad(input, (pad_amount, pad_amount), mode='reflect') + # separate input into n_fft-sized frames + input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) + # apply fft to each frame + fourier_transform = torch.matmul(fourier_basis, input_frames) + cutoff = n_fft // 2 + 1 + return torch.complex(fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :]) - def z_stft( - audio: torch.Tensor, - n_fft: int, - hop_length: int = None, - win_length: int = None, - window: torch.Tensor = None, - center: bool = True, - pad_mode: str = "reflect", - normalized: bool = False, - onesided: bool = None, - return_complex: bool = None, - ): - sd = audio.device - return _torch_stft( - audio.to("cpu"), - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window.to("cpu"), - center=center, - pad_mode=pad_mode, - normalized=normalized, - onesided=onesided, - return_complex=return_complex, - ).to(sd) + stft = STFT() + _torch_stft = torch.stft + + def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): + # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss + if (kwargs.get('win_length') == None + and kwargs.get("center") == None + and kwargs.get('return_complex') == True): + # use GPU accelerated calculation + return stft.transform( + input, + kwargs.get("n_fft"), + kwargs.get("hop_length"), + window) + else: + # simply do the operation on CPU + return _torch_stft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) def z_jit(f, *_, **__): f.graph = torch._C.Graph() @@ -40,4 +61,4 @@ def z_jit(f, *_, **__): torch.backends.cudnn.enabled = False torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) \ No newline at end of file diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py index 84f80cd0b..288547a93 100644 --- a/rvc/train/extract/extract.py +++ b/rvc/train/extract/extract.py @@ -250,6 +250,7 @@ def run_embedding_extraction( sample_rate = sys.argv[7] embedder_model = sys.argv[8] embedder_model_custom = sys.argv[9] if len(sys.argv) > 9 else None + include_mutes = int(sys.argv[10]) if len(sys.argv) > 10 else 2 # prep wav_path = os.path.join(exp_dir, "sliced_audios_16k") @@ -299,4 +300,4 @@ def run_embedding_extraction( # Run Preparing Files generate_config(version, sample_rate, exp_dir) - generate_filelist(exp_dir, version, sample_rate) + generate_filelist(exp_dir, version, sample_rate, include_mutes) diff --git a/rvc/train/extract/preparing_files.py b/rvc/train/extract/preparing_files.py index e0c6e2f6b..b90692907 100644 --- a/rvc/train/extract/preparing_files.py +++ b/rvc/train/extract/preparing_files.py @@ -15,7 +15,7 @@ def generate_config(rvc_version: str, sample_rate: int, model_path: str): shutil.copyfile(config_path, config_save_path) -def generate_filelist(model_path: str, rvc_version: str, sample_rate: int): +def generate_filelist(model_path: str, rvc_version: str, sample_rate: int, include_mutes: int = 2): gt_wavs_dir = os.path.join(model_path, "sliced_audios") feature_dir = os.path.join(model_path, f"{rvc_version}_extracted") @@ -41,23 +41,21 @@ def generate_filelist(model_path: str, rvc_version: str, sample_rate: int): f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|{sid}" ) - mute_audio_path = os.path.join( - mute_base_path, "sliced_audios", f"mute{sample_rate}.wav" - ) - mute_feature_path = os.path.join( - mute_base_path, f"{rvc_version}_extracted", "mute.npy" - ) - mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") - mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") - - # always adding two files - for sid in sids: - options.append( - f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + if include_mutes > 0: + mute_audio_path = os.path.join( + mute_base_path, "sliced_audios", f"mute{sample_rate}.wav" ) - options.append( - f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + mute_feature_path = os.path.join( + mute_base_path, f"{rvc_version}_extracted", "mute.npy" ) + mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") + mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") + + # adding x files per sid + for sid in sids * include_mutes: + options.append( + f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + ) file_path = os.path.join(model_path, "model_info.json") if os.path.exists(file_path): diff --git a/rvc/train/losses.py b/rvc/train/losses.py index 565ee7e8b..21afb9f45 100644 --- a/rvc/train/losses.py +++ b/rvc/train/losses.py @@ -24,9 +24,17 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs): disc_real_outputs (list of torch.Tensor): List of discriminator outputs for real samples. disc_generated_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. """ - r_losses = [(1 - dr).pow(2).mean() for dr in disc_real_outputs] - g_losses = [dg.pow(2).mean() for dg in disc_generated_outputs] - loss = sum(r_losses) + sum(g_losses) + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr.float()) ** 2) + g_loss = torch.mean(dg.float() ** 2) + + #r_losses.append(r_loss.item()) + #g_losses.append(g_loss.item()) + loss += r_loss + g_loss + return loss, r_losses, g_losses @@ -37,10 +45,31 @@ def generator_loss(disc_outputs): Args: disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. """ - gen_losses = [(1 - dg).pow(2).mean() for dg in disc_outputs] - loss = sum(gen_losses) + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg.float()) ** 2) + #gen_losses.append(l.item()) + loss += l + return loss, gen_losses +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + loss = 0 + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)): + real_loss = torch.mean((1 - d_real) ** 2) + fake_loss = torch.mean(d_fake**2) + _loss = real_loss + fake_loss + loss += _loss if i < len(disc_real) / 2 else scale * _loss + return loss, None, None + +def generator_loss_scaled(disc_outputs, scale=1.0): + loss = 0 + for i, d_fake in enumerate(disc_outputs): + d_fake = d_fake.float() + _loss = torch.mean((1 - d_fake) ** 2) + loss += _loss if i < len(disc_outputs) / 2 else scale * _loss + return loss, None, None def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): """ @@ -98,3 +127,11 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): kl = (kl * z_mask).sum() loss = kl / z_mask.sum() return loss + +MaxPool = torch.nn.MaxPool1d(160) + +def envelope_loss(y, y_g): + loss = 0 + loss += torch.mean(torch.abs(MaxPool( y) - MaxPool( y_g))) + loss += torch.mean(torch.abs(MaxPool(-y) - MaxPool(-y_g))) + return loss \ No newline at end of file diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index b3bb74f87..e4d51cbc1 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -159,66 +159,76 @@ class MultiScaleMelSpectrogramLoss(torch.nn.Module): def __init__( self, sample_rate: int = 24000, - n_mels=[5, 10, 20, 40, 80, 160, 320, 480], + n_mels: list[int] = [5, 10, 20, 40, 80, 160, 320, 480], loss_fn=torch.nn.L1Loss(), ): super().__init__() self.sample_rate = sample_rate self.loss_fn = loss_fn self.log_base = torch.log(torch.tensor(10.0)) - self.stft_params = {} - self.mel_banks = {} - - window_lengths = [compute_window_length(mel, sample_rate) for mel in n_mels] - # print(window_lengths) - - for n_mels, window_length in zip(n_mels, window_lengths): - self.stft_params[n_mels] = { - "n_mels": n_mels, - "window_length": window_length, - "hop_length": self.sample_rate // 100, - } - self.mel_banks[n_mels] = torch.from_numpy( - librosa_mel_fn( - sr=self.sample_rate, - n_mels=n_mels, - n_fft=window_length, - fmin=0, - fmax=None, - ) - ) + self.stft_params: list[tuple] = [] + self.hann_window: dict[int, torch.Tensor] = {} + self.mel_banks: dict[int, torch.Tensor] = {} + + self.stft_params = [ + (mel, compute_window_length(mel, sample_rate), self.sample_rate // 100) + for mel in n_mels + ] def mel_spectrogram( self, - wav, - n_mels, - window_length, - hop_length, + wav: torch.Tensor, + n_mels: int, + window_length: int, + hop_length: int, ): + # IDs for caching + dtype_device = str(wav.dtype) + "_" + str(wav.device) + win_dtype_device = str(window_length) + "_" + dtype_device + mel_dtype_device = str(n_mels) + "_" + dtype_device + # caching hann window + if win_dtype_device not in self.hann_window: + self.hann_window[win_dtype_device] = torch.hann_window( + window_length, device=wav.device, dtype=torch.float32 + ) + wav = wav.squeeze(1) # -> torch(B, T) - window = torch.hann_window(window_length).to(wav.device).to(wav.dtype) + stft = torch.stft( wav.float(), n_fft=window_length, hop_length=hop_length, - window=window, + window=self.hann_window[win_dtype_device], return_complex=True, ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1) + magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6) - mel_basis = self.mel_banks[n_mels].to( - wav.device - ) # torch(n_mels, window_length // 2 + 1) + + # caching mel filter + if mel_dtype_device not in self.mel_banks: + self.mel_banks[mel_dtype_device] = torch.from_numpy( + librosa_mel_fn( + sr=self.sample_rate, + n_mels=n_mels, + n_fft=window_length, + fmin=0, + fmax=None, + ) + ).to(device=wav.device, dtype=torch.float32) + mel_spectrogram = torch.matmul( - mel_basis, magnitude + self.mel_banks[mel_dtype_device], magnitude ) # torch(B, n_mels, stft.frames) return mel_spectrogram - def forward(self, real, fake): # real: torch(B, 1, T) , fake: torch(B, 1, T) + def forward( + self, real: torch.Tensor, fake: torch.Tensor + ): # real: torch(B, 1, T) , fake: torch(B, 1, T) loss = 0.0 - for p in self.stft_params.values(): - real_mels = self.mel_spectrogram(real, **p) - fake_mels = self.mel_spectrogram(fake, **p) - real_logmels = torch.log(real_mels.clamp(min=1e-5).pow(1)) / self.log_base - fake_logmels = torch.log(fake_mels.clamp(min=1e-5).pow(1)) / self.log_base + for p in self.stft_params: + real_mels = self.mel_spectrogram(real, *p) + fake_mels = self.mel_spectrogram(fake, *p) + real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base + fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base loss += self.loss_fn(real_logmels, fake_logmels) return loss diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index c9c865491..edc02f87a 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -11,6 +11,7 @@ import librosa import multiprocessing import noisereduce as nr +import soxr now_directory = os.getcwd() sys.path.append(now_directory) @@ -18,7 +19,6 @@ from rvc.lib.utils import load_audio from rvc.train.preprocess.slicer import Slicer -# Remove colab logs import logging logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) @@ -30,6 +30,7 @@ ALPHA = 0.75 HIGH_PASS_CUTOFF = 48 SAMPLE_RATE_16K = 16000 +RES_TYPE = "soxr_vhq" class PreProcess: @@ -76,7 +77,10 @@ def process_audio_segment( normalized_audio.astype(np.float32), ) audio_16k = librosa.resample( - normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K + normalized_audio, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, ) wavfile.write( os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), @@ -84,20 +88,60 @@ def process_audio_segment( audio_16k.astype(np.float32), ) + def simple_cut( + self, + audio: np.ndarray, + sid: int, + idx0: int, + chunk_len: float, + overlap_len: float, + ): + chunk_length = int(self.sr * chunk_len) + overlap_length = int(self.sr * overlap_len) + i = 0 + while i < len(audio): + chunk = audio[i : i + chunk_length] + if len(chunk) == chunk_length: + # full SR for training + wavfile.write( + os.path.join( + self.gt_wavs_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + self.sr, + chunk.astype(np.float32), + ) + # 16KHz for feature extraction + chunk_16k = librosa.resample( + chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE + ) + wavfile.write( + os.path.join( + self.wavs16k_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + SAMPLE_RATE_16K, + chunk_16k.astype(np.float32), + ) + i += chunk_length - overlap_length + def process_audio( self, path: str, idx0: int, sid: int, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, reduction_strength: float, + chunk_len: float, + overlap_len: float, ): audio_length = 0 try: audio = load_audio(path, self.sr) audio_length = librosa.get_duration(y=audio, sr=self.sr) + if process_effects: audio = signal.lfilter(self.b_high, self.a_high, audio) audio = self._normalize_audio(audio) @@ -105,8 +149,20 @@ def process_audio( audio = nr.reduce_noise( y=audio, sr=self.sr, prop_decrease=reduction_strength ) - idx1 = 0 - if cut_preprocess: + if cut_preprocess == "Skip": + # no cutting + self.process_audio_segment( + audio, + sid, + idx0, + 0, + ) + elif cut_preprocess == "Simple": + # simple + self.simple_cut(audio, sid, idx0, chunk_len, overlap_len) + elif cut_preprocess == "Automatic": + idx1 = 0 + # legacy for audio_segment in self.slicer.slice(audio): i = 0 while True: @@ -133,13 +189,7 @@ def process_audio( ) idx1 += 1 break - else: - self.process_audio_segment( - audio, - sid, - idx0, - idx1, - ) + except Exception as error: print(f"Error processing audio: {error}") return audio_length @@ -171,9 +221,16 @@ def save_dataset_duration(file_path, dataset_duration): def process_audio_wrapper(args): - pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength = ( - args - ) + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + ) = args file_path, idx0, sid = file return pp.process_audio( file_path, @@ -183,6 +240,8 @@ def process_audio_wrapper(args): process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ) @@ -192,10 +251,12 @@ def preprocess_training_set( num_processes: int, exp_dir: str, per: float, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, reduction_strength: float, + chunk_len: float, + overlap_len: float, ): start_time = time.time() pp = PreProcess(sr, exp_dir, per) @@ -232,6 +293,8 @@ def preprocess_training_set( process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ), ) for file in files @@ -260,10 +323,12 @@ def preprocess_training_set( num_processes = multiprocessing.cpu_count() else: num_processes = int(num_processes) - cut_preprocess = strtobool(sys.argv[6]) + cut_preprocess = str(sys.argv[6]) process_effects = strtobool(sys.argv[7]) noise_reduction = strtobool(sys.argv[8]) reduction_strength = float(sys.argv[9]) + chunk_len = float(sys.argv[10]) + overlap_len = float(sys.argv[11]) preprocess_training_set( input_root, @@ -275,4 +340,6 @@ def preprocess_training_set( process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ) diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py index 864765cb0..33eba8c97 100644 --- a/rvc/train/process/extract_model.py +++ b/rvc/train/process/extract_model.py @@ -33,6 +33,7 @@ def extract_model( version, hps, overtrain_info, + vocoder, ): try: print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})") @@ -105,6 +106,7 @@ def extract_model( opt["author"] = model_author opt["embedder_model"] = embedder_model opt["speakers_id"] = speakers_id + opt["vocoder"] = vocoder torch.save(opt, os.path.join(model_dir_path, pth_file)) diff --git a/rvc/train/train.py b/rvc/train/train.py index e1f892aa2..86a66fcd4 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -6,12 +6,14 @@ import torch import datetime +import math +from collections import deque from distutils.util import strtobool from random import randint, shuffle from time import time as ttime from time import sleep from tqdm import tqdm - +import numpy as np from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from torch.cuda.amp import GradScaler, autocast @@ -42,6 +44,7 @@ feature_loss, generator_loss, kl_loss, + envelope_loss, ) from mel_processing import ( mel_spectrogram_torch, @@ -63,13 +66,14 @@ gpus = sys.argv[7] batch_size = int(sys.argv[8]) sample_rate = int(sys.argv[9]) -pitch_guidance = strtobool(sys.argv[10]) -save_only_latest = strtobool(sys.argv[11]) -save_every_weights = strtobool(sys.argv[12]) -cache_data_in_gpu = strtobool(sys.argv[13]) -overtraining_detector = strtobool(sys.argv[14]) -overtraining_threshold = int(sys.argv[15]) -cleanup = strtobool(sys.argv[16]) +save_only_latest = strtobool(sys.argv[10]) +save_every_weights = strtobool(sys.argv[11]) +cache_data_in_gpu = strtobool(sys.argv[12]) +overtraining_detector = strtobool(sys.argv[13]) +overtraining_threshold = int(sys.argv[14]) +cleanup = strtobool(sys.argv[15]) +vocoder = sys.argv[16] +checkpointing = strtobool(sys.argv[17]) current_dir = os.getcwd() experiment_dir = os.path.join(current_dir, "logs", model_name) @@ -81,6 +85,10 @@ config = HParams(**config) config.data.training_files = os.path.join(experiment_dir, "filelist.txt") +# for Nvidia's CUDA device selection can be done from command line / UI +# for AMD the device selection can only be done from .bat file using HIP_VISIBLE_DEVICES +os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") + torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False @@ -94,6 +102,17 @@ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} training_file_path = os.path.join(experiment_dir, "training_data.json") +avg_losses = { + "gen_loss_queue": deque(maxlen=10), + "disc_loss_queue": deque(maxlen=10), + "disc_loss_50": deque(maxlen=50), + "env_loss_50": deque(maxlen=50), + "fm_loss_50": deque(maxlen=50), + "kl_loss_50": deque(maxlen=50), + "mel_loss_50": deque(maxlen=50), + "gen_loss_50": deque(maxlen=50), +} + import logging logging.getLogger("torch").setLevel(logging.ERROR) @@ -194,7 +213,6 @@ def start(): experiment_dir, pretrainG, pretrainD, - pitch_guidance, total_epoch, save_every_weights, config, @@ -281,7 +299,6 @@ def run( experiment_dir, pretrainG, pretrainD, - pitch_guidance, custom_total_epoch, custom_save_every_weights, config, @@ -296,7 +313,6 @@ def run( experiment_dir (str): The directory where experiment logs and checkpoints will be saved. pretrainG (str): Path to the pre-trained generator model. pretrainD (str): Path to the pre-trained discriminator model. - pitch_guidance (bool): Flag indicating whether to use pitch guidance during training. custom_total_epoch (int): The total number of epochs for training. custom_save_every_weights (int): The interval (in epochs) at which to save model weights. config (object): Configuration object containing training parameters. @@ -361,12 +377,16 @@ def run( config.data.filter_length // 2 + 1, config.train.segment_size // config.data.hop_length, **config.model, - use_f0=pitch_guidance == True, # converting 1/0 to True/False + use_f0=True, is_half=config.train.fp16_run and device.type == "cuda", sr=sample_rate, + vocoder=vocoder, + checkpointing=checkpointing, ).to(device) - net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm).to(device) + net_d = MultiPeriodDiscriminator( + version, config.model.use_spectral_norm, checkpointing=checkpointing + ).to(device) optim_g = torch.optim.AdamW( net_g.parameters(), @@ -444,8 +464,6 @@ def run( if True == False and os.path.isfile( os.path.join("logs", "reference", f"ref{sample_rate}.wav") ): - import numpy as np - phone = np.load( os.path.join("logs", "reference", f"ref{sample_rate}_feats.npy") ) @@ -463,8 +481,8 @@ def run( reference = ( phone, phone_lengths, - pitch if pitch_guidance else None, - pitchf if pitch_guidance else None, + pitch, + pitchf, sid, ) else: @@ -473,8 +491,8 @@ def run( reference = ( phone.to(device), phone_lengths.to(device), - pitch.to(device) if pitch_guidance else None, - pitchf.to(device) if pitch_guidance else None, + pitch.to(device), + pitchf.to(device), sid.to(device), ) break @@ -539,6 +557,9 @@ def train_and_evaluate( consecutive_increases_gen = 0 consecutive_increases_disc = 0 + epoch_disc_sum = 0.0 + epoch_gen_sum = 0.0 + net_g, net_d = nets optim_g, optim_d = optims train_loader = loaders[0] if loaders is not None else None @@ -583,8 +604,6 @@ def train_and_evaluate( wave_lengths, sid, ) = info - pitch = pitch if pitch_guidance else None - pitchf = pitchf if pitch_guidance else None # Forward pass use_amp = config.train.fp16_run and device.type == "cuda" @@ -604,27 +623,40 @@ def train_and_evaluate( ) y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) + # if vocoder == "HiFi-GAN": + # loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) + # else: + # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) + loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update + epoch_disc_sum += loss_disc optim_d.zero_grad() scaler.scale(loss_disc).backward() scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value(net_d.parameters(), None) + grad_norm_d = torch.nn.utils.clip_grad_norm_( + net_d.parameters(), max_norm=1000.0 + ) scaler.step(optim_d) + scaler.update() + # if not math.isfinite(grad_norm_d): + # print("\nWarning: grad_norm_d is NaN or Inf") # Generator backward and update with autocast(enabled=use_amp): _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) with autocast(enabled=False): loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 + loss_env = envelope_loss(wave, y_hat) loss_kl = ( kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl ) loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + # if vocoder == "HiFi-GAN": + # loss_gen, _ = generator_loss(y_d_hat_g) + # else: + # loss_gen, _ = generator_loss_scaled(y_d_hat_g) + loss_gen, _ = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env if loss_gen_all < lowest_value["value"]: lowest_value = { @@ -632,19 +664,67 @@ def train_and_evaluate( "value": loss_gen_all, "epoch": epoch, } - + epoch_gen_sum += loss_gen_all optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value(net_g.parameters(), None) + grad_norm_g = torch.nn.utils.clip_grad_norm_( + net_g.parameters(), max_norm=1000.0 + ) scaler.step(optim_g) scaler.update() + # if not math.isfinite(grad_norm_g): + # print("\n Warning: grad_norm_g is NaN or Inf") global_step += 1 + + # queue for rolling losses over 50 steps + avg_losses["disc_loss_50"].append(loss_disc.detach()) + avg_losses["env_loss_50"].append(loss_env.detach()) + avg_losses["fm_loss_50"].append(loss_fm.detach()) + avg_losses["kl_loss_50"].append(loss_kl.detach()) + avg_losses["mel_loss_50"].append(loss_mel.detach()) + avg_losses["gen_loss_50"].append(loss_gen_all.detach()) + + if rank == 0 and global_step % 50 == 0: + # logging rolling averages + scalar_dict = { + "loss_avg_50/d/total": torch.mean( + torch.stack(list(avg_losses["disc_loss_50"])) + ), + "loss_avg_50/g/env": torch.mean( + torch.stack(list(avg_losses["env_loss_50"])) + ), + "loss_avg_50/g/fm": torch.mean( + torch.stack(list(avg_losses["fm_loss_50"])) + ), + "loss_avg_50/g/kl": torch.mean( + torch.stack(list(avg_losses["kl_loss_50"])) + ), + "loss_avg_50/g/mel": torch.mean( + torch.stack(list(avg_losses["mel_loss_50"])) + ), + "loss_avg_50/g/total": torch.mean( + torch.stack(list(avg_losses["gen_loss_50"])) + ), + } + summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + pbar.update(1) + with torch.no_grad(): + torch.cuda.empty_cache() + # Logging and checkpointing if rank == 0: + + avg_losses["disc_loss_queue"].append(epoch_disc_sum.item() / len(train_loader)) + avg_losses["gen_loss_queue"].append(epoch_gen_sum.item() / len(train_loader)) + # used for tensorboard chart - all/mel mel = spec_to_mel_torch( spec, @@ -677,19 +757,19 @@ def train_and_evaluate( y_hat_mel = y_hat_mel.half() lr = optim_g.param_groups[0]["lr"] - if loss_mel > 75: - loss_mel = 75 - if loss_kl > 9: - loss_kl = 9 + scalar_dict = { "loss/g/total": loss_gen_all, "loss/d/total": loss_disc, "learning_rate": lr, - "grad/norm_d": grad_norm_d, - "grad/norm_g": grad_norm_g, + "grad/norm_d": grad_norm_d.item(), + "grad/norm_g": grad_norm_g.item(), "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl, + "loss/g/env": loss_env, + "loss_avg_epoch/disc": np.mean(avg_losses["disc_loss_queue"]), + "loss_avg_epoch/gen": np.mean(avg_losses["gen_loss_queue"]), } # commented out # scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) @@ -860,8 +940,7 @@ def train_and_evaluate( extract_model( ckpt=ckpt, sr=sample_rate, - pitch_guidance=pitch_guidance - == True, # converting 1/0 to True/False, + pitch_guidance=True, name=model_name, model_dir=m, epoch=epoch, @@ -869,6 +948,7 @@ def train_and_evaluate( version=version, hps=hps, overtrain_info=overtrain_info, + vocoder=vocoder, ) # Clean-up old best epochs for m in model_del: @@ -900,6 +980,9 @@ def train_and_evaluate( if done: os._exit(2333333) + with torch.no_grad(): + torch.cuda.empty_cache() + def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004): """ diff --git a/tabs/train/train.py b/tabs/train/train.py index de88b670c..2dcb15cc9 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -302,30 +302,87 @@ def upload_file(file_path): # Train Tab def train_tab(): - with gr.Row(): - model_name = gr.Dropdown( - label=i18n("Model Name"), - info=i18n("Name of the new model."), - choices=get_models_list(), - value="my-project", - interactive=True, - allow_custom_value=True, - ) - sampling_rate = gr.Radio( - label=i18n("Sampling Rate"), - info=i18n("The sampling rate of the audio files."), - choices=["32000", "40000", "48000"], - value="40000", - interactive=True, - ) - rvc_version = gr.Radio( - label=i18n("Model Architecture"), - info=i18n("Version of the model architecture."), - choices=["v1", "v2"], - value="v2", - interactive=True, - visible=False, - ) + # Model settings section + with gr.Accordion(i18n("Model Settings")): + with gr.Row(): + with gr.Column(): + model_name = gr.Dropdown( + label=i18n("Model Name"), + info=i18n("Name of the new model."), + choices=get_models_list(), + value="my-project", + interactive=True, + allow_custom_value=True, + ) + experimental_options = gr.Checkbox( + label=i18n("Enable Experimental Options"), + info=i18n( + "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models." + ), + value=False, + ) + + with gr.Column(): + sampling_rate = gr.Radio( + label=i18n("Sampling Rate"), + info=i18n("The sampling rate of the audio files."), + choices=["32000", "40000", "48000"], + value="40000", + interactive=True, + ) + vocoder = gr.Radio( + label=i18n("Vocoder"), + info=i18n( + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only)." + ), + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], + value="HiFi-GAN", + interactive=True, + visible=False, + ) + rvc_version = gr.Radio( + label=i18n("Model Architecture"), + info=i18n("Version of the model architecture."), + choices=["v1", "v2"], + value="v2", + interactive=True, + visible=False, + ) + with gr.Accordion( + i18n("Advanced Settings"), + open=False, + ): + with gr.Row(): + with gr.Column(): + cpu_cores = gr.Slider( + 1, + min(cpu_count(), 32), # max 32 parallel processes + min(cpu_count(), 32), + step=1, + label=i18n("CPU Cores"), + info=i18n( + "The number of CPU cores to use in the extraction process. The default setting are your cpu cores, which is recommended for most cases." + ), + interactive=True, + ) + + with gr.Column(): + gpu = gr.Textbox( + label=i18n("GPU Number"), + info=i18n( + "Specify the number of GPUs you wish to utilize for extracting by entering them separated by hyphens (-)." + ), + placeholder=i18n("0 to ∞ separated by -"), + value=str(get_number_of_gpus()), + interactive=True, + ) + gr.Textbox( + label=i18n("GPU Information"), + info=i18n("The GPU information will be displayed here."), + value=get_gpu_info(), + interactive=False, + ) + # Preprocess section with gr.Accordion(i18n("Preprocess")): dataset_path = gr.Dropdown( label=i18n("Dataset Path"), @@ -357,27 +414,38 @@ def train_tab(): refresh = gr.Button(i18n("Refresh")) with gr.Accordion(i18n("Advanced Settings"), open=False): - cpu_cores_preprocess = gr.Slider( - 1, - min(cpu_count(), 32), # max 32 parallel processes - min(cpu_count(), 32), - step=1, - label=i18n("CPU Cores"), + cut_preprocess = gr.Radio( + label=i18n("Audio cutting"), info=i18n( - "The number of CPU cores to use in the preprocess. The default setting are your cpu cores, which is recommended for most cases." + "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it." ), + choices=["Skip", "Simple", "Automatic"], + value="Automatic", interactive=True, ) with gr.Row(): - cut_preprocess = gr.Checkbox( - label=i18n("Audio cutting"), + chunk_len = gr.Slider( + 0.5, + 5.0, + 3.0, + step=0.1, + label=i18n("Chunk length (sec)"), + info=i18n("Length of the audio slice for 'Simple' method."), + interactive=True, + ) + overlap_len = gr.Slider( + 0.0, + 0.4, + 0.3, + step=0.1, + label=i18n("Overlap length (sec)"), info=i18n( - "It's recommended to deactivate this option if your dataset has already been processed." + "Length of the overlap between slices for 'Simple' method." ), - value=True, interactive=True, - visible=True, ) + + with gr.Row(): process_effects = gr.Checkbox( label=i18n("Process effects"), info=i18n( @@ -387,7 +455,6 @@ def train_tab(): interactive=True, visible=True, ) - with gr.Row(): noise_reduction = gr.Checkbox( label=i18n("Noise Reduction"), info=i18n( @@ -397,17 +464,17 @@ def train_tab(): interactive=True, visible=True, ) - clean_strength = gr.Slider( - minimum=0, - maximum=1, - label=i18n("Noise Reduction Strength"), - info=i18n( - "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed." - ), - visible=False, - value=0.5, - interactive=True, - ) + clean_strength = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Noise Reduction Strength"), + info=i18n( + "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed." + ), + visible=False, + value=0.5, + interactive=True, + ) preprocess_output_info = gr.Textbox( label=i18n("Output Information"), info=i18n("The output information will be displayed here."), @@ -424,15 +491,18 @@ def train_tab(): model_name, dataset_path, sampling_rate, - cpu_cores_preprocess, + cpu_cores, cut_preprocess, process_effects, noise_reduction, clean_strength, + chunk_len, + overlap_len, ], outputs=[preprocess_output_info], ) + # Extract section with gr.Accordion(i18n("Extract")): with gr.Row(): f0_method = gr.Radio( @@ -458,7 +528,18 @@ def train_tab(): value="contentvec", interactive=True, ) - + include_mutes = gr.Slider( + 0, + 10, + 2, + step=1, + label=i18n("Silent training files"), + info=i18n( + "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." + ), + value=True, + interactive=True, + ) hop_length = gr.Slider( 1, 512, @@ -491,43 +572,6 @@ def train_tab(): ) move_files_button = gr.Button("Move files to custom embedder folder") - with gr.Accordion( - i18n( - "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank." - ), - open=False, - ): - with gr.Row(): - with gr.Column(): - cpu_cores_extract = gr.Slider( - 1, - min(cpu_count(), 32), # max 32 parallel processes - min(cpu_count(), 32), - step=1, - label=i18n("CPU Cores"), - info=i18n( - "The number of CPU cores to use in the extraction process. The default setting are your cpu cores, which is recommended for most cases." - ), - interactive=True, - ) - - with gr.Column(): - gpu_extract = gr.Textbox( - label=i18n("GPU Number"), - info=i18n( - "Specify the number of GPUs you wish to utilize for extracting by entering them separated by hyphens (-)." - ), - placeholder=i18n("0 to ∞ separated by -"), - value=str(get_number_of_gpus()), - interactive=True, - ) - gr.Textbox( - label=i18n("GPU Information"), - info=i18n("The GPU information will be displayed here."), - value=get_gpu_info(), - interactive=False, - ) - extract_output_info = gr.Textbox( label=i18n("Output Information"), info=i18n("The output information will be displayed here."), @@ -543,15 +587,17 @@ def train_tab(): rvc_version, f0_method, hop_length, - cpu_cores_extract, - gpu_extract, + cpu_cores, + gpu, sampling_rate, embedder_model, embedder_model_custom, + include_mutes, ], outputs=[extract_output_info], ) + # Training section with gr.Accordion(i18n("Training")): with gr.Row(): batch_size = gr.Slider( @@ -629,15 +675,15 @@ def train_tab(): value=False, interactive=True, ) - pitch_guidance = gr.Checkbox( - label=i18n("Pitch Guidance"), + checkpointing = gr.Checkbox( + label=i18n("Checkpointing"), info=i18n( - "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential." + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." ), - value=True, + value=False, interactive=True, ) - with gr.Column(): + with gr.Row(): custom_pretrained = gr.Checkbox( label=i18n("Custom Pretrained"), info=i18n( @@ -646,6 +692,15 @@ def train_tab(): value=False, interactive=True, ) + overtraining_detector = gr.Checkbox( + label=i18n("Overtraining Detector"), + info=i18n( + "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data." + ), + value=False, + interactive=True, + ) + with gr.Row(): with gr.Column(visible=False) as pretrained_custom_settings: with gr.Accordion(i18n("Pretrained Custom Settings")): upload_pretrained = gr.File( @@ -674,41 +729,7 @@ def train_tab(): interactive=True, allow_custom_value=True, ) - multiple_gpu = gr.Checkbox( - label=i18n("GPU Settings"), - info=( - i18n( - "Sets advanced GPU settings, recommended for users with better GPU architecture." - ) - ), - value=False, - interactive=True, - ) - with gr.Column(visible=False) as gpu_custom_settings: - with gr.Accordion(i18n("GPU Settings")): - gpu = gr.Textbox( - label=i18n("GPU Number"), - info=i18n( - "Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-)." - ), - placeholder=i18n("0 to ∞ separated by -"), - value=str(get_number_of_gpus()), - interactive=True, - ) - gr.Textbox( - label=i18n("GPU Information"), - info=i18n("The GPU information will be displayed here."), - value=get_gpu_info(), - interactive=False, - ) - overtraining_detector = gr.Checkbox( - label=i18n("Overtraining Detector"), - info=i18n( - "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data." - ), - value=False, - interactive=True, - ) + with gr.Column(visible=False) as overtraining_settings: with gr.Accordion(i18n("Overtraining Detector Settings")): overtraining_threshold = gr.Slider( @@ -722,15 +743,15 @@ def train_tab(): ), interactive=True, ) - index_algorithm = gr.Radio( - label=i18n("Index Algorithm"), - info=i18n( - "KMeans is a clustering algorithm that divides the dataset into K clusters. This setting is particularly useful for large datasets." - ), - choices=["Auto", "Faiss", "KMeans"], - value="Auto", - interactive=True, - ) + index_algorithm = gr.Radio( + label=i18n("Index Algorithm"), + info=i18n( + "KMeans is a clustering algorithm that divides the dataset into K clusters. This setting is particularly useful for large datasets." + ), + choices=["Auto", "Faiss", "KMeans"], + value="Auto", + interactive=True, + ) def enforce_terms(terms_accepted, *args): if not terms_accepted: @@ -770,7 +791,6 @@ def enforce_terms(terms_accepted, *args): sampling_rate, batch_size, gpu, - pitch_guidance, overtraining_detector, overtraining_threshold, pretrained, @@ -780,6 +800,8 @@ def enforce_terms(terms_accepted, *args): custom_pretrained, g_pretrained_path, d_pretrained_path, + vocoder, + checkpointing, ], outputs=[train_output_info], ) @@ -798,6 +820,7 @@ def enforce_terms(terms_accepted, *args): outputs=[train_output_info], ) + # Export Model section with gr.Accordion(i18n("Export Model"), open=False): if not os.name == "nt": gr.Markdown( @@ -879,57 +902,32 @@ def disable_stop_train_button(): "__type__": "update", } - def download_prerequisites(version, pitch_guidance): + def download_prerequisites(version): if version == "v1": - if pitch_guidance: - gr.Info( - "Checking for v1 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=True, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) - else: - gr.Info( - "Checking for v1 prerequisites without pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=True, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) + gr.Info( + "Checking for v1 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." + ) + run_prerequisites_script( + pretraineds_v1_f0=True, + pretraineds_v1_nof0=False, + pretraineds_v2_f0=False, + pretraineds_v2_nof0=False, + models=False, + exe=False, + ) elif version == "v2": - if pitch_guidance: - gr.Info( - "Checking for v2 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=True, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) - else: - gr.Info( - "Checking for v2 prerequisites without pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=True, - models=False, - exe=False, - ) + gr.Info( + "Checking for v2 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." + ) + run_prerequisites_script( + pretraineds_v1_f0=False, + pretraineds_v1_nof0=False, + pretraineds_v2_f0=True, + pretraineds_v2_nof0=False, + models=False, + exe=False, + ) + gr.Info( "Prerequisites check complete. Missing files were downloaded, and you may now start preprocessing." ) @@ -939,6 +937,17 @@ def toggle_visible_embedder_custom(embedder_model): return {"visible": True, "__type__": "update"} return {"visible": False, "__type__": "update"} + def toggle_experimental(enabled): + if enabled: + return { + "choices": ["32000", "40000", "44100", "48000"], + "__type__": "update", + }, {"visible": True, "__type__": "update"} + return {"choices": ["32000", "40000", "48000"], "__type__": "update"}, { + "visible": False, + "__type__": "update", + } + def update_slider_visibility(noise_reduction): return gr.update(visible=noise_reduction) @@ -949,13 +958,13 @@ def update_slider_visibility(noise_reduction): ) rvc_version.change( fn=download_prerequisites, - inputs=[rvc_version, pitch_guidance], + inputs=[rvc_version], outputs=[], ) - pitch_guidance.change( - fn=download_prerequisites, - inputs=[rvc_version, pitch_guidance], - outputs=[], + experimental_options.change( + fn=toggle_experimental, + inputs=[experimental_options], + outputs=[sampling_rate, vocoder], ) refresh.click( fn=refresh_models_and_datasets, @@ -972,13 +981,11 @@ def update_slider_visibility(noise_reduction): inputs=[upload_audio_dataset, dataset_name], outputs=[upload_audio_dataset, dataset_path], ) - f0_method.change( fn=toggle_visible_hop_length, inputs=[f0_method], outputs=[hop_length], ) - embedder_model.change( fn=toggle_visible_embedder_custom, inputs=[embedder_model], @@ -1022,11 +1029,6 @@ def update_slider_visibility(noise_reduction): inputs=[overtraining_detector], outputs=[overtraining_settings], ) - multiple_gpu.change( - fn=toggle_visible, - inputs=[multiple_gpu], - outputs=[gpu_custom_settings], - ) train_button.click( fn=enable_stop_train_button, inputs=[],