diff --git a/README.md b/README.md index b42753e..f9b5baa 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,19 @@ -# Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models +# Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language M odels [Audio samples](https://languagecodec.github.io) | Paper [[abs]](https://arxiv.org/abs/2402.12208) [[pdf]](https://arxiv.org/pdf/2402.12208.pdf) +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/pdf/2402.12208.pdf) +[![demo](https://img.shields.io/badge/Languagecodec-Demo-red)](https://languagecodec.github.io) +[![model](https://img.shields.io/badge/%F0%9F%A4%97%20Languagecodec-Models-blue)](https://huggingface.co/amphion/naturalspeech3_facodec) + + +# 🔥 News +- *2024.04*: We update Languagecodec and release a more powerful checkpoint. +- *2022.02*: We release Languagecodec on arxiv. + +![result](result.png) + ## Installation @@ -20,25 +31,25 @@ pip install -r requirements.txt ```python -from encodec.utils import convert_audio +from languagecodec_encoder.utils import convert_audio import torchaudio import torch -from vocos.pretrained import Vocos +from languagecodec_decoder.pretrained import Vocos device=torch.device('cpu') config_path = "xxx/languagecodec/configs/languagecodec.yaml" model_path = "xxx/xxx.ckpt" audio_outpath = "xxx" -vocos = Vocos.from_pretrained0802(config_path, model_path) -vocos = vocos.to(device) +languagecodec = Vocos.from_pretrained0802(config_path, model_path) +languagecodec = languagecodec.to(device) wav, sr = torchaudio.load(audio_path) wav = convert_audio(wav, sr, 24000, 1) bandwidth_id = torch.tensor([0]) wav=wav.to(device) -features,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id) -audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) +features,discrete_code= languagecodec.encode_infer(wav, bandwidth_id=bandwidth_id) +audio_out = languagecodec.decode(features, bandwidth_id=bandwidth_id) torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', bits_per_sample=16) ``` @@ -46,23 +57,23 @@ torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', b ### Part2: Generating discrete codecs ```python -from encodec.utils import convert_audio +from languagecodec_encoder.utils import convert_audio import torchaudio import torch -from vocos.pretrained import Vocos +from languagecodec_decoder.pretrained import Vocos device=torch.device('cpu') config_path = "xxx/languagecodec/configs/languagecodec.yaml" model_path = "xxx/xxx.ckpt" -vocos = Vocos.from_pretrained0802(config_path, model_path) -vocos = vocos.to(device) +languagecodec = Vocos.from_pretrained0802(config_path, model_path) +languagecodec = languagecodec.to(device) wav, sr = torchaudio.load(audio_path) wav = convert_audio(wav, sr, 24000, 1) bandwidth_id = torch.tensor([0]) wav=wav.to(device) -_,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id) +_,discrete_code= languagecodec.encode_infer(wav, bandwidth_id=bandwidth_id) print(discrete_code) ``` @@ -71,9 +82,9 @@ print(discrete_code) ### Part3: Audio reconstruction through codecs ```python # audio_tokens [n_q,1,t]/[n_q,t] -features = vocos.codes_to_features(audio_tokens) +features = languagecodec.codes_to_features(audio_tokens) bandwidth_id = torch.tensor([0]) -audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) +audio_out = languagecodec.decode(features, bandwidth_id=bandwidth_id) ``` @@ -81,14 +92,9 @@ audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) ## Pre-trained models -Currently, we have only released the results from our paper, and we plan to release additional checkpoints trained on a larger training dataset within the next two months. - -Notice: We will release a better language-codec checkpoint before 5.15, and further revise the paper. - | Model Name | Dataset | Training Iterations -------------------------------------------------------------------------------------|---------------|--------------------- -| [languagecodec_paper_8nq](https://drive.google.com/file/d/109ectu4NJWFCpmrqc31wdXvkTI6U2nMA/view?usp=drive_link) | 3W Hours | 2.0 M -| [languagecodec_chinese_8nq](https://drive.google.com/file/d/18JpINstfF2YrbFg6nqs3BVn0oxdLsuUm/view?usp=drive_link) | 2W Chinese Hours | 2.0 M +| [languagecodec_paper_8nq](https://drive.google.com/file/d/109ectu4NJWFCpmrqc31wdXvkTI6U2nMA/view?usp=drive_link) | 5W Hours | 2.0 M ## Training @@ -99,7 +105,7 @@ Notice: We will release a better language-codec checkpoint before 5.15, and furt ### Step2: Modifying configuration files ```python -# xxx/languagecodec/configs/languagecodec.yaml +# xxx/languagecodec/configs/languagecodec_mm.yaml # Modify the values of parameters such as batch_size, filelist_path, save_dir, device ``` @@ -109,7 +115,7 @@ training pipeline. ```bash cd xxx/languagecodec -python train.py fit --config xxx/languagecodec/configs/languagecodec.yaml +python train.py fit --config xxx/languagecodec/configs/languagecodec_mm.yaml ``` diff --git a/configs/languagecodec.yaml b/configs/languagecodec_mm.yaml similarity index 66% rename from configs/languagecodec.yaml rename to configs/languagecodec_mm.yaml index 0f4d597..0d889ae 100644 --- a/configs/languagecodec.yaml +++ b/configs/languagecodec_mm.yaml @@ -1,24 +1,24 @@ seed_everything: 4444 data: - class_path: vocos.dataset.VocosDataModule + class_path: languagecodec_decoder.dataset.VocosDataModule init_args: train_params: - filelist_path: xxx/xxx + filelist_path: /home/jovyan/honor/big-disk/speech/code/languagecodec/data/train/languagecodec_ch_en sampling_rate: 24000 num_samples: 24000 batch_size: 100 num_workers: 8 val_params: - filelist_path: xxx/xxx + filelist_path: /home/jovyan/honor/big-disk/speech/code/languagecodec/data/train/languagecodec_large_val sampling_rate: 24000 num_samples: 24000 batch_size: 10 num_workers: 8 model: - class_path: vocos.experiment.VocosEncodecExp + class_path: languagecodec_decoder.experiment.VocosEncodecExp init_args: sample_rate: 24000 initial_learning_rate: 2e-4 @@ -33,27 +33,27 @@ model: evaluate_periodicty: true resume: false - resume_config: xxx/config.yaml - resume_model: xxx/xxxx.ckpt + resume_config: /home/jovyan/honor/big-disk/speech/code/languagecodec/result/train/languagecodec_mm/lightning_logs/version_1/config.yaml + resume_model: /home/jovyan/honor/big-disk/speech/code/languagecodec/result/train/languagecodec_mm/lightning_logs/version_1/checkpoints/vocos_checkpoint_epoch=7_step=1268768_val_loss=2.9373.ckpt feature_extractor: - class_path: vocos.feature_extractors.EncodecFeatures + class_path: languagecodec_decoder.feature_extractors.EncodecFeatures init_args: encodec_model: encodec_24khz bandwidths: [6.6, 6.6, 6.6, 6.6] train_codebooks: true backbone: - class_path: vocos.models.VocosBackbone + class_path: languagecodec_decoder.models.VocosBackbone init_args: input_channels: 128 dim: 384 intermediate_dim: 1152 - num_layers: 8 + num_layers: 12 adanorm_num_embeddings: 4 # len(bandwidths) head: - class_path: vocos.heads.ISTFTHead + class_path: languagecodec_decoder.heads.ISTFTHead init_args: dim: 384 n_fft: 1280 @@ -76,7 +76,7 @@ trainer: filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f} save_top_k: 50 save_last: true - - class_path: vocos.helpers.GradNormCallback + - class_path: languagecodec_decoder.helpers.GradNormCallback # Lightning calculates max_steps across all optimizer steps (rather than number of batches) # This equals to 1M steps per generator and 1M per discriminator @@ -85,5 +85,5 @@ trainer: limit_val_batches: 100 accelerator: gpu strategy: ddp - devices: [4,5,6,7] + devices: [0,1,2,3,4,5,6,7] log_every_n_steps: 1000 diff --git a/encodec/quantization/__pycache__/__init__.cpython-310.pyc b/encodec/quantization/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index d265e40..0000000 Binary files a/encodec/quantization/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/languagecodec_decoder/__init__.py b/languagecodec_decoder/__init__.py new file mode 100644 index 0000000..09c0a5a --- /dev/null +++ b/languagecodec_decoder/__init__.py @@ -0,0 +1,4 @@ +from languagecodec_decoder.pretrained import Vocos + + +__version__ = "0.0.3" diff --git a/languagecodec_decoder/__pycache__/__init__.cpython-310.pyc b/languagecodec_decoder/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..3c14b8e Binary files /dev/null and b/languagecodec_decoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/__init__.cpython-38.pyc b/languagecodec_decoder/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..9118d83 Binary files /dev/null and b/languagecodec_decoder/__pycache__/__init__.cpython-38.pyc differ diff --git a/vocos/__pycache__/dataset.cpython-310.pyc b/languagecodec_decoder/__pycache__/dataset.cpython-310.pyc similarity index 76% rename from vocos/__pycache__/dataset.cpython-310.pyc rename to languagecodec_decoder/__pycache__/dataset.cpython-310.pyc index 587a708..736de8b 100644 Binary files a/vocos/__pycache__/dataset.cpython-310.pyc and b/languagecodec_decoder/__pycache__/dataset.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/discriminator_dac.cpython-310.pyc b/languagecodec_decoder/__pycache__/discriminator_dac.cpython-310.pyc new file mode 100644 index 0000000..f35b174 Binary files /dev/null and b/languagecodec_decoder/__pycache__/discriminator_dac.cpython-310.pyc differ diff --git a/vocos/__pycache__/discriminators.cpython-310.pyc b/languagecodec_decoder/__pycache__/discriminators.cpython-310.pyc similarity index 96% rename from vocos/__pycache__/discriminators.cpython-310.pyc rename to languagecodec_decoder/__pycache__/discriminators.cpython-310.pyc index beeb197..a39c0ca 100644 Binary files a/vocos/__pycache__/discriminators.cpython-310.pyc and b/languagecodec_decoder/__pycache__/discriminators.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/experiment.cpython-310.pyc b/languagecodec_decoder/__pycache__/experiment.cpython-310.pyc new file mode 100644 index 0000000..0756b8a Binary files /dev/null and b/languagecodec_decoder/__pycache__/experiment.cpython-310.pyc differ diff --git a/vocos/__pycache__/feature_extractors.cpython-310.pyc b/languagecodec_decoder/__pycache__/feature_extractors.cpython-310.pyc similarity index 50% rename from vocos/__pycache__/feature_extractors.cpython-310.pyc rename to languagecodec_decoder/__pycache__/feature_extractors.cpython-310.pyc index 1a9e7bb..188b7cb 100644 Binary files a/vocos/__pycache__/feature_extractors.cpython-310.pyc and b/languagecodec_decoder/__pycache__/feature_extractors.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/feature_extractors.cpython-38.pyc b/languagecodec_decoder/__pycache__/feature_extractors.cpython-38.pyc new file mode 100644 index 0000000..229844c Binary files /dev/null and b/languagecodec_decoder/__pycache__/feature_extractors.cpython-38.pyc differ diff --git a/vocos/__pycache__/heads.cpython-310.pyc b/languagecodec_decoder/__pycache__/heads.cpython-310.pyc similarity index 93% rename from vocos/__pycache__/heads.cpython-310.pyc rename to languagecodec_decoder/__pycache__/heads.cpython-310.pyc index e5d3c33..403fddb 100644 Binary files a/vocos/__pycache__/heads.cpython-310.pyc and b/languagecodec_decoder/__pycache__/heads.cpython-310.pyc differ diff --git a/vocos/__pycache__/helpers.cpython-310.pyc b/languagecodec_decoder/__pycache__/helpers.cpython-310.pyc similarity index 92% rename from vocos/__pycache__/helpers.cpython-310.pyc rename to languagecodec_decoder/__pycache__/helpers.cpython-310.pyc index b01a5d6..de20bd5 100644 Binary files a/vocos/__pycache__/helpers.cpython-310.pyc and b/languagecodec_decoder/__pycache__/helpers.cpython-310.pyc differ diff --git a/vocos/__pycache__/loss.cpython-310.pyc b/languagecodec_decoder/__pycache__/loss.cpython-310.pyc similarity index 56% rename from vocos/__pycache__/loss.cpython-310.pyc rename to languagecodec_decoder/__pycache__/loss.cpython-310.pyc index 9b64885..effa88f 100644 Binary files a/vocos/__pycache__/loss.cpython-310.pyc and b/languagecodec_decoder/__pycache__/loss.cpython-310.pyc differ diff --git a/vocos/__pycache__/models.cpython-310.pyc b/languagecodec_decoder/__pycache__/models.cpython-310.pyc similarity index 83% rename from vocos/__pycache__/models.cpython-310.pyc rename to languagecodec_decoder/__pycache__/models.cpython-310.pyc index dc6764d..11dca97 100644 Binary files a/vocos/__pycache__/models.cpython-310.pyc and b/languagecodec_decoder/__pycache__/models.cpython-310.pyc differ diff --git a/vocos/__pycache__/modules.cpython-310.pyc b/languagecodec_decoder/__pycache__/modules.cpython-310.pyc similarity index 96% rename from vocos/__pycache__/modules.cpython-310.pyc rename to languagecodec_decoder/__pycache__/modules.cpython-310.pyc index 4242ab7..aeaea8b 100644 Binary files a/vocos/__pycache__/modules.cpython-310.pyc and b/languagecodec_decoder/__pycache__/modules.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/modules.cpython-38.pyc b/languagecodec_decoder/__pycache__/modules.cpython-38.pyc new file mode 100644 index 0000000..ab7c0c5 Binary files /dev/null and b/languagecodec_decoder/__pycache__/modules.cpython-38.pyc differ diff --git a/vocos/__pycache__/pretrained.cpython-310.pyc b/languagecodec_decoder/__pycache__/pretrained.cpython-310.pyc similarity index 71% rename from vocos/__pycache__/pretrained.cpython-310.pyc rename to languagecodec_decoder/__pycache__/pretrained.cpython-310.pyc index 8cbce9f..9c1fac5 100644 Binary files a/vocos/__pycache__/pretrained.cpython-310.pyc and b/languagecodec_decoder/__pycache__/pretrained.cpython-310.pyc differ diff --git a/languagecodec_decoder/__pycache__/pretrained.cpython-38.pyc b/languagecodec_decoder/__pycache__/pretrained.cpython-38.pyc new file mode 100644 index 0000000..865d46b Binary files /dev/null and b/languagecodec_decoder/__pycache__/pretrained.cpython-38.pyc differ diff --git a/vocos/__pycache__/pretrained_model.cpython-310.pyc b/languagecodec_decoder/__pycache__/pretrained_model.cpython-310.pyc similarity index 91% rename from vocos/__pycache__/pretrained_model.cpython-310.pyc rename to languagecodec_decoder/__pycache__/pretrained_model.cpython-310.pyc index ecd2220..af6b381 100644 Binary files a/vocos/__pycache__/pretrained_model.cpython-310.pyc and b/languagecodec_decoder/__pycache__/pretrained_model.cpython-310.pyc differ diff --git a/vocos/__pycache__/spectral_ops.cpython-310.pyc b/languagecodec_decoder/__pycache__/spectral_ops.cpython-310.pyc similarity index 96% rename from vocos/__pycache__/spectral_ops.cpython-310.pyc rename to languagecodec_decoder/__pycache__/spectral_ops.cpython-310.pyc index 9ac91b7..2156819 100644 Binary files a/vocos/__pycache__/spectral_ops.cpython-310.pyc and b/languagecodec_decoder/__pycache__/spectral_ops.cpython-310.pyc differ diff --git a/vocos/dataset.py b/languagecodec_decoder/dataset.py similarity index 95% rename from vocos/dataset.py rename to languagecodec_decoder/dataset.py index 4a8d594..352d1a4 100644 --- a/vocos/dataset.py +++ b/languagecodec_decoder/dataset.py @@ -7,6 +7,7 @@ from torch.utils.data import Dataset, DataLoader import soundfile +# import librosa torch.set_num_threads(1) @@ -54,7 +55,9 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> torch.Tensor: audio_path = self.filelist[index] # y, sr = torchaudio.load(audio_path) + # print(audio_path,"111") y1, sr = soundfile.read(audio_path) + # y1, sr = librosa.load(audio_path,sr=None) y = torch.tensor(y1).float().unsqueeze(0) # if y.size(0) > 1: # # mix to mono @@ -78,4 +81,4 @@ def __getitem__(self, index: int) -> torch.Tensor: # During validation, take always the first segment for determinism y = y[:, : self.num_samples] - return y[0] + return y[0] \ No newline at end of file diff --git a/languagecodec_decoder/discriminator_dac.py b/languagecodec_decoder/discriminator_dac.py new file mode 100644 index 0000000..3b4ca65 --- /dev/null +++ b/languagecodec_decoder/discriminator_dac.py @@ -0,0 +1,249 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +# from audiotools import AudioSignal +# from audiotools import ml +# from audiotools import STFTParams +from einops import rearrange +from torch.nn.utils import weight_norm + +from collections import namedtuple + +STFTParams = namedtuple( + "STFTParams", + ["window_length", "hop_length", "window_type", "match_stride", "padding_type"], +) + +STFTParams.__new__.__defaults__ = (None, None, None, None, None) + + +def WNConv1d(*args, **kwargs): + act = kwargs.pop("act", True) + conv = weight_norm(nn.Conv1d(*args, **kwargs)) + if not act: + return conv + return nn.Sequential(conv, nn.LeakyReLU(0.1)) + + +def WNConv2d(*args, **kwargs): + act = kwargs.pop("act", True) + conv = weight_norm(nn.Conv2d(*args, **kwargs)) + if not act: + return conv + return nn.Sequential(conv, nn.LeakyReLU(0.1)) + + +class MPD(nn.Module): + def __init__(self, period): + super().__init__() + self.period = period + self.convs = nn.ModuleList( + [ + WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)), + WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)), + WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)), + WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)), + WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)), + ] + ) + self.conv_post = WNConv2d( + 1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False + ) + + def pad_to_period(self, x): + t = x.shape[-1] + x = F.pad(x, (0, self.period - t % self.period), mode="reflect") + return x + + def forward(self, x): + fmap = [] + + x = self.pad_to_period(x) + x = rearrange(x, "b c (l p) -> b c l p", p=self.period) + + for layer in self.convs: + x = layer(x) + fmap.append(x) + + x = self.conv_post(x) + fmap.append(x) + + return fmap + + +class MSD(nn.Module): + def __init__(self, rate: int = 1, sample_rate: int = 24000): + super().__init__() + self.convs = nn.ModuleList( + [ + WNConv1d(1, 16, 15, 1, padding=7), + WNConv1d(16, 64, 41, 4, groups=4, padding=20), + WNConv1d(64, 256, 41, 4, groups=16, padding=20), + WNConv1d(256, 1024, 41, 4, groups=64, padding=20), + WNConv1d(1024, 1024, 41, 4, groups=256, padding=20), + WNConv1d(1024, 1024, 5, 1, padding=2), + ] + ) + self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False) + self.sample_rate = sample_rate + self.rate = rate + + def forward(self, x): + # x = AudioSignal(x, self.sample_rate) + # x.resample(self.sample_rate // self.rate) + # x = x.audio_data + + fmap = [] + + for l in self.convs: + x = l(x) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + + return fmap + + +BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)] + + +class MRD(nn.Module): + def __init__( + self, + window_length: int, + hop_factor: float = 0.25, + sample_rate: int = 24000, + bands: list = BANDS, + ): + """Complex multi-band spectrogram discriminator. + Parameters + ---------- + window_length : int + Window length of STFT. + hop_factor : float, optional + Hop factor of the STFT, defaults to ``0.25 * window_length``. + sample_rate : int, optional + Sampling rate of audio in Hz, by default 24000 + bands : list, optional + Bands to run discriminator over. + """ + super().__init__() + + self.window_length = window_length + self.hop_factor = hop_factor + self.sample_rate = sample_rate + self.stft_params = STFTParams( + window_length=window_length, + hop_length=int(window_length * hop_factor), + match_stride=True, + ) + + n_fft = window_length // 2 + 1 + bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands] + self.bands = bands + self.n_fft = window_length + + ch = 32 + convs = lambda: nn.ModuleList( + [ + WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)), + WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)), + WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)), + WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)), + WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)), + ] + ) + self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))]) + self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False) + + def spectrogram(self, x): + # x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params) + # x = torch.view_as_real(x.stft()) + + # x.squeeze(0).stft(n_fft=1024,win_length=1024,return_complex=True).size() + # breakpoint() + if x.size(0)==1: + # x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.window_length,return_complex=True).unsqueeze(0)) + x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(0)) + else: + # x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.window_length,return_complex=True).unsqueeze(1)) + x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(1)) + x = rearrange(x, "b 1 f t c -> (b 1) c t f") + # Split into bands + x_bands = [x[..., b[0] : b[1]] for b in self.bands] + return x_bands + + def forward(self, x): + x_bands = self.spectrogram(x) + fmap = [] + + x = [] + for band, stack in zip(x_bands, self.band_convs): + for layer in stack: + band = layer(band) + fmap.append(band) + x.append(band) + + x = torch.cat(x, dim=-1) + x = self.conv_post(x) + fmap.append(x) + + return fmap + + +# class DACDiscriminator(ml.BaseModel): +class DACDiscriminator(nn.Module): + def __init__( + self, + rates: list = [], + periods: list = [2, 3, 5, 7, 11], + fft_sizes: list = [2048, 1024, 512], + sample_rate: int = 24000, + bands: list = BANDS, + ): + """Discriminator that combines multiple discriminators. + + Parameters + ---------- + rates : list, optional + sampling rates (in Hz) to run MSD at, by default [] + If empty, MSD is not used. + periods : list, optional + periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11] + fft_sizes : list, optional + Window sizes of the FFT to run MRD at, by default [2048, 1024, 512] + sample_rate : int, optional + Sampling rate of audio in Hz, by default 24000 + bands : list, optional + Bands to run MRD at, by default `BANDS` + """ + super().__init__() + discs = [] + discs += [MPD(p) for p in periods] + discs += [MSD(r, sample_rate=sample_rate) for r in rates] + discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes] + self.discriminators = nn.ModuleList(discs) + + def preprocess(self, y): + # Remove DC offset + y = y - y.mean(dim=-1, keepdims=True) + # Peak normalize the volume of input audio + y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9) + return y + + def forward(self, x): + x = self.preprocess(x) + fmaps = [d(x) for d in self.discriminators] + return fmaps + + +if __name__ == "__main__": + disc = DACDiscriminator() + x = torch.zeros(1, 1, 24000) + results = disc(x) + breakpoint() + for i, result in enumerate(results): + print(f"disc{i}") + for i, r in enumerate(result): + print(r.shape, r.mean(), r.min(), r.max()) + print("00") diff --git a/vocos/discriminators.py b/languagecodec_decoder/discriminators.py similarity index 100% rename from vocos/discriminators.py rename to languagecodec_decoder/discriminators.py diff --git a/vocos/experiment.py b/languagecodec_decoder/experiment.py similarity index 94% rename from vocos/experiment.py rename to languagecodec_decoder/experiment.py index 41e80dd..166998f 100644 --- a/vocos/experiment.py +++ b/languagecodec_decoder/experiment.py @@ -7,14 +7,16 @@ import transformers import yaml -from vocos.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator -from vocos.feature_extractors import FeatureExtractor -from vocos.heads import FourierHead -from vocos.helpers import plot_spectrogram_to_numpy -from vocos.loss import DiscriminatorLoss, GeneratorLoss, FeatureMatchingLoss, MelSpecReconstructionLoss -from vocos.models import Backbone -from vocos.modules import safe_log -from vocos.pretrained_model import instantiate_class +from languagecodec_decoder.discriminator_dac import DACDiscriminator + +from languagecodec_decoder.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator +from languagecodec_decoder.feature_extractors import FeatureExtractor +from languagecodec_decoder.heads import FourierHead +from languagecodec_decoder.helpers import plot_spectrogram_to_numpy +from languagecodec_decoder.loss import DiscriminatorLoss, GeneratorLoss, FeatureMatchingLoss, MelSpecReconstructionLoss, DACGANLoss +from languagecodec_decoder.models import Backbone +from languagecodec_decoder.modules import safe_log +from languagecodec_decoder.pretrained_model import instantiate_class class VocosExp(pl.LightningModule): @@ -68,6 +70,11 @@ def __init__( self.multiperioddisc = MultiPeriodDiscriminator() self.multiresddisc = MultiResolutionDiscriminator() + + self.dac = DACDiscriminator() + + self.dacdiscriminator = DACGANLoss(self.dac) + self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.feat_matching_loss = FeatureMatchingLoss() @@ -117,6 +124,10 @@ def training_step(self, batch, batch_idx, optimizer_idx, **kwargs): if optimizer_idx == 0 and self.train_discriminator: with torch.no_grad(): audio_hat, _ = self(audio_input, **kwargs) + + + loss_dac=self.dacdiscriminator.discriminator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1)) + real_score_mp, gen_score_mp, _, _ = self.multiperioddisc(y=audio_input, y_hat=audio_hat, **kwargs,) real_score_mrd, gen_score_mrd, _, _ = self.multiresddisc(y=audio_input, y_hat=audio_hat, **kwargs,) loss_mp, loss_mp_real, _ = self.disc_loss( @@ -127,17 +138,20 @@ def training_step(self, batch, batch_idx, optimizer_idx, **kwargs): ) loss_mp /= len(loss_mp_real) loss_mrd /= len(loss_mrd_real) - loss = loss_mp + self.hparams.mrd_loss_coeff * loss_mrd + loss = loss_mp + self.hparams.mrd_loss_coeff * loss_mrd + loss_dac self.log("discriminator/total", loss, prog_bar=True) self.log("discriminator/multi_period_loss", loss_mp) self.log("discriminator/multi_res_loss", loss_mrd) + self.log("discriminator/dac", loss_dac) return loss # train generator if optimizer_idx == 1: audio_hat, commit_loss = self(audio_input, **kwargs) if self.train_discriminator: + + loss_dac_1,loss_dac_2 = self.dacdiscriminator.generator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1)) _, gen_score_mp, fmap_rs_mp, fmap_gs_mp = self.multiperioddisc( y=audio_input, y_hat=audio_hat, **kwargs, ) @@ -155,6 +169,8 @@ def training_step(self, batch, batch_idx, optimizer_idx, **kwargs): self.log("generator/multi_res_loss", loss_gen_mrd) self.log("generator/feature_matching_mp", loss_fm_mp) self.log("generator/feature_matching_mrd", loss_fm_mrd) + self.log("generator/loss_dac_1", loss_dac_1) + self.log("generator/loss_dac_2", loss_dac_2) else: loss_gen_mp = loss_gen_mrd = loss_fm_mp = loss_fm_mrd = 0 @@ -166,6 +182,8 @@ def training_step(self, batch, batch_idx, optimizer_idx, **kwargs): + self.hparams.mrd_loss_coeff * loss_fm_mrd + self.mel_loss_coeff * mel_loss + 1000 * commit_loss + + loss_dac_1 + + loss_dac_2 ) self.log("generator/total_loss", loss, prog_bar=True) diff --git a/vocos/feature_extractors.py b/languagecodec_decoder/feature_extractors.py similarity index 86% rename from vocos/feature_extractors.py rename to languagecodec_decoder/feature_extractors.py index 67b83cb..daf4f15 100644 --- a/vocos/feature_extractors.py +++ b/languagecodec_decoder/feature_extractors.py @@ -5,10 +5,10 @@ # from encodec import EncodecModel from torch import nn import math -from vocos.modules import safe_log -from encodec.modules import SEANetEncoder, SEANetDecoder +from languagecodec_decoder.modules import safe_log +from languagecodec_encoder.modules import SEANetEncoder, SEANetDecoder from encodec import EncodecModel -from encodec.quantization import ResidualVectorQuantizer +from languagecodec_encoder.quantization import ResidualVectorQuantizer class FeatureExtractor(nn.Module): @@ -70,7 +70,7 @@ def __init__( dimension=128, channels=1, n_filters=32, ratios=[8, 5, 4, 2], activation='ELU', kernel_size=7, residual_kernel_size=3, last_kernel_size=7, dilation_base=2, true_skip=False, compress=2) - quantizer = ResidualVectorQuantizer(dimension=128, n_q=n_q, bins=2048, kmeans_iters=50, + quantizer = ResidualVectorQuantizer(dimension=128, n_q=n_q, bins=1024, kmeans_iters=50, decay=0.99, kmeans_init=True) if encodec_model == "encodec_24khz": self.encodec = EncodecModel(encoder=encoder, decoder=decoder, quantizer=quantizer, @@ -115,3 +115,16 @@ def forward(self, audio: torch.Tensor, bandwidth_id: torch.Tensor): # embeddings_idxs = codes + offsets.view(-1, 1, 1) # features = torch.nn.functional.embedding(embeddings_idxs, self.codebook_weights).sum(dim=0) # return features.transpose(1, 2) + + def infer(self, audio: torch.Tensor, bandwidth_id: torch.Tensor): + if self.training: + self.encodec.train() + + audio = audio.unsqueeze(1) # audio(16,24000) + emb = self.encodec.encoder(audio) + q_res = self.encodec.quantizer.infer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id]) + quantized = q_res.quantized + codes = q_res.codes + commit_loss = q_res.penalty # codes(8,16,75),features(16,128,75) + + return quantized, codes, commit_loss diff --git a/vocos/heads.py b/languagecodec_decoder/heads.py similarity index 98% rename from vocos/heads.py rename to languagecodec_decoder/heads.py index 21863f1..e870ec7 100644 --- a/vocos/heads.py +++ b/languagecodec_decoder/heads.py @@ -2,8 +2,8 @@ from torch import nn from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz -from vocos.spectral_ops import IMDCT, ISTFT -from vocos.modules import symexp +from languagecodec_decoder.spectral_ops import IMDCT, ISTFT +from languagecodec_decoder.modules import symexp class FourierHead(nn.Module): diff --git a/vocos/helpers.py b/languagecodec_decoder/helpers.py similarity index 100% rename from vocos/helpers.py rename to languagecodec_decoder/helpers.py diff --git a/vocos/loss.py b/languagecodec_decoder/loss.py similarity index 72% rename from vocos/loss.py rename to languagecodec_decoder/loss.py index e6b0ed5..6b3f0fc 100644 --- a/vocos/loss.py +++ b/languagecodec_decoder/loss.py @@ -4,7 +4,9 @@ import torchaudio from torch import nn -from vocos.modules import safe_log +from languagecodec_decoder.modules import safe_log + +import torch.nn.functional as F class MelSpecReconstructionLoss(nn.Module): @@ -112,3 +114,46 @@ def forward(self, fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tens loss += torch.mean(torch.abs(rl - gl)) return loss + +class DACGANLoss(nn.Module): + """ + Computes a discriminator loss, given a discriminator on + generated waveforms/spectrograms compared to ground truth + waveforms/spectrograms. Computes the loss for both the + discriminator and the generator in separate functions. + """ + + def __init__(self, discriminator): + super().__init__() + self.discriminator = discriminator + + def forward(self, fake, real): + # d_fake = self.discriminator(fake.audio_data) + # d_real = self.discriminator(real.audio_data) + d_fake = self.discriminator(fake) + d_real = self.discriminator(real) + return d_fake, d_real + + def discriminator_loss(self, fake, real): + d_fake, d_real = self.forward(fake.clone().detach(), real) + + loss_d = 0 + for x_fake, x_real in zip(d_fake, d_real): + loss_d += torch.mean(x_fake[-1] ** 2) + loss_d += torch.mean((1 - x_real[-1]) ** 2) + return loss_d + + def generator_loss(self, fake, real): + d_fake, d_real = self.forward(fake, real) + + loss_g = 0 + for x_fake in d_fake: + loss_g += torch.mean((1 - x_fake[-1]) ** 2) + + loss_feature = 0 + + for i in range(len(d_fake)): + for j in range(len(d_fake[i]) - 1): + loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach()) + return loss_g, loss_feature + diff --git a/vocos/models.py b/languagecodec_decoder/models.py similarity index 98% rename from vocos/models.py rename to languagecodec_decoder/models.py index 886a88a..c62622b 100644 --- a/vocos/models.py +++ b/languagecodec_decoder/models.py @@ -4,7 +4,7 @@ from torch import nn from torch.nn.utils import weight_norm -from vocos.modules import ConvNeXtBlock, ResBlock1, AdaLayerNorm +from languagecodec_decoder.modules import ConvNeXtBlock, ResBlock1, AdaLayerNorm class Backbone(nn.Module): diff --git a/vocos/modules.py b/languagecodec_decoder/modules.py similarity index 100% rename from vocos/modules.py rename to languagecodec_decoder/modules.py diff --git a/vocos/pretrained.py b/languagecodec_decoder/pretrained.py similarity index 95% rename from vocos/pretrained.py rename to languagecodec_decoder/pretrained.py index 507a1aa..c0e233e 100644 --- a/vocos/pretrained.py +++ b/languagecodec_decoder/pretrained.py @@ -5,9 +5,9 @@ import yaml from huggingface_hub import hf_hub_download from torch import nn -from vocos.feature_extractors import FeatureExtractor, EncodecFeatures -from vocos.heads import FourierHead -from vocos.models import Backbone +from languagecodec_decoder.feature_extractors import FeatureExtractor, EncodecFeatures +from languagecodec_decoder.heads import FourierHead +from languagecodec_decoder.models import Backbone def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any: @@ -24,6 +24,7 @@ def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) - if not isinstance(args, tuple): args = (args,) class_module, class_name = init["class_path"].rsplit(".", 1) + # breakpoint() module = __import__(class_module, fromlist=[class_name]) args_class = getattr(module, class_name) return args_class(*args, **kwargs) @@ -182,6 +183,13 @@ def encode(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: return features,discrete_codes + # 0818 + @torch.inference_mode() + def encode_infer(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: + features, discrete_codes, _ = self.feature_extractor.infer(audio_input, **kwargs) + return features,discrete_codes + + @torch.inference_mode() def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: """ diff --git a/vocos/pretrained_model.py b/languagecodec_decoder/pretrained_model.py similarity index 96% rename from vocos/pretrained_model.py rename to languagecodec_decoder/pretrained_model.py index d8cfdaf..5453e89 100644 --- a/vocos/pretrained_model.py +++ b/languagecodec_decoder/pretrained_model.py @@ -4,10 +4,10 @@ import yaml from huggingface_hub import hf_hub_download from torch import nn -from vocos.feature_extractors import FeatureExtractor, EncodecFeatures -from vocos.heads import FourierHead -from vocos.models import Backbone -from vocos.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator +from languagecodec_decoder.feature_extractors import FeatureExtractor, EncodecFeatures +from languagecodec_decoder.heads import FourierHead +from languagecodec_decoder.models import Backbone +from languagecodec_decoder.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any: diff --git a/vocos/spectral_ops.py b/languagecodec_decoder/spectral_ops.py similarity index 100% rename from vocos/spectral_ops.py rename to languagecodec_decoder/spectral_ops.py diff --git a/encodec/__init__.py b/languagecodec_encoder/__init__.py similarity index 100% rename from encodec/__init__.py rename to languagecodec_encoder/__init__.py diff --git a/languagecodec_encoder/__pycache__/__init__.cpython-310.pyc b/languagecodec_encoder/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..071a1a7 Binary files /dev/null and b/languagecodec_encoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/encodec/__pycache__/__init__.cpython-310.pyc b/languagecodec_encoder/__pycache__/__init__.cpython-38.pyc similarity index 68% rename from encodec/__pycache__/__init__.cpython-310.pyc rename to languagecodec_encoder/__pycache__/__init__.cpython-38.pyc index 7980e1e..fd6ecb4 100644 Binary files a/encodec/__pycache__/__init__.cpython-310.pyc and b/languagecodec_encoder/__pycache__/__init__.cpython-38.pyc differ diff --git a/encodec/__pycache__/distrib.cpython-310.pyc b/languagecodec_encoder/__pycache__/distrib.cpython-310.pyc similarity index 94% rename from encodec/__pycache__/distrib.cpython-310.pyc rename to languagecodec_encoder/__pycache__/distrib.cpython-310.pyc index 92f02a7..2c7488a 100644 Binary files a/encodec/__pycache__/distrib.cpython-310.pyc and b/languagecodec_encoder/__pycache__/distrib.cpython-310.pyc differ diff --git a/languagecodec_encoder/__pycache__/distrib.cpython-38.pyc b/languagecodec_encoder/__pycache__/distrib.cpython-38.pyc new file mode 100644 index 0000000..e3be365 Binary files /dev/null and b/languagecodec_encoder/__pycache__/distrib.cpython-38.pyc differ diff --git a/encodec/__pycache__/model.cpython-310.pyc b/languagecodec_encoder/__pycache__/model.cpython-310.pyc similarity index 96% rename from encodec/__pycache__/model.cpython-310.pyc rename to languagecodec_encoder/__pycache__/model.cpython-310.pyc index 1ccceb9..6aa6ac9 100644 Binary files a/encodec/__pycache__/model.cpython-310.pyc and b/languagecodec_encoder/__pycache__/model.cpython-310.pyc differ diff --git a/languagecodec_encoder/__pycache__/model.cpython-38.pyc b/languagecodec_encoder/__pycache__/model.cpython-38.pyc new file mode 100644 index 0000000..6de05a0 Binary files /dev/null and b/languagecodec_encoder/__pycache__/model.cpython-38.pyc differ diff --git a/encodec/__pycache__/utils.cpython-310.pyc b/languagecodec_encoder/__pycache__/utils.cpython-310.pyc similarity index 91% rename from encodec/__pycache__/utils.cpython-310.pyc rename to languagecodec_encoder/__pycache__/utils.cpython-310.pyc index 0a2cf33..faf9e7f 100644 Binary files a/encodec/__pycache__/utils.cpython-310.pyc and b/languagecodec_encoder/__pycache__/utils.cpython-310.pyc differ diff --git a/languagecodec_encoder/__pycache__/utils.cpython-38.pyc b/languagecodec_encoder/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000..23b71d7 Binary files /dev/null and b/languagecodec_encoder/__pycache__/utils.cpython-38.pyc differ diff --git a/encodec/distrib.py b/languagecodec_encoder/distrib.py similarity index 100% rename from encodec/distrib.py rename to languagecodec_encoder/distrib.py diff --git a/encodec/model.py b/languagecodec_encoder/model.py similarity index 100% rename from encodec/model.py rename to languagecodec_encoder/model.py diff --git a/encodec/modules/__init__.py b/languagecodec_encoder/modules/__init__.py similarity index 100% rename from encodec/modules/__init__.py rename to languagecodec_encoder/modules/__init__.py diff --git a/languagecodec_encoder/modules/__pycache__/__init__.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..bf8dc55 Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/encodec/modules/__pycache__/__init__.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/__init__.cpython-38.pyc similarity index 93% rename from encodec/modules/__pycache__/__init__.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/__init__.cpython-38.pyc index f11187e..8409379 100644 Binary files a/encodec/modules/__pycache__/__init__.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/__init__.cpython-38.pyc differ diff --git a/encodec/modules/__pycache__/conv.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/conv.cpython-310.pyc similarity index 95% rename from encodec/modules/__pycache__/conv.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/conv.cpython-310.pyc index 2206304..466b8d4 100644 Binary files a/encodec/modules/__pycache__/conv.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/conv.cpython-310.pyc differ diff --git a/languagecodec_encoder/modules/__pycache__/conv.cpython-38.pyc b/languagecodec_encoder/modules/__pycache__/conv.cpython-38.pyc new file mode 100644 index 0000000..751a4cf Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/conv.cpython-38.pyc differ diff --git a/encodec/modules/__pycache__/lstm.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/lstm.cpython-310.pyc similarity index 80% rename from encodec/modules/__pycache__/lstm.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/lstm.cpython-310.pyc index 5750497..1b80458 100644 Binary files a/encodec/modules/__pycache__/lstm.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/lstm.cpython-310.pyc differ diff --git a/languagecodec_encoder/modules/__pycache__/lstm.cpython-38.pyc b/languagecodec_encoder/modules/__pycache__/lstm.cpython-38.pyc new file mode 100644 index 0000000..1b10457 Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/lstm.cpython-38.pyc differ diff --git a/encodec/modules/__pycache__/norm.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/norm.cpython-310.pyc similarity index 62% rename from encodec/modules/__pycache__/norm.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/norm.cpython-310.pyc index 0ae8d13..4777e02 100644 Binary files a/encodec/modules/__pycache__/norm.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/norm.cpython-310.pyc differ diff --git a/languagecodec_encoder/modules/__pycache__/norm.cpython-38.pyc b/languagecodec_encoder/modules/__pycache__/norm.cpython-38.pyc new file mode 100644 index 0000000..c9a4f2d Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/norm.cpython-38.pyc differ diff --git a/encodec/modules/__pycache__/seanet.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/seanet.cpython-310.pyc similarity index 97% rename from encodec/modules/__pycache__/seanet.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/seanet.cpython-310.pyc index fd32c56..edcca1a 100644 Binary files a/encodec/modules/__pycache__/seanet.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/seanet.cpython-310.pyc differ diff --git a/languagecodec_encoder/modules/__pycache__/seanet.cpython-38.pyc b/languagecodec_encoder/modules/__pycache__/seanet.cpython-38.pyc new file mode 100644 index 0000000..6ba48c2 Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/seanet.cpython-38.pyc differ diff --git a/encodec/modules/__pycache__/transformer.cpython-310.pyc b/languagecodec_encoder/modules/__pycache__/transformer.cpython-310.pyc similarity index 88% rename from encodec/modules/__pycache__/transformer.cpython-310.pyc rename to languagecodec_encoder/modules/__pycache__/transformer.cpython-310.pyc index a2ad4b6..4d1d56b 100644 Binary files a/encodec/modules/__pycache__/transformer.cpython-310.pyc and b/languagecodec_encoder/modules/__pycache__/transformer.cpython-310.pyc differ diff --git a/languagecodec_encoder/modules/__pycache__/transformer.cpython-38.pyc b/languagecodec_encoder/modules/__pycache__/transformer.cpython-38.pyc new file mode 100644 index 0000000..c92d179 Binary files /dev/null and b/languagecodec_encoder/modules/__pycache__/transformer.cpython-38.pyc differ diff --git a/encodec/modules/conv.py b/languagecodec_encoder/modules/conv.py similarity index 100% rename from encodec/modules/conv.py rename to languagecodec_encoder/modules/conv.py diff --git a/encodec/modules/lstm.py b/languagecodec_encoder/modules/lstm.py similarity index 100% rename from encodec/modules/lstm.py rename to languagecodec_encoder/modules/lstm.py diff --git a/encodec/modules/norm.py b/languagecodec_encoder/modules/norm.py similarity index 100% rename from encodec/modules/norm.py rename to languagecodec_encoder/modules/norm.py diff --git a/encodec/modules/seanet.py b/languagecodec_encoder/modules/seanet.py similarity index 100% rename from encodec/modules/seanet.py rename to languagecodec_encoder/modules/seanet.py diff --git a/encodec/modules/transformer.py b/languagecodec_encoder/modules/transformer.py similarity index 98% rename from encodec/modules/transformer.py rename to languagecodec_encoder/modules/transformer.py index b716527..44b4791 100644 --- a/encodec/modules/transformer.py +++ b/languagecodec_encoder/modules/transformer.py @@ -67,7 +67,7 @@ class StreamingTransformerEncoder(nn.Module): hidden_scale (int): intermediate dimension of FF module is this times the dimension. num_heads (int): number of heads. num_layers (int): number of layers. - max_period (float): maximum period of cosines in the positional embedding. + max_period (float): maxium period of cosines in the positional embedding. past_context (int or None): receptive field for the causal mask, infinite if None. gelu (bool): if true uses GeLUs, otherwise use ReLUs. norm_in (bool): normalize the input. diff --git a/encodec/msstftd.py b/languagecodec_encoder/msstftd.py similarity index 100% rename from encodec/msstftd.py rename to languagecodec_encoder/msstftd.py diff --git a/encodec/quantization/__init__.py b/languagecodec_encoder/quantization/__init__.py similarity index 100% rename from encodec/quantization/__init__.py rename to languagecodec_encoder/quantization/__init__.py diff --git a/languagecodec_encoder/quantization/__pycache__/__init__.cpython-310.pyc b/languagecodec_encoder/quantization/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..5be4c13 Binary files /dev/null and b/languagecodec_encoder/quantization/__pycache__/__init__.cpython-310.pyc differ diff --git a/languagecodec_encoder/quantization/__pycache__/__init__.cpython-38.pyc b/languagecodec_encoder/quantization/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..a12af2f Binary files /dev/null and b/languagecodec_encoder/quantization/__pycache__/__init__.cpython-38.pyc differ diff --git a/encodec/quantization/__pycache__/core_vq.cpython-310.pyc b/languagecodec_encoder/quantization/__pycache__/core_vq.cpython-310.pyc similarity index 60% rename from encodec/quantization/__pycache__/core_vq.cpython-310.pyc rename to languagecodec_encoder/quantization/__pycache__/core_vq.cpython-310.pyc index 6dbcc11..e23e7f7 100644 Binary files a/encodec/quantization/__pycache__/core_vq.cpython-310.pyc and b/languagecodec_encoder/quantization/__pycache__/core_vq.cpython-310.pyc differ diff --git a/languagecodec_encoder/quantization/__pycache__/core_vq.cpython-38.pyc b/languagecodec_encoder/quantization/__pycache__/core_vq.cpython-38.pyc new file mode 100644 index 0000000..a4b50fb Binary files /dev/null and b/languagecodec_encoder/quantization/__pycache__/core_vq.cpython-38.pyc differ diff --git a/encodec/quantization/__pycache__/vq.cpython-310.pyc b/languagecodec_encoder/quantization/__pycache__/vq.cpython-310.pyc similarity index 59% rename from encodec/quantization/__pycache__/vq.cpython-310.pyc rename to languagecodec_encoder/quantization/__pycache__/vq.cpython-310.pyc index a04d98a..9942a9e 100644 Binary files a/encodec/quantization/__pycache__/vq.cpython-310.pyc and b/languagecodec_encoder/quantization/__pycache__/vq.cpython-310.pyc differ diff --git a/languagecodec_encoder/quantization/__pycache__/vq.cpython-38.pyc b/languagecodec_encoder/quantization/__pycache__/vq.cpython-38.pyc new file mode 100644 index 0000000..c139b8c Binary files /dev/null and b/languagecodec_encoder/quantization/__pycache__/vq.cpython-38.pyc differ diff --git a/encodec/quantization/ac.py b/languagecodec_encoder/quantization/ac.py similarity index 100% rename from encodec/quantization/ac.py rename to languagecodec_encoder/quantization/ac.py diff --git a/encodec/quantization/core_vq.py b/languagecodec_encoder/quantization/core_vq.py similarity index 100% rename from encodec/quantization/core_vq.py rename to languagecodec_encoder/quantization/core_vq.py diff --git a/encodec/quantization/vq.py b/languagecodec_encoder/quantization/vq.py similarity index 77% rename from encodec/quantization/vq.py rename to languagecodec_encoder/quantization/vq.py index 82521ed..851b0c1 100644 --- a/encodec/quantization/vq.py +++ b/languagecodec_encoder/quantization/vq.py @@ -56,6 +56,11 @@ def __init__( self.kmeans_init = kmeans_init self.kmeans_iters = kmeans_iters self.threshold_ema_dead_code = threshold_ema_dead_code + + # print(self.bins) + + # breakpoint() + self.vq = LanguageVectorQuantization( dim=self.dimension, codebook_size=self.bins, @@ -91,16 +96,46 @@ def forward(self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth) # assert n_q==4 # breakpoint() - nq_choice=[3,4,8] + # nq_choice=[3,4,8] + nq_choice=[3,4,5,6,7,8] if self.training: - choice = int(torch.randint(0, 3, (1,)).item()) + # choice = int(torch.randint(0, 3, (1,)).item()) + choice = int(torch.randint(0, 6, (1,)).item()) # breakpoint() n_q=nq_choice[choice] + # breakpoint() # n_q=8 quantized, codes, commit_loss = self.vq(x, n_q=n_q) bw = torch.tensor(n_q * bw_per_q).to(x) return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss)) + def infer(self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float] = None) -> QuantizedResult: + """Residual vector quantization on the given input tensor. + Args: + x (torch.Tensor): Input tensor. + frame_rate (int): Sample rate of the input tensor. + bandwidth (float): Target bandwidth. + Returns: + QuantizedResult: + The quantized (or approximately quantized) representation with + the associated bandwidth and any penalty term for the loss. + """ + bw_per_q = self.get_bandwidth_per_quantizer(frame_rate) + n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth) + # assert n_q==4 + # breakpoint() + # nq_choice=[3,4,8] + nq_choice=[3,4,5,6,7,8] + if self.training: + # choice = int(torch.randint(0, 3, (1,)).item()) + choice = int(torch.randint(0, 6, (1,)).item()) + # breakpoint() + n_q=nq_choice[choice] + n_q=8 + quantized, codes, commit_loss = self.vq(x, n_q=n_q) + bw = torch.tensor(n_q * bw_per_q).to(x) + return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss)) + def get_num_quantizers_for_bandwidth(self, frame_rate: int, bandwidth: tp.Optional[float] = None) -> int: """Return n_q based on specified target bandwidth. """ diff --git a/encodec/utils.py b/languagecodec_encoder/utils.py similarity index 100% rename from encodec/utils.py rename to languagecodec_encoder/utils.py diff --git a/result.png b/result.png new file mode 100644 index 0000000..da29da7 Binary files /dev/null and b/result.png differ diff --git a/vocos/__init__.py b/vocos/__init__.py deleted file mode 100644 index f363437..0000000 --- a/vocos/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from vocos.pretrained import Vocos - - -__version__ = "0.0.3" diff --git a/vocos/__pycache__/__init__.cpython-310.pyc b/vocos/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index dbca3d2..0000000 Binary files a/vocos/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/vocos/__pycache__/experiment.cpython-310.pyc b/vocos/__pycache__/experiment.cpython-310.pyc deleted file mode 100644 index 2211de2..0000000 Binary files a/vocos/__pycache__/experiment.cpython-310.pyc and /dev/null differ