undate

jishengpeng · Feb 20, 2024 · a293e9b · a293e9b
1 parent d1e74a7
commit a293e9b
Show file tree

Hide file tree

Showing 73 changed files with 10,688 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,133 @@
-# languagecodec
-Official code repository of Language-Codec
+# Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models
+
+[Audio samples](https://languagecodec.github.io) |
+Paper [[abs]](https://arxiv.org/abs/2402.12208) [[pdf]](https://arxiv.org/pdf/2402.12208.pdf)
+
+
+## Installation
+
+To use Language-Codec, install it using:
+
+```bash
+conda create -n xxx python=3.8
+conda activate xxx
+pip install -r requirement.txt
+```
+
+## Infer
+
+### Part1: Reconstruct audio from raw wav
+
+```python
+
+from encodec.utils import convert_audio
+import torchaudio
+import torch
+from vocos.pretrained import Vocos
+
+device=torch.device('cpu')
+
+config_path = "xxx/languagecodec/configs/languagecodec.yaml"
+model_path = "xxx/xxx.ckpt"
+audio_outpath = "xxx"
+vocos = Vocos.from_pretrained0802(config_path, model_path)
+vocos = vocos.to(device)
+
+wav, sr = torchaudio.load(audio_path)
+wav = convert_audio(wav, sr, 24000, 1) 
+bandwidth_id = torch.tensor([0])
+wav=wav.to(device)
+features,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id)
+audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) 
+torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', bits_per_sample=16)
+```
+
+
+### Part2: Generating Discrete Codecs
+```python
+
+from encodec.utils import convert_audio
+import torchaudio
+import torch
+from vocos.pretrained import Vocos
+
+device=torch.device('cpu')
+
+config_path = "xxx/languagecodec/configs/languagecodec.yaml"
+model_path = "xxx/xxx.ckpt"
+vocos = Vocos.from_pretrained0802(config_path, model_path)
+vocos = vocos.to(device)
+
+wav, sr = torchaudio.load(audio_path)
+wav = convert_audio(wav, sr, 24000, 1) 
+bandwidth_id = torch.tensor([0])
+wav=wav.to(device)
+_,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id)
+print(discrete_code)
+```
+
+
+
+### Part3: Audio reconstruction through codecs
+```python
+# audio_tokens [n_q,1,t]/[n_q,t]
+features = vocos.codes_to_features(audio_tokens)
+bandwidth_id = torch.tensor([0])  
+audio_out = vocos.decode(features, bandwidth_id=bandwidth_id)
+```
+
+
+
+
+## Pre-trained models
+
+Currently, we have only released the results from our paper, and we plan to release additional checkpoints trained on a larger training dataset within the next two months.
+
+| Model Name                                                                          | Dataset       | Training Iterations 
+-------------------------------------------------------------------------------------|---------------|---------------------
+| [languagecodec_paper_8nq](https://huggingface.co/charactr/vocos-mel-24khz)         | 3W Hours      | 2.0 M           
+<!-- | [charactr/vocos-encodec-24khz](https://huggingface.co/charactr/vocos-encodec-24khz) | DNS Challenge | 2.5 M               | 7.9 M       -->
+
+## Training
+
+### Step1: Prepare train dataset
+```python
+# Process the data into a form similar to xxx/languagecodec/data/libritts_testother.txt
+```
+
+### Step2: Modifying configuration files
+```python
+# xxx/languagecodec/configs/languagecodec.yaml
+# Modify the values of parameters such as batch_size, filelist_path, save_dir, device
+```
+
+### Step3: Start training process
+Refer to [Pytorch Lightning documentation](https://lightning.ai/docs/pytorch/stable/) for details about customizing the
+training pipeline.
+
+```bash
+cd xxx/languagecodec
+python train.py fit --config xxx/languagecodec/configs/languagecodec.yaml
+```
+
+
+
+## Citation
+
+If this code contributes to your research, please cite our work:
+
+```
+@misc{ji2024languagecodec,
+      title={Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models}, 
+      author={Shengpeng Ji and Minghui Fang and Ziyue Jiang and Rongjie Huang and Jialung Zuo and Shulei Wang and Zhou Zhao},
+      year={2024},
+      eprint={2402.12208},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS}
+}
+```
+
+## License
+
+The code in this repository is released under the MIT license as found in the
+[LICENSE](LICENSE) file.
diff --git a/baselines/infer_encodec.py b/baselines/infer_encodec.py
@@ -0,0 +1,77 @@
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+
+import torchaudio
+import torch
+
+import os
+
+import logging
+
+infer_log_path="/home/jovyan/honor/big-disk/speech/code/languagecodec/测试比较encodec_encodec.log"
+
+os.system("rm %s"%(infer_log_path))
+
+# 设置输出的格式
+LOG_FORMAT = "时间: %(asctime)s - 日志等级: %(levelname)s - 日志信息: %(message)s"
+# 对logger进行配置——日志等级&输出格式
+logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, filename=infer_log_path)
+
+
+# Instantiate a pretrained EnCodec model
+model = EncodecModel.encodec_model_24khz()
+# The number of codebooks used will be determined bythe bandwidth selected.
+# E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used.
+# Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32).
+# For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number
+# of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much.
+model = model.cuda()
+model.set_target_bandwidth(12.0)
+
+# Load and pre-process the audio waveform
+# wav, sr = torchaudio.load("<PATH_TO_AUDIO_FILE>")
+input_path = "/home/jovyan/honor/big-disk/speech/code/languagecodec/data/infer/lirbitts_vctk_testclean_500"
+out_folder = '/home/jovyan/honor/big-disk/speech/code/languagecodec/result/infer/encodec'
+# os.system("rm -r %s"%(out_folder))
+# os.system("mkdir -p %s"%(out_folder))
+# ll="libritts_testclean500_nq16"
+ll = "debug"
+
+tmptmp=out_folder+"/"+ll
+
+os.system("rm -r %s"%(tmptmp))
+os.system("mkdir -p %s"%(tmptmp))
+
+with open(input_path,'r') as fin:
+    x=fin.readlines()
+
+x = [i.strip() for i in x]
+
+for i in range(20):
+
+    print(i)
+
+    wav, sr = torchaudio.load(x[i])
+
+    wav = wav.cuda()
+    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+    wav = wav.unsqueeze(0)
+
+    # Extract discrete codes from EnCodec
+    with torch.no_grad():
+        encoded_frames = model.encode(wav)
+    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]
+
+    if(codes.size()[2]>100):
+        logging.info(f"{x[i]}|{codes[:,:2,:]}")
+    else:
+        logging.info(f"{x[i]}|{codes}")
+
+    # frames=[(codes,None)]
+
+    # with torch.no_grad():
+    #     wav_output = model.decode(frames)
+
+    # audio_path = out_folder + '/' + ll + '/' + x[i].split('/')[-1]
+
+    # torchaudio.save(audio_path,wav_output.squeeze(0).cpu(),model.sample_rate)
diff --git a/code/datasplit.py b/code/datasplit.py
@@ -0,0 +1,45 @@
+# 将librilight的数据切分成八秒中一段
+
+import os
+import glob
+import torchaudio
+import torch
+
+input_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small"
+output_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small_split"
+
+aa=glob.glob(os.path.join(input_path,"*/*/*.flac"))
+
+os.system("rm -r %s"%(output_path))
+os.system("mkdir -p %s"%(output_path))
+
+sum_audio=0
+
+for i in aa:
+    # sum_audio+=1
+    # if(sum_audio>10):
+    #     break
+    wav, sr = torchaudio.load(i)
+    if(wav.size()[1]>(sr*8)):
+        wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8)
+        wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):]
+        name=i.split('/')[-3]
+        id=i.split('/')[-1].split('.')[0]
+        for id_i in range(wav_tmp.size()[0]):
+            out_folder=output_path+"/"+name
+            os.makedirs(out_folder, exist_ok=True)
+            audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".flac"
+            torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr)
+        if(wav_left.size()[1]>sr):
+            audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".flac"
+            torchaudio.save(audio_path, wav_left, sample_rate=sr)
+    else:
+        if(wav.size()[1]>sr):
+            name=i.split('/')[-3]
+            id=i.split('/')[-1].split('.')[0]
+            out_folder=output_path+"/"+name
+            os.makedirs(out_folder, exist_ok=True)
+            audio_path=out_folder+"/"+id+"_1.flac"
+            torchaudio.save(audio_path, wav, sample_rate=sr)
+
+
diff --git a/code/datasplit2.py b/code/datasplit2.py
@@ -0,0 +1,45 @@
+# 将librilight的数据切分成八秒中一段
+
+import os
+import glob
+import torchaudio
+import torch
+
+input_path="/home/jovyan/honor/big-disk/speech/Data/DNS/pdns_training_set/raw/clean"
+output_path="/home/jovyan/honor/big-disk/speech/Data/DNS/trainclean_split"
+
+aa=glob.glob(os.path.join(input_path,"*/*.wav"))
+
+# os.system("rm -r %s"%(output_path))
+# os.system("mkdir -p %s"%(output_path))
+
+sum_audio=0
+
+for i in aa:
+    # sum_audio+=1
+    # if(sum_audio>10):
+    #     break
+    wav, sr = torchaudio.load(i)
+    if(wav.size()[1]>(sr*8)):
+        wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8)
+        wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):]
+        name=i.split('/')[-2]
+        id=i.split('/')[-1].split('.')[0]
+        for id_i in range(wav_tmp.size()[0]):
+            out_folder=output_path+"/"+name
+            os.makedirs(out_folder, exist_ok=True)
+            audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".wav"
+            torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr)
+        if(wav_left.size()[1]>sr):
+            audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".wav"
+            torchaudio.save(audio_path, wav_left, sample_rate=sr)
+    else:
+        if(wav.size()[1]>sr):
+            name=i.split('/')[-2]
+            id=i.split('/')[-1].split('.')[0]
+            out_folder=output_path+"/"+name
+            os.makedirs(out_folder, exist_ok=True)
+            audio_path=out_folder+"/"+id+"_1.wav"
+            torchaudio.save(audio_path, wav, sample_rate=sr)
+
+
diff --git a/code/getdata.py b/code/getdata.py
@@ -0,0 +1,12 @@
+# 下载common voice
+
+from datasets import load_dataset
+
+
+print(0)
+
+# ['ab', 'af', 'am', 'ar', 'as', 'ast', 'az', 'ba', 'bas', 'be', 'bg', 'bn', 'br', 'ca', 'ckb', 'cnh', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dyu', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'gl', 'gn', 'ha', 'he', 'hi', 'hsb', 'hu', 'hy-AM', 'ia', 'id', 'ig', 'is', 'it', 'ja', 'ka', 'kab', 'kk', 'kmr', 'ko', 'ky', 'lg', 'lij', 'lo', 'lt', 'ltg', 'lv', 'mdf', 'mhr', 'mk', 'ml', 'mn', 'mr', 'mrj', 'mt', 'myv', 'nan-tw', 'ne-NP', 'nhi', 'nl', 'nn-NO', 'oc', 'or', 'os', 'pa-IN', 'pl', 'ps', 'pt', 'quy', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sat', 'sc', 'sk', 'skr', 'sl', 'sq', 'sr', 'sv-SE', 'sw', 'ta', 'te', 'th', 'ti', 'tig', 'tk', 'tok', 'tr', 'tt', 'tw', 'ug', 'uk', 'ur', 'uz', 'vi', 'vot', 'yi', 'yo', 'yue', 'zgh', 'zh-CN', 'zh-HK', 'zh-TW']
+
+ds=load_dataset("mozilla-foundation/common_voice_16_1","hu")
+
+# https://huggingface.co/datasets/reach-vb/common_voice_16_1/resolve/main/audio/af/train/af_train_0.tar