Skip to content

Commit

Permalink
undate
Browse files Browse the repository at this point in the history
  • Loading branch information
novateurjsp committed Feb 20, 2024
1 parent d1e74a7 commit a293e9b
Show file tree
Hide file tree
Showing 73 changed files with 10,688 additions and 2 deletions.
135 changes: 133 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,133 @@
# languagecodec
Official code repository of Language-Codec
# Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models

[Audio samples](https://languagecodec.github.io) |
Paper [[abs]](https://arxiv.org/abs/2402.12208) [[pdf]](https://arxiv.org/pdf/2402.12208.pdf)


## Installation

To use Language-Codec, install it using:

```bash
conda create -n xxx python=3.8
conda activate xxx
pip install -r requirement.txt
```

## Infer

### Part1: Reconstruct audio from raw wav

```python

from encodec.utils import convert_audio
import torchaudio
import torch
from vocos.pretrained import Vocos

device=torch.device('cpu')

config_path = "xxx/languagecodec/configs/languagecodec.yaml"
model_path = "xxx/xxx.ckpt"
audio_outpath = "xxx"
vocos = Vocos.from_pretrained0802(config_path, model_path)
vocos = vocos.to(device)

wav, sr = torchaudio.load(audio_path)
wav = convert_audio(wav, sr, 24000, 1)
bandwidth_id = torch.tensor([0])
wav=wav.to(device)
features,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id)
audio_out = vocos.decode(features, bandwidth_id=bandwidth_id)
torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', bits_per_sample=16)
```


### Part2: Generating Discrete Codecs
```python

from encodec.utils import convert_audio
import torchaudio
import torch
from vocos.pretrained import Vocos

device=torch.device('cpu')

config_path = "xxx/languagecodec/configs/languagecodec.yaml"
model_path = "xxx/xxx.ckpt"
vocos = Vocos.from_pretrained0802(config_path, model_path)
vocos = vocos.to(device)

wav, sr = torchaudio.load(audio_path)
wav = convert_audio(wav, sr, 24000, 1)
bandwidth_id = torch.tensor([0])
wav=wav.to(device)
_,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id)
print(discrete_code)
```



### Part3: Audio reconstruction through codecs
```python
# audio_tokens [n_q,1,t]/[n_q,t]
features = vocos.codes_to_features(audio_tokens)
bandwidth_id = torch.tensor([0])
audio_out = vocos.decode(features, bandwidth_id=bandwidth_id)
```




## Pre-trained models

Currently, we have only released the results from our paper, and we plan to release additional checkpoints trained on a larger training dataset within the next two months.

| Model Name | Dataset | Training Iterations
-------------------------------------------------------------------------------------|---------------|---------------------
| [languagecodec_paper_8nq](https://huggingface.co/charactr/vocos-mel-24khz) | 3W Hours | 2.0 M
<!-- | [charactr/vocos-encodec-24khz](https://huggingface.co/charactr/vocos-encodec-24khz) | DNS Challenge | 2.5 M | 7.9 M -->

## Training

### Step1: Prepare train dataset
```python
# Process the data into a form similar to xxx/languagecodec/data/libritts_testother.txt
```

### Step2: Modifying configuration files
```python
# xxx/languagecodec/configs/languagecodec.yaml
# Modify the values of parameters such as batch_size, filelist_path, save_dir, device
```

### Step3: Start training process
Refer to [Pytorch Lightning documentation](https://lightning.ai/docs/pytorch/stable/) for details about customizing the
training pipeline.

```bash
cd xxx/languagecodec
python train.py fit --config xxx/languagecodec/configs/languagecodec.yaml
```



## Citation

If this code contributes to your research, please cite our work:

```
@misc{ji2024languagecodec,
title={Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models},
author={Shengpeng Ji and Minghui Fang and Ziyue Jiang and Rongjie Huang and Jialung Zuo and Shulei Wang and Zhou Zhao},
year={2024},
eprint={2402.12208},
archivePrefix={arXiv},
primaryClass={eess.AS}
}
```

## License

The code in this repository is released under the MIT license as found in the
[LICENSE](LICENSE) file.
77 changes: 77 additions & 0 deletions baselines/infer_encodec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from encodec import EncodecModel
from encodec.utils import convert_audio

import torchaudio
import torch

import os

import logging

infer_log_path="/home/jovyan/honor/big-disk/speech/code/languagecodec/测试比较encodec_encodec.log"

os.system("rm %s"%(infer_log_path))

# 设置输出的格式
LOG_FORMAT = "时间: %(asctime)s - 日志等级: %(levelname)s - 日志信息: %(message)s"
# 对logger进行配置——日志等级&输出格式
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, filename=infer_log_path)


# Instantiate a pretrained EnCodec model
model = EncodecModel.encodec_model_24khz()
# The number of codebooks used will be determined bythe bandwidth selected.
# E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used.
# Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32).
# For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number
# of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much.
model = model.cuda()
model.set_target_bandwidth(12.0)

# Load and pre-process the audio waveform
# wav, sr = torchaudio.load("<PATH_TO_AUDIO_FILE>")
input_path = "/home/jovyan/honor/big-disk/speech/code/languagecodec/data/infer/lirbitts_vctk_testclean_500"
out_folder = '/home/jovyan/honor/big-disk/speech/code/languagecodec/result/infer/encodec'
# os.system("rm -r %s"%(out_folder))
# os.system("mkdir -p %s"%(out_folder))
# ll="libritts_testclean500_nq16"
ll = "debug"

tmptmp=out_folder+"/"+ll

os.system("rm -r %s"%(tmptmp))
os.system("mkdir -p %s"%(tmptmp))

with open(input_path,'r') as fin:
x=fin.readlines()

x = [i.strip() for i in x]

for i in range(20):

print(i)

wav, sr = torchaudio.load(x[i])

wav = wav.cuda()
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0)

# Extract discrete codes from EnCodec
with torch.no_grad():
encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]

if(codes.size()[2]>100):
logging.info(f"{x[i]}|{codes[:,:2,:]}")
else:
logging.info(f"{x[i]}|{codes}")

# frames=[(codes,None)]

# with torch.no_grad():
# wav_output = model.decode(frames)

# audio_path = out_folder + '/' + ll + '/' + x[i].split('/')[-1]

# torchaudio.save(audio_path,wav_output.squeeze(0).cpu(),model.sample_rate)
45 changes: 45 additions & 0 deletions code/datasplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# 将librilight的数据切分成八秒中一段

import os
import glob
import torchaudio
import torch

input_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small"
output_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small_split"

aa=glob.glob(os.path.join(input_path,"*/*/*.flac"))

os.system("rm -r %s"%(output_path))
os.system("mkdir -p %s"%(output_path))

sum_audio=0

for i in aa:
# sum_audio+=1
# if(sum_audio>10):
# break
wav, sr = torchaudio.load(i)
if(wav.size()[1]>(sr*8)):
wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8)
wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):]
name=i.split('/')[-3]
id=i.split('/')[-1].split('.')[0]
for id_i in range(wav_tmp.size()[0]):
out_folder=output_path+"/"+name
os.makedirs(out_folder, exist_ok=True)
audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".flac"
torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr)
if(wav_left.size()[1]>sr):
audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".flac"
torchaudio.save(audio_path, wav_left, sample_rate=sr)
else:
if(wav.size()[1]>sr):
name=i.split('/')[-3]
id=i.split('/')[-1].split('.')[0]
out_folder=output_path+"/"+name
os.makedirs(out_folder, exist_ok=True)
audio_path=out_folder+"/"+id+"_1.flac"
torchaudio.save(audio_path, wav, sample_rate=sr)


45 changes: 45 additions & 0 deletions code/datasplit2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# 将librilight的数据切分成八秒中一段

import os
import glob
import torchaudio
import torch

input_path="/home/jovyan/honor/big-disk/speech/Data/DNS/pdns_training_set/raw/clean"
output_path="/home/jovyan/honor/big-disk/speech/Data/DNS/trainclean_split"

aa=glob.glob(os.path.join(input_path,"*/*.wav"))

# os.system("rm -r %s"%(output_path))
# os.system("mkdir -p %s"%(output_path))

sum_audio=0

for i in aa:
# sum_audio+=1
# if(sum_audio>10):
# break
wav, sr = torchaudio.load(i)
if(wav.size()[1]>(sr*8)):
wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8)
wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):]
name=i.split('/')[-2]
id=i.split('/')[-1].split('.')[0]
for id_i in range(wav_tmp.size()[0]):
out_folder=output_path+"/"+name
os.makedirs(out_folder, exist_ok=True)
audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".wav"
torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr)
if(wav_left.size()[1]>sr):
audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".wav"
torchaudio.save(audio_path, wav_left, sample_rate=sr)
else:
if(wav.size()[1]>sr):
name=i.split('/')[-2]
id=i.split('/')[-1].split('.')[0]
out_folder=output_path+"/"+name
os.makedirs(out_folder, exist_ok=True)
audio_path=out_folder+"/"+id+"_1.wav"
torchaudio.save(audio_path, wav, sample_rate=sr)


12 changes: 12 additions & 0 deletions code/getdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# 下载common voice

from datasets import load_dataset


print(0)

# ['ab', 'af', 'am', 'ar', 'as', 'ast', 'az', 'ba', 'bas', 'be', 'bg', 'bn', 'br', 'ca', 'ckb', 'cnh', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dyu', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'gl', 'gn', 'ha', 'he', 'hi', 'hsb', 'hu', 'hy-AM', 'ia', 'id', 'ig', 'is', 'it', 'ja', 'ka', 'kab', 'kk', 'kmr', 'ko', 'ky', 'lg', 'lij', 'lo', 'lt', 'ltg', 'lv', 'mdf', 'mhr', 'mk', 'ml', 'mn', 'mr', 'mrj', 'mt', 'myv', 'nan-tw', 'ne-NP', 'nhi', 'nl', 'nn-NO', 'oc', 'or', 'os', 'pa-IN', 'pl', 'ps', 'pt', 'quy', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sat', 'sc', 'sk', 'skr', 'sl', 'sq', 'sr', 'sv-SE', 'sw', 'ta', 'te', 'th', 'ti', 'tig', 'tk', 'tok', 'tr', 'tt', 'tw', 'ug', 'uk', 'ur', 'uz', 'vi', 'vot', 'yi', 'yo', 'yue', 'zgh', 'zh-CN', 'zh-HK', 'zh-TW']

ds=load_dataset("mozilla-foundation/common_voice_16_1","hu")

# https://huggingface.co/datasets/reach-vb/common_voice_16_1/resolve/main/audio/af/train/af_train_0.tar
Loading

0 comments on commit a293e9b

Please sign in to comment.