-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d1e74a7
commit a293e9b
Showing
73 changed files
with
10,688 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,133 @@ | ||
# languagecodec | ||
Official code repository of Language-Codec | ||
# Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models | ||
|
||
[Audio samples](https://languagecodec.github.io) | | ||
Paper [[abs]](https://arxiv.org/abs/2402.12208) [[pdf]](https://arxiv.org/pdf/2402.12208.pdf) | ||
|
||
|
||
## Installation | ||
|
||
To use Language-Codec, install it using: | ||
|
||
```bash | ||
conda create -n xxx python=3.8 | ||
conda activate xxx | ||
pip install -r requirement.txt | ||
``` | ||
|
||
## Infer | ||
|
||
### Part1: Reconstruct audio from raw wav | ||
|
||
```python | ||
|
||
from encodec.utils import convert_audio | ||
import torchaudio | ||
import torch | ||
from vocos.pretrained import Vocos | ||
|
||
device=torch.device('cpu') | ||
|
||
config_path = "xxx/languagecodec/configs/languagecodec.yaml" | ||
model_path = "xxx/xxx.ckpt" | ||
audio_outpath = "xxx" | ||
vocos = Vocos.from_pretrained0802(config_path, model_path) | ||
vocos = vocos.to(device) | ||
|
||
wav, sr = torchaudio.load(audio_path) | ||
wav = convert_audio(wav, sr, 24000, 1) | ||
bandwidth_id = torch.tensor([0]) | ||
wav=wav.to(device) | ||
features,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id) | ||
audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) | ||
torchaudio.save(audio_outpath, audio_out, sample_rate=24000, encoding='PCM_S', bits_per_sample=16) | ||
``` | ||
|
||
|
||
### Part2: Generating Discrete Codecs | ||
```python | ||
|
||
from encodec.utils import convert_audio | ||
import torchaudio | ||
import torch | ||
from vocos.pretrained import Vocos | ||
|
||
device=torch.device('cpu') | ||
|
||
config_path = "xxx/languagecodec/configs/languagecodec.yaml" | ||
model_path = "xxx/xxx.ckpt" | ||
vocos = Vocos.from_pretrained0802(config_path, model_path) | ||
vocos = vocos.to(device) | ||
|
||
wav, sr = torchaudio.load(audio_path) | ||
wav = convert_audio(wav, sr, 24000, 1) | ||
bandwidth_id = torch.tensor([0]) | ||
wav=wav.to(device) | ||
_,discrete_code= vocos.encode(wav, bandwidth_id=bandwidth_id) | ||
print(discrete_code) | ||
``` | ||
|
||
|
||
|
||
### Part3: Audio reconstruction through codecs | ||
```python | ||
# audio_tokens [n_q,1,t]/[n_q,t] | ||
features = vocos.codes_to_features(audio_tokens) | ||
bandwidth_id = torch.tensor([0]) | ||
audio_out = vocos.decode(features, bandwidth_id=bandwidth_id) | ||
``` | ||
|
||
|
||
|
||
|
||
## Pre-trained models | ||
|
||
Currently, we have only released the results from our paper, and we plan to release additional checkpoints trained on a larger training dataset within the next two months. | ||
|
||
| Model Name | Dataset | Training Iterations | ||
-------------------------------------------------------------------------------------|---------------|--------------------- | ||
| [languagecodec_paper_8nq](https://huggingface.co/charactr/vocos-mel-24khz) | 3W Hours | 2.0 M | ||
<!-- | [charactr/vocos-encodec-24khz](https://huggingface.co/charactr/vocos-encodec-24khz) | DNS Challenge | 2.5 M | 7.9 M --> | ||
|
||
## Training | ||
|
||
### Step1: Prepare train dataset | ||
```python | ||
# Process the data into a form similar to xxx/languagecodec/data/libritts_testother.txt | ||
``` | ||
|
||
### Step2: Modifying configuration files | ||
```python | ||
# xxx/languagecodec/configs/languagecodec.yaml | ||
# Modify the values of parameters such as batch_size, filelist_path, save_dir, device | ||
``` | ||
|
||
### Step3: Start training process | ||
Refer to [Pytorch Lightning documentation](https://lightning.ai/docs/pytorch/stable/) for details about customizing the | ||
training pipeline. | ||
|
||
```bash | ||
cd xxx/languagecodec | ||
python train.py fit --config xxx/languagecodec/configs/languagecodec.yaml | ||
``` | ||
|
||
|
||
|
||
## Citation | ||
|
||
If this code contributes to your research, please cite our work: | ||
|
||
``` | ||
@misc{ji2024languagecodec, | ||
title={Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models}, | ||
author={Shengpeng Ji and Minghui Fang and Ziyue Jiang and Rongjie Huang and Jialung Zuo and Shulei Wang and Zhou Zhao}, | ||
year={2024}, | ||
eprint={2402.12208}, | ||
archivePrefix={arXiv}, | ||
primaryClass={eess.AS} | ||
} | ||
``` | ||
|
||
## License | ||
|
||
The code in this repository is released under the MIT license as found in the | ||
[LICENSE](LICENSE) file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
from encodec import EncodecModel | ||
from encodec.utils import convert_audio | ||
|
||
import torchaudio | ||
import torch | ||
|
||
import os | ||
|
||
import logging | ||
|
||
infer_log_path="/home/jovyan/honor/big-disk/speech/code/languagecodec/测试比较encodec_encodec.log" | ||
|
||
os.system("rm %s"%(infer_log_path)) | ||
|
||
# 设置输出的格式 | ||
LOG_FORMAT = "时间: %(asctime)s - 日志等级: %(levelname)s - 日志信息: %(message)s" | ||
# 对logger进行配置——日志等级&输出格式 | ||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, filename=infer_log_path) | ||
|
||
|
||
# Instantiate a pretrained EnCodec model | ||
model = EncodecModel.encodec_model_24khz() | ||
# The number of codebooks used will be determined bythe bandwidth selected. | ||
# E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used. | ||
# Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32). | ||
# For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number | ||
# of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much. | ||
model = model.cuda() | ||
model.set_target_bandwidth(12.0) | ||
|
||
# Load and pre-process the audio waveform | ||
# wav, sr = torchaudio.load("<PATH_TO_AUDIO_FILE>") | ||
input_path = "/home/jovyan/honor/big-disk/speech/code/languagecodec/data/infer/lirbitts_vctk_testclean_500" | ||
out_folder = '/home/jovyan/honor/big-disk/speech/code/languagecodec/result/infer/encodec' | ||
# os.system("rm -r %s"%(out_folder)) | ||
# os.system("mkdir -p %s"%(out_folder)) | ||
# ll="libritts_testclean500_nq16" | ||
ll = "debug" | ||
|
||
tmptmp=out_folder+"/"+ll | ||
|
||
os.system("rm -r %s"%(tmptmp)) | ||
os.system("mkdir -p %s"%(tmptmp)) | ||
|
||
with open(input_path,'r') as fin: | ||
x=fin.readlines() | ||
|
||
x = [i.strip() for i in x] | ||
|
||
for i in range(20): | ||
|
||
print(i) | ||
|
||
wav, sr = torchaudio.load(x[i]) | ||
|
||
wav = wav.cuda() | ||
wav = convert_audio(wav, sr, model.sample_rate, model.channels) | ||
wav = wav.unsqueeze(0) | ||
|
||
# Extract discrete codes from EnCodec | ||
with torch.no_grad(): | ||
encoded_frames = model.encode(wav) | ||
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T] | ||
|
||
if(codes.size()[2]>100): | ||
logging.info(f"{x[i]}|{codes[:,:2,:]}") | ||
else: | ||
logging.info(f"{x[i]}|{codes}") | ||
|
||
# frames=[(codes,None)] | ||
|
||
# with torch.no_grad(): | ||
# wav_output = model.decode(frames) | ||
|
||
# audio_path = out_folder + '/' + ll + '/' + x[i].split('/')[-1] | ||
|
||
# torchaudio.save(audio_path,wav_output.squeeze(0).cpu(),model.sample_rate) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# 将librilight的数据切分成八秒中一段 | ||
|
||
import os | ||
import glob | ||
import torchaudio | ||
import torch | ||
|
||
input_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small" | ||
output_path="/home/jovyan/honor/big-disk/speech/Data/librilight/small/small_split" | ||
|
||
aa=glob.glob(os.path.join(input_path,"*/*/*.flac")) | ||
|
||
os.system("rm -r %s"%(output_path)) | ||
os.system("mkdir -p %s"%(output_path)) | ||
|
||
sum_audio=0 | ||
|
||
for i in aa: | ||
# sum_audio+=1 | ||
# if(sum_audio>10): | ||
# break | ||
wav, sr = torchaudio.load(i) | ||
if(wav.size()[1]>(sr*8)): | ||
wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8) | ||
wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):] | ||
name=i.split('/')[-3] | ||
id=i.split('/')[-1].split('.')[0] | ||
for id_i in range(wav_tmp.size()[0]): | ||
out_folder=output_path+"/"+name | ||
os.makedirs(out_folder, exist_ok=True) | ||
audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".flac" | ||
torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr) | ||
if(wav_left.size()[1]>sr): | ||
audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".flac" | ||
torchaudio.save(audio_path, wav_left, sample_rate=sr) | ||
else: | ||
if(wav.size()[1]>sr): | ||
name=i.split('/')[-3] | ||
id=i.split('/')[-1].split('.')[0] | ||
out_folder=output_path+"/"+name | ||
os.makedirs(out_folder, exist_ok=True) | ||
audio_path=out_folder+"/"+id+"_1.flac" | ||
torchaudio.save(audio_path, wav, sample_rate=sr) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# 将librilight的数据切分成八秒中一段 | ||
|
||
import os | ||
import glob | ||
import torchaudio | ||
import torch | ||
|
||
input_path="/home/jovyan/honor/big-disk/speech/Data/DNS/pdns_training_set/raw/clean" | ||
output_path="/home/jovyan/honor/big-disk/speech/Data/DNS/trainclean_split" | ||
|
||
aa=glob.glob(os.path.join(input_path,"*/*.wav")) | ||
|
||
# os.system("rm -r %s"%(output_path)) | ||
# os.system("mkdir -p %s"%(output_path)) | ||
|
||
sum_audio=0 | ||
|
||
for i in aa: | ||
# sum_audio+=1 | ||
# if(sum_audio>10): | ||
# break | ||
wav, sr = torchaudio.load(i) | ||
if(wav.size()[1]>(sr*8)): | ||
wav_tmp=wav[:,:wav.size()[1]//(sr*8)*(sr*8)].reshape(-1,sr*8) | ||
wav_left=wav[:,wav.size()[1]//(sr*8)*(sr*8):] | ||
name=i.split('/')[-2] | ||
id=i.split('/')[-1].split('.')[0] | ||
for id_i in range(wav_tmp.size()[0]): | ||
out_folder=output_path+"/"+name | ||
os.makedirs(out_folder, exist_ok=True) | ||
audio_path=out_folder+"/"+id+"_"+str(id_i+1)+".wav" | ||
torchaudio.save(audio_path, wav_tmp[id_i].unsqueeze(0), sample_rate=sr) | ||
if(wav_left.size()[1]>sr): | ||
audio_path=out_folder+"/"+id+"_"+str(id_i+1+1)+".wav" | ||
torchaudio.save(audio_path, wav_left, sample_rate=sr) | ||
else: | ||
if(wav.size()[1]>sr): | ||
name=i.split('/')[-2] | ||
id=i.split('/')[-1].split('.')[0] | ||
out_folder=output_path+"/"+name | ||
os.makedirs(out_folder, exist_ok=True) | ||
audio_path=out_folder+"/"+id+"_1.wav" | ||
torchaudio.save(audio_path, wav, sample_rate=sr) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# 下载common voice | ||
|
||
from datasets import load_dataset | ||
|
||
|
||
print(0) | ||
|
||
# ['ab', 'af', 'am', 'ar', 'as', 'ast', 'az', 'ba', 'bas', 'be', 'bg', 'bn', 'br', 'ca', 'ckb', 'cnh', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dyu', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'gl', 'gn', 'ha', 'he', 'hi', 'hsb', 'hu', 'hy-AM', 'ia', 'id', 'ig', 'is', 'it', 'ja', 'ka', 'kab', 'kk', 'kmr', 'ko', 'ky', 'lg', 'lij', 'lo', 'lt', 'ltg', 'lv', 'mdf', 'mhr', 'mk', 'ml', 'mn', 'mr', 'mrj', 'mt', 'myv', 'nan-tw', 'ne-NP', 'nhi', 'nl', 'nn-NO', 'oc', 'or', 'os', 'pa-IN', 'pl', 'ps', 'pt', 'quy', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sat', 'sc', 'sk', 'skr', 'sl', 'sq', 'sr', 'sv-SE', 'sw', 'ta', 'te', 'th', 'ti', 'tig', 'tk', 'tok', 'tr', 'tt', 'tw', 'ug', 'uk', 'ur', 'uz', 'vi', 'vot', 'yi', 'yo', 'yue', 'zgh', 'zh-CN', 'zh-HK', 'zh-TW'] | ||
|
||
ds=load_dataset("mozilla-foundation/common_voice_16_1","hu") | ||
|
||
# https://huggingface.co/datasets/reach-vb/common_voice_16_1/resolve/main/audio/af/train/af_train_0.tar |
Oops, something went wrong.