-
Notifications
You must be signed in to change notification settings - Fork 117
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adding some devtools. * Adding delay calculation * Chunked inference for codec. * Version bump * Removing prod.yml, updating to recent main. * Turning padding off only when chunking codes. * Updating README, removing unused things. * Missed a padding. * Adding some checks to make sure pads are the same. * Factoring out latent dim, backwards compatible. * Adding latent dim, and the 44khz 16kbps model config. * Ran pre-commit. * Chunked vs unchunked inference. * Fixing padding stuff. * n quantizers back in encode * don't load unsupported versions * correct docstring * bitrate config + 16kbps models * update audiotools dep * fix argbind issue * minor correction * bump version * change model path * update audiotools deps --------- Co-authored-by: prem <[email protected]> Co-authored-by: Ishaan Kumar <[email protected]>
- Loading branch information
1 parent
0202b4a
commit c7cfc5d
Showing
17 changed files
with
703 additions
and
337 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
ARG IMAGE=pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime | ||
ARG GITHUB_TOKEN=none | ||
|
||
FROM $IMAGE | ||
|
||
RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc | ||
|
||
COPY requirements.txt /requirements.txt | ||
|
||
RUN apt update && apt install -y git | ||
|
||
# install the package | ||
RUN pip install --upgrade -r /requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# Model setup | ||
DAC.sample_rate: 44100 | ||
DAC.encoder_dim: 64 | ||
DAC.encoder_rates: [2, 4, 8, 8] | ||
DAC.latent_dim: 128 | ||
DAC.decoder_dim: 1536 | ||
DAC.decoder_rates: [8, 8, 4, 2] | ||
|
||
# Quantization | ||
DAC.n_codebooks: 18 # Max bitrate of 16kbps | ||
DAC.codebook_size: 1024 | ||
DAC.codebook_dim: 8 | ||
DAC.quantizer_dropout: 0.5 | ||
|
||
# Discriminator | ||
Discriminator.sample_rate: 44100 | ||
Discriminator.rates: [] | ||
Discriminator.periods: [2, 3, 5, 7, 11] | ||
Discriminator.fft_sizes: [2048, 1024, 512] | ||
Discriminator.bands: | ||
- [0.0, 0.1] | ||
- [0.1, 0.25] | ||
- [0.25, 0.5] | ||
- [0.5, 0.75] | ||
- [0.75, 1.0] | ||
|
||
# Optimization | ||
AdamW.betas: [0.8, 0.99] | ||
AdamW.lr: 0.0001 | ||
ExponentialLR.gamma: 0.999996 | ||
|
||
amp: false | ||
val_batch_size: 100 | ||
device: cuda | ||
num_iters: 400000 | ||
save_iters: [10000, 50000, 100000, 200000] | ||
valid_freq: 1000 | ||
sample_freq: 10000 | ||
num_workers: 32 | ||
val_idx: [0, 1, 2, 3, 4, 5, 6, 7] | ||
seed: 0 | ||
lambdas: | ||
mel/loss: 15.0 | ||
adv/feat_loss: 2.0 | ||
adv/gen_loss: 1.0 | ||
vq/commitment_loss: 0.25 | ||
vq/codebook_loss: 1.0 | ||
|
||
VolumeNorm.db: [const, -16] | ||
|
||
# Transforms | ||
build_transform.preprocess: | ||
- Identity | ||
build_transform.augment_prob: 0.0 | ||
build_transform.augment: | ||
- Identity | ||
build_transform.postprocess: | ||
- VolumeNorm | ||
- RescaleAudio | ||
- ShiftPhase | ||
|
||
# Loss setup | ||
MultiScaleSTFTLoss.window_lengths: [2048, 512] | ||
MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] | ||
MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] | ||
MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] | ||
MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] | ||
MelSpectrogramLoss.pow: 1.0 | ||
MelSpectrogramLoss.clamp_eps: 1.0e-5 | ||
MelSpectrogramLoss.mag_weight: 0.0 | ||
|
||
# Data | ||
batch_size: 72 | ||
train/AudioDataset.duration: 0.38 | ||
train/AudioDataset.n_examples: 10000000 | ||
|
||
val/AudioDataset.duration: 5.0 | ||
val/build_transform.augment_prob: 1.0 | ||
val/AudioDataset.n_examples: 250 | ||
|
||
test/AudioDataset.duration: 10.0 | ||
test/build_transform.augment_prob: 1.0 | ||
test/AudioDataset.n_examples: 1000 | ||
|
||
AudioLoader.shuffle: true | ||
AudioDataset.without_replacement: true | ||
|
||
train/build_dataset.folders: | ||
speech_fb: | ||
- /data/daps/train | ||
speech_hq: | ||
- /data/vctk | ||
- /data/vocalset | ||
- /data/read_speech | ||
- /data/french_speech | ||
speech_uq: | ||
- /data/emotional_speech/ | ||
- /data/common_voice/ | ||
- /data/german_speech/ | ||
- /data/russian_speech/ | ||
- /data/spanish_speech/ | ||
music_hq: | ||
- /data/musdb/train | ||
music_uq: | ||
- /data/jamendo | ||
general: | ||
- /data/audioset/data/unbalanced_train_segments/ | ||
- /data/audioset/data/balanced_train_segments/ | ||
|
||
val/build_dataset.folders: | ||
speech_hq: | ||
- /data/daps/val | ||
music_hq: | ||
- /data/musdb/test | ||
general: | ||
- /data/audioset/data/eval_segments/ | ||
|
||
test/build_dataset.folders: | ||
speech_hq: | ||
- /data/daps/test | ||
music_hq: | ||
- /data/musdb/test | ||
general: | ||
- /data/audioset/data/eval_segments/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .base import CodecMixin | ||
from .base import DACFile | ||
from .dac import DAC | ||
from .discriminator import Discriminator |
Oops, something went wrong.