diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..7d54908
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,5 @@
+*.pth
+*.wav
+*.dac
+tests/
+runs/
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..e69de29
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a7f37b9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,176 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/env.sh
+venv/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Files created by experiments
+output/
+snapshot/
+*.m4a
+*.wav
+notebooks/scratch.ipynb
+notebooks/inspect.ipynb
+notebooks/effects.ipynb
+notebooks/*.ipynb
+notebooks/*.gif
+notebooks/*.wav
+notebooks/*.mp4
+*runs/
+boards/
+samples/
+*.ipynb
+
+results.json
+metrics.csv
+mprofile_*
+mem.png
+
+results/
+mprofile*
+*.png
+# do not ignore the test wav file
+!tests/audio/short_test_audio.wav
+!tests/audio/output.wav
+*/.DS_Store
+.DS_Store
+env.sh
+_codebraid/
+**/*.html
+**/*.exec.md
+flagged/
+log.txt
+ckpt/
+.syncthing*
+tests/assets/
+archived/
+
+*_remote_module_*
+*.zip
+*.pth
+encoded_out/
+recon/ 
+recons/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..eca3cf9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+repos:
+- repo: https://github.com/asottile/reorder_python_imports
+  rev: v2.5.0
+  hooks:
+    - id: reorder-python-imports
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+    - id: black
+      language_version: python3
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.0.1
+  hooks:
+    - id: end-of-file-fixer
+    - id: trailing-whitespace
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d59d183
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+COPY . /app
+WORKDIR /app
+
+RUN apt update && apt install -y git
+# install the package
+RUN pip install .
+
+# cache the model
+RUN python3 -m dac download
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8356bd6
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-present, Descript
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6f72a0d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,95 @@
+# Descript Audio Codec (.dac)
+
+<!-- ![](https://static.arxiv.org/static/browse/0.3.4/images/icons/favicon-32x32.png) -->
+
+
+This repository contains training and inference scripts
+for the Descript Audio Codec (.dac), a high fidelity general
+neural audio codec.
+
+
+## Usage
+
+### Installation
+```
+git clone https://github.com/descriptinc/descript-audio-codec
+cd descript-audio-codec
+pip install .
+```
+
+### Compress audio
+```
+python3 -m dac encode /path/to/input --output /path/to/output/codes
+```
+
+This command will create `.dac` files with the same name as the input files.
+It will also preserve the directory structure relative to input root and
+re-create it in the output directory. Please use `python -m dac encode --help`
+for more options.
+
+### Reconstruct audio from compressed codes
+```
+python3 -m dac decode /path/to/output/codes --output /path/to/reconstructed_input
+```
+
+This command will create `.wav` files with the same name as the input files.
+It will also preserve the directory structure relative to input root and
+re-create it in the output directory. Please use `python -m dac decode --help`
+for more options.
+
+### Docker image
+We provide a dockerfile to build a docker image with all the necessary
+dependencies.
+1. Building the image.
+    ```
+    docker build -t dac .
+    ```
+2. Using the image.
+
+    Usage on CPU:
+    ```
+    docker run dac <command>
+    ```
+
+    Usage on GPU:
+    ```
+    docker run --gpus=all dac <command>
+    ```
+
+    `<command>` can be one of the compression and reconstruction commands listed
+    above. For example, if you want to run compression,
+
+    ```
+    docker run --gpus=all dac python3 -m dac encode ...
+    ```
+
+
+## Training
+The baseline model configuration can be trained using the following commands.
+
+### Pre-requisites
+Please install the correct dependencies
+```
+pip install -e ".[dev]"
+```
+
+
+### Single GPU training
+```
+export CUDA_VISIBLE_DEVICES=0
+python scripts/train.py --args.load conf/ablations/baseline.yml --save_path runs/baseline/
+```
+
+### Multi GPU training
+```
+export CUDA_VISIBLE_DEVICES=0,1
+torchrun --nproc_per_node gpu scripts/train.py --args.load conf/ablations/baseline.yml --save_path runs/baseline/
+```
+
+## Testing
+We provide two test scripts to test CLI + training functionality. Please
+make sure that the trainig pre-requisites are satisfied before launching these
+tests. To launch these tests please run
+```
+python -m pytest tests
+```
diff --git a/conf/1gpu.yml b/conf/1gpu.yml
new file mode 100644
index 0000000..ce87802
--- /dev/null
+++ b/conf/1gpu.yml
@@ -0,0 +1,6 @@
+$include:
+  - conf/base.yml
+
+batch_size: 12
+val_batch_size: 12
+num_workers: 4
diff --git a/conf/ablations/baseline.yml b/conf/ablations/baseline.yml
new file mode 100644
index 0000000..1510ce2
--- /dev/null
+++ b/conf/ablations/baseline.yml
@@ -0,0 +1,3 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
diff --git a/conf/ablations/diff-mb.yml b/conf/ablations/diff-mb.yml
new file mode 100644
index 0000000..afa758d
--- /dev/null
+++ b/conf/ablations/diff-mb.yml
@@ -0,0 +1,22 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+Discriminator.sample_rate: 44100
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.05]
+  - [0.05, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 1.0]
+
+
+# re-weight lambdas to make up for
+# lost discriminators vs baseline
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 5.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/equal-mb.yml b/conf/ablations/equal-mb.yml
new file mode 100644
index 0000000..2c091ac
--- /dev/null
+++ b/conf/ablations/equal-mb.yml
@@ -0,0 +1,22 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+Discriminator.sample_rate: 44100
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.2]
+  - [0.2, 0.4]
+  - [0.4, 0.6]
+  - [0.6, 0.8]
+  - [0.8, 1.0]
+
+
+# re-weight lambdas to make up for
+# lost discriminators vs baseline
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 5.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/no-adv.yml b/conf/ablations/no-adv.yml
new file mode 100644
index 0000000..75e271b
--- /dev/null
+++ b/conf/ablations/no-adv.yml
@@ -0,0 +1,9 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+lambdas:
+  mel/loss: 1.0
+  waveform/loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/no-data-balance.yml b/conf/ablations/no-data-balance.yml
new file mode 100644
index 0000000..a88f392
--- /dev/null
+++ b/conf/ablations/no-data-balance.yml
@@ -0,0 +1,22 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+train/build_dataset.folders:
+  speech:
+    - /data/daps/train
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+  music:
+    - /data/musdb/train
+    - /data/jamendo
+  general:
+    - /data/audioset/data/unbalanced_train_segments/
+    - /data/audioset/data/balanced_train_segments/
diff --git a/conf/ablations/no-low-hop.yml b/conf/ablations/no-low-hop.yml
new file mode 100644
index 0000000..abde923
--- /dev/null
+++ b/conf/ablations/no-low-hop.yml
@@ -0,0 +1,18 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+MelSpectrogramLoss.n_mels: [80]
+MelSpectrogramLoss.window_lengths: [512]
+MelSpectrogramLoss.mel_fmin: [0]
+MelSpectrogramLoss.mel_fmax: [null]
+MelSpectrogramLoss.pow: 1.0
+MelSpectrogramLoss.clamp_eps: 1.0e-5
+MelSpectrogramLoss.mag_weight: 0.0
+
+lambdas:
+  mel/loss: 100.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/no-mb.yml b/conf/ablations/no-mb.yml
new file mode 100644
index 0000000..3aa0015
--- /dev/null
+++ b/conf/ablations/no-mb.yml
@@ -0,0 +1,17 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+Discriminator.sample_rate: 44100
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 1.0]
+
+# re-weight lambdas to make up for
+# lost discriminators vs baseline
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 5.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/no-mpd-msd.yml b/conf/ablations/no-mpd-msd.yml
new file mode 100644
index 0000000..9059b82
--- /dev/null
+++ b/conf/ablations/no-mpd-msd.yml
@@ -0,0 +1,21 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+Discriminator.sample_rate: 44100
+Discriminator.rates: []
+Discriminator.periods: []
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.66
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/no-mpd.yml b/conf/ablations/no-mpd.yml
new file mode 100644
index 0000000..0e4dc93
--- /dev/null
+++ b/conf/ablations/no-mpd.yml
@@ -0,0 +1,21 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+Discriminator.sample_rate: 44100
+Discriminator.rates: [1]
+Discriminator.periods: []
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.5
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
diff --git a/conf/ablations/only-speech.yml b/conf/ablations/only-speech.yml
new file mode 100644
index 0000000..c2bbc0d
--- /dev/null
+++ b/conf/ablations/only-speech.yml
@@ -0,0 +1,22 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+train/build_dataset.folders:
+  speech_fb:
+    - /data/daps/train
+  speech_hq:
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+  speech_uq:
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+
+val/build_dataset.folders:
+  speech_hq:
+    - /data/daps/val
diff --git a/conf/base.yml b/conf/base.yml
new file mode 100644
index 0000000..746e6f7
--- /dev/null
+++ b/conf/base.yml
@@ -0,0 +1,123 @@
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 4, 8, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [8, 8, 4, 2]
+
+# Quantization
+DAC.n_codebooks: 9
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 1.0
+
+# Discriminator
+Discriminator.sample_rate: 44100
+Discriminator.rates: []
+Discriminator.periods: [2, 3, 5, 7, 11]
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+# Optimization
+AdamW.betas: [0.8, 0.99]
+AdamW.lr: 0.0001
+ExponentialLR.gamma: 0.999996
+
+amp: false
+val_batch_size: 100
+device: cuda
+num_iters: 250000
+save_iters: [10000, 50000, 100000, 200000]
+valid_freq: 1000
+sample_freq: 10000
+num_workers: 32
+val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
+seed: 0
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
+
+VolumeNorm.db: [const, -16]
+
+# Transforms
+build_transform.preprocess:
+  - Identity
+build_transform.augment_prob: 0.0
+build_transform.augment:
+  - Identity
+build_transform.postprocess:
+  - VolumeNorm
+  - RescaleAudio
+  - ShiftPhase
+
+# Loss setup
+MultiScaleSTFTLoss.window_lengths: [2048, 512]
+MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
+MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
+MelSpectrogramLoss.pow: 1.0
+MelSpectrogramLoss.clamp_eps: 1.0e-5
+MelSpectrogramLoss.mag_weight: 0.0
+
+# Data
+batch_size: 72
+train/AudioDataset.duration: 0.38
+train/AudioDataset.n_examples: 10000000
+
+val/AudioDataset.duration: 5.0
+val/build_transform.augment_prob: 1.0
+val/AudioDataset.n_examples: 250
+
+test/AudioDataset.duration: 10.0
+test/build_transform.augment_prob: 1.0
+test/AudioDataset.n_examples: 1000
+
+AudioLoader.shuffle: true
+AudioDataset.without_replacement: true
+
+train/build_dataset.folders:
+  speech_fb:
+    - /data/daps/train
+  speech_hq:
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+  speech_uq:
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+  music_hq:
+    - /data/musdb/train
+  music_uq:
+    - /data/jamendo
+  general:
+    - /data/audioset/data/unbalanced_train_segments/
+    - /data/audioset/data/balanced_train_segments/
+
+val/build_dataset.folders:
+  speech_hq:
+    - /data/daps/val
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
+
+test/build_dataset.folders:
+  speech_hq:
+    - /data/daps/test
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
diff --git a/conf/downsampling/1024x.yml b/conf/downsampling/1024x.yml
new file mode 100644
index 0000000..2719f9b
--- /dev/null
+++ b/conf/downsampling/1024x.yml
@@ -0,0 +1,16 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 8, 8, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [8, 4, 4, 2, 2, 2]
+
+# Quantization
+DAC.n_codebooks: 19
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 1.0
diff --git a/conf/downsampling/128x.yml b/conf/downsampling/128x.yml
new file mode 100644
index 0000000..cf7d5a4
--- /dev/null
+++ b/conf/downsampling/128x.yml
@@ -0,0 +1,16 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 4, 4, 4]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [4, 4, 2, 2, 2, 1]
+
+# Quantization
+DAC.n_codebooks: 2
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 1.0
diff --git a/conf/downsampling/1536x.yml b/conf/downsampling/1536x.yml
new file mode 100644
index 0000000..fa695b1
--- /dev/null
+++ b/conf/downsampling/1536x.yml
@@ -0,0 +1,16 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 96
+DAC.encoder_rates: [2, 8, 8, 12]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [12, 4, 4, 2, 2, 2]
+
+# Quantization
+DAC.n_codebooks: 28
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 1.0
diff --git a/conf/downsampling/768x.yml b/conf/downsampling/768x.yml
new file mode 100644
index 0000000..8100545
--- /dev/null
+++ b/conf/downsampling/768x.yml
@@ -0,0 +1,16 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 6, 8, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [6, 4, 4, 2, 2, 2]
+
+# Quantization
+DAC.n_codebooks: 14
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 1.0
diff --git a/conf/quantizer/24kbps.yml b/conf/quantizer/24kbps.yml
new file mode 100644
index 0000000..1b2f26a
--- /dev/null
+++ b/conf/quantizer/24kbps.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.n_codebooks: 28
diff --git a/conf/quantizer/256d.yml b/conf/quantizer/256d.yml
new file mode 100644
index 0000000..2d958f8
--- /dev/null
+++ b/conf/quantizer/256d.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.codebook_dim: 256
diff --git a/conf/quantizer/2d.yml b/conf/quantizer/2d.yml
new file mode 100644
index 0000000..aae678e
--- /dev/null
+++ b/conf/quantizer/2d.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.codebook_dim: 2
diff --git a/conf/quantizer/32d.yml b/conf/quantizer/32d.yml
new file mode 100644
index 0000000..24ba180
--- /dev/null
+++ b/conf/quantizer/32d.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.codebook_dim: 32
diff --git a/conf/quantizer/4d.yml b/conf/quantizer/4d.yml
new file mode 100644
index 0000000..48d5287
--- /dev/null
+++ b/conf/quantizer/4d.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.codebook_dim: 4
diff --git a/conf/quantizer/512d.yml b/conf/quantizer/512d.yml
new file mode 100644
index 0000000..2a9d9ae
--- /dev/null
+++ b/conf/quantizer/512d.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.codebook_dim: 512
diff --git a/conf/quantizer/dropout-0.0.yml b/conf/quantizer/dropout-0.0.yml
new file mode 100644
index 0000000..93a6577
--- /dev/null
+++ b/conf/quantizer/dropout-0.0.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.quantizer_dropout: 0.0
diff --git a/conf/quantizer/dropout-0.25.yml b/conf/quantizer/dropout-0.25.yml
new file mode 100644
index 0000000..d0c1ff4
--- /dev/null
+++ b/conf/quantizer/dropout-0.25.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.quantizer_dropout: 0.25
diff --git a/conf/quantizer/dropout-0.5.yml b/conf/quantizer/dropout-0.5.yml
new file mode 100644
index 0000000..f6682b3
--- /dev/null
+++ b/conf/quantizer/dropout-0.5.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.quantizer_dropout: 0.5
diff --git a/conf/size/medium.yml b/conf/size/medium.yml
new file mode 100644
index 0000000..5751dec
--- /dev/null
+++ b/conf/size/medium.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.decoder_dim: 1024
diff --git a/conf/size/small.yml b/conf/size/small.yml
new file mode 100644
index 0000000..d67649b
--- /dev/null
+++ b/conf/size/small.yml
@@ -0,0 +1,5 @@
+$include:
+  - conf/base.yml
+  - conf/1gpu.yml
+
+DAC.decoder_dim: 512
diff --git a/dac/__init__.py b/dac/__init__.py
new file mode 100644
index 0000000..bb6e173
--- /dev/null
+++ b/dac/__init__.py
@@ -0,0 +1,10 @@
+__version__ = "0.0.1"
+__model_version__ = "0.0.1"
+import audiotools
+
+audiotools.ml.BaseModel.INTERN += ["dac.**"]
+audiotools.ml.BaseModel.EXTERN += ["einops"]
+
+
+from . import nn
+from . import model
diff --git a/dac/__main__.py b/dac/__main__.py
new file mode 100644
index 0000000..338b238
--- /dev/null
+++ b/dac/__main__.py
@@ -0,0 +1,37 @@
+import sys
+
+import argbind
+
+from dac.utils import ensure_default_model as download
+from dac.utils.decode import decode
+from dac.utils.encode import encode
+
+STAGES = ["encode", "decode", "download"]
+
+
+def run(stage: str):
+    """Run stages.
+
+    Parameters
+    ----------
+    stage : str
+        Stage to run
+    """
+    if stage not in STAGES:
+        raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
+    stage_fn = globals()[stage]
+
+    if stage == "download":
+        stage_fn()
+        return
+
+
+    stage_fn()
+
+
+if __name__ == "__main__":
+    group = sys.argv.pop(1)
+    args = argbind.parse_args(group=group)
+
+    with argbind.scope(args):
+        run(group)
diff --git a/dac/compare/__init__.py b/dac/compare/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dac/compare/encodec.py b/dac/compare/encodec.py
new file mode 100644
index 0000000..42877de
--- /dev/null
+++ b/dac/compare/encodec.py
@@ -0,0 +1,54 @@
+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from encodec import EncodecModel
+
+
+class Encodec(BaseModel):
+    def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0):
+        super().__init__()
+
+        if sample_rate == 24000:
+            self.model = EncodecModel.encodec_model_24khz()
+        else:
+            self.model = EncodecModel.encodec_model_48khz()
+        self.model.set_target_bandwidth(bandwidth)
+        self.sample_rate = 44100
+
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = 44100,
+        n_quantizers: int = None,
+    ):
+        signal = AudioSignal(audio_data, sample_rate)
+        signal.resample(self.model.sample_rate)
+        recons = self.model(signal.audio_data)
+        recons = AudioSignal(recons, self.model.sample_rate)
+        recons.resample(sample_rate)
+        return {"audio": recons.audio_data}
+
+
+if __name__ == "__main__":
+    import numpy as np
+    from functools import partial
+
+    model = Encodec()
+
+    for n, m in model.named_modules():
+        o = m.extra_repr()
+        p = sum([np.prod(p.size()) for p in m.parameters()])
+        fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
+        setattr(m, "extra_repr", partial(fn, o=o, p=p))
+    print(model)
+    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
+
+    length = 88200 * 2
+    x = torch.randn(1, 1, length).to(model.device)
+    x.requires_grad_(True)
+    x.retain_grad()
+
+    # Make a forward pass
+    out = model(x)["audio"]
+
+    print(x.shape, out.shape)
diff --git a/dac/model/__init__.py b/dac/model/__init__.py
new file mode 100644
index 0000000..94304fd
--- /dev/null
+++ b/dac/model/__init__.py
@@ -0,0 +1,3 @@
+from .base import CodecMixin
+from .dac import DAC
+from .discriminator import Discriminator
diff --git a/dac/model/base.py b/dac/model/base.py
new file mode 100644
index 0000000..9da8a9b
--- /dev/null
+++ b/dac/model/base.py
@@ -0,0 +1,116 @@
+import math
+from pathlib import Path
+from typing import Union
+
+import torch
+import tqdm
+from audiotools import AudioSignal
+
+
+class CodecMixin:
+    EXT = ".dac"
+
+    @torch.no_grad()
+    def reconstruct(
+        self,
+        audio_path_or_signal: Union[str, Path, AudioSignal],
+        overlap_win_duration: float = 5.0,
+        overlap_hop_ratio: float = 0.5,
+        verbose: bool = False,
+        normalize_db: float = -16,
+        match_input_db: bool = False,
+        mono: bool = False,
+        **kwargs,
+    ):
+        """Reconstructs an audio signal from a file or AudioSignal object.
+            This function decomposes the audio signal into overlapping windows
+            and reconstructs them one by one. The overlapping windows are then
+            overlap-and-added together to form the final output.
+
+        Parameters
+        ----------
+        audio_path_or_signal : Union[str, Path, AudioSignal]
+            audio signal to reconstruct
+        overlap_win_duration : float, optional
+            overlap window duration in seconds, by default 5.0
+        overlap_hop_ratio : float, optional
+            overlap hop ratio, by default 0.5
+        verbose : bool, optional
+            by default False
+        normalize_db : float, optional
+            normalize db, by default -16
+        match_input_db : bool, optional
+            set True to match input db, by default False
+        mono : bool, optional
+            set True to convert to mono, by default False
+        Returns
+        -------
+        AudioSignal
+            reconstructed audio signal
+        """
+        self.eval()
+        audio_signal = audio_path_or_signal
+        if isinstance(audio_signal, (str, Path)):
+            audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
+
+        if mono:
+            audio_signal = audio_signal.to_mono()
+
+        audio_signal = audio_signal.clone()
+        audio_signal = audio_signal.ffmpeg_resample(self.sample_rate)
+
+        original_length = audio_signal.signal_length
+        input_db = audio_signal.ffmpeg_loudness()
+
+        # Fix overlap window so that it's divisible by 4 in # of samples
+        sr = audio_signal.sample_rate
+        overlap_win_duration = ((overlap_win_duration * sr) // 4) * 4
+        overlap_win_duration = overlap_win_duration / sr
+
+        if normalize_db is not None:
+            audio_signal.normalize(normalize_db)
+        audio_signal.ensure_max_of_audio()
+        overlap_hop_duration = overlap_win_duration * overlap_hop_ratio
+        do_overlap_and_add = audio_signal.signal_duration > overlap_win_duration
+
+        nb, nac, nt = audio_signal.audio_data.shape
+        audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
+
+        if do_overlap_and_add:
+            pad_length = (
+                math.ceil(audio_signal.signal_duration / overlap_win_duration)
+                * overlap_win_duration
+            )
+            audio_signal.zero_pad_to(int(pad_length * sr))
+            audio_signal = audio_signal.collect_windows(
+                overlap_win_duration, overlap_hop_duration
+            )
+
+        range_fn = range if not verbose else tqdm.trange
+        for i in range_fn(audio_signal.batch_size):
+            signal_from_batch = AudioSignal(
+                audio_signal.audio_data[i, ...], audio_signal.sample_rate
+            )
+            signal_from_batch.to(self.device)
+            _output = self.forward(
+                signal_from_batch.audio_data, signal_from_batch.sample_rate, **kwargs
+            )
+
+            _output = _output["audio"].detach()
+            _output_signal = AudioSignal(_output, self.sample_rate).to(self.device)
+            audio_signal.audio_data[i] = _output_signal.audio_data.cpu()
+
+        recons = audio_signal
+        recons._loudness = None
+        recons.stft_data = None
+
+        if do_overlap_and_add:
+            recons = recons.overlap_and_add(overlap_hop_duration)
+            recons.audio_data = recons.audio_data.reshape(nb, nac, -1)
+
+        if match_input_db:
+            recons.ffmpeg_loudness()
+            recons = recons.normalize(input_db)
+
+        recons.truncate_samples(original_length)
+        return recons
diff --git a/dac/model/dac.py b/dac/model/dac.py
new file mode 100644
index 0000000..f7fd762
--- /dev/null
+++ b/dac/model/dac.py
@@ -0,0 +1,344 @@
+import math
+from typing import List
+from typing import Union
+
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from torch import nn
+
+from .base import CodecMixin
+from dac.nn.layers import Snake1d
+from dac.nn.layers import WNConv1d
+from dac.nn.layers import WNConvTranspose1d
+from dac.nn.quantize import ResidualVectorQuantize
+
+
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+
+
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+
+    def forward(self, x):
+        return x + self.block(x)
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Snake1d(dim // 2),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=stride // 2,
+            ),
+        )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+
+        # Create last convolution
+        self.block += [
+            Snake1d(d_model),
+            WNConv1d(d_model, d_model, kernel_size=3, padding=1),
+        ]
+
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=stride // 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        d_out: int = 1,
+    ):
+        super().__init__()
+
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class DAC(BaseModel, CodecMixin):
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = [2, 4, 8, 8],
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 8, 4, 2],
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 44100,
+    ):
+        super().__init__()
+
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+
+        self.hop_length = np.prod(decoder_rates)
+        self.encoder = Encoder(encoder_dim, encoder_rates)
+
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+
+        self.quantizer = ResidualVectorQuantize(
+            self.encoder.enc_dim,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_dropout=quantizer_dropout,
+        )
+
+        self.decoder = Decoder(
+            self.encoder.enc_dim,
+            decoder_dim,
+            decoder_rates,
+        )
+        self.sample_rate = sample_rate
+        self.apply(init_weights)
+
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data, length
+
+    def encode(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+    ):
+        """Encode given audio data and return quantized latent codes
+
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        sample_rate : int, optional
+            Sample rate of audio data in Hz, by default None
+            If None, defaults to `self.sample_rate`
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None
+            If None, all quantizers are used.
+
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+        """
+        out = {}
+        audio_data, length = self.preprocess(audio_data, sample_rate)
+        out["length"] = length
+
+        out["z"] = self.encoder(audio_data)
+        out.update(self.quantizer(out["z"], n_quantizers))
+        return out
+
+    def decode(self, z: torch.Tensor, length: int = None):
+        """Decode given latent codes and return audio data
+
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        out = {}
+        x = self.decoder(z)
+        out["audio"] = x[..., :length]
+        return out
+
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+    ):
+        """Model forward pass
+
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        sample_rate : int, optional
+            Sample rate of audio data in Hz, by default None
+            If None, defaults to `self.sample_rate`
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None.
+            If None, all quantizers are used.
+
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        out = {}
+        out.update(self.encode(audio_data, sample_rate, n_quantizers))
+        out.update(self.decode(out["z"], out["length"]))
+        return out
+
+
+if __name__ == "__main__":
+    import numpy as np
+    from functools import partial
+
+    model = DAC()
+
+    for n, m in model.named_modules():
+        o = m.extra_repr()
+        p = sum([np.prod(p.size()) for p in m.parameters()])
+        fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
+        setattr(m, "extra_repr", partial(fn, o=o, p=p))
+    print(model)
+    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
+
+    length = 88200 * 2
+    x = torch.randn(1, 1, length).to(model.device)
+    x.requires_grad_(True)
+    x.retain_grad()
+
+    # Make a forward pass
+    out = model(x)["audio"]
+
+    # Create gradient variable
+    grad = torch.zeros_like(out)
+    grad[:, :, grad.shape[-1] // 2] = 1
+
+    # Make a backward pass
+    out.backward(grad)
+
+    # Check non-zero values
+    gradmap = x.grad.squeeze(0)
+    gradmap = (gradmap != 0).sum(0)  # sum across features
+    rf = (gradmap != 0).sum()
+
+    print(f"Receptive field: {rf.item()}")
diff --git a/dac/model/discriminator.py b/dac/model/discriminator.py
new file mode 100644
index 0000000..09c79d1
--- /dev/null
+++ b/dac/model/discriminator.py
@@ -0,0 +1,228 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from audiotools import AudioSignal
+from audiotools import ml
+from audiotools import STFTParams
+from einops import rearrange
+from torch.nn.utils import weight_norm
+
+
+def WNConv1d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv1d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+
+
+def WNConv2d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv2d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+
+
+class MPD(nn.Module):
+    def __init__(self, period):
+        super().__init__()
+        self.period = period
+        self.convs = nn.ModuleList(
+            [
+                WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
+            ]
+        )
+        self.conv_post = WNConv2d(
+            1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
+        )
+
+    def pad_to_period(self, x):
+        t = x.shape[-1]
+        x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
+        return x
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.pad_to_period(x)
+        x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
+
+        for layer in self.convs:
+            x = layer(x)
+            fmap.append(x)
+
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+class MSD(nn.Module):
+    def __init__(self, rate: int = 1, sample_rate: int = 44100):
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                WNConv1d(1, 16, 15, 1, padding=7),
+                WNConv1d(16, 64, 41, 4, groups=4, padding=20),
+                WNConv1d(64, 256, 41, 4, groups=16, padding=20),
+                WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
+                WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
+                WNConv1d(1024, 1024, 5, 1, padding=2),
+            ]
+        )
+        self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
+        self.sample_rate = sample_rate
+        self.rate = rate
+
+    def forward(self, x):
+        x = AudioSignal(x, self.sample_rate)
+        x.resample(self.sample_rate // self.rate)
+        x = x.audio_data
+
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
+
+
+class MRD(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        hop_factor: float = 0.25,
+        sample_rate: int = 44100,
+        bands: list = BANDS,
+    ):
+        """Complex multi-band spectrogram discriminator.
+        Parameters
+        ----------
+        window_length : int
+            Window length of STFT.
+        hop_factor : float, optional
+            Hop factor of the STFT, defaults to ``0.25 * window_length``.
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 44100
+        bands : list, optional
+            Bands to run discriminator over.
+        """
+        super().__init__()
+
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.sample_rate = sample_rate
+        self.stft_params = STFTParams(
+            window_length=window_length,
+            hop_length=int(window_length * hop_factor),
+            match_stride=True,
+        )
+
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+
+        ch = 32
+        convs = lambda: nn.ModuleList(
+            [
+                WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
+
+    def spectrogram(self, x):
+        x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
+        x = torch.view_as_real(x.stft())
+        x = rearrange(x, "b 1 f t c -> (b 1) c t f")
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+
+    def forward(self, x):
+        x_bands = self.spectrogram(x)
+        fmap = []
+
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for layer in stack:
+                band = layer(band)
+                fmap.append(band)
+            x.append(band)
+
+        x = torch.cat(x, dim=-1)
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+class Discriminator(ml.BaseModel):
+    def __init__(
+        self,
+        rates: list = [],
+        periods: list = [2, 3, 5, 7, 11],
+        fft_sizes: list = [2048, 1024, 512],
+        sample_rate: int = 44100,
+        bands: list = BANDS,
+    ):
+        """Discriminator that combines multiple discriminators.
+
+        Parameters
+        ----------
+        rates : list, optional
+            sampling rates (in Hz) to run MSD at, by default []
+            If empty, MSD is not used.
+        periods : list, optional
+            periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
+        fft_sizes : list, optional
+            Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 44100
+        bands : list, optional
+            Bands to run MRD at, by default `BANDS`
+        """
+        super().__init__()
+        discs = []
+        discs += [MPD(p) for p in periods]
+        discs += [MSD(r, sample_rate=sample_rate) for r in rates]
+        discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
+        self.discriminators = nn.ModuleList(discs)
+
+    def preprocess(self, y):
+        # Remove DC offset
+        y = y - y.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        return y
+
+    def forward(self, x):
+        x = self.preprocess(x)
+        fmaps = [d(x) for d in self.discriminators]
+        return fmaps
+
+
+if __name__ == "__main__":
+    disc = Discriminator()
+    x = torch.zeros(1, 1, 44100)
+    results = disc(x)
+    for i, result in enumerate(results):
+        print(f"disc{i}")
+        for i, r in enumerate(result):
+            print(r.shape, r.mean(), r.min(), r.max())
+        print()
diff --git a/dac/nn/__init__.py b/dac/nn/__init__.py
new file mode 100644
index 0000000..6718c8b
--- /dev/null
+++ b/dac/nn/__init__.py
@@ -0,0 +1,3 @@
+from . import layers
+from . import loss
+from . import quantize
diff --git a/dac/nn/layers.py b/dac/nn/layers.py
new file mode 100644
index 0000000..44fbc29
--- /dev/null
+++ b/dac/nn/layers.py
@@ -0,0 +1,33 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+
+
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+
+
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+
+
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+
+
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+
+    def forward(self, x):
+        return snake(x, self.alpha)
diff --git a/dac/nn/loss.py b/dac/nn/loss.py
new file mode 100644
index 0000000..9bb3dd6
--- /dev/null
+++ b/dac/nn/loss.py
@@ -0,0 +1,368 @@
+import typing
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from audiotools import AudioSignal
+from audiotools import STFTParams
+from torch import nn
+
+
+class L1Loss(nn.L1Loss):
+    """L1 Loss between AudioSignals. Defaults
+    to comparing ``audio_data``, but any
+    attribute of an AudioSignal can be used.
+
+    Parameters
+    ----------
+    attribute : str, optional
+        Attribute of signal to compare, defaults to ``audio_data``.
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
+    """
+
+    def __init__(self, attribute: str = "audio_data", weight: float = 1.0, **kwargs):
+        self.attribute = attribute
+        self.weight = weight
+        super().__init__(**kwargs)
+
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate AudioSignal
+        y : AudioSignal
+            Reference AudioSignal
+
+        Returns
+        -------
+        torch.Tensor
+            L1 loss between AudioSignal attributes.
+        """
+        if isinstance(x, AudioSignal):
+            x = getattr(x, self.attribute)
+            y = getattr(y, self.attribute)
+        return super().forward(x, y)
+
+
+class SISDRLoss(nn.Module):
+    """
+    Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
+    of estimated and reference audio signals or aligned features.
+
+    Parameters
+    ----------
+    scaling : int, optional
+        Whether to use scale-invariant (True) or
+        signal-to-noise ratio (False), by default True
+    reduction : str, optional
+        How to reduce across the batch (either 'mean',
+        'sum', or none).], by default ' mean'
+    zero_mean : int, optional
+        Zero mean the references and estimates before
+        computing the loss, by default True
+    clip_min : int, optional
+        The minimum possible loss value. Helps network
+        to not focus on making already good examples better, by default None
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
+    """
+
+    def __init__(
+        self,
+        scaling: int = True,
+        reduction: str = "mean",
+        zero_mean: int = True,
+        clip_min: int = None,
+        weight: float = 1.0,
+    ):
+        self.scaling = scaling
+        self.reduction = reduction
+        self.zero_mean = zero_mean
+        self.clip_min = clip_min
+        self.weight = weight
+        super().__init__()
+
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        eps = 1e-8
+        # nb, nc, nt
+        if isinstance(x, AudioSignal):
+            references = x.audio_data
+            estimates = y.audio_data
+        else:
+            references = x
+            estimates = y
+
+        nb = references.shape[0]
+        references = references.reshape(nb, 1, -1).permute(0, 2, 1)
+        estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
+
+        # samples now on axis 1
+        if self.zero_mean:
+            mean_reference = references.mean(dim=1, keepdim=True)
+            mean_estimate = estimates.mean(dim=1, keepdim=True)
+        else:
+            mean_reference = 0
+            mean_estimate = 0
+
+        _references = references - mean_reference
+        _estimates = estimates - mean_estimate
+
+        references_projection = (_references**2).sum(dim=-2) + eps
+        references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
+
+        scale = (
+            (references_on_estimates / references_projection).unsqueeze(1)
+            if self.scaling
+            else 1
+        )
+
+        e_true = scale * _references
+        e_res = _estimates - e_true
+
+        signal = (e_true**2).sum(dim=1)
+        noise = (e_res**2).sum(dim=1)
+        sdr = -10 * torch.log10(signal / noise + eps)
+
+        if self.clip_min is not None:
+            sdr = torch.clamp(sdr, min=self.clip_min)
+
+        if self.reduction == "mean":
+            sdr = sdr.mean()
+        elif self.reduction == "sum":
+            sdr = sdr.sum()
+        return sdr
+
+
+class MultiScaleSTFTLoss(nn.Module):
+    """Computes the multi-scale STFT loss from [1].
+
+    Parameters
+    ----------
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+
+    References
+    ----------
+
+    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
+        "DDSP: Differentiable Digital Signal Processing."
+        International Conference on Learning Representations. 2019.
+
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
+    """
+
+    def __init__(
+        self,
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.loss_fn = loss_fn
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.clamp_eps = clamp_eps
+        self.weight = weight
+        self.pow = pow
+
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes multi-scale STFT between an estimate and a reference
+        signal.
+
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+
+        Returns
+        -------
+        torch.Tensor
+            Multi-scale STFT loss.
+        """
+        loss = 0.0
+        for s in self.stft_params:
+            x.stft(s.window_length, s.hop_length, s.window_type)
+            y.stft(s.window_length, s.hop_length, s.window_type)
+            loss += self.log_weight * self.loss_fn(
+                x.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
+        return loss
+
+
+class MelSpectrogramLoss(nn.Module):
+    """Compute distance between mel spectrograms. Can be used
+    in a multi-scale way.
+
+    Parameters
+    ----------
+    n_mels : List[int]
+        Number of mels per STFT, by default [150, 80],
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
+    """
+
+    def __init__(
+        self,
+        n_mels: List[int] = [150, 80],
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        mel_fmin: List[float] = [0.0, 0.0],
+        mel_fmax: List[float] = [None, None],
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.n_mels = n_mels
+        self.loss_fn = loss_fn
+        self.clamp_eps = clamp_eps
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.weight = weight
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.pow = pow
+
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes mel loss between an estimate and a reference
+        signal.
+
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+
+        Returns
+        -------
+        torch.Tensor
+            Mel loss.
+        """
+        loss = 0.0
+        for n_mels, fmin, fmax, s in zip(
+            self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
+        ):
+            kwargs = {
+                "window_length": s.window_length,
+                "hop_length": s.hop_length,
+                "window_type": s.window_type,
+            }
+            x_mels = x.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+            y_mels = y.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+
+            loss += self.log_weight * self.loss_fn(
+                x_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x_mels, y_mels)
+        return loss
+
+
+class GANLoss(nn.Module):
+    """
+    Computes a discriminator loss, given a discriminator on
+    generated waveforms/spectrograms compared to ground truth
+    waveforms/spectrograms. Computes the loss for both the
+    discriminator and the generator in separate functions.
+    """
+
+    def __init__(self, discriminator):
+        super().__init__()
+        self.discriminator = discriminator
+
+    def forward(self, fake, real):
+        d_fake = self.discriminator(fake.audio_data)
+        d_real = self.discriminator(real.audio_data)
+        return d_fake, d_real
+
+    def discriminator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake.clone().detach(), real)
+
+        loss_d = 0
+        for x_fake, x_real in zip(d_fake, d_real):
+            loss_d += torch.mean(x_fake[-1] ** 2)
+            loss_d += torch.mean((1 - x_real[-1]) ** 2)
+        return loss_d
+
+    def generator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake, real)
+
+        loss_g = 0
+        for x_fake in d_fake:
+            loss_g += torch.mean((1 - x_fake[-1]) ** 2)
+
+        loss_feature = 0
+
+        for i in range(len(d_fake)):
+            for j in range(len(d_fake[i]) - 1):
+                loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach())
+        return loss_g, loss_feature
diff --git a/dac/nn/quantize.py b/dac/nn/quantize.py
new file mode 100644
index 0000000..0a2f438
--- /dev/null
+++ b/dac/nn/quantize.py
@@ -0,0 +1,262 @@
+from typing import Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+
+from dac.nn.layers import WNConv1d
+
+
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+
+        z_q = self.out_proj(z_q)
+
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices
+
+
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i])
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+
+        codebook_indices = []
+        latents = []
+
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+
+        for i, quantizer in enumerate(self.quantizers):
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+
+        return {
+            "z": z_q,
+            "codes": torch.stack(codebook_indices, dim=1),
+            "latents": torch.cat(latents, dim=1),
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+
+
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(quantizer_dropout=True)
+    x = torch.randn(16, 512, 80)
+    y = rvq(x)
+    print(y["latents"].shape)
diff --git a/dac/utils/__init__.py b/dac/utils/__init__.py
new file mode 100644
index 0000000..7ee945d
--- /dev/null
+++ b/dac/utils/__init__.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+
+from audiotools import ml
+
+import dac
+
+
+DAC = dac.model.DAC
+Accelerator = ml.Accelerator
+
+
+def ensure_default_model(tag: str = dac.__model_version__):
+    """
+    Function that downloads the weights file from URL if a local cache is not
+    found.
+
+    Args:
+        tag (str): The tag of the model to download.
+    """
+    download_link = f"https://github.com/descriptinc/descript-audio-codec/releases/download/{tag}/weights.pth"
+    local_path = Path.home() / ".cache" / "descript" / tag / "dac" / f"weights.pth"
+    if not local_path.exists():
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Download the model
+        import requests
+
+        response = requests.get(download_link)
+
+        if response.status_code != 200:
+            raise ValueError(
+                f"Could not download model. Received response code {response.status_code}"
+            )
+        local_path.write_bytes(response.content)
+
+    # return the path required by audiotools to load the model
+    return local_path.parent.parent
+
+
+def load_model(
+    tag: str,
+    load_path: str = "",
+):
+    if not load_path:
+        load_path = ensure_default_model(tag)
+    kwargs = {
+        "folder": load_path,
+        "map_location": "cpu",
+        "package": False,
+    }
+    print(f"Loading weights from {kwargs['folder']}")
+    generator, _ = DAC.load_from_folder(**kwargs)
+    return generator
diff --git a/dac/utils/decode.py b/dac/utils/decode.py
new file mode 100644
index 0000000..02cd5d9
--- /dev/null
+++ b/dac/utils/decode.py
@@ -0,0 +1,147 @@
+import warnings
+from pathlib import Path
+
+import argbind
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from tqdm import tqdm
+
+import dac
+from dac.utils import load_model
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+
+@torch.no_grad()
+@torch.inference_mode()
+def process(
+    artifacts: dict,
+    device: str,
+    generator: torch.nn.Module,
+    preserve_sample_rate: bool,
+) -> AudioSignal:
+    """Decode encoded audio. The `artifacts` contain codes from chunked windows
+    of the original audio signal. The codes are decoded one by one and windows are trimmed and concatenated together to form the final output.
+
+    Parameters
+    ----------
+    artifacts : dict
+        Dictionary of artifacts with the following keys:
+        - codes: the quantized codes
+        - metadata: dictionary with following keys
+            - original_db: the loudness of the input signal
+            - overlap_hop_duration: the hop duration of the overlap window
+            - original_length: the original length of the input signal
+            - is_overlap: whether the input signal was overlapped
+            - batch_size: the batch size of the input signal
+            - channels: the number of channels of the input signal
+            - original_sr: the original sample rate of the input signal
+    device : str
+        Device to use
+    generator : torch.nn.Module
+        Generator to decode with.
+    preserve_sample_rate : bool
+        If True, return audio will have the same sample rate as the original
+        encoded audio. If False, return audio will have the sample rate of the
+        generator.
+
+    Returns
+    -------
+    AudioSignal
+    """
+    if isinstance(generator, torch.nn.DataParallel):
+        generator = generator.module
+    audio_signal = AudioSignal(artifacts["codes"], generator.sample_rate)
+    metadata = artifacts["metadata"]
+
+    # Decode chunks
+    output = []
+    for i in range(audio_signal.batch_size):
+        signal_from_batch = AudioSignal(
+            audio_signal.audio_data[i, ...], audio_signal.sample_rate, device=device
+        )
+        z_q = generator.quantizer.from_codes(signal_from_batch.audio_data)[0]
+        audio = generator.decode(z_q)["audio"].cpu()
+        output.append(audio)
+
+    output = torch.cat(output, dim=0)
+    output_signal = AudioSignal(output, generator.sample_rate)
+
+    # Overlap and add
+    if metadata["is_overlap"]:
+        boundary = int(metadata["overlap_hop_duration"] * generator.sample_rate / 2)
+        # remove window overlap
+        output_signal.trim(boundary, boundary)
+        output_signal.audio_data = output_signal.audio_data.reshape(
+            metadata["batch_size"], metadata["channels"], -1
+        )
+        # remove padding
+        output_signal.trim(boundary, boundary)
+
+    # Restore loudness and truncate to original length
+    output_signal.ffmpeg_loudness()
+    output_signal = output_signal.normalize(metadata["original_db"])
+    output_signal.truncate_samples(metadata["original_length"])
+
+    if preserve_sample_rate:
+        output_signal = output_signal.ffmpeg_resample(metadata["original_sr"])
+
+    return output_signal.to("cpu")
+
+
+@argbind.bind(group="decode", positional=True, without_prefix=True)
+@torch.inference_mode()
+@torch.no_grad()
+def decode(
+    input: str,
+    output: str = "",
+    weights_path: str = "",
+    model_tag: str = dac.__model_version__,
+    preserve_sample_rate: bool = False,
+    device: str = "cuda",
+):
+    generator = load_model(
+        tag=model_tag,
+        load_path=weights_path,
+    )
+    generator.to(device)
+    generator.eval()
+
+    # Find all .dac files in input directory
+    _input = Path(input)
+    input_files = list(_input.glob("**/*.dac"))
+
+    # If input is a .dac file, add it to the list
+    if _input.suffix == ".dac":
+        input_files.append(_input)
+
+    # Create output directory
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+
+    for i in tqdm(range(len(input_files)), desc=f"Decoding files"):
+        # Load file
+        artifacts = np.load(input_files[i], allow_pickle=True)[()]
+
+        # Reconstruct audio from codes
+        recons = process(artifacts, device, generator, preserve_sample_rate)
+
+        # Compute output path
+        relative_path = input_files[i].relative_to(input)
+        output_dir = output / relative_path.parent
+        if not relative_path.name:
+            output_dir = output
+            relative_path = input_files[i]
+        output_name = relative_path.with_suffix(".wav").name
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write to file
+        recons.write(output_path)
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        decode()
diff --git a/dac/utils/encode.py b/dac/utils/encode.py
new file mode 100644
index 0000000..ac904b8
--- /dev/null
+++ b/dac/utils/encode.py
@@ -0,0 +1,171 @@
+import math
+import warnings
+from pathlib import Path
+
+import argbind
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from audiotools.core import util
+from tqdm import tqdm
+
+import dac
+from dac.utils import load_model
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+
+@torch.no_grad()
+@torch.inference_mode()
+def process(
+    signal: AudioSignal, device: str, generator: torch.nn.Module, **kwargs
+) -> dict:
+    """Encode an audio signal. The signal is chunked into overlapping windows
+    and encoded one by one.
+
+    Parameters
+    ----------
+    signal : AudioSignal
+        Input signal to encode
+    device : str
+        Device to use
+    generator : torch.nn.Module
+        Generator to encode with
+
+    Returns
+    -------
+    dict
+        Dictionary of artifacts with the following keys:
+        - codes: the quantized codes
+        - metadata: dictionary with following keys
+            - original_db: the loudness of the input signal
+            - overlap_hop_duration: the hop duration of the overlap window
+            - original_length: the original length of the input signal
+            - is_overlap: whether the input signal was overlapped
+            - batch_size: the batch size of the input signal
+            - channels: the number of channels of the input signal
+            - original_sr: the original sample rate of the input signal
+
+    """
+    if isinstance(generator, torch.nn.DataParallel):
+        generator = generator.module
+
+    original_sr = signal.sample_rate
+
+    # Resample input
+    audio_signal = signal.ffmpeg_resample(generator.sample_rate)
+
+    original_length = audio_signal.signal_length
+    input_db = audio_signal.ffmpeg_loudness()
+
+    # Set variables
+    sr = audio_signal.sample_rate
+    overlap_win_duration = 5.0
+    overlap_hop_ratio = 0.5
+
+    # Fix overlap window so that it's divisible by 4 in # of samples
+    overlap_win_duration = ((overlap_win_duration * sr) // 4) * 4
+    overlap_win_duration = overlap_win_duration / sr
+    overlap_hop_duration = overlap_win_duration * overlap_hop_ratio
+    do_overlap_and_add = audio_signal.signal_duration > overlap_win_duration
+
+    # TODO (eeishaan): Remove this when correct caching logic is implemented and
+    # overlap of codes is minimal
+    do_overlap_and_add = False
+
+    # Sanitize input
+    audio_signal.normalize(-16)
+    audio_signal.ensure_max_of_audio()
+
+    nb, nac, nt = audio_signal.audio_data.shape
+    audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
+
+    if do_overlap_and_add:
+        pad_length = (
+            math.ceil(audio_signal.signal_duration / overlap_win_duration)
+            * overlap_win_duration
+        )
+        audio_signal.zero_pad_to(int(pad_length * sr))
+        audio_signal = audio_signal.collect_windows(
+            overlap_win_duration, overlap_hop_duration
+        )
+
+    codebook_indices = []
+    for i in range(audio_signal.batch_size):
+        signal_from_batch = AudioSignal(
+            audio_signal.audio_data[i, ...], audio_signal.sample_rate
+        )
+        signal_from_batch.to(device)
+        codes = generator.encode(
+            signal_from_batch.audio_data, signal_from_batch.sample_rate, **kwargs
+        )["codes"].cpu()
+        codebook_indices.append(codes)
+
+    codebook_indices = torch.cat(codebook_indices, dim=0)
+
+    return {
+        "codes": codebook_indices.numpy(),
+        "metadata": {
+            "original_db": input_db,
+            "overlap_hop_duration": overlap_hop_duration,
+            "original_length": original_length,
+            "is_overlap": do_overlap_and_add,
+            "batch_size": nb,
+            "channels": nac,
+            "original_sr": original_sr,
+        },
+    }
+
+
+@argbind.bind(group="encode", positional=True, without_prefix=True)
+@torch.inference_mode()
+@torch.no_grad()
+def encode(
+    input: str,
+    output: str = "",
+    weights_path: str = "",
+    model_tag: str = dac.__model_version__,
+    n_quantizers: int = None,
+    device: str = "cuda",
+):
+    generator = load_model(
+        tag=model_tag,
+        load_path=weights_path,
+    )
+    generator.to(device)
+    generator.eval()
+    kwargs = {"n_quantizers": n_quantizers}
+
+    # Find all audio files in input path
+    input = Path(input)
+    audio_files = util.find_audio(input)
+
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+
+    for i in tqdm(range(len(audio_files)), desc="Encoding files"):
+        # Load file
+        signal = AudioSignal(audio_files[i])
+
+        # Encode audio to .dac format
+        artifacts = process(signal, device, generator, **kwargs)
+
+        # Compute output path
+        relative_path = audio_files[i].relative_to(input)
+        output_dir = output / relative_path.parent
+        if not relative_path.name:
+            output_dir = output
+            relative_path = audio_files[i]
+        output_name = relative_path.with_suffix(".dac").name
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write to file
+        with open(output_path, "wb") as f:
+            np.save(f, artifacts)
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        encode()
diff --git a/scripts/compute_entropy.py b/scripts/compute_entropy.py
new file mode 100644
index 0000000..a065cfd
--- /dev/null
+++ b/scripts/compute_entropy.py
@@ -0,0 +1,50 @@
+import argbind
+import audiotools as at
+import numpy as np
+import torch
+import tqdm
+
+import dac
+
+
+@argbind.bind(without_prefix=True, positional=True)
+def main(
+    folder: str,
+    model_path: str,
+    n_samples: int = 1024,
+    device: str = "cuda",
+):
+    files = at.util.find_audio(folder)[:n_samples]
+    signals = [
+        at.AudioSignal.salient_excerpt(f, loudness_cutoff=-20, duration=1.0)
+        for f in files
+    ]
+
+    with torch.no_grad():
+        model = dac.model.DAC.load(model_path).to(device)
+        model.eval()
+
+        codes = []
+        for x in tqdm.tqdm(signals):
+            x = x.to(model.device)
+            o = model.encode(x.audio_data, x.sample_rate)
+            codes.append(o["codes"].cpu())
+
+        codes = torch.cat(codes, dim=-1)
+        entropy = []
+
+        for i in range(codes.shape[1]):
+            codes_ = codes[0, i, :]
+            counts = torch.bincount(codes_)
+            counts = (counts / counts.sum()).clamp(1e-10)
+            entropy.append(-(counts * counts.log()).sum().item() * np.log2(np.e))
+
+        pct = sum(entropy) / (10 * len(entropy))
+        print(f"Entropy for each codebook: {entropy}")
+        print(f"Effective percentage: {pct * 100}%")
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        main()
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
new file mode 100644
index 0000000..07cdc3c
--- /dev/null
+++ b/scripts/evaluate.py
@@ -0,0 +1,105 @@
+import csv
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+
+import argbind
+import torch
+from audiotools import AudioSignal
+from audiotools import metrics
+from audiotools.core import util
+from audiotools.ml.decorators import Tracker
+from train import losses
+
+
+@dataclass
+class State:
+    stft_loss: losses.MultiScaleSTFTLoss
+    mel_loss: losses.MelSpectrogramLoss
+    waveform_loss: losses.L1Loss
+    sisdr_loss: losses.SISDRLoss
+
+
+def get_metrics(signal_path, recons_path, state):
+    output = {}
+    signal = AudioSignal(signal_path)
+    recons = AudioSignal(recons_path)
+    for sr in [22050, 44100]:
+        x = signal.clone().resample(sr)
+        y = recons.clone().resample(sr)
+        k = "22k" if sr == 22050 else "44k"
+        output.update(
+            {
+                f"mel-{k}": state.mel_loss(x, y),
+                f"stft-{k}": state.stft_loss(x, y),
+                f"waveform-{k}": state.waveform_loss(x, y),
+                f"sisdr-{k}": state.sisdr_loss(x, y),
+                f"visqol-audio-{k}": metrics.quality.visqol(x, y),
+                f"visqol-speech-{k}": metrics.quality.visqol(x, y, "speech"),
+            }
+        )
+    output["path"] = signal.path_to_file
+    output.update(signal.metadata)
+    return output
+
+
+@argbind.bind(without_prefix=True)
+@torch.no_grad()
+def evaluate(
+    input: str = "samples/input",
+    output: str = "samples/output",
+    n_proc: int = 50,
+):
+    tracker = Tracker()
+
+    waveform_loss = losses.L1Loss()
+    stft_loss = losses.MultiScaleSTFTLoss()
+    mel_loss = losses.MelSpectrogramLoss()
+    sisdr_loss = losses.SISDRLoss()
+
+    state = State(
+        waveform_loss=waveform_loss,
+        stft_loss=stft_loss,
+        mel_loss=mel_loss,
+        sisdr_loss=sisdr_loss,
+    )
+
+    audio_files = util.find_audio(input)
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+
+    @tracker.track("metrics", len(audio_files))
+    def record(future, writer):
+        o = future.result()
+        for k, v in o.items():
+            if torch.is_tensor(v):
+                o[k] = v.item()
+        writer.writerow(o)
+        o.pop("path")
+        return o
+
+    futures = []
+    with tracker.live:
+        with open(output / "metrics.csv", "w") as csvfile:
+            with ProcessPoolExecutor(n_proc, mp.get_context("fork")) as pool:
+                for i in range(len(audio_files)):
+                    future = pool.submit(
+                        get_metrics, audio_files[i], output / audio_files[i].name, state
+                    )
+                    futures.append(future)
+
+                keys = list(futures[0].result().keys())
+                writer = csv.DictWriter(csvfile, fieldnames=keys)
+                writer.writeheader()
+
+                for future in futures:
+                    record(future, writer)
+
+        tracker.done("test", f"N={len(audio_files)}")
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        evaluate()
diff --git a/scripts/get_samples.py b/scripts/get_samples.py
new file mode 100644
index 0000000..b3adad0
--- /dev/null
+++ b/scripts/get_samples.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+
+import argbind
+import torch
+from audiotools import AudioSignal
+from audiotools.core import util
+from audiotools.ml.decorators import Tracker
+from train import Accelerator
+from train import DAC
+
+from dac.compare.encodec import Encodec
+
+Encodec = argbind.bind(Encodec)
+
+
+def load_state(
+    accel: Accelerator,
+    tracker: Tracker,
+    save_path: str,
+    tag: str = "latest",
+    load_weights: bool = False,
+    model_type: str = "dac",
+    bandwidth: float = 24.0,
+):
+    kwargs = {
+        "folder": f"{save_path}/{tag}",
+        "map_location": "cpu",
+        "package": not load_weights,
+    }
+    tracker.print(f"Resuming from {str(Path('.').absolute())}/{kwargs['folder']}")
+
+    if model_type == "dac":
+        generator, _ = DAC.load_from_folder(**kwargs)
+    elif model_type == "encodec":
+        generator = Encodec(bandwidth=bandwidth)
+
+    generator = accel.prepare_model(generator)
+    return generator
+
+
+@torch.no_grad()
+def process(signal, accel, generator, **kwargs):
+    signal = signal.to(accel.device)
+    recons = generator(signal.audio_data, signal.sample_rate, **kwargs)["audio"]
+    recons = AudioSignal(recons, signal.sample_rate)
+    recons = recons.normalize(signal.loudness())
+    return recons.cpu()
+
+
+@argbind.bind(without_prefix=True)
+@torch.no_grad()
+def get_samples(
+    accel,
+    path: str = "ckpt",
+    input: str = "samples/input",
+    output: str = "samples/output",
+    model_type: str = "dac",
+    model_tag: str = "latest",
+    bandwidth: float = 24.0,
+    n_quantizers: int = None,
+):
+    tracker = Tracker(log_file=f"{path}/eval.txt", rank=accel.local_rank)
+    generator = load_state(
+        accel,
+        tracker,
+        save_path=path,
+        model_type=model_type,
+        bandwidth=bandwidth,
+        tag=model_tag,
+    )
+    generator.eval()
+    kwargs = {"n_quantizers": n_quantizers} if model_type == "dac" else {}
+
+    audio_files = util.find_audio(input)
+
+    global process
+    process = tracker.track("process", len(audio_files))(process)
+
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+
+    with tracker.live:
+        for i in range(len(audio_files)):
+            signal = AudioSignal(audio_files[i])
+            recons = process(signal, accel, generator, **kwargs)
+            recons.write(output / audio_files[i].name)
+
+        tracker.done("test", f"N={len(audio_files)}")
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        with Accelerator() as accel:
+            get_samples(accel)
diff --git a/scripts/organize_daps.py b/scripts/organize_daps.py
new file mode 100644
index 0000000..13a43db
--- /dev/null
+++ b/scripts/organize_daps.py
@@ -0,0 +1,97 @@
+import os
+import pathlib
+import shutil
+from collections import defaultdict
+from typing import Tuple
+
+import argbind
+import numpy as np
+import tqdm
+from audiotools import util
+
+
+@argbind.bind()
+def split(
+    audio_files, ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1), seed: int = 0
+):
+    assert sum(ratio) == 1.0
+    util.seed(seed)
+
+    idx = np.arange(len(audio_files))
+    np.random.shuffle(idx)
+
+    b = np.cumsum([0] + list(ratio)) * len(idx)
+    b = [int(_b) for _b in b]
+    train_idx = idx[b[0] : b[1]]
+    val_idx = idx[b[1] : b[2]]
+    test_idx = idx[b[2] :]
+
+    audio_files = np.array(audio_files)
+    train_files = audio_files[train_idx]
+    val_files = audio_files[val_idx]
+    test_files = audio_files[test_idx]
+
+    return train_files, val_files, test_files
+
+
+def assign(val_split, test_split):
+    def _assign(value):
+        if value in val_split:
+            return "val"
+        if value in test_split:
+            return "test"
+        return "train"
+
+    return _assign
+
+
+DAPS_VAL = ["f2", "m2"]
+DAPS_TEST = ["f10", "m10"]
+
+
+@argbind.bind(without_prefix=True)
+def process(
+    dataset: str = "daps",
+    daps_subset: str = "",
+):
+    get_split = None
+    get_value = lambda path: path
+
+    data_path = pathlib.Path("/data")
+    dataset_path = data_path / dataset
+    audio_files = util.find_audio(dataset_path)
+
+    if dataset == "daps":
+        get_split = assign(DAPS_VAL, DAPS_TEST)
+        get_value = lambda path: (str(path).split("/")[-1].split("_", maxsplit=4)[0])
+        audio_files = [
+            x
+            for x in util.find_audio(dataset_path)
+            if daps_subset in str(x) and "breaths" not in str(x)
+        ]
+
+    if get_split is None:
+        _, val, test = split(audio_files)
+        get_split = assign(val, test)
+
+    splits = defaultdict(list)
+    for x in audio_files:
+        _split = get_split(get_value(x))
+        splits[_split].append(x)
+
+    with util.chdir(dataset_path):
+        for k, v in splits.items():
+            v = sorted(v)
+            print(f"Processing {k} in {dataset_path} of length {len(v)}")
+            for _v in tqdm.tqdm(v):
+                tgt_path = pathlib.Path(
+                    str(_v).replace(str(dataset_path), str(dataset_path / k))
+                )
+                tgt_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copyfile(_v, tgt_path)
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        process()
diff --git a/scripts/save_test_set.py b/scripts/save_test_set.py
new file mode 100644
index 0000000..93fd1b1
--- /dev/null
+++ b/scripts/save_test_set.py
@@ -0,0 +1,55 @@
+import csv
+from pathlib import Path
+
+import argbind
+import torch
+from audiotools.core import util
+from audiotools.ml.decorators import Tracker
+from train import Accelerator
+
+import scripts.train as train
+
+
+@torch.no_grad()
+def process(batch, accel, test_data):
+    batch = util.prepare_batch(batch, accel.device)
+    signal = test_data.transform(batch["signal"].clone(), **batch["transform_args"])
+    return signal.cpu()
+
+
+@argbind.bind(without_prefix=True)
+@torch.no_grad()
+def save_test_set(args, accel, sample_rate: int = 44100, output: str = "samples/input"):
+    tracker = Tracker()
+    with argbind.scope(args, "test"):
+        test_data = train.build_dataset(sample_rate)
+
+    global process
+    process = tracker.track("process", len(test_data))(process)
+
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+    (output.parent / "input").mkdir(parents=True, exist_ok=True)
+    with open(output / "metadata.csv", "w") as csvfile:
+        keys = ["path", "original"]
+        writer = csv.DictWriter(csvfile, fieldnames=keys)
+        writer.writeheader()
+
+        with tracker.live:
+            for i in range(len(test_data)):
+                signal = process(test_data[i], accel, test_data)
+                input_path = output.parent / "input" / f"sample_{i}.wav"
+                metadata = {
+                    "path": str(input_path),
+                    "original": str(signal.path_to_input_file),
+                }
+                writer.writerow(metadata)
+                signal.write(input_path)
+            tracker.done("test", f"N={len(test_data)}")
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        with Accelerator() as accel:
+            save_test_set(args, accel)
diff --git a/scripts/train.py b/scripts/train.py
new file mode 100644
index 0000000..57e2e41
--- /dev/null
+++ b/scripts/train.py
@@ -0,0 +1,436 @@
+import os
+import sys
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+
+import argbind
+import torch
+from audiotools import AudioSignal
+from audiotools import ml
+from audiotools.core import util
+from audiotools.data import transforms
+from audiotools.data.datasets import AudioDataset
+from audiotools.data.datasets import AudioLoader
+from audiotools.data.datasets import ConcatDataset
+from audiotools.ml.decorators import timer
+from audiotools.ml.decorators import Tracker
+from audiotools.ml.decorators import when
+from torch.utils.tensorboard import SummaryWriter
+
+import dac
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+# Enable cudnn autotuner to speed up training
+# (can be altered by the funcs.seed function)
+torch.backends.cudnn.benchmark = bool(int(os.getenv("CUDNN_BENCHMARK", 1)))
+# Uncomment to trade memory for speed.
+
+# Optimizers
+AdamW = argbind.bind(torch.optim.AdamW, "generator", "discriminator")
+Accelerator = argbind.bind(ml.Accelerator, without_prefix=True)
+
+
+@argbind.bind("generator", "discriminator")
+def ExponentialLR(optimizer, gamma: float = 1.0):
+    return torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
+
+
+# Models
+DAC = argbind.bind(dac.model.DAC)
+Discriminator = argbind.bind(dac.model.Discriminator)
+
+# Data
+AudioDataset = argbind.bind(AudioDataset, "train", "val")
+AudioLoader = argbind.bind(AudioLoader, "train", "val")
+
+# Transforms
+filter_fn = lambda fn: hasattr(fn, "transform") and fn.__qualname__ not in [
+    "BaseTransform",
+    "Compose",
+    "Choose",
+]
+tfm = argbind.bind_module(transforms, "train", "val", filter_fn=filter_fn)
+
+# Loss
+filter_fn = lambda fn: hasattr(fn, "forward") and "Loss" in fn.__name__
+losses = argbind.bind_module(dac.nn.loss, filter_fn=filter_fn)
+
+
+def get_infinite_loader(dataloader):
+    while True:
+        for batch in dataloader:
+            yield batch
+
+
+@argbind.bind("train", "val")
+def build_transform(
+    augment_prob: float = 1.0,
+    preprocess: list = ["Identity"],
+    augment: list = ["Identity"],
+    postprocess: list = ["Identity"],
+):
+    to_tfm = lambda l: [getattr(tfm, x)() for x in l]
+    preprocess = transforms.Compose(*to_tfm(preprocess), name="preprocess")
+    augment = transforms.Compose(*to_tfm(augment), name="augment", prob=augment_prob)
+    postprocess = transforms.Compose(*to_tfm(postprocess), name="postprocess")
+    transform = transforms.Compose(preprocess, augment, postprocess)
+    return transform
+
+
+@argbind.bind("train", "val", "test")
+def build_dataset(
+    sample_rate: int,
+    folders: dict = None,
+):
+    # Give one loader per key/value of dictionary, where
+    # value is a list of folders. Create a dataset for each one.
+    # Concatenate the datasets with ConcatDataset, which
+    # cycles through them.
+    datasets = []
+    for _, v in folders.items():
+        loader = AudioLoader(sources=v)
+        transform = build_transform()
+        dataset = AudioDataset(loader, sample_rate, transform=transform)
+        datasets.append(dataset)
+
+    dataset = ConcatDataset(datasets)
+    dataset.transform = transform
+    return dataset
+
+
+@dataclass
+class State:
+    generator: DAC
+    optimizer_g: AdamW
+    scheduler_g: ExponentialLR
+
+    discriminator: Discriminator
+    optimizer_d: AdamW
+    scheduler_d: ExponentialLR
+
+    stft_loss: losses.MultiScaleSTFTLoss
+    mel_loss: losses.MelSpectrogramLoss
+    gan_loss: losses.GANLoss
+    waveform_loss: losses.L1Loss
+
+    train_data: AudioDataset
+    val_data: AudioDataset
+
+    tracker: Tracker
+
+
+@argbind.bind(without_prefix=True)
+def load(
+    args,
+    accel: ml.Accelerator,
+    tracker: Tracker,
+    save_path: str,
+    resume: bool = False,
+    tag: str = "latest",
+    load_weights: bool = False,
+):
+    generator, g_extra = None, {}
+    discriminator, d_extra = None, {}
+
+    if resume:
+        kwargs = {
+            "folder": f"{save_path}/{tag}",
+            "map_location": "cpu",
+            "package": not load_weights,
+        }
+        tracker.print(f"Resuming from {str(Path('.').absolute())}/{kwargs['folder']}")
+        if (Path(kwargs["folder"]) / "dac").exists():
+            generator, g_extra = DAC.load_from_folder(**kwargs)
+        if (Path(kwargs["folder"]) / "discriminator").exists():
+            discriminator, d_extra = Discriminator.load_from_folder(**kwargs)
+
+    generator = DAC() if generator is None else generator
+    discriminator = Discriminator() if discriminator is None else discriminator
+
+    generator = accel.prepare_model(generator)
+    discriminator = accel.prepare_model(discriminator)
+
+    with argbind.scope(args, "generator"):
+        optimizer_g = AdamW(generator.parameters(), use_zero=accel.use_ddp)
+        scheduler_g = ExponentialLR(optimizer_g)
+    with argbind.scope(args, "discriminator"):
+        optimizer_d = AdamW(discriminator.parameters(), use_zero=accel.use_ddp)
+        scheduler_d = ExponentialLR(optimizer_d)
+
+    if "optimizer.pth" in g_extra:
+        optimizer_g.load_state_dict(g_extra["optimizer.pth"])
+    if "scheduler.pth" in g_extra:
+        scheduler_g.load_state_dict(g_extra["scheduler.pth"])
+    if "tracker.pth" in g_extra:
+        tracker.load_state_dict(g_extra["tracker.pth"])
+
+    if "optimizer.pth" in d_extra:
+        optimizer_d.load_state_dict(d_extra["optimizer.pth"])
+    if "scheduler.pth" in d_extra:
+        scheduler_d.load_state_dict(d_extra["scheduler.pth"])
+
+    sample_rate = accel.unwrap(generator).sample_rate
+    with argbind.scope(args, "train"):
+        train_data = build_dataset(sample_rate)
+    with argbind.scope(args, "val"):
+        val_data = build_dataset(sample_rate)
+
+    waveform_loss = losses.L1Loss()
+    stft_loss = losses.MultiScaleSTFTLoss()
+    mel_loss = losses.MelSpectrogramLoss()
+    gan_loss = losses.GANLoss(discriminator)
+
+    return State(
+        generator=generator,
+        optimizer_g=optimizer_g,
+        scheduler_g=scheduler_g,
+        discriminator=discriminator,
+        optimizer_d=optimizer_d,
+        scheduler_d=scheduler_d,
+        waveform_loss=waveform_loss,
+        stft_loss=stft_loss,
+        mel_loss=mel_loss,
+        gan_loss=gan_loss,
+        tracker=tracker,
+        train_data=train_data,
+        val_data=val_data,
+    )
+
+
+@timer()
+@torch.no_grad()
+def val_loop(batch, state, accel):
+    state.generator.eval()
+    batch = util.prepare_batch(batch, accel.device)
+    signal = state.val_data.transform(
+        batch["signal"].clone(), **batch["transform_args"]
+    )
+
+    recons = state.generator(signal.audio_data, signal.sample_rate)["audio"]
+    recons = AudioSignal(recons, signal.sample_rate)
+
+    return {
+        "loss": state.mel_loss(recons, signal),
+        "mel/loss": state.mel_loss(recons, signal),
+        "stft/loss": state.stft_loss(recons, signal),
+        "waveform/loss": state.waveform_loss(recons, signal),
+    }
+
+
+@timer()
+def train_loop(state, batch, accel, lambdas):
+    state.generator.train()
+    state.discriminator.train()
+    output = {}
+
+    batch = util.prepare_batch(batch, accel.device)
+    with torch.no_grad():
+        signal = state.train_data.transform(
+            batch["signal"].clone(), **batch["transform_args"]
+        )
+
+    with accel.autocast():
+        out = state.generator(signal.audio_data, signal.sample_rate)
+        recons = AudioSignal(out["audio"], signal.sample_rate)
+        commitment_loss = out["vq/commitment_loss"]
+        codebook_loss = out["vq/codebook_loss"]
+
+    with accel.autocast():
+        output["adv/disc_loss"] = state.gan_loss.discriminator_loss(recons, signal)
+
+    state.optimizer_d.zero_grad()
+    accel.backward(output["adv/disc_loss"])
+    accel.scaler.unscale_(state.optimizer_d)
+    output["other/grad_norm_d"] = torch.nn.utils.clip_grad_norm_(
+        state.discriminator.parameters(), 10.0
+    )
+    accel.step(state.optimizer_d)
+    state.scheduler_d.step()
+
+    with accel.autocast():
+        output["stft/loss"] = state.stft_loss(recons, signal)
+        output["mel/loss"] = state.mel_loss(recons, signal)
+        output["waveform/loss"] = state.waveform_loss(recons, signal)
+        (
+            output["adv/gen_loss"],
+            output["adv/feat_loss"],
+        ) = state.gan_loss.generator_loss(recons, signal)
+        output["vq/commitment_loss"] = commitment_loss
+        output["vq/codebook_loss"] = codebook_loss
+        output["loss"] = sum([v * output[k] for k, v in lambdas.items() if k in output])
+
+    state.optimizer_g.zero_grad()
+    accel.backward(output["loss"])
+    accel.scaler.unscale_(state.optimizer_g)
+    output["other/grad_norm"] = torch.nn.utils.clip_grad_norm_(
+        state.generator.parameters(), 1e3
+    )
+    accel.step(state.optimizer_g)
+    state.scheduler_g.step()
+    accel.update()
+
+    output["other/learning_rate"] = state.optimizer_g.param_groups[0]["lr"]
+    output["other/batch_size"] = signal.batch_size * accel.world_size
+
+    return {k: v for k, v in sorted(output.items())}
+
+
+def checkpoint(state, save_iters, save_path):
+    metadata = {"logs": state.tracker.history}
+
+    tags = ["latest"]
+    state.tracker.print(f"Saving to {str(Path('.').absolute())}")
+    if state.tracker.is_best("val", "mel/loss"):
+        state.tracker.print(f"Best generator so far")
+        tags.append("best")
+    if state.tracker.step in save_iters:
+        tags.append(f"{state.tracker.step // 1000}k")
+
+    for tag in tags:
+        generator_extra = {
+            "optimizer.pth": state.optimizer_g.state_dict(),
+            "scheduler.pth": state.scheduler_g.state_dict(),
+            "tracker.pth": state.tracker.state_dict(),
+            "metadata.pth": metadata,
+        }
+        accel.unwrap(state.generator).metadata = metadata
+        accel.unwrap(state.generator).save_to_folder(
+            f"{save_path}/{tag}", generator_extra
+        )
+        discriminator_extra = {
+            "optimizer.pth": state.optimizer_d.state_dict(),
+            "scheduler.pth": state.scheduler_d.state_dict(),
+        }
+        accel.unwrap(state.discriminator).save_to_folder(
+            f"{save_path}/{tag}", discriminator_extra
+        )
+
+
+@torch.no_grad()
+def save_samples(state, val_idx, writer):
+    state.tracker.print("Saving audio samples to TensorBoard")
+    state.generator.eval()
+
+    samples = [state.val_data[idx] for idx in val_idx]
+    batch = state.val_data.collate(samples)
+    batch = util.prepare_batch(batch, accel.device)
+    signal = state.train_data.transform(
+        batch["signal"].clone(), **batch["transform_args"]
+    )
+
+    recons = state.generator(signal.audio_data, signal.sample_rate)["audio"]
+    recons = AudioSignal(recons, signal.sample_rate)
+
+    audio_dict = {"recons": recons}
+    if state.tracker.step == 0:
+        audio_dict["signal"] = signal
+
+    for k, v in audio_dict.items():
+        for nb in range(v.batch_size):
+            v[nb].cpu().write_audio_to_tb(
+                f"{k}/sample_{nb}.wav", writer, state.tracker.step
+            )
+
+
+def validate(state, val_dataloader, accel):
+    for batch in val_dataloader:
+        output = val_loop(batch, state, accel)
+    # Consolidate state dicts if using ZeroRedundancyOptimizer
+    if hasattr(state.optimizer_g, "consolidate_state_dict"):
+        state.optimizer_g.consolidate_state_dict()
+        state.optimizer_d.consolidate_state_dict()
+    return output
+
+
+@argbind.bind(without_prefix=True)
+def train(
+    args,
+    accel: ml.Accelerator,
+    seed: int = 0,
+    save_path: str = "ckpt",
+    num_iters: int = 250000,
+    save_iters: list = [10000, 50000, 100000, 200000],
+    sample_freq: int = 10000,
+    valid_freq: int = 1000,
+    batch_size: int = 12,
+    val_batch_size: int = 10,
+    num_workers: int = 8,
+    val_idx: list = [0, 1, 2, 3, 4, 5, 6, 7],
+    lambdas: dict = {
+        "mel/loss": 100.0,
+        "adv/feat_loss": 2.0,
+        "adv/gen_loss": 1.0,
+        "vq/commitment_loss": 0.25,
+        "vq/codebook_loss": 1.0,
+    },
+):
+    util.seed(seed)
+    Path(save_path).mkdir(exist_ok=True, parents=True)
+    writer = (
+        SummaryWriter(log_dir=f"{save_path}/logs") if accel.local_rank == 0 else None
+    )
+    tracker = Tracker(
+        writer=writer, log_file=f"{save_path}/log.txt", rank=accel.local_rank
+    )
+
+    state = load(args, accel, tracker, save_path)
+    train_dataloader = accel.prepare_dataloader(
+        state.train_data,
+        start_idx=state.tracker.step * batch_size,
+        num_workers=num_workers,
+        batch_size=batch_size,
+        collate_fn=state.train_data.collate,
+    )
+    train_dataloader = get_infinite_loader(train_dataloader)
+    val_dataloader = accel.prepare_dataloader(
+        state.val_data,
+        start_idx=0,
+        num_workers=num_workers,
+        batch_size=val_batch_size,
+        collate_fn=state.val_data.collate,
+        persistent_workers=True,
+    )
+
+    # Wrap the functions so that they neatly track in TensorBoard + progress bars
+    # and only run when specific conditions are met.
+    global train_loop, val_loop, validate, save_samples, checkpoint
+    train_loop = tracker.log("train", "value", history=False)(
+        tracker.track("train", num_iters, completed=state.tracker.step)(train_loop)
+    )
+    val_loop = tracker.track("val", len(val_dataloader))(val_loop)
+    validate = tracker.log("val", "mean")(validate)
+
+    # These functions run only on the 0-rank process
+    save_samples = when(lambda: accel.local_rank == 0)(save_samples)
+    checkpoint = when(lambda: accel.local_rank == 0)(checkpoint)
+
+    with tracker.live:
+        for tracker.step, batch in enumerate(train_dataloader, start=tracker.step):
+            train_loop(state, batch, accel, lambdas)
+
+            last_iter = (
+                tracker.step == num_iters - 1 if num_iters is not None else False
+            )
+            if tracker.step % sample_freq == 0 or last_iter:
+                save_samples(state, val_idx, writer)
+
+            if tracker.step % valid_freq == 0 or last_iter:
+                validate(state, val_dataloader, accel)
+                checkpoint(state, save_iters, save_path)
+                # Reset validation progress bar, print summary since last validation.
+                tracker.done("val", f"Iteration {tracker.step}")
+
+            if last_iter:
+                break
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    args["args.debug"] = int(os.getenv("LOCAL_RANK", 0)) == 0
+    with argbind.scope(args):
+        with Accelerator() as accel:
+            if accel.local_rank != 0:
+                sys.tracebacklimit = 0
+            train(args, accel)
diff --git a/scripts/train_no_adv.py b/scripts/train_no_adv.py
new file mode 100644
index 0000000..632d2fe
--- /dev/null
+++ b/scripts/train_no_adv.py
@@ -0,0 +1,388 @@
+# Train without adversarial loss
+import os
+import sys
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+
+import argbind
+import torch
+from audiotools import AudioSignal
+from audiotools import ml
+from audiotools.core import util
+from audiotools.data import transforms
+from audiotools.data.datasets import AudioDataset
+from audiotools.data.datasets import AudioLoader
+from audiotools.data.datasets import ConcatDataset
+from audiotools.ml.decorators import timer
+from audiotools.ml.decorators import Tracker
+from audiotools.ml.decorators import when
+from torch.utils.tensorboard import SummaryWriter
+
+import dac
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+# Enable cudnn autotuner to speed up training
+# (can be altered by the funcs.seed function)
+torch.backends.cudnn.benchmark = bool(int(os.getenv("CUDNN_BENCHMARK", 1)))
+# Uncomment to trade memory for speed.
+
+# Optimizers
+AdamW = argbind.bind(torch.optim.AdamW, "generator")
+Accelerator = argbind.bind(ml.Accelerator, without_prefix=True)
+
+
+@argbind.bind("generator")
+def ExponentialLR(optimizer, gamma: float = 1.0):
+    return torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
+
+
+# Models
+DAC = argbind.bind(dac.model.DAC)
+
+# Data
+AudioDataset = argbind.bind(AudioDataset, "train", "val")
+AudioLoader = argbind.bind(AudioLoader, "train", "val")
+
+# Transforms
+filter_fn = lambda fn: hasattr(fn, "transform") and fn.__qualname__ not in [
+    "BaseTransform",
+    "Compose",
+    "Choose",
+]
+tfm = argbind.bind_module(transforms, "train", "val", filter_fn=filter_fn)
+
+# Loss
+filter_fn = lambda fn: hasattr(fn, "forward") and "Loss" in fn.__name__
+losses = argbind.bind_module(dac.nn.loss, filter_fn=filter_fn)
+
+
+def get_infinite_loader(dataloader):
+    while True:
+        for batch in dataloader:
+            yield batch
+
+
+@argbind.bind("train", "val")
+def build_transform(
+    augment_prob: float = 1.0,
+    preprocess: list = ["Identity"],
+    augment: list = ["Identity"],
+    postprocess: list = ["Identity"],
+):
+    to_tfm = lambda l: [getattr(tfm, x)() for x in l]
+    preprocess = transforms.Compose(*to_tfm(preprocess), name="preprocess")
+    augment = transforms.Compose(*to_tfm(augment), name="augment", prob=augment_prob)
+    postprocess = transforms.Compose(*to_tfm(postprocess), name="postprocess")
+    transform = transforms.Compose(preprocess, augment, postprocess)
+    return transform
+
+
+@argbind.bind("train", "val", "test")
+def build_dataset(
+    sample_rate: int,
+    folders: dict = None,
+):
+    # Give one loader per key/value of dictionary, where
+    # value is a list of folders. Create a dataset for each one.
+    # Concatenate the datasets with ConcatDataset, which
+    # cycles through them.
+    datasets = []
+    for _, v in folders.items():
+        loader = AudioLoader(sources=v)
+        transform = build_transform()
+        dataset = AudioDataset(loader, sample_rate, transform=transform)
+        datasets.append(dataset)
+
+    dataset = ConcatDataset(datasets)
+    dataset.transform = transform
+    return dataset
+
+
+@dataclass
+class State:
+    generator: DAC
+    optimizer_g: AdamW
+    scheduler_g: ExponentialLR
+
+    stft_loss: losses.MultiScaleSTFTLoss
+    mel_loss: losses.MelSpectrogramLoss
+    waveform_loss: losses.L1Loss
+
+    train_data: AudioDataset
+    val_data: AudioDataset
+
+    tracker: Tracker
+
+
+@argbind.bind(without_prefix=True)
+def load(
+    args,
+    accel: ml.Accelerator,
+    tracker: Tracker,
+    save_path: str,
+    resume: bool = False,
+    tag: str = "latest",
+    load_weights: bool = False,
+):
+    generator, g_extra = None, {}
+
+    if resume:
+        kwargs = {
+            "folder": f"{save_path}/{tag}",
+            "map_location": "cpu",
+            "package": not load_weights,
+        }
+        tracker.print(f"Resuming from {str(Path('.').absolute())}/{kwargs['folder']}")
+        if (Path(kwargs["folder"]) / "dac").exists():
+            generator, g_extra = DAC.load_from_folder(**kwargs)
+
+    generator = DAC() if generator is None else generator
+    generator = accel.prepare_model(generator)
+
+    with argbind.scope(args, "generator"):
+        optimizer_g = AdamW(generator.parameters(), use_zero=accel.use_ddp)
+        scheduler_g = ExponentialLR(optimizer_g)
+
+    if "optimizer.pth" in g_extra:
+        optimizer_g.load_state_dict(g_extra["optimizer.pth"])
+    if "scheduler.pth" in g_extra:
+        scheduler_g.load_state_dict(g_extra["scheduler.pth"])
+    if "tracker.pth" in g_extra:
+        tracker.load_state_dict(g_extra["tracker.pth"])
+
+    sample_rate = accel.unwrap(generator).sample_rate
+    with argbind.scope(args, "train"):
+        train_data = build_dataset(sample_rate)
+    with argbind.scope(args, "val"):
+        val_data = build_dataset(sample_rate)
+
+    waveform_loss = losses.L1Loss()
+    stft_loss = losses.MultiScaleSTFTLoss()
+    mel_loss = losses.MelSpectrogramLoss()
+
+    return State(
+        generator=generator,
+        optimizer_g=optimizer_g,
+        scheduler_g=scheduler_g,
+        waveform_loss=waveform_loss,
+        stft_loss=stft_loss,
+        mel_loss=mel_loss,
+        tracker=tracker,
+        train_data=train_data,
+        val_data=val_data,
+    )
+
+
+@timer()
+@torch.no_grad()
+def val_loop(batch, state, accel):
+    state.generator.eval()
+    batch = util.prepare_batch(batch, accel.device)
+    signal = state.val_data.transform(
+        batch["signal"].clone(), **batch["transform_args"]
+    )
+
+    recons = state.generator(signal.audio_data, signal.sample_rate)["audio"]
+    recons = AudioSignal(recons, signal.sample_rate)
+
+    return {
+        "loss": state.mel_loss(recons, signal),
+        "mel/loss": state.mel_loss(recons, signal),
+        "stft/loss": state.stft_loss(recons, signal),
+        "waveform/loss": state.waveform_loss(recons, signal),
+    }
+
+
+@timer()
+def train_loop(state, batch, accel, lambdas):
+    state.generator.train()
+    output = {}
+
+    batch = util.prepare_batch(batch, accel.device)
+    with torch.no_grad():
+        signal = state.train_data.transform(
+            batch["signal"].clone(), **batch["transform_args"]
+        )
+
+    with accel.autocast():
+        out = state.generator(signal.audio_data, signal.sample_rate)
+        recons = AudioSignal(out["audio"], signal.sample_rate)
+        commitment_loss = out["vq/commitment_loss"]
+        codebook_loss = out["vq/codebook_loss"]
+
+    with accel.autocast():
+        output["stft/loss"] = state.stft_loss(recons, signal)
+        output["mel/loss"] = state.mel_loss(recons, signal)
+        output["waveform/loss"] = state.waveform_loss(recons, signal)
+        output["vq/commitment_loss"] = commitment_loss
+        output["vq/codebook_loss"] = codebook_loss
+        output["loss"] = sum([v * output[k] for k, v in lambdas.items() if k in output])
+
+    state.optimizer_g.zero_grad()
+    accel.backward(output["loss"])
+    accel.scaler.unscale_(state.optimizer_g)
+    output["other/grad_norm"] = torch.nn.utils.clip_grad_norm_(
+        state.generator.parameters(), 1e3
+    )
+    accel.step(state.optimizer_g)
+    state.scheduler_g.step()
+    accel.update()
+
+    output["other/learning_rate"] = state.optimizer_g.param_groups[0]["lr"]
+    output["other/batch_size"] = signal.batch_size * accel.world_size
+
+    return {k: v for k, v in sorted(output.items())}
+
+
+def checkpoint(state, save_iters, save_path):
+    metadata = {"logs": state.tracker.history}
+
+    tags = ["latest"]
+    state.tracker.print(f"Saving to {str(Path('.').absolute())}")
+    if state.tracker.is_best("val", "mel/loss"):
+        state.tracker.print(f"Best generator so far")
+        tags.append("best")
+    if state.tracker.step in save_iters:
+        tags.append(f"{state.tracker.step // 1000}k")
+
+    for tag in tags:
+        generator_extra = {
+            "optimizer.pth": state.optimizer_g.state_dict(),
+            "scheduler.pth": state.scheduler_g.state_dict(),
+            "tracker.pth": state.tracker.state_dict(),
+            "metadata.pth": metadata,
+        }
+        accel.unwrap(state.generator).metadata = metadata
+        accel.unwrap(state.generator).save_to_folder(
+            f"{save_path}/{tag}", generator_extra
+        )
+
+
+@torch.no_grad()
+def save_samples(state, val_idx, writer):
+    state.tracker.print("Saving audio samples to TensorBoard")
+    state.generator.eval()
+
+    samples = [state.val_data[idx] for idx in val_idx]
+    batch = state.val_data.collate(samples)
+    batch = util.prepare_batch(batch, accel.device)
+    signal = state.train_data.transform(
+        batch["signal"].clone(), **batch["transform_args"]
+    )
+
+    recons = state.generator(signal.audio_data, signal.sample_rate)["audio"]
+    recons = AudioSignal(recons, signal.sample_rate)
+
+    audio_dict = {"recons": recons}
+    if state.tracker.step == 0:
+        audio_dict["signal"] = signal
+
+    for k, v in audio_dict.items():
+        for nb in range(v.batch_size):
+            v[nb].cpu().write_audio_to_tb(
+                f"{k}/sample_{nb}.wav", writer, state.tracker.step
+            )
+
+
+def validate(state, val_dataloader, accel):
+    for batch in val_dataloader:
+        output = val_loop(batch, state, accel)
+    # Consolidate state dicts if using ZeroRedundancyOptimizer
+    if hasattr(state.optimizer_g, "consolidate_state_dict"):
+        state.optimizer_g.consolidate_state_dict()
+        state.optimizer_d.consolidate_state_dict()
+    return output
+
+
+@argbind.bind(without_prefix=True)
+def train(
+    args,
+    accel: ml.Accelerator,
+    seed: int = 0,
+    save_path: str = "ckpt",
+    num_iters: int = 250000,
+    save_iters: list = [10000, 50000, 100000, 200000],
+    sample_freq: int = 10000,
+    valid_freq: int = 1000,
+    batch_size: int = 12,
+    val_batch_size: int = 10,
+    num_workers: int = 8,
+    val_idx: list = [0, 1, 2, 3, 4, 5, 6, 7],
+    lambdas: dict = {
+        "mel/loss": 100.0,
+        "adv/feat_loss": 2.0,
+        "adv/gen_loss": 1.0,
+        "vq/commitment_loss": 0.25,
+        "vq/codebook_loss": 1.0,
+    },
+):
+    util.seed(seed)
+    Path(save_path).mkdir(exist_ok=True, parents=True)
+    writer = (
+        SummaryWriter(log_dir=f"{save_path}/logs") if accel.local_rank == 0 else None
+    )
+    tracker = Tracker(
+        writer=writer, log_file=f"{save_path}/log.txt", rank=accel.local_rank
+    )
+
+    state = load(args, accel, tracker, save_path)
+    train_dataloader = accel.prepare_dataloader(
+        state.train_data,
+        start_idx=state.tracker.step * batch_size,
+        num_workers=num_workers,
+        batch_size=batch_size,
+        collate_fn=state.train_data.collate,
+    )
+    train_dataloader = get_infinite_loader(train_dataloader)
+    val_dataloader = accel.prepare_dataloader(
+        state.val_data,
+        start_idx=0,
+        num_workers=num_workers,
+        batch_size=val_batch_size,
+        collate_fn=state.val_data.collate,
+        persistent_workers=True,
+    )
+
+    # Wrap the functions so that they neatly track in TensorBoard + progress bars
+    # and only run when specific conditions are met.
+    global train_loop, val_loop, validate, save_samples, checkpoint
+    train_loop = tracker.log("train", "value", history=False)(
+        tracker.track("train", num_iters, completed=state.tracker.step)(train_loop)
+    )
+    val_loop = tracker.track("val", len(val_dataloader))(val_loop)
+    validate = tracker.log("val", "mean")(validate)
+
+    # These functions run only on the 0-rank process
+    save_samples = when(lambda: accel.local_rank == 0)(save_samples)
+    checkpoint = when(lambda: accel.local_rank == 0)(checkpoint)
+
+    with tracker.live:
+        for tracker.step, batch in enumerate(train_dataloader, start=tracker.step):
+            train_loop(state, batch, accel, lambdas)
+
+            last_iter = (
+                tracker.step == num_iters - 1 if num_iters is not None else False
+            )
+            if tracker.step % sample_freq == 0 or last_iter:
+                save_samples(state, val_idx, writer)
+
+            if tracker.step % valid_freq == 0 or last_iter:
+                validate(state, val_dataloader, accel)
+                checkpoint(state, save_iters, save_path)
+                # Reset validation progress bar, print summary since last validation.
+                tracker.done("val", f"Iteration {tracker.step}")
+
+            if last_iter:
+                break
+
+
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    args["args.debug"] = int(os.getenv("LOCAL_RANK", 0)) == 0
+    with argbind.scope(args):
+        with Accelerator() as accel:
+            if accel.local_rank != 0:
+                sys.tracebacklimit = 0
+            train(args, accel)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bcab119
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,56 @@
+from setuptools import find_packages
+from setuptools import setup
+
+with open("README.md") as f:
+    long_description = f.read()
+
+setup(
+    name="dac",
+    version="0.0.1",
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3.7",
+        "Topic :: Artistic Software",
+        "Topic :: Multimedia",
+        "Topic :: Multimedia :: Sound/Audio",
+        "Topic :: Multimedia :: Sound/Audio :: Editors",
+        "Topic :: Software Development :: Libraries",
+    ],
+    description="A high-quality general neural audio codec.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="Prem Seetharaman, Rithesh Kumar",
+    author_email="prem@descript.com",
+    url="https://github.com/descriptinc/descript-audio-codec",
+    license="MIT",
+    packages=find_packages(),
+    keywords=["audio", "compression", "machine learning"],
+    install_requires=[
+        "argbind>=0.3.7",
+        "audiotools @ git+https://github.com/descriptinc/audiotools.git@0.7.0",
+        "einops",
+        "numpy",
+        "torch",
+        "torchaudio",
+        "tqdm",
+    ],
+    extras_require={
+        "dev": [
+            "pytest",
+            "pytest-cov",
+            "pynvml",
+            "psutil",
+            "pandas",
+            "onnx",
+            "onnx-simplifier",
+            "seaborn",
+            "jupyterlab",
+            "pandas",
+            "watchdog",
+            "pesq",
+            "tabulate",
+            "encodec",
+        ],
+    },
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..fa3def9
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,53 @@
+"""
+Tests for CLI.
+"""
+import subprocess
+from pathlib import Path
+
+import argbind
+import numpy as np
+from audiotools import AudioSignal
+
+from dac.__main__ import run
+
+
+def setup_module(module):
+    data_dir = Path(__file__).parent / "assets"
+    data_dir.mkdir(exist_ok=True, parents=True)
+    input_dir = data_dir / "input"
+    input_dir.mkdir(exist_ok=True, parents=True)
+
+    for i in range(5):
+        signal = AudioSignal(np.random.randn(1000), 44_100)
+        signal.write(input_dir / f"sample_{i}.wav")
+    return input_dir
+
+
+def teardown_module(module):
+    repo_root = Path(__file__).parent.parent
+    subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/assets"])
+
+
+def test_reconstruction():
+    # Test encoding
+    input_dir = Path(__file__).parent / "assets" / "input"
+    output_dir = input_dir.parent / "encoded_output"
+    args = {
+        "input": str(input_dir),
+        "output": str(output_dir),
+    }
+    with argbind.scope(args):
+        run("encode")
+
+    # Test decoding
+    input_dir = output_dir
+    output_dir = input_dir.parent / "decoded_output"
+    args = {
+        "input": str(input_dir),
+        "output": str(output_dir),
+    }
+    with argbind.scope(args):
+        run("decode")
+
+
+# CUDA_VISIBLE_DEVICES=0 python -m pytest tests/test_cli.py -s
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 0000000..addb8ae
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,95 @@
+"""
+Tests for CLI.
+"""
+import os
+import shlex
+import subprocess
+from pathlib import Path
+
+import argbind
+import numpy as np
+from audiotools import AudioSignal
+
+from dac.__main__ import run
+
+
+def make_fake_data(data_dir=Path(__file__).parent / "assets"):
+    data_dir.mkdir(exist_ok=True, parents=True)
+    input_dir = data_dir / "input"
+    input_dir.mkdir(exist_ok=True, parents=True)
+
+    for i in range(100):
+        signal = AudioSignal(np.random.randn(44_100 * 5), 44_100)
+        signal.write(input_dir / f"sample_{i}.wav")
+    return input_dir
+
+
+def make_fake_data_tree():
+    data_dir = Path(__file__).parent / "assets"
+
+    for relative_dir in [
+        "train/speech",
+        "train/music",
+        "train/env",
+        "val/speech",
+        "val/music",
+        "val/env",
+        "test/speech",
+        "test/music",
+        "test/env",
+    ]:
+        leaf_dir = data_dir / relative_dir
+        leaf_dir.mkdir(exist_ok=True, parents=True)
+        make_fake_data(leaf_dir)
+    return {
+        split: {
+            key: [str(data_dir / f"{split}/{key}")]
+            for key in ["speech", "music", "env"]
+        }
+        for split in ["train", "val", "test"]
+    }
+
+
+def setup_module(module):
+    # Make fake dataset dir
+    input_datasets = make_fake_data_tree()
+    repo_root = Path(__file__).parent.parent
+
+    # Load baseline conf and modify it for testing
+    conf = argbind.load_args(repo_root / "conf" / "ablations" / "baseline.yml")
+
+    for key in ["train", "val", "test"]:
+        conf[f"{key}/build_dataset.folders"] = input_datasets[key]
+    conf["num_iters"] = 1
+    conf["val/AudioDataset.n_examples"] = 1
+    conf["val_idx"] = [0]
+    conf["val_batch_size"] = 1
+
+    argbind.dump_args(conf, Path(__file__).parent / "assets" / "conf.yml")
+
+
+def teardown_module(module):
+    repo_root = Path(__file__).parent.parent
+    # Remove fake dataset dir
+    subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/assets"])
+    subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/runs"])
+
+
+def test_single_gpu_train():
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0"
+    repo_root = Path(__file__).parent.parent
+    args = shlex.split(
+        f"python {repo_root}/scripts/train.py --args.load {repo_root}/tests/assets/conf.yml --save_path {repo_root}/tests/runs/baseline"
+    )
+    subprocess.check_output(args, env=env)
+
+
+def test_multi_gpu_train():
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0,1"
+    repo_root = Path(__file__).parent.parent
+    args = shlex.split(
+        f"torchrun --nproc_per_node gpu {repo_root}/scripts/train.py --args.load {repo_root}/tests/assets/conf.yml --save_path {repo_root}/tests/runs/baseline_multigpu"
+    )
+    subprocess.check_output(args, env=env)