Skip to content

Commit

Permalink
tools: use pytorch/nnAudio for data preparation's CQT
Browse files Browse the repository at this point in the history
Reduce the data preparation CQT step duration using nnAudio's
implementation.

$ time python3 -m tools.extract_csi_features data/covers80_testset

before:
real    1m31,340s
user    17m35,827s
sys     0m10,517s

after (with cuda):
real    0m10,519s
user    1m33,332s
sys     0m10,304s

A new device parameter is added to data/covers80_testset/hparams.yaml to
select the device, similar to the train parameter. It is not ideal, but
when I tried auto-detection of the device, I faced this error:
> Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
It can be improved later.

The librosa implementation is kept as it is faster when the device is
set to "cpu".
  • Loading branch information
samuel-gauthier authored and alanngnet committed Apr 3, 2024
1 parent 21d2237 commit 903a62e
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 7 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ See https://ar5iv.labs.arxiv.org/html/2306.09025 for the July 2023 research pape
1. Apple computer with an Apple M-series chip
2. Other computer with an Nvidia GPU (including free cloud options like Google Colab)
2. python3 (minimum version 3.10, tested on 3.11) with these libraries:
1. torch (compiled with CUDA or MPS or other GPU support as appropriate for your hardware)
1. torch torchaudio (compiled with CUDA or MPS or other GPU support as appropriate for your hardware)
2. librosa
3. Optional: tensorboard
3. nnAudio
4. Optional: tensorboard
3. sox

# Usage
Expand Down
1 change: 1 addition & 0 deletions data/covers80/hparams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"name": "cqt_with_asr_noise",
"noise_path": "dataset/asr_as_noise/dataset.txt"
}
"device": "mps"
"train-sample_data_split": 0.1 # percent of training data to reserve for validation aka "train-sample"
"train-sample_unseen": 0.02 # percent of song_ids from training data to reserve exclusively for validation aka "train-sample"
"test_data_split": 0.1 # percent of training data to reserve for test aka "dev"
Expand Down
1 change: 1 addition & 0 deletions data/covers80_testset/hparams.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"device": "mps"
"aug_speed_mode": [1.0]
"train-sample_data_split": 0 # percent of training data to reserve for validation aka "train-sample"
"train-sample_unseen": 0 # percent of song_ids from training data to reserve exclusively for validation aka "train-sample"
Expand Down
61 changes: 56 additions & 5 deletions tools/extract_csi_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from concurrent.futures import ProcessPoolExecutor

import librosa
from nnAudio.features.cqt import CQT, CQT2010v2
import numpy as np
import torch
import torchaudio

from src.cqt import PyCqt
from src.dataset import SignalAug
Expand Down Expand Up @@ -148,7 +150,7 @@ def _speed_aug_parallel(init_path, aug_speed_lst, aug_path, sp_dir) -> None:

# instead of original serial function,
# leverage multiple CPU cores to run multiple CQT extractions in parallel
def _extract_cqt_worker(args):
def _extract_cqt_worker_librosa(args):
"""worker function for _extract_cqt_parallel"""
line, cqt_dir = args
wav_path = line["wav"]
Expand All @@ -168,18 +170,67 @@ def _extract_cqt_worker(args):
return line


def _extract_cqt_parallel(init_path, out_path, cqt_dir) -> None:
def _extract_cqt_worker_torchaudio(args):
line, cqt_dir, device = args
wav_path = line["wav"]
feat_path = os.path.join(cqt_dir, "{}.cqt.npy".format(line["utt"]))

# CQT seems faster on mps, and CQT2010v2 faster on cuda
if device == "mps":
transform = CQT
elif device == "cuda":
transform = CQT2010v2

if not os.path.exists(feat_path):
signal, sr = torchaudio.load(wav_path)
signal = signal.to(device)
signal = (
signal
/ torch.max(torch.tensor(0.001).to(device), torch.max(torch.abs(signal)))
* 0.999
)
signal = transform(16000, hop_length=640, n_bins=96, fmin=32, verbose=False).to(
device
)(signal)
signal = signal + 1e-9
signal = signal.squeeze(0)

# Add contrast
ref_value_log10 = torch.log10(torch.max(signal))
signal = 20 * torch.log10(signal) - 20 * ref_value_log10

signal = torch.swapaxes(signal, 0, 1)
cqt = signal.numpy(force=True)
np.save(feat_path, cqt)
feat_len = len(cqt)
else:
feat_len = len(np.load(feat_path))
line["feat"] = feat_path
line["feat_len"] = feat_len
return line


def worker(args):
line, cqt_dir, device = args

if device in ("mps", "cuda"):
return _extract_cqt_worker_torchaudio(args)

return _extract_cqt_worker_librosa(line, cqt_dir)


def _extract_cqt_parallel(init_path, out_path, cqt_dir, device) -> None:
logging.info("Extract CQT features")
os.makedirs(cqt_dir, exist_ok=True)
dump_lines = []

with ProcessPoolExecutor() as executor:
worker_args = [
(line_to_dict(line), cqt_dir)
(line_to_dict(line), cqt_dir, device)
for line in read_lines(init_path, log=False)
]

for result in executor.map(_extract_cqt_worker, worker_args):
for result in executor.map(worker, worker_args):
dump_lines.append(dict_to_line(result))
if len(dump_lines) % 1000 == 0:
logging.info(
Expand Down Expand Up @@ -576,7 +627,7 @@ def _generate_csi_features(hp, feat_dir, start_stage, end_stage) -> None:
# Got it to run w/o errors but output wasn't quite right,
# and speed was 7 min 28 s for covers80,
# compared to 2 min 5 s with CPU-based _extract_cqt_parallel()
_extract_cqt_parallel(sp_aug_path, full_path, cqt_dir)
_extract_cqt_parallel(sp_aug_path, full_path, cqt_dir, hp["device"])

# noise augmentation was default off for CoverHunter
hp_noise = hp.get("add_noise", None)
Expand Down

0 comments on commit 903a62e

Please sign in to comment.