tools: use pytorch/nnAudio for data preparation's CQT

Reduce the data preparation CQT step duration using nnAudio's implementation. $ time python3 -m tools.extract_csi_features data/covers80_testset before: real 1m31,340s user 17m35,827s sys 0m10,517s after (with cuda): real 0m10,519s user 1m33,332s sys 0m10,304s A new device parameter is added to data/covers80_testset/hparams.yaml to select the device, similar to the train parameter. It is not ideal, but when I tried auto-detection of the device, I faced this error: > Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method It can be improved later. The librosa implementation is kept as it is faster when the device is set to "cpu".
alanngnet · Apr 3, 2024 · 903a62e · 903a62e
1 parent 21d2237
commit 903a62e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -10,9 +10,10 @@ See https://ar5iv.labs.arxiv.org/html/2306.09025 for the July 2023 research pape
     1. Apple computer with an Apple M-series chip
     2. Other computer with an Nvidia GPU (including free cloud options like Google Colab)
 2. python3 (minimum version 3.10, tested on 3.11) with these libraries:
-    1. torch (compiled with CUDA or MPS or other GPU support as appropriate for your hardware)
+    1. torch torchaudio (compiled with CUDA or MPS or other GPU support as appropriate for your hardware)
     2. librosa
-    3. Optional: tensorboard
+    3. nnAudio
+    4. Optional: tensorboard
 3. sox
 
 # Usage

diff --git a/data/covers80/hparams.yaml b/data/covers80/hparams.yaml
@@ -6,6 +6,7 @@
   "name": "cqt_with_asr_noise",
   "noise_path": "dataset/asr_as_noise/dataset.txt"
 }
+"device": "mps"
 "train-sample_data_split": 0.1 # percent of training data to reserve for validation aka "train-sample"
 "train-sample_unseen": 0.02 # percent of song_ids from training data to reserve exclusively for validation aka "train-sample"
 "test_data_split": 0.1 # percent of training data to reserve for test aka "dev"

diff --git a/data/covers80_testset/hparams.yaml b/data/covers80_testset/hparams.yaml
@@ -1,3 +1,4 @@
+"device": "mps"
 "aug_speed_mode": [1.0]
 "train-sample_data_split": 0 # percent of training data to reserve for validation aka "train-sample"
 "train-sample_unseen": 0 # percent of song_ids from training data to reserve exclusively for validation aka "train-sample"

diff --git a/tools/extract_csi_features.py b/tools/extract_csi_features.py
@@ -12,8 +12,10 @@
 from concurrent.futures import ProcessPoolExecutor
 
 import librosa
+from nnAudio.features.cqt import CQT, CQT2010v2
 import numpy as np
 import torch
+import torchaudio
 
 from src.cqt import PyCqt
 from src.dataset import SignalAug
@@ -148,7 +150,7 @@ def _speed_aug_parallel(init_path, aug_speed_lst, aug_path, sp_dir) -> None:
 
 # instead of original serial function,
 # leverage multiple CPU cores to run multiple CQT extractions in parallel
-def _extract_cqt_worker(args):
+def _extract_cqt_worker_librosa(args):
     """worker function for _extract_cqt_parallel"""
     line, cqt_dir = args
     wav_path = line["wav"]
@@ -168,18 +170,67 @@ def _extract_cqt_worker(args):
     return line
 
 
-def _extract_cqt_parallel(init_path, out_path, cqt_dir) -> None:
+def _extract_cqt_worker_torchaudio(args):
+    line, cqt_dir, device = args
+    wav_path = line["wav"]
+    feat_path = os.path.join(cqt_dir, "{}.cqt.npy".format(line["utt"]))
+
+    # CQT seems faster on mps, and CQT2010v2 faster on cuda
+    if device == "mps":
+        transform = CQT
+    elif device == "cuda":
+        transform = CQT2010v2
+
+    if not os.path.exists(feat_path):
+        signal, sr = torchaudio.load(wav_path)
+        signal = signal.to(device)
+        signal = (
+            signal
+            / torch.max(torch.tensor(0.001).to(device), torch.max(torch.abs(signal)))
+            * 0.999
+        )
+        signal = transform(16000, hop_length=640, n_bins=96, fmin=32, verbose=False).to(
+            device
+        )(signal)
+        signal = signal + 1e-9
+        signal = signal.squeeze(0)
+
+        # Add contrast
+        ref_value_log10 = torch.log10(torch.max(signal))
+        signal = 20 * torch.log10(signal) - 20 * ref_value_log10
+
+        signal = torch.swapaxes(signal, 0, 1)
+        cqt = signal.numpy(force=True)
+        np.save(feat_path, cqt)
+        feat_len = len(cqt)
+    else:
+        feat_len = len(np.load(feat_path))
+    line["feat"] = feat_path
+    line["feat_len"] = feat_len
+    return line
+
+
+def worker(args):
+    line, cqt_dir, device = args
+
+    if device in ("mps", "cuda"):
+        return _extract_cqt_worker_torchaudio(args)
+
+    return _extract_cqt_worker_librosa(line, cqt_dir)
+
+
+def _extract_cqt_parallel(init_path, out_path, cqt_dir, device) -> None:
     logging.info("Extract CQT features")
     os.makedirs(cqt_dir, exist_ok=True)
     dump_lines = []
 
     with ProcessPoolExecutor() as executor:
         worker_args = [
-            (line_to_dict(line), cqt_dir)
+            (line_to_dict(line), cqt_dir, device)
             for line in read_lines(init_path, log=False)
         ]
 
-        for result in executor.map(_extract_cqt_worker, worker_args):
+        for result in executor.map(worker, worker_args):
             dump_lines.append(dict_to_line(result))
             if len(dump_lines) % 1000 == 0:
                 logging.info(
@@ -576,7 +627,7 @@ def _generate_csi_features(hp, feat_dir, start_stage, end_stage) -> None:
             # Got it to run w/o errors but output wasn't quite right,
             # and speed was 7 min 28 s for covers80,
             # compared to 2 min 5 s with CPU-based _extract_cqt_parallel()
-            _extract_cqt_parallel(sp_aug_path, full_path, cqt_dir)
+            _extract_cqt_parallel(sp_aug_path, full_path, cqt_dir, hp["device"])
 
     # noise augmentation was default off for CoverHunter
     hp_noise = hp.get("add_noise", None)