Skip to content

Commit

Permalink
SincNet base working
Browse files Browse the repository at this point in the history
  • Loading branch information
vikigenius committed Apr 2, 2019
1 parent a8c8a63 commit 5154043
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 116 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,5 @@ target/

# Special
sync.sh

casync.sh
gsync.sh
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ dependencies:
- openblas=0.3.5=h9ac9557_1001
- openh264=1.8.0=hdbcaa40_1000
- openssl=1.1.1b=h7b6447c_1
- pandas=0.24.2=py37he6710b0_0
- pandoc=2.2.3.2=0
- pandocfilters=1.4.2=py37_1
- parso=0.3.4=py37_0
Expand All @@ -112,6 +113,7 @@ dependencies:
- python=3.7.2=h0371630_0
- python-dateutil=2.8.0=py_0
- pytorch=1.0.1=py3.7_cuda10.0.130_cudnn7.4.2_2
- pytz=2018.9=py37_0
- pyyaml=3.13=py37h14c3975_0
- pyzmq=18.0.0=py37he6710b0_0
- qt=5.6.3=h8bf5577_3
Expand Down
7 changes: 7 additions & 0 deletions nsi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import logging.config
import click
import collections
import random
import numpy as np
import torch
from src.utils.params import Params
from src.features.build_features import featuregen
from src.models.train_model import train
Expand All @@ -22,6 +25,10 @@ def main(ctx, config):
with open('logging_config.yml') as fp:
log_cfg = yaml.safe_load(fp)
logging.config.dictConfig(log_cfg)
random.seed(1037)
np.random.seed(99999)
torch.manual_seed(1504)
torch.cuda.manual_seed(1610)


main.add_command(featuregen)
Expand Down
36 changes: 22 additions & 14 deletions settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,20 @@ defaults:
num_classes: 1251
num_workers: 4
hparams:
window_size: 25
window_shift: 10
win_size: 25
hop_len: 10
window_type: 'hann'
num_workers: 1
sample_freq: 16000
l2_coeff: 0.0001
sf: 16000
num_classes: 1251
batch_size: 128
learning_rate: 0.001
batch_size: 64
epochs: 100
adam_eps: 0.001
sched_decay: 0.63095
val_step: 100000
val_start: 99999
preprocess: True
duration: 1.0
overlap: 0.05
duration: 3.0
overlap: 0.5
cnn:
sf: 16000
input_dim: 16000
Expand All @@ -53,8 +50,19 @@ hparams:
act_funs: ['relu', 'softmax']
drop_probs: [0.0, 0.0, 0.0]
optimizer:
name: 'rmsprop'
params:
learning_rate: 0.001
alpha: 0.95
weight_decay: 0.0
rmsprop:
params:
learning_rate: 0.001
alpha: 0.95
weight_decay: 0.0
sgd:
params:
init_lr: 1.0e-2
fin_lr: 1.0e-8
decay_schedule: 'logarithmic'
last_epoch: 30
weight_decay: 5.0e-4
adam:
params:
learning_rate: 0.001
weight_decay: 5.0e-4
22 changes: 16 additions & 6 deletions src/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,30 @@ class SInfo:
path: str = ''


class Spectrogram(Dataset):
def __init__(self, map_file: str):
class CelebSpeech(Dataset):
def __init__(self, map_file: str, tdur=None):
"""
Spectrogram Dataset
Args:
map_file: Path of raw audio
tdur: int, the duration to truncate it to
"""
with open(map_file, 'rb') as f:
self.spec_list = pickle.load(f)
self.tdur = tdur
self.processor = ProcessedRaw(16000.0, preprocess=False)

def __getitem__(self, idx):
sinfo = self.spec_list[idx]
sgram = np.load(sinfo.path)
sgram -= np.mean(sgram, 1, keepdims=True)
sgram /= np.std(sgram, 1, keepdims=True)
path = sinfo.path
if self.tdur:
raw = self.processor.load_sample(path, self.tdur)
else:
raw = self.processor.load(path)
return {
'cid': sinfo.cid,
'gid': sinfo.gid,
'sgram': sgram
'raw': raw,
}

def __len__(self):
Expand Down
52 changes: 39 additions & 13 deletions src/features/raw.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
#!/usr/bin/env python
import librosa
import scipy
import numpy as np


class ProcessedRaw(object):
def __init__(self, sf, cwlen, cwshift=None, max_chunks=None):
def __init__(self, sf, preprocess=True):
self.sf = sf
self.cwlen = cwlen
self.cwshift = cwshift
self.max_chunks = max_chunks
self.do_process = preprocess
if self.sf == 16000:
self.dc_alpha = 0.99
elif self.sf == 8000:
self.dc_alpha = 0.999
else:
raise ValueError('Only 16 and 8Khz supported')
self.pe_alpha = 0.97

def _preprocess(self, signal):
# Remove DC component and add a small dither
signal = scipy.signal.lfilter([1, -1], [1, -self.dc_alpha], signal)
dither = np.random.random_sample(
signal.shape) + np.random.random_sample(
signal.shape) - 1
spow = np.std(signal)
signal = signal + 1e-6*spow*dither

signal = scipy.signal.lfilter([1 - self.pe_alpha], 1, signal)
return signal

def _get_chunks(self, signal):
def _get_chunks(self, signal, cwlen, cwshift, max_chunks):
wlen = int(self.cwlen*self.sf)
slen = signal.shape[0]
wshift = int(self.cwshift*self.sf)
Expand All @@ -22,29 +40,37 @@ def _get_chunks(self, signal):
sig_arr = np.zeros((self.max_chunks, wlen))
count_fr = 0

while end_samp < slen and count_fr < self.max_chunks:
while end_samp < slen and count_fr < max_chunks:
sig_arr[count_fr, :] = signal[beg_samp:end_samp]
beg_samp = beg_samp+wshift
end_samp = beg_samp+wlen
count_fr += 1

return sig_arr[:count_fr, :]

def _get_sample(self, signal):
def _get_sample(self, signal, cwlen):
# Get Random Chunk
wlen = int(self.cwlen*self.sf)
wlen = int(cwlen*self.sf)
slen = signal.shape[0]
offs = np.random.randint(slen - wlen)
raw_sample = signal[offs:offs+wlen]
return raw_sample

def load(self, path: str):
signal, _ = librosa.load(path, sr=self.sf)
return signal

def load_sample(self, path: str, cwlen, normalize=False):
signal = self.load(path)
signal = self._get_sample(signal, cwlen)
# Normalize
signal /= np.abs(np.max(signal))
if normalize:
signal /= np.abs(np.max(signal))
if self.do_process:
signal = self._preprocess(signal)
return signal

if self.cwshift:
return self._get_chunks(signal)
else:
return self._get_sample(signal)
def load_chunks(self, path: str, cwlen, cwshift,
max_chunks, normalize=False):
signal = self.load(path)
return self._get_chunks(signal, cwlen, max_chunks)
21 changes: 21 additions & 0 deletions src/features/speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python
import librosa
import numpy as np


class SpeechFeatures(object):
def __init__(self, cwlen, cwshift, sf):
self.sf = sf

def _preprocess(self):
pass

def _load(self, afile):
signal, _ = librosa.load(afile, sr=self.sf)
return signal

def load_raw(self, preprocess=True):
pass

def load_sgram(self, preprocess=False):
pass
26 changes: 0 additions & 26 deletions src/models/resnet.py

This file was deleted.

10 changes: 8 additions & 2 deletions src/models/resnet_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo


Expand Down Expand Up @@ -119,7 +120,6 @@ def __init__(self, block, layers, num_classes=1000,
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.fc1 = nn.Linear(9, 1)
self.avgpool = nn.AvgPool2d((1, 10), stride=1)
self.fc2 = nn.Linear(512 * block.expansion, num_classes)

for m in self.modules():
Expand Down Expand Up @@ -172,7 +172,13 @@ def forward(self, x):
x = x.permute(0, 1, 3, 2)
x = self.fc1(x)
x = x.permute(0, 1, 3, 2)
x = self.avgpool(x)

width = x.size(3)

# Now do average pooling
x = F.avg_pool2d(x, (1, width))
#
# x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc2(x)

Expand Down
52 changes: 52 additions & 0 deletions src/models/specnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python
from torch import nn
import torch
from src.models.resnet_base import resnet50
from src.models.ops import CELoss
from src.utils.math_utils import nextpow2
from src.utils import torch_utils


class SpecNet(nn.Module):
def __init__(self, num_classes, sf, win_size, hop_len,
window=torch.hamming_window):
super().__init__()
self.num_classes = num_classes
self.base = resnet50(num_classes=self.num_classes)
self.criterion = nn.CrossEntropyLoss()
self.loss_obj = CELoss
self.sf = sf
self.win_length = round(1e-3*win_size*self.sf)
self.hop_length = round(1e-3*hop_len*self.sf)
self.n_fft = 2**nextpow2(self.win_length)
self.hop_len = hop_len

self.window = window(self.win_length, device=torch_utils.device)

def spectrogram(self, signal: torch.Tensor):
window = self.window
spec = torch.stft(signal, self.n_fft, hop_length=self.hop_length,
win_length=self.win_length, window=window)
mag_spec = spec.pow(2).sum(-1) # Mag Spectrogram
if mag_spec.size(1) != 257: # Debug
raise RuntimeError(
f'Expected SPEC size 257, got {mag_spec.size(2)}')
spec_mean = mag_spec.mean(2, keepdim=True)
spec_std = mag_spec.std(2, keepdim=True)
mag_spec -= spec_mean
mag_spec /= spec_std
return mag_spec.to(torch.float)

def forward(self, batch):
signal = batch['raw']
spec = self.spectrogram(signal).unsqueeze(1)
return self.base(spec)

def loss(self, model_outs, batch):
if self.num_classes == 2:
target = batch['gid']
else:
target = batch['cid']
loss = self.criterion(model_outs, target)
metric = CELoss(loss, 1)
return metric, loss
Loading

0 comments on commit 5154043

Please sign in to comment.