-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlosses.py
101 lines (86 loc) · 5.59 KB
/
losses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import torch
import torch.nn.functional as F
from model import Audio2Mel
def total_loss(fmap_real, logits_fake, fmap_fake, input_wav, output_wav, sample_rate=24000):
"""This function is used to compute the total loss of the encodec generator.
Loss = \lambda_t * L_t + \lambda_f * L_f + \lambda_g * L_g + \lambda_feat * L_feat
L_t: time domain loss | L_f: frequency domain loss | L_g: generator loss | L_feat: feature loss
\lambda_t = 0.1 | \lambda_f = 1 | \lambda_g = 3 | \lambda_feat = 3
Args:
fmap_real (list): fmap_real is the output of the discriminator when the input is the real audio.
len(fmap_real) = len(fmap_fake) = disc.num_discriminators = 3
logits_fake (_type_): logits_fake is the list of every sub discriminator output of the Multi discriminator
logits_fake, _ = disc_model(model(input_wav)[0].detach())
fmap_fake (_type_): fmap_fake is the output of the discriminator when the input is the fake audio.
fmap_fake = disc_model(model(input_wav)[0]) = disc_model(reconstructed_audio)
input_wav (tensor): input_wav is the input audio of the generator (GT audio)
output_wav (tensor): output_wav is the output of the generator (output = model(input_wav)[0])
sample_rate (int, optional): Defaults to 24000.
Returns:
loss: total loss
"""
device = input_wav.device
relu = torch.nn.ReLU()
l1Loss = torch.nn.L1Loss(reduction='mean')
l2Loss = torch.nn.MSELoss(reduction='mean')
# Collect losses as defined in paper for use with balancer
# l_t - L1 distance between the target and compressed audio over the time domain
# l_f - linear combination between the L1 and L2 losses over the mel-spectrogram using several time scales
# l_g - adversarial loss for the generator
# l_feat - relative feature matching loss for the generator
l_t = torch.tensor([0.0], device=device, requires_grad=True)
l_f = torch.tensor([0.0], device=device, requires_grad=True)
l_g = torch.tensor([0.0], device=device, requires_grad=True)
l_feat = torch.tensor([0.0], device=device, requires_grad=True)
#time domain loss, output_wav is the output of the generator
l_t = l1Loss(input_wav, output_wav)
#frequency domain loss, window length is 2^i, hop length is 2^i/4, i \in [5,11]. combine l1 and l2 loss
for i in range(5, 12): #e=5,...,11
# fft = model.Audio2Mel(n_fft=2 ** i,win_length=2 ** i, hop_length=(2 ** i) // 4, n_mel_channels=64, sampling_rate=sample_rate, device='cpu')
# l_f = l_f + l1Loss(fft(input_wav), fft(output_wav)) + l2Loss(fft(input_wav), fft(output_wav))
fft = Audio2Mel(n_fft=2 ** i,win_length=2 ** i, hop_length=(2 ** i) // 4, n_mel_channels=64, sampling_rate=sample_rate, device=device)
l_f = l_f + l1Loss(fft(input_wav), fft(output_wav)) + l2Loss(fft(input_wav), fft(output_wav))
#generator loss and feat loss, D_k(\hat x) = logits_fake[k], D_k^l(x) = fmap_real[k][l], D_k^l(\hat x) = fmap_fake[k][l]
# l_g = \sum max(0, 1 - D_k(\hat x)) / K, K = disc.num_discriminators = len(fmap_real) = len(fmap_fake) = len(logits_fake) = 3
# l_feat = \sum |D_k^l(x) - D_k^l(\hat x)| / |D_k^l(x)| / KL, KL = len(fmap_real[0])*len(fmap_real)=3 * 5
for tt1 in range(len(fmap_real)): # len(fmap_real) = 3
l_g = l_g + torch.mean(relu(1 - logits_fake[tt1])) / len(logits_fake)
for tt2 in range(len(fmap_real[tt1])): # len(fmap_real[tt1]) = 5
# l_feat = l_feat + l1Loss(fmap_real[tt1][tt2].detach(), fmap_fake[tt1][tt2]) / torch.mean(torch.abs(fmap_real[tt1][tt2].detach()))
l_feat = l_feat + l1Loss(fmap_real[tt1][tt2], fmap_fake[tt1][tt2]) / torch.mean(torch.abs(fmap_real[tt1][tt2]))
KL_scale = len(fmap_real)*len(fmap_real[0]) # len(fmap_real) == len(fmap_fake) == len(logits_real) == len(logits_fake) == disc.num_discriminators == K
l_feat = l_feat / KL_scale
K_scale = len(fmap_real) # len(fmap_real[0]) = len(fmap_fake[0]) == L
l_g = l_g / K_scale
return {
'l_t': l_t,
'l_f': l_f,
'l_g': l_g,
'l_feat': l_feat,
}
def disc_loss(logits_real, logits_fake, logits_real_p=None, logits_fake_p=None, logits_real_s=None, logits_fake_s=None):
"""This function is used to compute the loss of the discriminator.
l_d = \sum max(0, 1 - D_k(x)) + max(0, 1 + D_k(\hat x)) / K, K = disc.num_discriminators = len(logits_real) = len(logits_fake) = 3
Args:
logits_real (List[torch.Tensor]): logits_real = disc_model(input_wav)[0]
logits_fake (List[torch.Tensor]): logits_fake = disc_model(model(input_wav)[0])[0]
Returns:
lossd: discriminator loss
"""
device = logits_real[0].device
relu = torch.nn.ReLU()
loss_stft = torch.tensor([0.0], device=device, requires_grad=True)
for tt1 in range(len(logits_real)):
loss_stft = loss_stft + torch.mean(relu(1-logits_real[tt1])) + torch.mean(relu(1+logits_fake[tt1]))
loss_stft = loss_stft / len(logits_real)
if logits_real_p == None:
return loss_stft
loss_mpd = torch.tensor([0.0], device=device, requires_grad=True)
for tt1 in range(len(logits_real_p)):
loss_mpd = loss_mpd + torch.mean(relu(1-logits_real_p[tt1])) + torch.mean(relu(1+logits_fake_p[tt1]))
loss_mpd = loss_mpd / len(logits_real_p)
loss_msd = torch.tensor([0.0], device=device, requires_grad=True)
for tt1 in range(len(logits_real_s)):
loss_msd = loss_msd + torch.mean(relu(1-logits_real_s[tt1])) + torch.mean(relu(1+logits_fake_s[tt1]))
loss_msd = loss_msd / len(logits_real_s)
return (loss_stft + loss_mpd + loss_msd) / 3