-
Notifications
You must be signed in to change notification settings - Fork 96
/
hparams.py
236 lines (161 loc) · 10.6 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
hparams = tf.contrib.training.HParams(
name = "Tacotron-2",
# tacotron hyper parameter
cleaners = 'korean_cleaners', # 'korean_cleaners' or 'english_cleaners'
skip_path_filter = False, # npz파일에서 불필요한 것을 거르는 작업을 할지 말지 결정. receptive_field 보다 짧은 data를 걸러야 하기 때문에 해 줘야 한다.
use_lws = False,
# Audio
sample_rate = 24000, #
# shift can be specified by either hop_size(우선) or frame_shift_ms
hop_size = 300, # frame_shift_ms = 12.5ms
fft_size=2048, # n_fft. 주로 1024로 되어있는데, tacotron에서 2048사용
win_size = 1200, # 50ms
num_mels=80,
#Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude levels. Also allows for better G&L phase reconstruction)
preemphasize = True, #whether to apply filter
preemphasis = 0.97,
min_level_db = -100,
ref_level_db = 20,
signal_normalization = True, #Whether to normalize mel spectrograms to some predefined range (following below parameters)
allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
symmetric_mels = True, #Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, faster and cleaner convergence)
max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not be too big to avoid gradient explosion, not too small for fast convergence)
rescaling=True,
rescaling_max=0.999,
trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
#M-AILABS (and other datasets) trim params (there parameters are usually correct for any data, but definitely must be tuned for specific speakers)
trim_fft_size = 512,
trim_hop_size = 128,
trim_top_db = 23,
clip_mels_length = True, #For cases of OOM (Not really recommended, only use if facing unsolvable OOM errors, also consider clipping your samples to smaller chunks)
max_mel_frames = 1000, #Only relevant when clip_mels_length = True, please only use after trying output_per_steps=3 and still getting OOM errors.
l2_regularization_strength = 0, # Coefficient in the L2 regularization.
sample_size = 9000, # Concatenate and cut audio samples to this many samples
silence_threshold = 0, # Volume threshold below which to trim the start and the end from the training set samples. e.g. 2
filter_width = 3,
gc_channels = 32, # global_condition_vector의 차원. 이것 지정함으로써, global conditioning을 모델에 반영하라는 의미가 된다.
input_type="raw", # 'mulaw-quantize', 'mulaw', 'raw', mulaw, raw 2가지는 scalar input
scalar_input = True, # input_type과 맞아야 함.
dilations = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
residual_channels = 128,
dilation_channels = 256,
quantization_channels = 256,
out_channels = 30, # discretized_mix_logistic_loss를 적용하기 때문에, 3의 배수
skip_channels = 128,
use_biases = True,
upsample_type = 'SubPixel', # 'SubPixel', None
upsample_factor=[12,25], # np.prod(upsample_factor) must equal to hop_size
# wavenet training hp
wavenet_batch_size = 2, # 16--> OOM. wavenet은 batch_size가 고정되어야 한다.
store_metadata = False,
num_steps = 1000000, # Number of training steps
#Learning rate schedule
wavenet_learning_rate = 1e-3, #wavenet initial learning rate
wavenet_decay_rate = 0.5, #Only used with 'exponential' scheme. Defines the decay rate.
wavenet_decay_steps = 300000, #Only used with 'exponential' scheme. Defines the decay steps.
#Regularization parameters
wavenet_clip_gradients = True, #Whether the clip the gradients during wavenet training.
# residual 결과를 sum할 때,
legacy = True, #Whether to use legacy mode: Multiply all skip outputs but the first one with sqrt(0.5) (True for more early training stability, especially for large models)
# residual block내에서 x = (x + residual) * np.sqrt(0.5)
residual_legacy = True, #Whether to scale residual blocks outputs by a factor of sqrt(0.5) (True for input variance preservation early in training and better overall stability)
wavenet_dropout = 0.05,
optimizer = 'adam',
momentum = 0.9, # 'Specify the momentum to be used by sgd or rmsprop optimizer. Ignored by the adam optimizer.
max_checkpoints = 3, # 'Maximum amount of checkpoints that will be kept alive. Default: '
####################################
####################################
####################################
# TACOTRON HYPERPARAMETERS
# Training
adam_beta1 = 0.9,
adam_beta2 = 0.999,
#Learning rate schedule
tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
tacotron_start_decay = 40000, #Step at which learning decay starts
tacotron_decay_steps = 18000, #Determines the learning rate decay slope (UNDER TEST)
tacotron_decay_rate = 0.5, #learning rate decay rate (UNDER TEST)
tacotron_initial_learning_rate = 1e-3, #starting learning rate
tacotron_final_learning_rate = 1e-4, #minimal learning rate
initial_data_greedy = True,
initial_phase_step = 8000, # 여기서 지정한 step 이전에는 data_dirs의 각각의 디렉토리에 대하여 같은 수의 example을 만들고, 이후, weght 비듈에 따라 ... 즉, 아래의 'main_data_greedy_factor'의 영향을 받는다.
main_data_greedy_factor = 0,
main_data = [''], # 이곳에 있는 directory 속에 있는 data는 가중치를 'main_data_greedy_factor' 만큼 더 준다.
prioritize_loss = False,
# Model
model_type = 'multi-speaker', # [single, multi-speaker]
speaker_embedding_size = 16,
embedding_size = 512, # 'ᄀ', 'ᄂ', 'ᅡ' 에 대한 embedding dim
dropout_prob = 0.5,
reduction_factor = 2, # reduction_factor가 적으면 더 많은 iteration이 필요하므로, 더 많은 메모리가 필요하다.
# Encoder
enc_conv_num_layers = 3,
enc_conv_kernel_size = 5,
enc_conv_channels = 512,
tacotron_zoneout_rate = 0.1,
encoder_lstm_units = 256,
attention_type = 'bah_mon_norm', # 'loc_sen', 'bah_mon_norm'
attention_size = 128,
#Attention mechanism
smoothing = False, #Whether to smooth the attention normalization function
attention_dim = 128, #dimension of attention space
attention_filters = 32, #number of attention convolution filters
attention_kernel = (31, ), #kernel size of attention convolution
cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
#Attention synthesis constraints
#"Monotonic" constraint forces the model to only look at the forwards attention_win_size steps.
#"Window" allows the model to look at attention_win_size neighbors, both forward and backward steps.
synthesis_constraint = False, #Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis)
synthesis_constraint_type = 'window', #can be in ('window', 'monotonic').
attention_win_size = 7, #Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window.
#Loss params
mask_encoder = True, #whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
#Decoder
prenet_layers = [256, 256], #number of layers and number of units of prenet
decoder_layers = 2, #number of decoder lstm layers
decoder_lstm_units = 1024, #number of decoder lstm units on each layer
dec_prenet_sizes = [256, 256], #number of layers and number of units of prenet
#Residual postnet
postnet_num_layers = 5, #number of postnet convolutional layers
postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer
postnet_channels = 512, #number of postnet convolution filters for each layer
# for linear mel spectrogrma
post_bank_size = 8,
post_bank_channel_size = 128,
post_maxpool_width = 2,
post_highway_depth = 4,
post_rnn_size = 128,
post_proj_sizes = [256, 80], # num_mels=80
post_proj_width = 3,
tacotron_reg_weight = 1e-6, #regularization weight (for L2 regularization)
inference_prenet_dropout = True,
# Eval
min_tokens = 30, #originally 50, 30 is good for korean, text를 token으로 쪼갰을 때, 최소 길이 이상되어야 train에 사용
min_n_frame = 30*5, # min_n_frame = reduction_factor * min_iters, reduction_factor와 곱해서 min_n_frame을 설정한다.
max_n_frame = 200*5,
skip_inadequate = False,
griffin_lim_iters = 60,
power = 1.5,
)
if hparams.use_lws:
# Does not work if fft_size is not multiple of hop_size!!
# sample size = 20480, hop_size=256=12.5ms. fft_size는 window_size를 결정하는데, 2048을 시간으로 환산하면 2048/20480 = 0.1초=100ms
hparams.sample_rate = 20480 #
# shift can be specified by either hop_size(우선) or frame_shift_ms
hparams.hop_size = 256 # frame_shift_ms = 12.5ms
hparams.frame_shift_ms=None # hop_size= sample_rate * frame_shift_ms / 1000
hparams.fft_size=2048 # 주로 1024로 되어있는데, tacotron에서 2048사용==> output size = 1025
hparams.win_size = None # 256x4 --> 50ms
else:
# 미리 정의되 parameter들로 부터 consistant하게 정의해 준다.
hparams.num_freq = int(hparams.fft_size/2 + 1)
hparams.frame_shift_ms = hparams.hop_size * 1000.0/ hparams.sample_rate # hop_size= sample_rate * frame_shift_ms / 1000
hparams.frame_length_ms = hparams.win_size * 1000.0/ hparams.sample_rate
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
return 'Hyperparameters:\n' + '\n'.join(hp)