forked from seorim0/DCCRN-with-various-loss-functions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainer.py
190 lines (159 loc) · 7.95 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
Interface for train
"""
import os
import time
import torch
import shutil
import numpy as np
import config as cfg
from run import model_train, model_validate, model_test
from dataloader import create_dataloader, create_dataloader_for_test
from model import DCCRN, DCUNET, DCCRN_direct, DCCRN_no_skip
from write_on_tensorboard import Writer
###############################################################################
# Helper function definition #
###############################################################################
# Write training related parameters into the log file.
def write_status_to_log_file(fp, total_parameters):
fp.write('adsfasdfsdfds')
fp.write('%d-%d-%d %d:%d:%d\n' %
(time.localtime().tm_year, time.localtime().tm_mon,
time.localtime().tm_mday, time.localtime().tm_hour,
time.localtime().tm_min, time.localtime().tm_sec))
fp.write('mode : %s_%s\n' % (cfg.mode, cfg.info))
fp.write('learning rate : %g\n' % cfg.learning_rate)
fp.write('total params : %d (%.2f M, %.2f MBytes)\n' %
(total_parameters,
total_parameters / 1000000.0,
total_parameters * 4.0 / 1000000.0))
# Calculate the size of total network.
def calculate_total_params(our_model):
total_parameters = 0
for variable in our_model.parameters():
shape = variable.size()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim
total_parameters += variable_parameters
return total_parameters
###############################################################################
# Parameter Initialization #
###############################################################################
print('***********************************************************')
print('* Python library for DNN-based speech enhancement *')
print('* using Pytorch API *')
print('***********************************************************')
# Set device
DEVICE = torch.device("cuda")
# Set model
if cfg.mode == 'DCCRN':
model = DCCRN(rnn_units=cfg.rnn_units, masking_mode=cfg.masking_mode, use_clstm=cfg.use_clstm,
kernel_num=cfg.kernel_num).to(DEVICE)
elif cfg.mode == 'DCUNET':
model = DCUNET(masking_mode=cfg.masking_mode, kernel_num=cfg.kernel_num).to(DEVICE)
elif cfg.mode == 'DCCRN_direct':
model = DCCRN_direct(rnn_units=cfg.rnn_units, use_clstm=cfg.use_clstm, kernel_num=cfg.kernel_num).to(DEVICE)
###############################################################################
# Set optimizer and learning rate #
###############################################################################
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)
total_params = calculate_total_params(model)
###############################################################################
# Confirm model information #
###############################################################################
print('%d-%d-%d %d:%d:%d\n' %
(time.localtime().tm_year, time.localtime().tm_mon,
time.localtime().tm_mday, time.localtime().tm_hour,
time.localtime().tm_min, time.localtime().tm_sec))
print('mode : %s_%s\n' % (cfg.mode, cfg.info))
print('learning rate : %g\n' % cfg.learning_rate)
print('total params : %d (%.2f M, %.2f MBytes)\n' %
(total_params,
total_params / 1000000.0,
total_params * 4.0 / 1000000.0))
###############################################################################
# Create Dataloader #
###############################################################################
# Set device
DEVICE = torch.device("cuda")
train_loader = create_dataloader(mode='train')
validation_loader = create_dataloader(mode='valid')
###############################################################################
# Set a log file to store progress. #
# Set a hps file to store hyper-parameters information. #
###############################################################################
# Load the checkpoint
if cfg.chkpt_path is not None:
print('Resuming from checkpoint: %s' % cfg.chkpt_path)
# Set a log file to store progress.
dir_to_save = cfg.job_dir + cfg.chkpt_model
dir_to_logs = cfg.logs_dir + cfg.chkpt_model
checkpoint = torch.load(cfg.chkpt_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
epoch_start_idx = checkpoint['epoch'] + 1
mse_vali_total = np.load(str(dir_to_save + '/mse_vali_total.npy'))
if len(mse_vali_total) < cfg.max_epochs:
plus = cfg.max_epochs - len(mse_vali_total)
mse_vali_total = np.concatenate((mse_vali_total, np.zeros(plus)), 0)
else:
print('Starting new training run')
epoch_start_idx = 1
mse_vali_total = np.zeros(cfg.max_epochs)
# Set a log file to store progress.
dir_to_save = str(cfg.job_dir) + '%d.%d' % (time.localtime().tm_mon, time.localtime().tm_mday) \
+ '_%s' % cfg.mode + '_%s' % cfg.info
dir_to_logs = str(cfg.logs_dir) + '%d.%d' % (time.localtime().tm_mon, time.localtime().tm_mday) \
+ '_%s' % cfg.mode + '_%s' % cfg.info
if not os.path.exists(dir_to_save):
os.mkdir(dir_to_save)
os.mkdir(dir_to_logs)
log_fname = str(dir_to_save + '/log.txt')
if not os.path.exists(log_fname):
fp = open(log_fname, 'w')
write_status_to_log_file(fp, total_params)
else:
fp = open(log_fname, 'a')
# Set a hps file to store hyper-parameters information.
hps_fname = str(dir_to_save + '/hp_str.txt')
fp_h = open(hps_fname, 'w')
with open('config.py', 'r') as f:
hp_str = ''.join(f.readlines())
fp_h.write(hp_str)
fp_h.close()
###############################################################################
###############################################################################
# Main program start !! #
###############################################################################
###############################################################################
# Writer initialize
writer = Writer(dir_to_logs)
###############################################################################
# Train #
###############################################################################
for epoch in range(epoch_start_idx, cfg.max_epochs + 1):
start_time = time.time()
train_loss = model_train(model, optimizer, train_loader, epoch, DEVICE)
vali_loss, vali_pesq, vali_stoi = model_validate(model, validation_loader,
dir_to_save, writer, epoch, DEVICE)
mse_vali_total[epoch - 1] = vali_loss
np.save(str(dir_to_save + '/mse_vali_total.npy'), mse_vali_total)
# write the loss on tensorboard
writer.log_loss(train_loss, vali_loss, epoch)
# save checkpoint file to resume training
save_path = str(dir_to_save + '/' + ('chkpt_%d.pt' % epoch))
torch.save({
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'epoch': epoch
}, save_path)
print('Epoch [{}] | {:.6f} | {:.6} | {:.6} | {:.6} takes {:.2f} seconds'
.format(epoch, train_loss, vali_loss, vali_pesq, vali_stoi, time.time() - start_time))
fp.write('Epoch [{}] | {:.6f} | {:.6f} | {:.6f} | {:.6f} takes {:.2f} seconds\n'
.format(epoch, train_loss, vali_loss, vali_pesq, vali_stoi, time.time() - start_time))
print('Training has been finished.')
# Copy optimum model that has minimum MSE.
print('Save optimum models...')
min_index = np.argmin(mse_vali_total)
print('Minimum validation loss is at '+str(min_index+1)+'.')