@@ -49,6 +49,12 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
49
49
model .make_generation_fast_ ()
50
50
51
51
mel_org = np .load (join (in_dir , mel_filename ))
52
+ # zero padd
53
+ b_pad = r # imitates initial state
54
+ e_pad = r - len (mel_org ) % r if len (mel_org ) % r > 0 else 0
55
+ mel_org = np .pad (mel_org , [(b_pad , e_pad ), (0 , 0 )],
56
+ mode = "constant" , constant_values = 0 )
57
+
52
58
mel = Variable (torch .from_numpy (mel_org )).unsqueeze (0 ).contiguous ()
53
59
54
60
# Downsample mel spectrogram
@@ -78,10 +84,10 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
78
84
frame_positions = frame_positions , speaker_ids = speaker_ids )
79
85
80
86
mel_output = mel_outputs [0 ].data .cpu ().numpy ()
81
-
82
87
# **Time resolution adjustment**
83
- # remove begenning audio used for first mel prediction
84
- wav = np .load (join (in_dir , audio_filename ))[hparams .hop_size * downsample_step :]
88
+ mel_output = mel_output [:- (b_pad + e_pad )]
89
+
90
+ wav = np .load (join (in_dir , audio_filename ))
85
91
assert len (wav ) % hparams .hop_size == 0
86
92
87
93
# Coarse upsample just for convenience
@@ -102,8 +108,6 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
102
108
timesteps = len (wav )
103
109
104
110
# save
105
- np .save (join (out_dir , audio_filename ), wav .astype (np .int16 ),
106
- allow_pickle = False )
107
111
np .save (join (out_dir , mel_filename ), mel_output .astype (np .float32 ),
108
112
allow_pickle = False )
109
113
0 commit comments