ming024 · Daniel-Chin · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/synthesize.py b/synthesize.py
@@ -84,7 +84,10 @@ def preprocess_mandarin(text, preprocess_config):
     return np.array(sequence)
 
 
-def synthesize(model, step, configs, vocoder, batchs, control_values):
+def synthesize(
+    model, step, configs, vocoder, batchs, control_values, 
+    do_plot_spectrogram=True, 
+):
     preprocess_config, model_config, train_config = configs
     pitch_control, energy_control, duration_control = control_values
 
@@ -105,6 +108,7 @@ def synthesize(model, step, configs, vocoder, batchs, control_values):
                 model_config,
                 preprocess_config,
                 train_config["path"]["result_path"],
+                do_plot_spectrogram, 
             )
 
 

diff --git a/utils/model.py b/utils/model.py
@@ -7,6 +7,7 @@
 import hifigan
 from model import FastSpeech2, ScheduledOptim
 
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 def get_model(args, configs, device, train=False):
     (preprocess_config, model_config, train_config) = configs
@@ -17,7 +18,7 @@ def get_model(args, configs, device, train=False):
             train_config["path"]["ckpt_path"],
             "{}.pth.tar".format(args.restore_step),
         )
-        ckpt = torch.load(ckpt_path)
+        ckpt = torch.load(ckpt_path, map_location=device)
         model.load_state_dict(ckpt["model"])
 
     if train:
@@ -60,7 +61,7 @@ def get_vocoder(config, device):
         config = hifigan.AttrDict(config)
         vocoder = hifigan.Generator(config)
         if speaker == "LJSpeech":
-            ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar")
+            ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar", map_location=device)
         elif speaker == "universal":
             ckpt = torch.load("hifigan/generator_universal.pth.tar")
         vocoder.load_state_dict(ckpt["generator"])

diff --git a/utils/tools.py b/utils/tools.py
@@ -160,8 +160,14 @@ def synth_one_sample(targets, predictions, vocoder, model_config, preprocess_con
 
     return fig, wav_reconstruction, wav_prediction, basename
 
+def legalize_filename(filename: str):
+    return ''.join([x if x.isalnum() else '' for x in filename])[:30]
 
-def synth_samples(targets, predictions, vocoder, model_config, preprocess_config, path):
+def synth_samples(
+    targets, predictions, vocoder, model_config, 
+    preprocess_config, path, 
+    do_plot_spectrogram=True, 
+):
 
     basenames = targets[0]
     for i in range(len(predictions[0])):
@@ -187,15 +193,16 @@ def synth_samples(targets, predictions, vocoder, model_config, preprocess_config
             stats = json.load(f)
             stats = stats["pitch"] + stats["energy"][:2]
 
-        fig = plot_mel(
-            [
-                (mel_prediction.cpu().numpy(), pitch, energy),
-            ],
-            stats,
-            ["Synthetized Spectrogram"],
-        )
-        plt.savefig(os.path.join(path, "{}.png".format(basename)))
-        plt.close()
+        if do_plot_spectrogram:
+            fig = plot_mel(
+                [
+                    (mel_prediction.cpu().numpy(), pitch, energy),
+                ],
+                stats,
+                ["Synthetized Spectrogram"],
+            )
+            plt.savefig(os.path.join(path, "{}.png".format(legalize_filename(basename))))
+            plt.close()
 
     from .model import vocoder_infer
 
@@ -207,7 +214,8 @@ def synth_samples(targets, predictions, vocoder, model_config, preprocess_config
 
     sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
     for wav, basename in zip(wav_predictions, basenames):
-        wavfile.write(os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav)
+        wavfile.write(os.path.join(
+            path, "{}.wav".format(legalize_filename(basename))), sampling_rate, wav)
 
 
 def plot_mel(data, stats, titles):