diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs index 4c7d5c8f8..3a46b14d4 100644 --- a/crates/voicevox_core/src/engine/model.rs +++ b/crates/voicevox_core/src/engine/model.rs @@ -84,10 +84,10 @@ pub struct AudioQuery { pub pause_length_scale: (), /// \[読み取り専用\] AquesTalk風記法。 /// - /// [`Synthesizer::audio_query`]が返すもののみ`Some`となる。入力としてのAudioQueryでは無視され + /// [`Synthesizer::create_audio_query`]が返すもののみ`Some`となる。入力としてのAudioQueryでは無視され /// る。 /// - /// [`Synthesizer::audio_query`]: crate::blocking::Synthesizer::audio_query + /// [`Synthesizer::create_audio_query`]: crate::blocking::Synthesizer::create_audio_query pub kana: Option, } diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py index 9af47148a..dc32558fb 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py @@ -218,8 +218,8 @@ class AudioQuery: """ [読み取り専用] AquesTalk風記法。 - :func:`Synthesizer.audio_query` が返すもののみ ``str`` となる。入力としてのAudioQueryでは無視さ - れる。 + :func:`Synthesizer.create_audio_query` が返すもののみ ``str`` となる。入力として + のAudioQueryでは無視される。 """ diff --git a/docs/guide/user/usage.md b/docs/guide/user/usage.md index 26ed50810..5038b31b4 100644 --- a/docs/guide/user/usage.md +++ b/docs/guide/user/usage.md @@ -120,7 +120,7 @@ with VoiceModelFile.open("model/0.vvm") as model: ```python text = "サンプル音声です" style_id = 0 -audio_query = synthesizer.audio_query(text, style_id) +audio_query = synthesizer.create_audio_query(text, style_id) pprint(audio_query) ``` diff --git a/example/python/run-asyncio.py b/example/python/run-asyncio.py index 1880f81ae..0d7460f90 100644 --- a/example/python/run-asyncio.py +++ b/example/python/run-asyncio.py @@ -6,12 +6,75 @@ import logging from argparse import ArgumentParser from pathlib import Path -from typing import Tuple from voicevox_core import AccelerationMode, AudioQuery from voicevox_core.asyncio import Onnxruntime, OpenJtalk, Synthesizer, VoiceModelFile +@dataclasses.dataclass +class Args: + mode: AccelerationMode + vvm: Path + onnxruntime: str + dict_dir: Path + text: str + out: Path + style_id: int + + @staticmethod + def parse_args() -> "Args": + argparser = ArgumentParser() + argparser.add_argument( + "--mode", + default="AUTO", + type=AccelerationMode, + help='モード ("AUTO", "CPU", "GPU")', + ) + argparser.add_argument( + "vvm", + type=Path, + help="vvmファイルへのパス", + ) + argparser.add_argument( + "--onnxruntime", + default=Onnxruntime.LIB_VERSIONED_FILENAME, + help="ONNX Runtimeのライブラリのfilename", + ) + argparser.add_argument( + "--dict-dir", + default="./open_jtalk_dic_utf_8-1.11", + type=Path, + help="Open JTalkの辞書ディレクトリ", + ) + argparser.add_argument( + "--text", + default="この音声は、ボイスボックスを使用して、出力されています。", + help="読み上げさせたい文章", + ) + argparser.add_argument( + "--out", + default="./output.wav", + type=Path, + help="出力wavファイルのパス", + ) + argparser.add_argument( + "--style-id", + default=0, + type=int, + help="話者IDを指定", + ) + args = argparser.parse_args() + return Args( + args.mode, + args.vvm, + args.onnxruntime, + args.dict_dir, + args.text, + args.out, + args.style_id, + ) + + async def main() -> None: logging.basicConfig(format="[%(levelname)s] %(name)s: %(message)s") logger = logging.getLogger(__name__) @@ -19,97 +82,33 @@ async def main() -> None: logging.getLogger("voicevox_core_python_api").setLevel("DEBUG") logging.getLogger("voicevox_core").setLevel("DEBUG") - ( - acceleration_mode, - vvm_path, - onnxruntime_filename, - open_jtalk_dict_dir, - text, - out, - style_id, - ) = parse_args() + args = Args.parse_args() - logger.info("%s", f"Loading ONNX Runtime ({onnxruntime_filename=})") - onnxruntime = await Onnxruntime.load_once(filename=onnxruntime_filename) + logger.info("%s", f"Loading ONNX Runtime ({args.onnxruntime=})") + onnxruntime = await Onnxruntime.load_once(filename=args.onnxruntime) logger.debug("%s", f"{onnxruntime.supported_devices()=}") - logger.info("%s", f"Initializing ({acceleration_mode=}, {open_jtalk_dict_dir=})") + logger.info("%s", f"Initializing ({args.mode=}, {args.dict_dir=})") synthesizer = Synthesizer( - onnxruntime, - await OpenJtalk.new(open_jtalk_dict_dir), - acceleration_mode=acceleration_mode, + onnxruntime, await OpenJtalk.new(args.dict_dir), acceleration_mode=args.mode ) logger.debug("%s", f"{synthesizer.metas=}") logger.debug("%s", f"{synthesizer.is_gpu_mode=}") - logger.info("%s", f"Loading `{vvm_path}`") - async with await VoiceModelFile.open(vvm_path) as model: + logger.info("%s", f"Loading `{args.vvm}`") + async with await VoiceModelFile.open(args.vvm) as model: await synthesizer.load_voice_model(model) - logger.info("%s", f"Creating an AudioQuery from {text!r}") - audio_query = await synthesizer.create_audio_query(text, style_id) + logger.info("%s", f"Creating an AudioQuery from {args.text!r}") + audio_query = await synthesizer.create_audio_query(args.text, args.style_id) logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}") - wav = await synthesizer.synthesis(audio_query, style_id) - - out.write_bytes(wav) - logger.info("%s", f"Wrote `{out}`") + wav = await synthesizer.synthesis(audio_query, args.style_id) - -def parse_args() -> Tuple[AccelerationMode, Path, str, Path, str, Path, int]: - argparser = ArgumentParser() - argparser.add_argument( - "--mode", - default="AUTO", - type=AccelerationMode, - help='モード ("AUTO", "CPU", "GPU")', - ) - argparser.add_argument( - "vvm", - type=Path, - help="vvmファイルへのパス", - ) - argparser.add_argument( - "--onnxruntime", - default=Onnxruntime.LIB_VERSIONED_FILENAME, - help="ONNX Runtimeのライブラリのfilename", - ) - argparser.add_argument( - "--dict-dir", - default="./open_jtalk_dic_utf_8-1.11", - type=Path, - help="Open JTalkの辞書ディレクトリ", - ) - argparser.add_argument( - "--text", - default="この音声は、ボイスボックスを使用して、出力されています。", - help="読み上げさせたい文章", - ) - argparser.add_argument( - "--out", - default="./output.wav", - type=Path, - help="出力wavファイルのパス", - ) - argparser.add_argument( - "--style-id", - default=0, - type=int, - help="話者IDを指定", - ) - args = argparser.parse_args() - # FIXME: 流石に多くなってきたので、`dataclass`化する - return ( - args.mode, - args.vvm, - args.onnxruntime, - args.dict_dir, - args.text, - args.out, - args.style_id, - ) + args.out.write_bytes(wav) + logger.info("%s", f"Wrote `{args.out}`") def display_as_json(audio_query: AudioQuery) -> str: diff --git a/example/python/run.py b/example/python/run.py index f5967daf7..4a22e709c 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -3,12 +3,82 @@ import logging from argparse import ArgumentParser from pathlib import Path -from typing import Tuple from voicevox_core import AccelerationMode, AudioQuery, wav_from_s16le from voicevox_core.blocking import Onnxruntime, OpenJtalk, Synthesizer, VoiceModelFile +@dataclasses.dataclass +class Args: + mode: AccelerationMode + vvm: Path + onnxruntime: str + dict_dir: Path + text: str + out: Path + style_id: int + streaming: bool + + @staticmethod + def parse_args() -> "Args": + argparser = ArgumentParser() + argparser.add_argument( + "--mode", + default="AUTO", + type=AccelerationMode, + help='モード ("AUTO", "CPU", "GPU")', + ) + argparser.add_argument( + "vvm", + type=Path, + help="vvmファイルへのパス", + ) + argparser.add_argument( + "--onnxruntime", + default=Onnxruntime.LIB_VERSIONED_FILENAME, + help="ONNX Runtimeのライブラリのfilename", + ) + argparser.add_argument( + "--dict-dir", + default="./open_jtalk_dic_utf_8-1.11", + type=Path, + help="Open JTalkの辞書ディレクトリ", + ) + argparser.add_argument( + "--text", + default="この音声は、ボイスボックスを使用して、出力されています。", + help="読み上げさせたい文章", + ) + argparser.add_argument( + "--out", + default="./output.wav", + type=Path, + help="出力wavファイルのパス", + ) + argparser.add_argument( + "--style-id", + default=0, + type=int, + help="話者IDを指定", + ) + argparser.add_argument( + "--streaming", + action="store_true", + help="ストリーミング生成", + ) + args = argparser.parse_args() + return Args( + args.mode, + args.vvm, + args.onnxruntime, + args.dict_dir, + args.text, + args.out, + args.style_id, + args.streaming, + ) + + def main() -> None: logging.basicConfig(format="[%(levelname)s] %(name)s: %(message)s") logger = logging.getLogger(__name__) @@ -16,44 +86,33 @@ def main() -> None: logging.getLogger("voicevox_core_python_api").setLevel("DEBUG") logging.getLogger("voicevox_core").setLevel("DEBUG") - ( - acceleration_mode, - vvm_path, - onnxruntime_filename, - open_jtalk_dict_dir, - text, - out, - style_id, - streaming, - ) = parse_args() + args = Args.parse_args() - logger.info("%s", f"Loading ONNX Runtime ({onnxruntime_filename=})") - onnxruntime = Onnxruntime.load_once(filename=onnxruntime_filename) + logger.info("%s", f"Loading ONNX Runtime ({args.onnxruntime=})") + onnxruntime = Onnxruntime.load_once(filename=args.onnxruntime) logger.debug("%s", f"{onnxruntime.supported_devices()=}") - logger.info("%s", f"Initializing ({acceleration_mode=}, {open_jtalk_dict_dir=})") + logger.info("%s", f"Initializing ({args.mode=}, {args.dict_dir=})") synthesizer = Synthesizer( - onnxruntime, - OpenJtalk(open_jtalk_dict_dir), - acceleration_mode=acceleration_mode, + onnxruntime, OpenJtalk(args.dict_dir), acceleration_mode=args.mode ) logger.debug("%s", f"{synthesizer.metas=}") logger.debug("%s", f"{synthesizer.is_gpu_mode=}") - logger.info("%s", f"Loading `{vvm_path}`") - with VoiceModelFile.open(vvm_path) as model: + logger.info("%s", f"Loading `{args.vvm}`") + with VoiceModelFile.open(args.vvm) as model: synthesizer.load_voice_model(model) - logger.info("%s", f"Creating an AudioQuery from {text!r}") - audio_query = synthesizer.create_audio_query(text, style_id) + logger.info("%s", f"Creating an AudioQuery from {args.text!r}") + audio_query = synthesizer.create_audio_query(args.text, args.style_id) logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}") - if streaming: + if args.streaming: logger.info("%s", "In streaming mode") chunk_sec = 1.0 - audio_feature = synthesizer.precompute_render(audio_query, style_id) + audio_feature = synthesizer.precompute_render(audio_query, args.style_id) chunk_frames = int(audio_feature.frame_rate * chunk_sec) pcm = b"" for i in range(0, audio_feature.frame_length, chunk_frames): @@ -67,70 +126,10 @@ def main() -> None: ) else: - wav = synthesizer.synthesis(audio_query, style_id) + wav = synthesizer.synthesis(audio_query, args.style_id) - out.write_bytes(wav) - logger.info("%s", f"Wrote `{out}`") - - -def parse_args() -> Tuple[AccelerationMode, Path, str, Path, str, Path, int, bool]: - argparser = ArgumentParser() - argparser.add_argument( - "--mode", - default="AUTO", - type=AccelerationMode, - help='モード ("AUTO", "CPU", "GPU")', - ) - argparser.add_argument( - "vvm", - type=Path, - help="vvmファイルへのパス", - ) - argparser.add_argument( - "--onnxruntime", - default=Onnxruntime.LIB_VERSIONED_FILENAME, - help="ONNX Runtimeのライブラリのfilename", - ) - argparser.add_argument( - "--dict-dir", - default="./open_jtalk_dic_utf_8-1.11", - type=Path, - help="Open JTalkの辞書ディレクトリ", - ) - argparser.add_argument( - "--text", - default="この音声は、ボイスボックスを使用して、出力されています。", - help="読み上げさせたい文章", - ) - argparser.add_argument( - "--out", - default="./output.wav", - type=Path, - help="出力wavファイルのパス", - ) - argparser.add_argument( - "--style-id", - default=0, - type=int, - help="話者IDを指定", - ) - argparser.add_argument( - "--streaming", - action="store_true", - help="ストリーミング生成", - ) - args = argparser.parse_args() - # FIXME: 流石に多くなってきたので、`dataclass`化する - return ( - args.mode, - args.vvm, - args.onnxruntime, - args.dict_dir, - args.text, - args.out, - args.style_id, - args.streaming, - ) + args.out.write_bytes(wav) + logger.info("%s", f"Wrote `{args.out}`") def display_as_json(audio_query: AudioQuery) -> str: