From 378b51e94da13974b59849cc47e3f9156c8bd678 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 24 Aug 2024 18:17:26 +0900 Subject: [PATCH] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9A=20=E3=82=BD?= =?UTF-8?q?=E3=83=B3=E3=82=B0=E3=81=A7=E3=83=8E=E3=83=BC=E3=83=88=E3=81=94?= =?UTF-8?q?=E3=81=A8=E3=81=AB=20ID=20=E3=82=92=E6=8C=81=E3=81=9F=E3=81=9B?= =?UTF-8?q?=E3=80=81=E9=9F=B3=E7=B4=A0=E3=81=A8=E5=AF=BE=E5=BF=9C=E3=81=A5?= =?UTF-8?q?=E3=81=91=E3=82=8B=20(#1460)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * stash * 歌声合成系のe2eテストを追加 * ノートごとにIDを持たせ、音素と対応づ * ID付与したスコアでのテストを追加 --- ...\343\202\222\347\242\272\350\252\215.json" | 24 ++++ .../test_post_sing_frame_audio_query_200.json | 8 ++ ...t_post_sing_old_frame_audio_query_200.json | 105 ++++++++++++++++++ .../test_sing_frame_audio_query.py | 18 +++ ...thesize_wave_from_score_output[query].json | 13 +++ voicevox_engine/tts_pipeline/model.py | 5 + voicevox_engine/tts_pipeline/tts_engine.py | 25 ++++- 7 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" index d3d616be7..4001df4f2 100644 --- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" +++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" @@ -423,6 +423,18 @@ "title": "Frame Length", "type": "integer" }, + "note_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "音符のID", + "title": "Note Id" + }, "phoneme": { "description": "音素", "title": "Phoneme", @@ -616,6 +628,18 @@ "title": "Frame Length", "type": "integer" }, + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "ID", + "title": "Id" + }, "key": { "description": "音階", "title": "Key", diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json index 4914aa6d9..a24153125 100644 --- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json @@ -33,34 +33,42 @@ "phonemes": [ { "frame_length": 2, + "note_id": "a", "phoneme": "pau" }, { "frame_length": 8, + "note_id": "b", "phoneme": "t" }, { "frame_length": 2, + "note_id": "b", "phoneme": "e" }, { "frame_length": 1, + "note_id": "c", "phoneme": "s" }, { "frame_length": 1, + "note_id": "c", "phoneme": "u" }, { "frame_length": 2, + "note_id": "d", "phoneme": "t" }, { "frame_length": 1, + "note_id": "d", "phoneme": "o" }, { "frame_length": 10, + "note_id": "e", "phoneme": "pau" } ], diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json new file mode 100644 index 000000000..ed60aa6e1 --- /dev/null +++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json @@ -0,0 +1,105 @@ +{ + "f0": [ + 0.0, + 0.0, + 46.64, + 46.64, + 46.64, + 46.64, + 46.64, + 46.64, + 46.64, + 46.64, + 46.4, + 46.4, + 46.62, + 46.67, + 83.1, + 83.1, + 82.97, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "outputSamplingRate": 24000, + "outputStereo": false, + "phonemes": [ + { + "frame_length": 2, + "note_id": null, + "phoneme": "pau" + }, + { + "frame_length": 8, + "note_id": null, + "phoneme": "t" + }, + { + "frame_length": 2, + "note_id": null, + "phoneme": "e" + }, + { + "frame_length": 1, + "note_id": null, + "phoneme": "s" + }, + { + "frame_length": 1, + "note_id": null, + "phoneme": "u" + }, + { + "frame_length": 2, + "note_id": null, + "phoneme": "t" + }, + { + "frame_length": 1, + "note_id": null, + "phoneme": "o" + }, + { + "frame_length": 10, + "note_id": null, + "phoneme": "pau" + } + ], + "volume": [ + 0.0, + 0.0, + 0.33, + 0.33, + 0.33, + 0.33, + 0.33, + 0.33, + 0.33, + 0.33, + 0.13, + 0.13, + 0.32, + 0.36, + 0.79, + 0.79, + 0.64, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "volumeScale": 1.0 +} diff --git a/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py b/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py index 31ec684bd..908108c10 100644 --- a/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py +++ b/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py @@ -11,6 +11,24 @@ def test_post_sing_frame_audio_query_200( client: TestClient, snapshot_json: SnapshotAssertion ) -> None: + score = { + "notes": [ + {"id": "a", "key": None, "frame_length": 10, "lyric": ""}, + {"id": "b", "key": 30, "frame_length": 3, "lyric": "て"}, + {"id": "c", "key": 30, "frame_length": 3, "lyric": "す"}, + {"id": "d", "key": 40, "frame_length": 1, "lyric": "と"}, + {"id": "e", "key": None, "frame_length": 10, "lyric": ""}, + ] + } + response = client.post("/sing_frame_audio_query", params={"speaker": 0}, json=score) + assert response.status_code == 200 + assert snapshot_json == round_floats(response.json(), 2) + + +def test_post_sing_old_frame_audio_query_200( + client: TestClient, snapshot_json: SnapshotAssertion +) -> None: + """古いバージョンの楽譜でもエラーなく合成できる""" score = { "notes": [ {"key": None, "frame_length": 10, "lyric": ""}, diff --git a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json index ed97c822c..fa6baf1d5 100644 --- a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json +++ b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json @@ -2,54 +2,67 @@ [ { "frame_length": 4, + "note_id": null, "phoneme": "pau" }, { "frame_length": 6, + "note_id": null, "phoneme": "d" }, { "frame_length": 4, + "note_id": null, "phoneme": "o" }, { "frame_length": 8, + "note_id": null, "phoneme": "r" }, { "frame_length": 13, + "note_id": null, "phoneme": "e" }, { "frame_length": 4, + "note_id": null, "phoneme": "m" }, { "frame_length": 21, + "note_id": null, "phoneme": "i" }, { "frame_length": 3, + "note_id": null, "phoneme": "pau" }, { "frame_length": 2, + "note_id": null, "phoneme": "f" }, { "frame_length": 6, + "note_id": null, "phoneme": "a" }, { "frame_length": 6, + "note_id": null, "phoneme": "s" }, { "frame_length": 17, + "note_id": null, "phoneme": "o" }, { "frame_length": 10, + "note_id": null, "phoneme": "pau" } ], diff --git a/voicevox_engine/tts_pipeline/model.py b/voicevox_engine/tts_pipeline/model.py index dde8b7631..542b859a4 100644 --- a/voicevox_engine/tts_pipeline/model.py +++ b/voicevox_engine/tts_pipeline/model.py @@ -5,10 +5,13 @@ """ from enum import Enum +from typing import NewType from pydantic import BaseModel, ConfigDict, Field from pydantic.json_schema import SkipJsonSchema +NoteId = NewType("NoteId", str) + class Mora(BaseModel): """ @@ -63,6 +66,7 @@ class Note(BaseModel): 音符ごとの情報 """ + id: NoteId | None = Field(default=None, description="ID") key: int | SkipJsonSchema[None] = Field(default=None, description="音階") frame_length: int = Field(description="音符のフレーム長") lyric: str = Field(description="音符の歌詞") @@ -83,6 +87,7 @@ class FramePhoneme(BaseModel): phoneme: str = Field(description="音素") frame_length: int = Field(description="音素のフレーム長") + note_id: NoteId | None = Field(default=None, description="音符のID") class FrameAudioQuery(BaseModel): diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 664e11b95..ea0f250ff 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -17,7 +17,15 @@ from ..metas.Metas import StyleId from ..model import AudioQuery from .kana_converter import parse_kana -from .model import AccentPhrase, FrameAudioQuery, FramePhoneme, Mora, Note, Score +from .model import ( + AccentPhrase, + FrameAudioQuery, + FramePhoneme, + Mora, + Note, + NoteId, + Score, +) from .mora_mapping import mora_kana_to_mora_phonemes, mora_phonemes_to_mora_kana from .phoneme import Phoneme from .text_analyzer import text_to_accent_phrases @@ -313,6 +321,7 @@ def _notes_to_keys_and_phonemes( NDArray[np.int64], NDArray[np.int64], NDArray[np.int64], + list[NoteId | None], ]: """ ノート単位の長さ・モーラ情報や、音素列・音素ごとのキー列を作成する @@ -332,6 +341,8 @@ def _notes_to_keys_and_phonemes( 音素列 phoneme_keys : NDArray[np.int64] 音素ごとのキー列 + phoneme_note_ids : list[NoteId] + 音素ごとのノートID列 """ note_lengths: list[int] = [] @@ -339,6 +350,7 @@ def _notes_to_keys_and_phonemes( note_vowels: list[int] = [] phonemes: list[int] = [] phoneme_keys: list[int] = [] + phoneme_note_ids: list[NoteId | None] = [] for note in notes: if note.lyric == "": @@ -350,6 +362,7 @@ def _notes_to_keys_and_phonemes( note_vowels.append(0) # pau phonemes.append(0) # pau phoneme_keys.append(-1) + phoneme_note_ids.append(note.id) else: if note.key is None: msg = "keyがnullの場合、lyricは空文字列である必要があります。" @@ -378,8 +391,10 @@ def _notes_to_keys_and_phonemes( if consonant_id != -1: phonemes.append(consonant_id) phoneme_keys.append(note.key) + phoneme_note_ids.append(note.id) phonemes.append(vowel_id) phoneme_keys.append(note.key) + phoneme_note_ids.append(note.id) # 各データをnumpy配列に変換する note_lengths_array = np.array(note_lengths, dtype=np.int64) @@ -394,6 +409,7 @@ def _notes_to_keys_and_phonemes( note_vowels_array, phonemes_array, phoneme_keys_array, + phoneme_note_ids, ) @@ -599,6 +615,7 @@ def create_sing_phoneme_and_f0_and_volume( note_vowels_array, phonemes_array, phoneme_keys_array, + phoneme_note_ids, ) = _notes_to_keys_and_phonemes(notes) # コアを用いて子音長を生成する @@ -628,8 +645,11 @@ def create_sing_phoneme_and_f0_and_volume( FramePhoneme( phoneme=Phoneme._PHONEME_LIST[phoneme_id], frame_length=phoneme_duration, + note_id=phoneme_note_id, + ) + for phoneme_id, phoneme_duration, phoneme_note_id in zip( + phonemes_array, phoneme_lengths, phoneme_note_ids ) - for phoneme_id, phoneme_duration in zip(phonemes_array, phoneme_lengths) ] return phoneme_data_list, f0s.tolist(), volumes.tolist() @@ -650,6 +670,7 @@ def create_sing_volume_from_phoneme_and_f0( _, phonemes_array_from_notes, phoneme_keys_array, + _, ) = _notes_to_keys_and_phonemes(notes) phonemes_array = np.array(