From 378b51e94da13974b59849cc47e3f9156c8bd678 Mon Sep 17 00:00:00 2001
From: Hiroshiba <hihokaruta@gmail.com>
Date: Sat, 24 Aug 2024 18:17:26 +0900
Subject: [PATCH] =?UTF-8?q?=E8=BF=BD=E5=8A=A0=EF=BC=9A=20=E3=82=BD?=
 =?UTF-8?q?=E3=83=B3=E3=82=B0=E3=81=A7=E3=83=8E=E3=83=BC=E3=83=88=E3=81=94?=
 =?UTF-8?q?=E3=81=A8=E3=81=AB=20ID=20=E3=82=92=E6=8C=81=E3=81=9F=E3=81=9B?=
 =?UTF-8?q?=E3=80=81=E9=9F=B3=E7=B4=A0=E3=81=A8=E5=AF=BE=E5=BF=9C=E3=81=A5?=
 =?UTF-8?q?=E3=81=91=E3=82=8B=20(#1460)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* stash

* 歌声合成系のe2eテストを追加

* ノートごとにIDを持たせ、音素と対応づ

* ID付与したスコアでのテストを追加
---
 ...\343\202\222\347\242\272\350\252\215.json" |  24 ++++
 .../test_post_sing_frame_audio_query_200.json |   8 ++
 ...t_post_sing_old_frame_audio_query_200.json | 105 ++++++++++++++++++
 .../test_sing_frame_audio_query.py            |  18 +++
 ...thesize_wave_from_score_output[query].json |  13 +++
 voicevox_engine/tts_pipeline/model.py         |   5 +
 voicevox_engine/tts_pipeline/tts_engine.py    |  25 ++++-
 7 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json

diff --git "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json" "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
index d3d616be7..4001df4f2 100644
--- "a/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
+++ "b/test/e2e/__snapshots__/test_openapi/test_OpenAPI\343\201\256\345\275\242\343\201\214\345\244\211\343\202\217\343\201\243\343\201\246\343\201\204\343\201\252\343\201\204\343\201\223\343\201\250\343\202\222\347\242\272\350\252\215.json"
@@ -423,6 +423,18 @@
             "title": "Frame Length",
             "type": "integer"
           },
+          "note_id": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "description": "音符のID",
+            "title": "Note Id"
+          },
           "phoneme": {
             "description": "音素",
             "title": "Phoneme",
@@ -616,6 +628,18 @@
             "title": "Frame Length",
             "type": "integer"
           },
+          "id": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "description": "ID",
+            "title": "Id"
+          },
           "key": {
             "description": "音階",
             "title": "Key",
diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json
index 4914aa6d9..a24153125 100644
--- a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json
+++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json
@@ -33,34 +33,42 @@
   "phonemes": [
     {
       "frame_length": 2,
+      "note_id": "a",
       "phoneme": "pau"
     },
     {
       "frame_length": 8,
+      "note_id": "b",
       "phoneme": "t"
     },
     {
       "frame_length": 2,
+      "note_id": "b",
       "phoneme": "e"
     },
     {
       "frame_length": 1,
+      "note_id": "c",
       "phoneme": "s"
     },
     {
       "frame_length": 1,
+      "note_id": "c",
       "phoneme": "u"
     },
     {
       "frame_length": 2,
+      "note_id": "d",
       "phoneme": "t"
     },
     {
       "frame_length": 1,
+      "note_id": "d",
       "phoneme": "o"
     },
     {
       "frame_length": 10,
+      "note_id": "e",
       "phoneme": "pau"
     }
   ],
diff --git a/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json
new file mode 100644
index 000000000..ed60aa6e1
--- /dev/null
+++ b/test/e2e/single_api/tts_pipeline/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json
@@ -0,0 +1,105 @@
+{
+  "f0": [
+    0.0,
+    0.0,
+    46.64,
+    46.64,
+    46.64,
+    46.64,
+    46.64,
+    46.64,
+    46.64,
+    46.64,
+    46.4,
+    46.4,
+    46.62,
+    46.67,
+    83.1,
+    83.1,
+    82.97,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "outputSamplingRate": 24000,
+  "outputStereo": false,
+  "phonemes": [
+    {
+      "frame_length": 2,
+      "note_id": null,
+      "phoneme": "pau"
+    },
+    {
+      "frame_length": 8,
+      "note_id": null,
+      "phoneme": "t"
+    },
+    {
+      "frame_length": 2,
+      "note_id": null,
+      "phoneme": "e"
+    },
+    {
+      "frame_length": 1,
+      "note_id": null,
+      "phoneme": "s"
+    },
+    {
+      "frame_length": 1,
+      "note_id": null,
+      "phoneme": "u"
+    },
+    {
+      "frame_length": 2,
+      "note_id": null,
+      "phoneme": "t"
+    },
+    {
+      "frame_length": 1,
+      "note_id": null,
+      "phoneme": "o"
+    },
+    {
+      "frame_length": 10,
+      "note_id": null,
+      "phoneme": "pau"
+    }
+  ],
+  "volume": [
+    0.0,
+    0.0,
+    0.33,
+    0.33,
+    0.33,
+    0.33,
+    0.33,
+    0.33,
+    0.33,
+    0.33,
+    0.13,
+    0.13,
+    0.32,
+    0.36,
+    0.79,
+    0.79,
+    0.64,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "volumeScale": 1.0
+}
diff --git a/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py b/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py
index 31ec684bd..908108c10 100644
--- a/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py
+++ b/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py
@@ -11,6 +11,24 @@
 def test_post_sing_frame_audio_query_200(
     client: TestClient, snapshot_json: SnapshotAssertion
 ) -> None:
+    score = {
+        "notes": [
+            {"id": "a", "key": None, "frame_length": 10, "lyric": ""},
+            {"id": "b", "key": 30, "frame_length": 3, "lyric": "て"},
+            {"id": "c", "key": 30, "frame_length": 3, "lyric": "す"},
+            {"id": "d", "key": 40, "frame_length": 1, "lyric": "と"},
+            {"id": "e", "key": None, "frame_length": 10, "lyric": ""},
+        ]
+    }
+    response = client.post("/sing_frame_audio_query", params={"speaker": 0}, json=score)
+    assert response.status_code == 200
+    assert snapshot_json == round_floats(response.json(), 2)
+
+
+def test_post_sing_old_frame_audio_query_200(
+    client: TestClient, snapshot_json: SnapshotAssertion
+) -> None:
+    """古いバージョンの楽譜でもエラーなく合成できる"""
     score = {
         "notes": [
             {"key": None, "frame_length": 10, "lyric": ""},
diff --git a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json
index ed97c822c..fa6baf1d5 100644
--- a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json
+++ b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json
@@ -2,54 +2,67 @@
   [
     {
       "frame_length": 4,
+      "note_id": null,
       "phoneme": "pau"
     },
     {
       "frame_length": 6,
+      "note_id": null,
       "phoneme": "d"
     },
     {
       "frame_length": 4,
+      "note_id": null,
       "phoneme": "o"
     },
     {
       "frame_length": 8,
+      "note_id": null,
       "phoneme": "r"
     },
     {
       "frame_length": 13,
+      "note_id": null,
       "phoneme": "e"
     },
     {
       "frame_length": 4,
+      "note_id": null,
       "phoneme": "m"
     },
     {
       "frame_length": 21,
+      "note_id": null,
       "phoneme": "i"
     },
     {
       "frame_length": 3,
+      "note_id": null,
       "phoneme": "pau"
     },
     {
       "frame_length": 2,
+      "note_id": null,
       "phoneme": "f"
     },
     {
       "frame_length": 6,
+      "note_id": null,
       "phoneme": "a"
     },
     {
       "frame_length": 6,
+      "note_id": null,
       "phoneme": "s"
     },
     {
       "frame_length": 17,
+      "note_id": null,
       "phoneme": "o"
     },
     {
       "frame_length": 10,
+      "note_id": null,
       "phoneme": "pau"
     }
   ],
diff --git a/voicevox_engine/tts_pipeline/model.py b/voicevox_engine/tts_pipeline/model.py
index dde8b7631..542b859a4 100644
--- a/voicevox_engine/tts_pipeline/model.py
+++ b/voicevox_engine/tts_pipeline/model.py
@@ -5,10 +5,13 @@
 """
 
 from enum import Enum
+from typing import NewType
 
 from pydantic import BaseModel, ConfigDict, Field
 from pydantic.json_schema import SkipJsonSchema
 
+NoteId = NewType("NoteId", str)
+
 
 class Mora(BaseModel):
     """
@@ -63,6 +66,7 @@ class Note(BaseModel):
     音符ごとの情報
     """
 
+    id: NoteId | None = Field(default=None, description="ID")
     key: int | SkipJsonSchema[None] = Field(default=None, description="音階")
     frame_length: int = Field(description="音符のフレーム長")
     lyric: str = Field(description="音符の歌詞")
@@ -83,6 +87,7 @@ class FramePhoneme(BaseModel):
 
     phoneme: str = Field(description="音素")
     frame_length: int = Field(description="音素のフレーム長")
+    note_id: NoteId | None = Field(default=None, description="音符のID")
 
 
 class FrameAudioQuery(BaseModel):
diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 664e11b95..ea0f250ff 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -17,7 +17,15 @@
 from ..metas.Metas import StyleId
 from ..model import AudioQuery
 from .kana_converter import parse_kana
-from .model import AccentPhrase, FrameAudioQuery, FramePhoneme, Mora, Note, Score
+from .model import (
+    AccentPhrase,
+    FrameAudioQuery,
+    FramePhoneme,
+    Mora,
+    Note,
+    NoteId,
+    Score,
+)
 from .mora_mapping import mora_kana_to_mora_phonemes, mora_phonemes_to_mora_kana
 from .phoneme import Phoneme
 from .text_analyzer import text_to_accent_phrases
@@ -313,6 +321,7 @@ def _notes_to_keys_and_phonemes(
     NDArray[np.int64],
     NDArray[np.int64],
     NDArray[np.int64],
+    list[NoteId | None],
 ]:
     """
     ノート単位の長さ・モーラ情報や、音素列・音素ごとのキー列を作成する
@@ -332,6 +341,8 @@ def _notes_to_keys_and_phonemes(
         音素列
     phoneme_keys : NDArray[np.int64]
         音素ごとのキー列
+    phoneme_note_ids : list[NoteId]
+        音素ごとのノートID列
     """
 
     note_lengths: list[int] = []
@@ -339,6 +350,7 @@ def _notes_to_keys_and_phonemes(
     note_vowels: list[int] = []
     phonemes: list[int] = []
     phoneme_keys: list[int] = []
+    phoneme_note_ids: list[NoteId | None] = []
 
     for note in notes:
         if note.lyric == "":
@@ -350,6 +362,7 @@ def _notes_to_keys_and_phonemes(
             note_vowels.append(0)  # pau
             phonemes.append(0)  # pau
             phoneme_keys.append(-1)
+            phoneme_note_ids.append(note.id)
         else:
             if note.key is None:
                 msg = "keyがnullの場合、lyricは空文字列である必要があります。"
@@ -378,8 +391,10 @@ def _notes_to_keys_and_phonemes(
             if consonant_id != -1:
                 phonemes.append(consonant_id)
                 phoneme_keys.append(note.key)
+                phoneme_note_ids.append(note.id)
             phonemes.append(vowel_id)
             phoneme_keys.append(note.key)
+            phoneme_note_ids.append(note.id)
 
     # 各データをnumpy配列に変換する
     note_lengths_array = np.array(note_lengths, dtype=np.int64)
@@ -394,6 +409,7 @@ def _notes_to_keys_and_phonemes(
         note_vowels_array,
         phonemes_array,
         phoneme_keys_array,
+        phoneme_note_ids,
     )
 
 
@@ -599,6 +615,7 @@ def create_sing_phoneme_and_f0_and_volume(
             note_vowels_array,
             phonemes_array,
             phoneme_keys_array,
+            phoneme_note_ids,
         ) = _notes_to_keys_and_phonemes(notes)
 
         # コアを用いて子音長を生成する
@@ -628,8 +645,11 @@ def create_sing_phoneme_and_f0_and_volume(
             FramePhoneme(
                 phoneme=Phoneme._PHONEME_LIST[phoneme_id],
                 frame_length=phoneme_duration,
+                note_id=phoneme_note_id,
+            )
+            for phoneme_id, phoneme_duration, phoneme_note_id in zip(
+                phonemes_array, phoneme_lengths, phoneme_note_ids
             )
-            for phoneme_id, phoneme_duration in zip(phonemes_array, phoneme_lengths)
         ]
 
         return phoneme_data_list, f0s.tolist(), volumes.tolist()
@@ -650,6 +670,7 @@ def create_sing_volume_from_phoneme_and_f0(
             _,
             phonemes_array_from_notes,
             phoneme_keys_array,
+            _,
         ) = _notes_to_keys_and_phonemes(notes)
 
         phonemes_array = np.array(