diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 49896b0de..23425324e 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -222,7 +222,7 @@ jobs: plain-cdylib) linking=load-onnxruntime ;; ios-xcframework) linking=link-onnxruntime ;; esac - cargo build -p voicevox_core_c_api -vv --features "$linking" --target ${{ matrix.target }} --release + cargo build -p voicevox_core_c_api -v --features "$linking" --target ${{ matrix.target }} --release env: RUSTFLAGS: -C panic=abort - name: build voicevox_core_python_api @@ -237,7 +237,7 @@ jobs: echo "whl=$(find ./target/wheels -type f)" >> "$GITHUB_OUTPUT" - name: build voicevox_core_java_api if: contains(matrix.target, 'android') - run: cargo build -p voicevox_core_java_api -vv --target ${{ matrix.target }} --release + run: cargo build -p voicevox_core_java_api vv --target ${{ matrix.target }} --release - name: Organize artifact run: | mkdir -p "artifact/${{ env.ASSET_NAME }}" diff --git a/.github/workflows/build_and_deploy_downloader.yml b/.github/workflows/build_and_deploy_downloader.yml index 256e17bd6..6295c05bf 100644 --- a/.github/workflows/build_and_deploy_downloader.yml +++ b/.github/workflows/build_and_deploy_downloader.yml @@ -78,7 +78,7 @@ jobs: targets: ${{ matrix.target }} - name: Build downloader - run: cargo build -vv --release -p downloader --target ${{ matrix.target }} + run: cargo build -v --release -p downloader --target ${{ matrix.target }} - name: Rename the binary run: | diff --git a/.github/workflows/download_test.yml b/.github/workflows/download_test.yml index ebdd25813..3ff8f71dd 100644 --- a/.github/workflows/download_test.yml +++ b/.github/workflows/download_test.yml @@ -33,7 +33,7 @@ jobs: include: - name: 通常ダウンロード os: windows-latest - download_command: cargo run -vv -p downloader -- # バージョン指定のために -- が必要 + download_command: cargo run -v -p downloader -- # バージョン指定のために -- が必要 download_dir: voicevox_core check_items: | voicevox_core.dll @@ -51,7 +51,7 @@ jobs: *curand* - name: CpuArch指定 os: windows-latest - download_command: cargo run -vv -p downloader -- --cpu-arch x86 + download_command: cargo run -v -p downloader -- --cpu-arch x86 download_dir: voicevox_core check_items: | voicevox_core.dll @@ -68,7 +68,7 @@ jobs: *curand* - name: output先指定ダウンロード os: windows-latest - download_command: cargo run -vv -p downloader -- -o other_output + download_command: cargo run -v -p downloader -- -o other_output download_dir: other_output check_items: | voicevox_core.dll @@ -85,7 +85,7 @@ jobs: *curand* - name: Min option確認 os: windows-latest - download_command: cargo run -vv -p downloader -- --min + download_command: cargo run -v -p downloader -- --min download_dir: voicevox_core check_items: | voicevox_core.dll @@ -102,7 +102,7 @@ jobs: open_jtalk_dic_utf_8-1.11 - name: DirectML option確認 os: windows-latest - download_command: cargo run -vv -p downloader -- --devices directml + download_command: cargo run -v -p downloader -- --devices directml download_dir: voicevox_core check_items: | voicevox_core.dll @@ -121,7 +121,7 @@ jobs: *curand* - name: DirectMLかつMin option確認 os: windows-latest - download_command: cargo run -vv -p downloader -- --devices directml --min + download_command: cargo run -v -p downloader -- --devices directml --min download_dir: voicevox_core check_items: | voicevox_core.dll @@ -139,7 +139,7 @@ jobs: open_jtalk_dic_utf_8-1.11 - name: cuda option確認 os: windows-latest - download_command: cargo run -vv -p downloader -- --devices cuda + download_command: cargo run -v -p downloader -- --devices cuda download_dir: voicevox_core check_items: | voicevox_core.dll @@ -161,7 +161,7 @@ jobs: *directml* - name: cudaかつmin option確認 os: windows-latest - download_command: cargo run -vv -p downloader -- --devices cuda --min + download_command: cargo run -v -p downloader -- --devices cuda --min download_dir: voicevox_core check_items: | voicevox_core.dll diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f4c4d6ab1..936e3aebe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,10 +72,10 @@ jobs: with: python-version: "3.8" - uses: Swatinem/rust-cache@v2 - - run: cargo clippy -vv --tests -- -D clippy::all -D warnings --no-deps - - run: cargo clippy -vv -- -D clippy::all -D warnings --no-deps - - run: cargo clippy -vv -p voicevox_core -p voicevox_core_c_api --features link-onnxruntime --tests -- -D clippy::all -D warnings --no-deps - - run: cargo clippy -vv -p voicevox_core -p voicevox_core_c_api --features link-onnxruntime -- -D clippy::all -D warnings --no-deps + - run: cargo clippy -v --tests -- -D clippy::all -D warnings --no-deps + - run: cargo clippy -v -- -D clippy::all -D warnings --no-deps + - run: cargo clippy -v -p voicevox_core -p voicevox_core_c_api --features link-onnxruntime --tests -- -D clippy::all -D warnings --no-deps + - run: cargo clippy -v -p voicevox_core -p voicevox_core_c_api --features link-onnxruntime -- -D clippy::all -D warnings --no-deps - run: cargo fmt -- --check rust-unit-test: @@ -88,9 +88,9 @@ jobs: with: key: "cargo-unit-test-cache" - name: Run cargo unit test - run: RUST_BACKTRACE=full cargo test --lib --bins -vv -- --include-ignored + run: RUST_BACKTRACE=full cargo test --lib --bins -v -- --include-ignored - name: Run cargo documentation test - run: RUST_BACKTRACE=full cargo test --doc -vv + run: RUST_BACKTRACE=full cargo test --doc -v rust-integration-test-strategy-matrix: # 実行対象の条件をフィルタリングする runs-on: ubuntu-latest @@ -135,7 +135,7 @@ jobs: with: key: "cargo-integration-test-cache-${{ matrix.os }}" - name: Run cargo integration test (load-onnxruntime) - run: RUST_BACKTRACE=full cargo test --test "*" -vv -- --include-ignored + run: RUST_BACKTRACE=full cargo test --test "*" -v -- --include-ignored c-header: runs-on: ubuntu-latest @@ -193,7 +193,7 @@ jobs: - name: Install cargo-binstall uses: taiki-e/install-action@cargo-binstall - name: build voicevox_core_c_api - run: cargo build -p voicevox_core_c_api --features load-onnxruntime -vv + run: cargo build -p voicevox_core_c_api --features load-onnxruntime -v - name: 必要なfileをunix用exampleのディレクトリに移動させる run: | mkdir -p example/cpp/unix/voicevox_core/ @@ -237,7 +237,7 @@ jobs: - name: Install cargo-binstall uses: taiki-e/install-action@cargo-binstall - name: build voicevox_core_c_api - run: cargo build -p voicevox_core_c_api --features load-onnxruntime -vv + run: cargo build -p voicevox_core_c_api --features load-onnxruntime -v - name: 必要なfileをexampleのディレクトリに移動させる shell: bash run: | @@ -281,7 +281,7 @@ jobs: - run: | pip install --upgrade poetry poetry install --with dev --with test - - run: cargo build -p test_util -vv # build scriptにより/crates/test_util/data/の生成 + - run: cargo build -p test_util -v # build scriptにより/crates/test_util/data/の生成 - run: poetry run maturin build --locked - run: poetry run maturin develop --locked - name: pytestを実行 @@ -311,8 +311,8 @@ jobs: distribution: "adopt" - name: Build run: | - cargo build -p voicevox_core_java_api -vv - cargo build -p test_util -vv # build scriptにより/crates/test_util/data/の生成 + cargo build -p voicevox_core_java_api -v + cargo build -p test_util -v # build scriptにより/crates/test_util/data/の生成 - name: 必要なDLLをコピーしてテストを実行 working-directory: crates/voicevox_core_java_api run: | diff --git a/Cargo.lock b/Cargo.lock index 531efc631..2c987cab4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4272,7 +4272,7 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "voicevox-ort" version = "2.0.0-rc.4" -source = "git+https://github.com/VOICEVOX/ort.git?rev=3ecf05d66e2e04435fde3c8200e5208ce2707eb7#3ecf05d66e2e04435fde3c8200e5208ce2707eb7" +source = "git+https://github.com/VOICEVOX/ort.git?rev=17f741301db0bb08da0eafe8a338e5efd8a4b5df#17f741301db0bb08da0eafe8a338e5efd8a4b5df" dependencies = [ "anyhow", "half", @@ -4289,7 +4289,7 @@ dependencies = [ [[package]] name = "voicevox-ort-sys" version = "2.0.0-rc.4" -source = "git+https://github.com/VOICEVOX/ort.git?rev=3ecf05d66e2e04435fde3c8200e5208ce2707eb7#3ecf05d66e2e04435fde3c8200e5208ce2707eb7" +source = "git+https://github.com/VOICEVOX/ort.git?rev=17f741301db0bb08da0eafe8a338e5efd8a4b5df#17f741301db0bb08da0eafe8a338e5efd8a4b5df" dependencies = [ "flate2", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 932e2b1d8..744121194 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,7 +99,7 @@ zip = "0.6.3" [workspace.dependencies.voicevox-ort] git = "https://github.com/VOICEVOX/ort.git" -rev = "3ecf05d66e2e04435fde3c8200e5208ce2707eb7" +rev = "17f741301db0bb08da0eafe8a338e5efd8a4b5df" [workspace.dependencies.open_jtalk] git = "https://github.com/VOICEVOX/open_jtalk-rs.git" diff --git a/README.md b/README.md index 8f4f2749d..c766f23c8 100644 --- a/README.md +++ b/README.md @@ -139,8 +139,7 @@ Issue 側で取り組み始めたことを伝えるか、最初に Draft プル ### Rust 以外の言語の API に関する方針 -VOICEVOX CORE の主要機能は Rust で実装されることを前提としており、他の言語のラッパーでのみの機能追加はしない方針としています。これは機能の一貫性を保つための方針です。 -各言語の特性に応じた追加実装(例えば、Python での `style_id` の [`NewType`](https://docs.python.org/ja/3/library/typing.html#newtype) 化など)は許容されます。 +[APIデザイン ガイドライン](./docs/guide/dev/api-design.md)をご覧ください。 ## コアライブラリのビルド diff --git a/crates/test_util/build.rs b/crates/test_util/build.rs index 3cdc88d78..700321bd3 100644 --- a/crates/test_util/build.rs +++ b/crates/test_util/build.rs @@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> { phoneme.to_vec() }, }, + intermediate: typing::IntermediateExampleData { + f0_length: 69, + phoneme_size: 45, + feature_dim: 80, + margin_width: 14, + f0_vector: { + let mut f0 = [0.; 69]; + f0[9..24].fill(5.905218); + f0[37..60].fill(5.565851); + f0.to_vec() + }, + phoneme_vector: { + let mut phoneme = [0.; 45 * 69]; + let mut set_one = |index, range| { + for i in range { + phoneme[(i * 45 + index) as usize] = 1.; + } + }; + set_one(0, 0..9); + set_one(37, 9..13); + set_one(14, 13..24); + set_one(35, 24..30); + set_one(6, 30..37); + set_one(37, 37..45); + set_one(30, 45..60); + set_one(0, 60..69); + phoneme.to_vec() + }, + }, }; fs_err::write( diff --git a/crates/test_util/compatible_engine.h b/crates/test_util/compatible_engine.h index 254fd8161..2cd58b971 100644 --- a/crates/test_util/compatible_engine.h +++ b/crates/test_util/compatible_engine.h @@ -25,4 +25,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phoneme, int64_t *speaker_id, float *output); +bool generate_full_intermediate(int64_t length, int64_t phoneme_size, + float *f0, float *phoneme, int64_t *speaker_id, + float *output); + +bool render_audio_segment(int64_t length, int64_t margin_width, int64_t feature_size, + float *audio_feature, int64_t *speaker_id, float *output); + const char *last_error_message(); diff --git a/crates/test_util/src/typing.rs b/crates/test_util/src/typing.rs index 1d10c9cb9..ed0b6b40c 100644 --- a/crates/test_util/src/typing.rs +++ b/crates/test_util/src/typing.rs @@ -31,6 +31,16 @@ pub struct DecodeExampleData { pub phoneme_vector: Vec, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IntermediateExampleData { + pub f0_length: i64, + pub phoneme_size: i64, + pub feature_dim: i64, + pub margin_width: i64, + pub f0_vector: Vec, + pub phoneme_vector: Vec, +} + #[derive(Debug, Serialize, Deserialize)] pub struct ExampleData { pub speaker_id: i64, @@ -38,4 +48,5 @@ pub struct ExampleData { pub duration: DurationExampleData, pub intonation: IntonationExampleData, pub decode: DecodeExampleData, + pub intermediate: IntermediateExampleData, } diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs index 20203feb3..228c0bbd8 100644 --- a/crates/voicevox_core/src/engine/model.rs +++ b/crates/voicevox_core/src/engine/model.rs @@ -1,4 +1,7 @@ -use serde::{Deserialize, Serialize}; +use std::fmt; + +use duplicate::duplicate_item; +use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; /* 各フィールドのjsonフィールド名はsnake_caseとする*/ @@ -64,6 +67,21 @@ pub struct AudioQuery { pub output_sampling_rate: u32, /// 音声データをステレオ出力するか否か。 pub output_stereo: bool, + // TODO: VOICEVOX/voicevox_engine#1308 を実装する + /// 句読点などの無音時間。`null`のときは無視される。デフォルト値は`null`。 + #[serde( + default, + deserialize_with = "deserialize_pause_length", + serialize_with = "serialize_pause_length" + )] + pub pause_length: (), + /// 読点などの無音時間(倍率)。デフォルト値は`1`。 + #[serde( + default, + deserialize_with = "deserialize_pause_length_scale", + serialize_with = "serialize_pause_length_scale" + )] + pub pause_length_scale: (), /// \[読み取り専用\] AquesTalk風記法。 /// /// [`Synthesizer::audio_query`]が返すもののみ`Some`となる。入力としてのAudioQueryでは無視され @@ -73,6 +91,87 @@ pub struct AudioQuery { pub kana: Option, } +fn deserialize_pause_length<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: Deserializer<'de>, +{ + return deserializer.deserialize_any(Visitor); + + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = (); + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("`null`") + } + + #[duplicate_item( + method T; + [ visit_i64 ] [ i64 ]; + [ visit_u64 ] [ u64 ]; + [ visit_f64 ] [ f64 ]; + )] + fn method(self, _: T) -> Result + where + E: de::Error, + { + Err(E::custom("currently `pause_length` must be `null`")) + } + + fn visit_unit(self) -> Result { + Ok(()) + } + } +} + +fn serialize_pause_length(_: &(), serializer: S) -> Result +where + S: Serializer, +{ + serializer.serialize_unit() +} + +fn deserialize_pause_length_scale<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: Deserializer<'de>, +{ + return deserializer.deserialize_any(Visitor); + + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = (); + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("`1.`") + } + + #[duplicate_item( + method T ONE; + [ visit_i64 ] [ i64 ] [ 1 ]; + [ visit_u64 ] [ u64 ] [ 1 ]; + [ visit_f64 ] [ f64 ] [ 1. ]; + )] + fn method(self, v: T) -> Result + where + E: de::Error, + { + if v != ONE { + return Err(E::custom("currently `pause_length_scale` must be `1.`")); + } + Ok(()) + } + } +} + +fn serialize_pause_length_scale(_: &(), serializer: S) -> Result +where + S: Serializer, +{ + (1.).serialize(serializer) +} + impl AudioQuery { pub(crate) fn with_kana(self, kana: Option) -> Self { Self { kana, ..self } @@ -99,6 +198,8 @@ mod tests { post_phoneme_length: 0.0, output_sampling_rate: 0, output_stereo: false, + pause_length: (), + pause_length_scale: (), kana: None, }; let val = serde_json::to_value(audio_query_model).unwrap(); @@ -152,4 +253,42 @@ mod tests { }))?; Ok(()) } + + // TODO: 型的に自明になったらこのテストは削除する + #[rstest] + fn it_denies_non_null_for_pause_length() { + serde_json::from_value::(json!({ + "accent_phrases": [], + "speed_scale": 1.0, + "pitch_scale": 0.0, + "intonation_scale": 1.0, + "volume_scale": 1.0, + "pre_phoneme_length": 0.1, + "post_phoneme_length": 0.1, + "output_sampling_rate": 24000, + "output_stereo": false, + "pause_length": "aaaaa" + })) + .map(|_| ()) + .unwrap_err(); + } + + // TODO: 型的に自明になったらこのテストは削除する + #[rstest] + fn it_denies_non_float_for_pause_length_scale() { + serde_json::from_value::(json!({ + "accent_phrases": [], + "speed_scale": 1.0, + "pitch_scale": 0.0, + "intonation_scale": 1.0, + "volume_scale": 1.0, + "pre_phoneme_length": 0.1, + "post_phoneme_length": 0.1, + "output_sampling_rate": 24000, + "output_stereo": false, + "pause_length_scale": "aaaaa", + })) + .map(|_| ()) + .unwrap_err(); + } } diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 17b1b2f9c..5a8a06a03 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -100,6 +100,7 @@ mod inner { use std::{ io::{Cursor, Write as _}, marker::PhantomData, + ops::Range, sync::Arc, }; use tracing::info; @@ -127,6 +128,30 @@ mod inner { use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions}; const DEFAULT_SAMPLING_RATE: u32 = 24000; + /// 音が途切れてしまうのを避けるworkaround処理のためのパディング幅(フレーム数) + const PADDING_FRAME_LENGTH: usize = 38; // (0.4秒 * 24000Hz / 256.0).round() + /// 音声生成の際、音声特徴量の前後に確保すべきマージン幅(フレーム数) + /// モデルの受容野から計算される + const MARGIN: usize = 14; + /// 指定した音声区間に対応する特徴量を両端にマージンを追加した上で切り出す + fn crop_with_margin(audio: &AudioFeature, range: Range) -> ndarray::ArrayView2<'_, f32> { + if range.start > audio.frame_length || range.end > audio.frame_length { + panic!( + "{range:?} is out of range for audio feature of length {frame_length}", + frame_length = audio.frame_length, + ); + } + if range.start > range.end { + panic!("{range:?} is invalid because start > end",); + } + let range = range.start..range.end + 2 * MARGIN; + audio.internal_state.slice(ndarray::s![range, ..]) + } + /// 追加した安全マージンを生成音声から取り除く + fn trim_margin_from_wave(wave_with_margin: ndarray::Array1) -> ndarray::Array1 { + let len = wave_with_margin.len(); + wave_with_margin.slice_move(ndarray::s![MARGIN * 256..len - MARGIN * 256]) + } /// 音声の中間表現。 pub struct AudioFeature { @@ -138,8 +163,6 @@ mod inner { pub frame_length: usize, /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。 pub frame_rate: f64, - /// workaroundとして付け足されているパディング長。 - padding_frame_length: usize, /// 生成時に利用したクエリ。 audio_query: AudioQuery, } @@ -248,9 +271,11 @@ mod inner { &self, model: &voice_model::Inner, ) -> crate::Result<()> { - let model_bytes = &model.read_inference_models().await?; - // TODO: 重い操作なので、asyncにする - self.status.insert_model(model.header(), model_bytes) + let model_bytes = model.read_inference_models().await?; + + let status = self.status.clone(); + let header = model.header().clone(); + A::unblock(move || status.insert_model(&header, &model_bytes)).await } pub(super) fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> { @@ -373,28 +398,12 @@ mod inner { } } - // 音が途切れてしまうのを避けるworkaround処理が入っている - // NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。 - // TODO: 改善したらここのpadding処理を取り除く - const PADDING_SIZE: f64 = 0.4; - let padding_size = - ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; - let start_and_end_padding_size = 2 * padding_size; - let length_with_padding = f0.len() + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size); - let phoneme_with_padding = make_phoneme_with_padding( - phoneme.as_flattened(), - OjtPhoneme::num_phoneme(), - length_with_padding, - padding_size, - ); - let spec = self .generate_full_intermediate( - f0_with_padding.len(), + f0.len(), OjtPhoneme::num_phoneme(), - &f0_with_padding, - &phoneme_with_padding, + &f0, + phoneme.as_flattened(), style_id, ) .await?; @@ -403,7 +412,6 @@ mod inner { style_id, frame_length: f0.len(), frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, - padding_frame_length: padding_size, audio_query: audio_query.clone(), }); @@ -455,46 +463,6 @@ mod inner { pitch, } } - - fn make_f0_with_padding( - f0_slice: &[f32], - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut f0_with_padding = Vec::with_capacity(length_with_padding); - let padding = vec![0.0; padding_size]; - f0_with_padding.extend_from_slice(&padding); - f0_with_padding.extend_from_slice(f0_slice); - f0_with_padding.extend_from_slice(&padding); - f0_with_padding - } - - fn make_phoneme_with_padding( - phoneme_slice: &[f32], - phoneme_size: usize, - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding_phoneme = vec![0.0; phoneme_size]; - padding_phoneme[0] = 1.0; - let padding_phoneme_len = padding_phoneme.len(); - let padding_phonemes: Vec = padding_phoneme - .into_iter() - .cycle() - .take(padding_phoneme_len * padding_size) - .collect(); - let mut phoneme_with_padding = - Vec::with_capacity(phoneme_size * length_with_padding); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - phoneme_with_padding.extend_from_slice(phoneme_slice); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - - phoneme_with_padding - } } pub(super) async fn render( @@ -504,41 +472,20 @@ mod inner { end: usize, ) -> Result> { // TODO: 44.1kHzなどの対応 - const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン - use std::cmp::min; - // 実態(workaround paddingを含まない)上での区間 - let clipped_start = min(start, audio.frame_length); - let clipped_end = min(end, audio.frame_length); - // 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン - if (clipped_start..clipped_end).is_empty() { + if (start..end).is_empty() { + // 指定区間が空のときは早期リターン return Ok(vec![]); } - // マージンがデータからはみ出さないことを保証 - // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 - if MARGIN > audio.padding_frame_length + clipped_start - || MARGIN > audio.padding_frame_length + (audio.frame_length - clipped_end) - { - unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); - } - let left_margin = MARGIN; - let right_margin = MARGIN; - // 安全マージンを追加したデータ上での区間 - let slice_start = audio.padding_frame_length + clipped_start - left_margin; - let slice_end = audio.padding_frame_length + clipped_end + right_margin; - let segment = audio - .internal_state - .slice(ndarray::s![slice_start..slice_end, ..]); + let spec_segment = crop_with_margin(audio, start..end); let wave_with_margin = self - .render_audio_segment(segment.into_owned(), audio.style_id) + .render_audio_segment(spec_segment.to_owned(), audio.style_id) .await?; - // 変換前に追加した安全マージンを生成音声から取り除く - let wave = wave_with_margin - .slice(ndarray::s![ - left_margin * 256..wave_with_margin.len() - right_margin * 256 - ]) - .into_owned() - .into_raw_vec(); - return Ok(to_s16le_pcm(&wave, &audio.audio_query)); + let wave = trim_margin_from_wave(wave_with_margin); + return Ok(to_s16le_pcm( + wave.as_slice() + .expect("`trim_margin_from_wave` should just trim an array"), + &audio.audio_query, + )); fn to_s16le_pcm( wave: &[f32], @@ -997,6 +944,10 @@ mod inner { Ok(output.into_raw_vec()) } + /// モデル`generate_full_intermediate`の実行と、その前後の処理を行う。 + /// + /// 無音パディングを付加して音声特徴量を計算し、マージン込みの音声特徴量を返す。 + /// /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn generate_full_intermediate( &self, @@ -1008,17 +959,69 @@ mod inner { ) -> Result> { let (model_id, inner_voice_id) = self.ids_for::(style_id)?; - let GenerateFullIntermediateOutput { spec } = self.run_session( + // 音が途切れてしまうのを避けるworkaround処理が入っている + // TODO: 改善したらここのpadding処理を取り除く + let start_and_end_padding_size = 2 * PADDING_FRAME_LENGTH; + let length_with_padding = f0.len() + start_and_end_padding_size; + let f0_with_padding = make_f0_with_padding(f0, PADDING_FRAME_LENGTH); + let phoneme_with_padding = make_phoneme_with_padding( + phoneme_vector.into_shape([length, phoneme_size]).unwrap(), + PADDING_FRAME_LENGTH, + ); + + let GenerateFullIntermediateOutput { + spec: spec_with_padding, + } = self.run_session( model_id, GenerateFullIntermediateInput { - f0: f0.into_shape([length, 1]).unwrap(), - phoneme: phoneme_vector.into_shape([length, phoneme_size]).unwrap(), + f0: f0_with_padding + .into_shape([length_with_padding, 1]) + .unwrap(), + phoneme: phoneme_with_padding, speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), }, )?; - Ok(spec) + + // マージンがデータからはみ出さないことを保証 + // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 + if MARGIN > PADDING_FRAME_LENGTH { + unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); + } + // マージン分を両端に残して音声特徴量を返す + return Ok(spec_with_padding + .slice(ndarray::s![ + PADDING_FRAME_LENGTH - MARGIN + ..spec_with_padding.nrows() - PADDING_FRAME_LENGTH + MARGIN, + .. + ]) + .to_owned()); + + fn make_f0_with_padding( + f0_slice: ndarray::Array1, + padding_size: usize, + ) -> ndarray::Array1 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let padding = ndarray::Array1::::zeros(padding_size); + ndarray::concatenate![ndarray::Axis(0), padding, f0_slice, padding] + } + + fn make_phoneme_with_padding( + phoneme_slice: ndarray::Array2, + padding_size: usize, + ) -> ndarray::Array2 { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding = + ndarray::Array2::::zeros((padding_size, phoneme_slice.ncols())); + padding + .slice_mut(ndarray::s![.., 0]) + .assign(&ndarray::arr0(1.0)); + ndarray::concatenate![ndarray::Axis(0), padding, phoneme_slice, padding] + } } + /// 与えられた音声特徴量で音声生成。 /// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn render_audio_segment( &self, @@ -1047,8 +1050,9 @@ mod inner { phoneme_vector, style_id, )?; - let output = self.render_audio_segment(intermediate, style_id)?; - Ok(output.into_raw_vec()) + let output_with_margin = self.render_audio_segment(intermediate, style_id)?; + let output = trim_margin_from_wave(output_with_margin); + Ok(output.to_vec()) } } @@ -1183,6 +1187,8 @@ mod inner { post_phoneme_length: 0.1, output_sampling_rate: DEFAULT_SAMPLING_RATE, output_stereo: false, + pause_length: (), + pause_length_scale: (), kana: Some(kana), } } diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index e4d1b3b5a..2f46d887b 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -5,6 +5,7 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, + sync::Arc, }; use anyhow::{anyhow, Context as _}; @@ -62,7 +63,7 @@ impl VoiceModelId { #[self_referencing] pub(crate) struct Inner { - header: VoiceModelHeader, + header: Arc, #[borrows(header)] #[not_covariant] @@ -125,11 +126,12 @@ impl Inner { ) })?; - let header = VoiceModelHeader::new(manifest, metas, path)?; + let header = VoiceModelHeader::new(manifest, metas, path)?.into(); InnerTryBuilder { header, - inference_model_entries_builder: |VoiceModelHeader { manifest, .. }| { + inference_model_entries_builder: |header| { + let VoiceModelHeader { manifest, .. } = &**header; manifest .domains() .each_ref() @@ -183,7 +185,7 @@ impl Inner { &self.borrow_header().metas } - pub(crate) fn header(&self) -> &VoiceModelHeader { + pub(crate) fn header(&self) -> &Arc { self.borrow_header() } diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml index c9600c7a6..4c4001b4d 100644 --- a/crates/voicevox_core_c_api/Cargo.toml +++ b/crates/voicevox_core_c_api/Cargo.toml @@ -30,6 +30,7 @@ easy-ext.workspace = true educe.workspace = true itertools.workspace = true libc.workspace = true +ndarray.workspace = true parking_lot = { workspace = true, features = ["arc_lock"] } process_path.workspace = true ref-cast.workspace = true diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index f9f8bf865..a9bc9963f 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -234,22 +234,31 @@ pub extern "C" fn supported_devices() -> *const c_char { }); } -// SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない -#[unsafe(no_mangle)] -pub extern "C" fn yukarin_s_forward( +/// # Safety +/// +/// - `phoneme_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [f32; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn yukarin_s_forward( length: i64, phoneme_list: *mut i64, speaker_id: *mut i64, output: *mut f32, ) -> bool { init_logger_once(); + assert_aligned(phoneme_list); + assert_aligned(speaker_id); + assert_aligned(output); let synthesizer = &*lock_synthesizer(); let result = ensure_initialized!(synthesizer).predict_duration( + // SAFETY: The safety contract must be upheld by the caller. unsafe { std::slice::from_raw_parts_mut(phoneme_list, length as usize) }, StyleId::new(unsafe { *speaker_id as u32 }), ); match result { Ok(output_vec) => { + // SAFETY: The safety contract must be upheld by the caller. let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; output_slice.clone_from_slice(&output_vec); true @@ -261,9 +270,18 @@ pub extern "C" fn yukarin_s_forward( } } -// SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない -#[unsafe(no_mangle)] -pub extern "C" fn yukarin_sa_forward( +/// # Safety +/// +/// - `vowel_phoneme_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `consonant_phoneme_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `start_accent_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `end_accent_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `start_accent_phrase_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `end_accent_phrase_list`はRustの`&[i64; length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [f32; length as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn yukarin_sa_forward( length: i64, vowel_phoneme_list: *mut i64, consonant_phoneme_list: *mut i64, @@ -275,9 +293,18 @@ pub extern "C" fn yukarin_sa_forward( output: *mut f32, ) -> bool { init_logger_once(); + assert_aligned(vowel_phoneme_list); + assert_aligned(consonant_phoneme_list); + assert_aligned(start_accent_list); + assert_aligned(end_accent_list); + assert_aligned(start_accent_phrase_list); + assert_aligned(end_accent_phrase_list); + assert_aligned(speaker_id); + assert_aligned(output); let synthesizer = &*lock_synthesizer(); let result = ensure_initialized!(synthesizer).predict_intonation( length as usize, + // SAFETY: The safety contract must be upheld by the caller. unsafe { std::slice::from_raw_parts(vowel_phoneme_list, length as usize) }, unsafe { std::slice::from_raw_parts(consonant_phoneme_list, length as usize) }, unsafe { std::slice::from_raw_parts(start_accent_list, length as usize) }, @@ -288,6 +315,7 @@ pub extern "C" fn yukarin_sa_forward( ); match result { Ok(output_vec) => { + // SAFETY: The safety contract must be upheld by the caller. let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length as usize) }; output_slice.clone_from_slice(&output_vec); true @@ -299,9 +327,14 @@ pub extern "C" fn yukarin_sa_forward( } } -// SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない -#[unsafe(no_mangle)] -pub extern "C" fn decode_forward( +/// # Safety +/// +/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [f32; length as usize * 256]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn decode_forward( length: i64, phoneme_size: i64, f0: *mut f32, @@ -310,18 +343,24 @@ pub extern "C" fn decode_forward( output: *mut f32, ) -> bool { init_logger_once(); + assert_aligned(f0); + assert_aligned(phoneme); + assert_aligned(speaker_id); + assert_aligned(output); let length = length as usize; let phoneme_size = phoneme_size as usize; let synthesizer = &*lock_synthesizer(); let result = ensure_initialized!(synthesizer).decode( length, phoneme_size, + // SAFETY: The safety contract must be upheld by the caller. unsafe { std::slice::from_raw_parts(f0, length) }, unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, StyleId::new(unsafe { *speaker_id as u32 }), ); match result { Ok(output_vec) => { + // SAFETY: The safety contract must be upheld by the caller. let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; output_slice.clone_from_slice(&output_vec); true @@ -332,3 +371,116 @@ pub extern "C" fn decode_forward( } } } + +/// # Safety +/// +/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; ((length + 2 * 14) * 80) as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn generate_full_intermediate( + length: i64, + phoneme_size: i64, + f0: *mut f32, + phoneme: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(f0); + assert_aligned(phoneme); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let phoneme_size = phoneme_size as usize; + const MARGIN_WIDTH: usize = 14; + const FEATURE_SIZE: usize = 80; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).generate_full_intermediate( + length, + phoneme_size, + // SAFETY: The safety contract must be upheld by the caller. + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = (length + 2 * MARGIN_WIDTH) * FEATURE_SIZE; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +/// # Safety +/// +/// - `audio_feature`はRustの`&[f32; (length * feature_size) as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize * 256]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn render_audio_segment( + length: i64, + _margin_width: i64, + feature_size: i64, + audio_feature: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(audio_feature); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let feature_size = feature_size as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).render_audio_segment( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + ndarray::ArrayView2::from_shape_ptr([length, feature_size], audio_feature).to_owned() + }, + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length * 256; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[track_caller] +fn assert_aligned(ptr: *mut impl Sized) { + assert!( + ptr.is_aligned(), + "all of the pointers passed to this library **must** be aligned", + ); +} diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs index 581edece5..6e31f557e 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs @@ -1,7 +1,7 @@ // エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う -use std::ffi::CStr; use std::sync::LazyLock; +use std::{cmp::min, ffi::CStr}; use assert_cmd::assert::AssertResult; use libloading::Library; @@ -83,12 +83,86 @@ impl assert_cdylib::TestCase for TestCase { wave }; + // 中間生成物を経由した場合の生成音声 + let wave2 = { + let length_with_margin = + EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width; + let mut audio_feature = + vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize]; + let mut wave = vec![0.; 256 * length_with_margin as usize]; + assert!(lib.generate_full_intermediate( + EXAMPLE_DATA.intermediate.f0_length, + EXAMPLE_DATA.intermediate.phoneme_size, + EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32, + EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + audio_feature.as_mut_ptr(), + )); + assert!(lib.render_audio_segment( + length_with_margin, + EXAMPLE_DATA.intermediate.margin_width, + EXAMPLE_DATA.intermediate.feature_dim, + audio_feature.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + wave.as_mut_ptr(), + )); + wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize + ..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize] + .to_vec() + }; + + // 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声 + let wave3 = { + let length_with_margin = + EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width; + let mut audio_feature = + vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize]; + let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize]; + assert!(lib.generate_full_intermediate( + EXAMPLE_DATA.intermediate.f0_length, + EXAMPLE_DATA.intermediate.phoneme_size, + EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32, + EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + audio_feature.as_mut_ptr(), + )); + let full_length = EXAMPLE_DATA.intermediate.f0_length as usize; + let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize; + for render_start in (0..full_length).step_by(10) { + // render_start .. render_end の音声を取得する + let render_end = min(render_start + 10, full_length); + let slice_start = render_start; + let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize; + let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch]; + let slice_length = slice_end - slice_start; + let mut wave_segment_with_margin = vec![0.; 256 * slice_length]; + assert!(lib.render_audio_segment( + slice_length as i64, + EXAMPLE_DATA.intermediate.margin_width, + pitch as i64, + feature_segment.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + wave_segment_with_margin.as_mut_ptr(), + )); + let wave_segment = &wave_segment_with_margin[256 + * EXAMPLE_DATA.intermediate.margin_width as usize + ..wave_segment_with_margin.len() + - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]; + wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment); + } + wave + }; + std::assert_eq!(SNAPSHOTS.metas, metas_json); float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01); float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01); assert!(wave.iter().copied().all(f32::is_normal)); + assert!(wave2.iter().copied().all(f32::is_normal)); + assert!(wave3.iter().copied().all(f32::is_normal)); + float_assert::close_l1(&wave2, &wave, 0.001); + float_assert::close_l1(&wave3, &wave, 0.001); lib.finalize(); Ok(()) diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AccelerationMode.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AccelerationMode.java new file mode 100644 index 000000000..836d32eab --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AccelerationMode.java @@ -0,0 +1,11 @@ +package jp.hiroshiba.voicevoxcore; + +/** ハードウェアアクセラレーションモード。 */ +public enum AccelerationMode { + /** 実行環境に合わせて自動的に選択する。 */ + AUTO, + /** CPUに設定する。 */ + CPU, + /** GPUに設定する。 */ + GPU, +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java index c03accf2f..afc735034 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java @@ -55,10 +55,22 @@ public class AudioQuery { @Expose public boolean outputStereo; + /** 句読点などの無音時間。{@code null}のときは無視される。デフォルト値は{@code null}。 */ + @SerializedName("pause_length") + @Expose + @Nullable + public Double pauseLength; + + /** 読点などの無音時間(倍率)。デフォルト値は{@code 1.}。 */ + @SerializedName("pause_length_scale") + @Expose + public double pauseLengthScale; + /** * [読み取り専用] AquesTalk風記法。 * - *

{@link Synthesizer#createAudioQuery} が返すもののみ String となる。入力としてのAudioQueryでは無視される。 + *

{@link jp.hiroshiba.voicevoxcore.blocking.Synthesizer#createAudioQuery} が返すもののみ String + * となる。入力としてのAudioQueryでは無視される。 */ @SerializedName("kana") @Expose @@ -74,6 +86,8 @@ public AudioQuery() { this.prePhonemeLength = 0.1; this.postPhonemeLength = 0.1; this.outputSamplingRate = 24000; + this.pauseLength = null; + this.pauseLengthScale = 1.0; this.kana = null; } } diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/GlobalInfo.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/GlobalInfo.java index 496c2ccc4..e8214a480 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/GlobalInfo.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/GlobalInfo.java @@ -3,9 +3,14 @@ import com.google.gson.annotations.Expose; import com.google.gson.annotations.SerializedName; import jakarta.annotation.Nonnull; +import jp.hiroshiba.voicevoxcore.internal.Dll; /** VOICEVOX CORE自体の情報。 */ -public class GlobalInfo extends Dll { +public class GlobalInfo { + static { + Dll.loadLibrary(); + } + /** * ライブラリのバージョン。 * @@ -19,6 +24,7 @@ public static String getVersion() { @Nonnull private static native String rsGetVersion(); + // FIXME: dead code @Nonnull private static native String rsGetSupportedDevicesJson(); diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/SpeakerMeta.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/SpeakerMeta.java new file mode 100644 index 000000000..c43d32a8e --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/SpeakerMeta.java @@ -0,0 +1,53 @@ +package jp.hiroshiba.voicevoxcore; + +import com.google.gson.annotations.Expose; +import com.google.gson.annotations.SerializedName; +import jakarta.annotation.Nonnull; +import jakarta.annotation.Nullable; + +/** 話者(speaker)のメタ情報。 */ +public class SpeakerMeta { + /** 話者名。 */ + @SerializedName("name") + @Expose + @Nonnull + public final String name; + + /** 話者に属するスタイル。 */ + @SerializedName("styles") + @Expose + @Nonnull + public final StyleMeta[] styles; + + /** 話者のUUID。 */ + @SerializedName("speaker_uuid") + @Expose + @Nonnull + public final String speakerUuid; + + /** 話者のバージョン。 */ + @SerializedName("version") + @Expose + @Nonnull + public final String version; + + /** + * 話者の順番。 + * + *

{@code SpeakerMeta}の列は、この値に対して昇順に並んでいるべきである。 + */ + @SerializedName("order") + @Expose + @Nullable + public final Integer order; + + private SpeakerMeta() { + // GSONからコンストラクトするため、このメソッドは呼ばれることは無い。 + // このメソッドは@Nonnullを満たすために必要。 + this.name = ""; + this.styles = new StyleMeta[0]; + this.speakerUuid = ""; + this.version = ""; + this.order = null; + } +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleMeta.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleMeta.java new file mode 100644 index 000000000..fc63530a4 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleMeta.java @@ -0,0 +1,43 @@ +package jp.hiroshiba.voicevoxcore; + +import com.google.gson.annotations.Expose; +import com.google.gson.annotations.SerializedName; +import jakarta.annotation.Nonnull; +import jakarta.annotation.Nullable; + +/** スタイル(style)のメタ情報。 */ +public class StyleMeta { + /** スタイル名。 */ + @SerializedName("name") + @Expose + @Nonnull + public final String name; + + /** スタイルID。 */ + @SerializedName("id") + @Expose + public final int id; + + /** スタイルに対応するモデルの種類。 */ + @SerializedName("type") + @Expose + @Nonnull + public final StyleType type; + + /** + * 話者の順番。 + * + *

{@link SpeakerMeta#styles}の列は、この値に対して昇順に並んでいるべきである。 + */ + @SerializedName("order") + @Expose + @Nullable + public final Integer order; + + private StyleMeta() { + this.name = ""; + this.id = 0; + this.type = StyleType.TALK; + this.order = null; + } +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleType.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleType.java new file mode 100644 index 000000000..1e7c52d47 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/StyleType.java @@ -0,0 +1,12 @@ +package jp.hiroshiba.voicevoxcore; + +import com.google.gson.annotations.Expose; +import com.google.gson.annotations.SerializedName; + +/** スタイル(style)に対応するモデルの種類。 */ +public enum StyleType { + /** 音声合成クエリの作成と音声合成が可能。 */ + @SerializedName("talk") + @Expose + TALK, +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDictWord.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDictWord.java new file mode 100644 index 000000000..8b1e93457 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDictWord.java @@ -0,0 +1,143 @@ +package jp.hiroshiba.voicevoxcore; + +import com.google.gson.annotations.Expose; +import com.google.gson.annotations.SerializedName; +import jakarta.annotation.Nonnull; +import jakarta.validation.constraints.Max; +import jakarta.validation.constraints.Min; + +/** ユーザー辞書の単語。 */ +public class UserDictWord { + /** 単語の表層形。 */ + @SerializedName("surface") + @Expose + @Nonnull + public String surface; + + /** 単語の発音。 発音として有効なカタカナである必要がある。 */ + @SerializedName("pronunciation") + @Expose + @Nonnull + public String pronunciation; + + /** + * 単語の種類。 + * + * @see Type + */ + @SerializedName("word_type") + @Expose + @Nonnull + public Type wordType; + + /** アクセント型。 音が下がる場所を指す。 */ + @SerializedName("accent_type") + @Expose + public int accentType; + + /** 単語の優先度。 0から10までの整数。 数字が大きいほど優先度が高くなる。 1から9までの値を指定することを推奨。 */ + @SerializedName("priority") + @Expose + @Min(0) + @Max(10) + public int priority; + + /** + * {@link UserDictWord}を作成する。 + * + * @param surface 言葉の表層形。 + * @param pronunciation 言葉の発音。 + * @throws IllegalArgumentException pronunciationが不正な場合。 + */ + public UserDictWord(String surface, String pronunciation) { + if (surface == null) { + throw new NullPointerException("surface"); + } + if (pronunciation == null) { + throw new NullPointerException("pronunciation"); + } + + this.surface = rsToZenkaku(surface); + rsValidatePronunciation(pronunciation); + this.pronunciation = pronunciation; + this.wordType = Type.COMMON_NOUN; + this.accentType = 0; + this.priority = 5; + } + + /** + * 単語の種類を設定する。 + * + * @param wordType 単語の種類。 + * @return このインスタンス。 + */ + public UserDictWord wordType(Type wordType) { + if (wordType == null) { + throw new NullPointerException("wordType"); + } + this.wordType = wordType; + return this; + } + + /** + * アクセント型を設定する。 + * + * @param accentType アクセント型。 + * @return このインスタンス。 + */ + public UserDictWord accentType(int accentType) { + if (accentType < 0) { + throw new IllegalArgumentException("accentType"); + } + this.accentType = accentType; + return this; + } + + /** + * 優先度を設定する。 + * + * @param priority 優先度。 + * @return このインスタンス。 + * @throws IllegalArgumentException priorityが0未満または10より大きい場合。 + */ + public UserDictWord priority(int priority) { + if (priority < 0 || priority > 10) { + throw new IllegalArgumentException("priority"); + } + this.priority = priority; + return this; + } + + @Nonnull + private static native String rsToZenkaku(String surface); + + private static native void rsValidatePronunciation(String pronunciation); + + /** 単語の種類。 */ + public static enum Type { + /** 固有名詞。 */ + @SerializedName("PROPER_NOUN") + @Expose + PROPER_NOUN, + + /** 一般名詞。 */ + @SerializedName("COMMON_NOUN") + @Expose + COMMON_NOUN, + + /** 動詞。 */ + @SerializedName("VERB") + @Expose + VERB, + + /** 形容詞。 */ + @SerializedName("ADJECTIVE") + @Expose + ADJECTIVE, + + /** 語尾。 */ + @SerializedName("SUFFIX") + @Expose + SUFFIX, + } +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/VoiceModelFile.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/VoiceModelFile.java deleted file mode 100644 index b2cceca3f..000000000 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/VoiceModelFile.java +++ /dev/null @@ -1,152 +0,0 @@ -package jp.hiroshiba.voicevoxcore; - -import com.google.gson.Gson; -import com.google.gson.annotations.Expose; -import com.google.gson.annotations.SerializedName; -import jakarta.annotation.Nonnull; -import jakarta.annotation.Nullable; -import java.io.Closeable; -import java.util.UUID; - -/** 音声モデルファイル。 */ -public class VoiceModelFile extends Dll implements Closeable { - private long handle; - - /** ID。 */ - @Nonnull public final UUID id; - - /** メタ情報。 */ - @Nonnull public final SpeakerMeta[] metas; - - public VoiceModelFile(String modelPath) { - rsOpen(modelPath); - id = rsGetId(); - String metasJson = rsGetMetasJson(); - Gson gson = new Gson(); - SpeakerMeta[] rawMetas = gson.fromJson(metasJson, SpeakerMeta[].class); - if (rawMetas == null) { - throw new RuntimeException("Failed to parse metasJson"); - } - metas = rawMetas; - } - - /** - * VVMファイルを閉じる。 - * - *

このメソッドが呼ばれた段階で{@link Synthesizer#loadVoiceModel}からのアクセスが継続中の場合、アクセスが終わるまで待つ。 - */ - @Override - public void close() { - rsClose(); - } - - @Override - protected void finalize() throws Throwable { - rsDrop(); - super.finalize(); - } - - private native void rsOpen(String modelPath); - - @Nonnull - private native UUID rsGetId(); - - @Nonnull - private native String rsGetMetasJson(); - - private native void rsClose(); - - private native void rsDrop(); - - /** 話者(speaker)のメタ情報。 */ - public static class SpeakerMeta { - /** 話者名。 */ - @SerializedName("name") - @Expose - @Nonnull - public final String name; - - /** 話者に属するスタイル。 */ - @SerializedName("styles") - @Expose - @Nonnull - public final StyleMeta[] styles; - - /** 話者のUUID。 */ - @SerializedName("speaker_uuid") - @Expose - @Nonnull - public final String speakerUuid; - - /** 話者のバージョン。 */ - @SerializedName("version") - @Expose - @Nonnull - public final String version; - - /** - * 話者の順番。 - * - *

{@code SpeakerMeta}の列は、この値に対して昇順に並んでいるべきである。 - */ - @SerializedName("order") - @Expose - @Nullable - public final Integer order; - - private SpeakerMeta() { - // GSONからコンストラクトするため、このメソッドは呼ばれることは無い。 - // このメソッドは@Nonnullを満たすために必要。 - this.name = ""; - this.styles = new StyleMeta[0]; - this.speakerUuid = ""; - this.version = ""; - this.order = null; - } - } - - /** スタイル(style)のメタ情報。 */ - public static class StyleMeta { - /** スタイル名。 */ - @SerializedName("name") - @Expose - @Nonnull - public final String name; - - /** スタイルID。 */ - @SerializedName("id") - @Expose - public final int id; - - /** スタイルに対応するモデルの種類。 */ - @SerializedName("type") - @Expose - @Nonnull - public final StyleType type; - - /** - * 話者の順番。 - * - *

{@link SpeakerMeta#styles}の列は、この値に対して昇順に並んでいるべきである。 - */ - @SerializedName("order") - @Expose - @Nullable - public final Integer order; - - private StyleMeta() { - this.name = ""; - this.id = 0; - this.type = StyleType.TALK; - this.order = null; - } - } - - /** スタイル(style)に対応するモデルの種類。 */ - public static enum StyleType { - /** 音声合成クエリの作成と音声合成が可能。 */ - @SerializedName("talk") - @Expose - TALK, - } -} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Onnxruntime.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Onnxruntime.java similarity index 96% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Onnxruntime.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Onnxruntime.java index d957f4a0f..90c02a670 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Onnxruntime.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Onnxruntime.java @@ -1,4 +1,4 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import static jp.hiroshiba.voicevoxcore.GlobalInfo.SupportedDevices; @@ -6,6 +6,7 @@ import jakarta.annotation.Nonnull; import jakarta.annotation.Nullable; import java.util.Optional; +import jp.hiroshiba.voicevoxcore.internal.Dll; /** * ONNX Runtime。 @@ -18,7 +19,11 @@ * assert ort1 == ort2; * */ -public class Onnxruntime extends Dll { +public class Onnxruntime { + static { + Dll.loadLibrary(); + } + /** ONNX Runtimeのライブラリ名。 */ public static final String LIB_NAME = "voicevox_onnxruntime"; diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/OpenJtalk.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/OpenJtalk.java similarity index 87% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/OpenJtalk.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/OpenJtalk.java index 11cb3c587..24f96eb42 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/OpenJtalk.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/OpenJtalk.java @@ -1,7 +1,13 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; + +import jp.hiroshiba.voicevoxcore.internal.Dll; /** テキスト解析機としてのOpen JTalk。 */ -public class OpenJtalk extends Dll { +public class OpenJtalk { + static { + Dll.loadLibrary(); + } + private long handle; /** diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Synthesizer.java similarity index 97% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Synthesizer.java index c59f8ca1e..4b59529ee 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Synthesizer.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Synthesizer.java @@ -1,4 +1,4 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import com.google.gson.Gson; import jakarta.annotation.Nonnull; @@ -7,15 +7,24 @@ import java.util.List; import java.util.Optional; import java.util.UUID; +import jp.hiroshiba.voicevoxcore.AccelerationMode; +import jp.hiroshiba.voicevoxcore.AccentPhrase; +import jp.hiroshiba.voicevoxcore.AudioQuery; +import jp.hiroshiba.voicevoxcore.SpeakerMeta; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; import jp.hiroshiba.voicevoxcore.exceptions.RunModelException; +import jp.hiroshiba.voicevoxcore.internal.Dll; /** * 音声シンセサイザ。 * * @see Synthesizer#builder */ -public class Synthesizer extends Dll { +public class Synthesizer { + static { + Dll.loadLibrary(); + } + private long handle; private Synthesizer(Onnxruntime onnxruntime, OpenJtalk openJtalk, Builder builder) { @@ -54,11 +63,10 @@ public boolean isGpuMode() { * @return メタ情報。 */ @Nonnull - public VoiceModelFile.SpeakerMeta[] metas() { + public SpeakerMeta[] metas() { Gson gson = new Gson(); String metasJson = rsGetMetasJson(); - VoiceModelFile.SpeakerMeta[] rawMetas = - gson.fromJson(metasJson, VoiceModelFile.SpeakerMeta[].class); + SpeakerMeta[] rawMetas = gson.fromJson(metasJson, SpeakerMeta[].class); if (rawMetas == null) { throw new NullPointerException("metas"); } @@ -389,16 +397,6 @@ public Synthesizer build() { } } - /** ハードウェアアクセラレーションモード。 */ - public static enum AccelerationMode { - /** 実行環境に合わせて自動的に選択する。 */ - AUTO, - /** CPUに設定する。 */ - CPU, - /** GPUに設定する。 */ - GPU, - } - /** {@link Synthesizer#synthesis} のオプション。 */ public class SynthesisConfigurator { private Synthesizer synthesizer; diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDict.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/UserDict.java similarity index 51% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDict.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/UserDict.java index 7135365ff..e9819959a 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/UserDict.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/UserDict.java @@ -1,20 +1,22 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import com.google.gson.Gson; -import com.google.gson.annotations.Expose; -import com.google.gson.annotations.SerializedName; import com.google.gson.internal.LinkedTreeMap; import jakarta.annotation.Nonnull; -import jakarta.validation.constraints.Max; -import jakarta.validation.constraints.Min; import java.io.File; import java.nio.file.Path; import java.util.HashMap; +import jp.hiroshiba.voicevoxcore.UserDictWord; import jp.hiroshiba.voicevoxcore.exceptions.LoadUserDictException; import jp.hiroshiba.voicevoxcore.exceptions.SaveUserDictException; +import jp.hiroshiba.voicevoxcore.internal.Dll; /** ユーザー辞書。 */ -public class UserDict extends Dll { +public class UserDict { + static { + Dll.loadLibrary(); + } + private long handle; /** ユーザー辞書を作成する。 */ @@ -34,7 +36,7 @@ protected void finalize() throws Throwable { * @return 追加した単語のUUID。 */ @Nonnull - public String addWord(Word word) { + public String addWord(UserDictWord word) { Gson gson = new Gson(); String wordJson = gson.toJson(word); @@ -47,7 +49,7 @@ public String addWord(Word word) { * @param uuid 更新する単語のUUID。 * @param word 新しい単語のデータ。 */ - public void updateWord(String uuid, Word word) { + public void updateWord(String uuid, UserDictWord word) { Gson gson = new Gson(); String wordJson = gson.toJson(word); @@ -138,7 +140,7 @@ public void save(String path) throws SaveUserDictException { * @return ユーザー辞書の単語。 */ @Nonnull - public HashMap toHashMap() { + public HashMap toHashMap() { String json = rsGetWords(); Gson gson = new Gson(); @SuppressWarnings("unchecked") @@ -146,10 +148,10 @@ public HashMap toHashMap() { if (rawWords == null) { throw new NullPointerException("words"); } - HashMap words = new HashMap<>(); + HashMap words = new HashMap<>(); rawWords.forEach( (uuid, rawWord) -> { - Word word = gson.fromJson(gson.toJson(rawWord), Word.class); + UserDictWord word = gson.fromJson(gson.toJson(rawWord), UserDictWord.class); if (word == null) { throw new NullPointerException("word"); } @@ -178,140 +180,4 @@ public HashMap toHashMap() { private native String rsGetWords(); private native void rsDrop(); - - @Nonnull - private static native String rsToZenkaku(String surface); - - private static native void rsValidatePronunciation(String pronunciation); - - /** ユーザー辞書の単語。 */ - public static class Word { - /** 単語の表層形。 */ - @SerializedName("surface") - @Expose - @Nonnull - public String surface; - - /** 単語の発音。 発音として有効なカタカナである必要がある。 */ - @SerializedName("pronunciation") - @Expose - @Nonnull - public String pronunciation; - - /** - * 単語の種類。 - * - * @see Type - */ - @SerializedName("word_type") - @Expose - @Nonnull - public Type wordType; - - /** アクセント型。 音が下がる場所を指す。 */ - @SerializedName("accent_type") - @Expose - public int accentType; - - /** 単語の優先度。 0から10までの整数。 数字が大きいほど優先度が高くなる。 1から9までの値を指定することを推奨。 */ - @SerializedName("priority") - @Expose - @Min(0) - @Max(10) - public int priority; - - /** - * UserDict.Wordを作成する。 - * - * @param surface 言葉の表層形。 - * @param pronunciation 言葉の発音。 - * @throws IllegalArgumentException pronunciationが不正な場合。 - */ - public Word(String surface, String pronunciation) { - if (surface == null) { - throw new NullPointerException("surface"); - } - if (pronunciation == null) { - throw new NullPointerException("pronunciation"); - } - - this.surface = rsToZenkaku(surface); - rsValidatePronunciation(pronunciation); - this.pronunciation = pronunciation; - this.wordType = Type.COMMON_NOUN; - this.accentType = 0; - this.priority = 5; - } - - /** - * 単語の種類を設定する。 - * - * @param wordType 単語の種類。 - * @return このインスタンス。 - */ - public Word wordType(Type wordType) { - if (wordType == null) { - throw new NullPointerException("wordType"); - } - this.wordType = wordType; - return this; - } - - /** - * アクセント型を設定する。 - * - * @param accentType アクセント型。 - * @return このインスタンス。 - */ - public Word accentType(int accentType) { - if (accentType < 0) { - throw new IllegalArgumentException("accentType"); - } - this.accentType = accentType; - return this; - } - - /** - * 優先度を設定する。 - * - * @param priority 優先度。 - * @return このインスタンス。 - * @throws IllegalArgumentException priorityが0未満または10より大きい場合。 - */ - public Word priority(int priority) { - if (priority < 0 || priority > 10) { - throw new IllegalArgumentException("priority"); - } - this.priority = priority; - return this; - } - - /** 単語の種類。 */ - public static enum Type { - /** 固有名詞。 */ - @SerializedName("PROPER_NOUN") - @Expose - PROPER_NOUN, - - /** 一般名詞。 */ - @SerializedName("COMMON_NOUN") - @Expose - COMMON_NOUN, - - /** 動詞。 */ - @SerializedName("VERB") - @Expose - VERB, - - /** 形容詞。 */ - @SerializedName("ADJECTIVE") - @Expose - ADJECTIVE, - - /** 語尾。 */ - @SerializedName("SUFFIX") - @Expose - SUFFIX, - } - } } diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Utils.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Utils.java similarity index 65% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Utils.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Utils.java index 19b154cbc..803bb3ed4 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Utils.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/Utils.java @@ -1,6 +1,9 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; + +// TODO: `IllegalArgumentException`はRustコード内で`throw`する class Utils { + // FIXME: dead code static boolean isU8(int value) { return value >= 0 && value <= 255; } diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelFile.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelFile.java new file mode 100644 index 000000000..4c1c265b2 --- /dev/null +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelFile.java @@ -0,0 +1,63 @@ +package jp.hiroshiba.voicevoxcore.blocking; + +import com.google.gson.Gson; +import jakarta.annotation.Nonnull; +import java.io.Closeable; +import java.util.UUID; +import jp.hiroshiba.voicevoxcore.SpeakerMeta; +import jp.hiroshiba.voicevoxcore.internal.Dll; + +/** 音声モデルファイル。 */ +public class VoiceModelFile implements Closeable { + static { + Dll.loadLibrary(); + } + + private long handle; + + /** ID。 */ + @Nonnull public final UUID id; + + /** メタ情報。 */ + @Nonnull public final SpeakerMeta[] metas; + + public VoiceModelFile(String modelPath) { + rsOpen(modelPath); + id = rsGetId(); + String metasJson = rsGetMetasJson(); + Gson gson = new Gson(); + SpeakerMeta[] rawMetas = gson.fromJson(metasJson, SpeakerMeta[].class); + if (rawMetas == null) { + throw new RuntimeException("Failed to parse metasJson"); + } + metas = rawMetas; + } + + /** + * VVMファイルを閉じる。 + * + *

このメソッドが呼ばれた段階で{@link Synthesizer#loadVoiceModel}からのアクセスが継続中の場合、アクセスが終わるまで待つ。 + */ + @Override + public void close() { + rsClose(); + } + + @Override + protected void finalize() throws Throwable { + rsDrop(); + super.finalize(); + } + + private native void rsOpen(String modelPath); + + @Nonnull + private native UUID rsGetId(); + + @Nonnull + private native String rsGetMetasJson(); + + private native void rsClose(); + + private native void rsDrop(); +} diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Dll.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/internal/Dll.java similarity index 86% rename from crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Dll.java rename to crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/internal/Dll.java index 94ee33a8d..b37e1fcac 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/Dll.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/internal/Dll.java @@ -1,13 +1,21 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.internal; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -/** ライブラリを読み込むためだけのクラス。 */ -abstract class Dll { - static { +/** ライブラリを読み込むためだけ。 */ +public class Dll { + private static boolean loaded = false; + + private Dll() { + throw new UnsupportedOperationException(); + } + + public static synchronized void loadLibrary() { + if (loaded) return; + String runtimeName = System.getProperty("java.runtime.name"); if (runtimeName.equals("Android Runtime")) { // Android ではjniLibsから読み込む。 @@ -58,11 +66,14 @@ abstract class Dll { System.load(dllPath.toAbsolutePath().toString()); } } catch (Exception e) { + // FIXME: `tempDir`の削除 throw new RuntimeException("Failed to load Voicevox Core DLL for " + target, e); } } new LoggerInitializer().initLogger(); + + loaded = true; } static class LoggerInitializer { diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/TestUtils.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/TestUtils.java index 58c574a35..7355c8ba6 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/TestUtils.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/TestUtils.java @@ -1,9 +1,12 @@ package jp.hiroshiba.voicevoxcore; import java.io.File; +import jp.hiroshiba.voicevoxcore.blocking.Onnxruntime; +import jp.hiroshiba.voicevoxcore.blocking.OpenJtalk; +import jp.hiroshiba.voicevoxcore.blocking.VoiceModelFile; -class TestUtils { - VoiceModelFile openModel() { +public class TestUtils { + protected VoiceModelFile openModel() { // cwdはvoicevox_core/crates/voicevox_core_java_api/lib String cwd = System.getProperty("user.dir"); File path = new File(cwd + "/../../test_util/data/model/sample.vvm"); @@ -15,7 +18,7 @@ VoiceModelFile openModel() { } } - Onnxruntime loadOnnxruntime() { + protected Onnxruntime loadOnnxruntime() { final String FILENAME = "../../test_util/data/lib/" + Onnxruntime.LIB_VERSIONED_FILENAME.replace("voicevox_onnxruntime", "onnxruntime"); @@ -27,7 +30,7 @@ Onnxruntime loadOnnxruntime() { } } - OpenJtalk loadOpenJtalk() { + protected OpenJtalk loadOpenJtalk() { String cwd = System.getProperty("user.dir"); File path = new File(cwd + "/../../test_util/data/open_jtalk_dic_utf_8-1.11"); diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/MetaTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/MetaTest.java similarity index 92% rename from crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/MetaTest.java rename to crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/MetaTest.java index ece3a87ff..24230f3a8 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/MetaTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/MetaTest.java @@ -1,7 +1,7 @@ /* * This Java source file was generated by the Gradle 'init' task. */ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import static org.junit.jupiter.api.Assertions.assertNotNull; diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/SynthesizerTest.java similarity index 92% rename from crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java rename to crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/SynthesizerTest.java index 4c7d16f56..b137ca382 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/SynthesizerTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/SynthesizerTest.java @@ -2,12 +2,17 @@ * 音声合成のテスト。 * ttsaudioQuery -> synthesisの順に実行する。 */ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; +import jp.hiroshiba.voicevoxcore.AccelerationMode; +import jp.hiroshiba.voicevoxcore.AccentPhrase; +import jp.hiroshiba.voicevoxcore.AudioQuery; +import jp.hiroshiba.voicevoxcore.Mora; +import jp.hiroshiba.voicevoxcore.TestUtils; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; import jp.hiroshiba.voicevoxcore.exceptions.RunModelException; import org.junit.jupiter.api.Test; @@ -23,9 +28,7 @@ void checkIsGpuMode() { Onnxruntime onnxruntime = loadOnnxruntime(); OpenJtalk openJtalk = loadOpenJtalk(); Synthesizer synthesizer = - Synthesizer.builder(onnxruntime, openJtalk) - .accelerationMode(Synthesizer.AccelerationMode.CPU) - .build(); + Synthesizer.builder(onnxruntime, openJtalk).accelerationMode(AccelerationMode.CPU).build(); assertFalse(synthesizer.isGpuMode()); } diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/UserDictTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/UserDictTest.java similarity index 80% rename from crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/UserDictTest.java rename to crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/UserDictTest.java index ed9a94e8e..272bdfd86 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/UserDictTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/UserDictTest.java @@ -1,9 +1,12 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.file.Files; import java.nio.file.Path; +import jp.hiroshiba.voicevoxcore.AudioQuery; +import jp.hiroshiba.voicevoxcore.TestUtils; +import jp.hiroshiba.voicevoxcore.UserDictWord; import jp.hiroshiba.voicevoxcore.exceptions.InvalidModelDataException; import jp.hiroshiba.voicevoxcore.exceptions.LoadUserDictException; import jp.hiroshiba.voicevoxcore.exceptions.RunModelException; @@ -27,7 +30,7 @@ void checkLoad() throws RunModelException, InvalidModelDataException, LoadUserDi "this_word_should_not_exist_in_default_dictionary", synthesizer.metas()[0].styles[0].id); - userDict.addWord(new UserDict.Word("this_word_should_not_exist_in_default_dictionary", "テスト")); + userDict.addWord(new UserDictWord("this_word_should_not_exist_in_default_dictionary", "テスト")); openJtalk.useUserDict(userDict); AudioQuery query2 = synthesizer.createAudioQuery( @@ -41,11 +44,11 @@ void checkLoad() throws RunModelException, InvalidModelDataException, LoadUserDi void checkManipulation() throws Exception { UserDict userDict = new UserDict(); // 単語追加 - String uuid = userDict.addWord(new UserDict.Word("hoge", "ホゲ")); + String uuid = userDict.addWord(new UserDictWord("hoge", "ホゲ")); assertTrue(userDict.toHashMap().get(uuid) != null); // 単語更新 - userDict.updateWord(uuid, new UserDict.Word("hoge", "ホゲホゲ")); + userDict.updateWord(uuid, new UserDictWord("hoge", "ホゲホゲ")); assertTrue(userDict.toHashMap().get(uuid).pronunciation.equals("ホゲホゲ")); // 単語削除 @@ -53,9 +56,9 @@ void checkManipulation() throws Exception { assertTrue(userDict.toHashMap().get(uuid) == null); // 辞書のインポート - userDict.addWord(new UserDict.Word("hoge", "ホゲ")); + userDict.addWord(new UserDictWord("hoge", "ホゲ")); UserDict userDict2 = new UserDict(); - userDict2.addWord(new UserDict.Word("fuga", "フガ")); + userDict2.addWord(new UserDictWord("fuga", "フガ")); userDict.importDict(userDict2); assertTrue(userDict.toHashMap().size() == 2); diff --git a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/VoiceModelTest.java b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelTest.java similarity index 92% rename from crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/VoiceModelTest.java rename to crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelTest.java index 2bdba9c28..ba3c76ffc 100644 --- a/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/VoiceModelTest.java +++ b/crates/voicevox_core_java_api/lib/src/test/java/jp/hiroshiba/voicevoxcore/blocking/VoiceModelTest.java @@ -1,4 +1,4 @@ -package jp.hiroshiba.voicevoxcore; +package jp.hiroshiba.voicevoxcore.blocking; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -11,6 +11,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.UUID; +import jp.hiroshiba.voicevoxcore.TestUtils; import org.junit.jupiter.api.Test; class VoiceModelTest extends TestUtils { diff --git a/crates/voicevox_core_java_api/src/logger.rs b/crates/voicevox_core_java_api/src/logger.rs index eaa1889b5..e32680ca4 100644 --- a/crates/voicevox_core_java_api/src/logger.rs +++ b/crates/voicevox_core_java_api/src/logger.rs @@ -2,7 +2,7 @@ use jni::{objects::JObject, JNIEnv}; // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -extern "system" fn Java_jp_hiroshiba_voicevoxcore_Dll_00024LoggerInitializer_initLogger( +extern "system" fn Java_jp_hiroshiba_voicevoxcore_internal_Dll_00024LoggerInitializer_initLogger( _: JNIEnv<'_>, _: JObject<'_>, ) { diff --git a/crates/voicevox_core_java_api/src/onnxruntime.rs b/crates/voicevox_core_java_api/src/onnxruntime.rs index becf773b8..78044c750 100644 --- a/crates/voicevox_core_java_api/src/onnxruntime.rs +++ b/crates/voicevox_core_java_api/src/onnxruntime.rs @@ -12,10 +12,10 @@ use crate::common::throw_if_err; // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[duplicate_item( f CONST; - [ Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsLibName ] [ LIB_NAME ]; - [ Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsLibVersion ] [ LIB_VERSION ]; - [ Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsLibVersionedFilename ] [ LIB_VERSIONED_FILENAME ]; - [ Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsLibUnversionedFilename ] [ LIB_UNVERSIONED_FILENAME ]; + [ Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsLibName ] [ LIB_NAME ]; + [ Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsLibVersion ] [ LIB_VERSION ]; + [ Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsLibVersionedFilename ] [ LIB_VERSIONED_FILENAME ]; + [ Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsLibUnversionedFilename ] [ LIB_UNVERSIONED_FILENAME ]; )] #[unsafe(no_mangle)] extern "system" fn f(env: JNIEnv<'_>) -> jobject { @@ -27,7 +27,7 @@ extern "system" fn f(env: JNIEnv<'_>) -> jobject { // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsNew<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsNew<'local>( env: JNIEnv<'local>, this: JObject<'local>, filename: JString<'local>, @@ -44,7 +44,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsNew<'loca // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Onnxruntime_rsSupportedDevices<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Onnxruntime_rsSupportedDevices< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jobject { diff --git a/crates/voicevox_core_java_api/src/open_jtalk.rs b/crates/voicevox_core_java_api/src/open_jtalk.rs index 06201bc8a..f520aef2a 100644 --- a/crates/voicevox_core_java_api/src/open_jtalk.rs +++ b/crates/voicevox_core_java_api/src/open_jtalk.rs @@ -8,7 +8,7 @@ use jni::{ // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_OpenJtalk_rsNew<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_OpenJtalk_rsNew<'local>( env: JNIEnv<'local>, this: JObject<'local>, open_jtalk_dict_dir: JString<'local>, @@ -26,7 +26,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_OpenJtalk_rsNew<'local> // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_OpenJtalk_rsUseUserDict<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_OpenJtalk_rsUseUserDict< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, user_dict: JObject<'local>, @@ -48,7 +50,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_OpenJtalk_rsUseUserDict // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_OpenJtalk_rsDrop<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_OpenJtalk_rsDrop<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { diff --git a/crates/voicevox_core_java_api/src/synthesizer.rs b/crates/voicevox_core_java_api/src/synthesizer.rs index d1116c05f..3487d3c50 100644 --- a/crates/voicevox_core_java_api/src/synthesizer.rs +++ b/crates/voicevox_core_java_api/src/synthesizer.rs @@ -12,7 +12,7 @@ use std::sync::Arc; // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsNew<'local>( env: JNIEnv<'local>, this: JObject<'local>, onnxruntime: JObject<'local>, @@ -26,14 +26,14 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'loca .get_field( &builder, "accelerationMode", - object_type!("Synthesizer$AccelerationMode"), + object_type!("AccelerationMode"), )? .l()?; if !acceleration_mode.is_null() { - let auto = enum_object!(env, "Synthesizer$AccelerationMode", "AUTO")?; - let cpu = enum_object!(env, "Synthesizer$AccelerationMode", "CPU")?; - let gpu = enum_object!(env, "Synthesizer$AccelerationMode", "GPU")?; + let auto = enum_object!(env, "AccelerationMode", "AUTO")?; + let cpu = enum_object!(env, "AccelerationMode", "CPU")?; + let gpu = enum_object!(env, "AccelerationMode", "GPU")?; options.acceleration_mode = if env.is_same_object(&acceleration_mode, auto)? { voicevox_core::AccelerationMode::Auto } else if env.is_same_object(&acceleration_mode, cpu)? { @@ -67,7 +67,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsNew<'loca // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsGpuMode<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsIsGpuMode< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jboolean { @@ -85,7 +87,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsGpuMode // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJson<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsGetMetasJson< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jobject { @@ -106,7 +110,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsGetMetasJ // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoiceModel<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsLoadVoiceModel< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, model: JObject<'local>, @@ -128,7 +134,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsLoadVoice // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsUnloadVoiceModel<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsUnloadVoiceModel< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, model_id: JObject<'local>, @@ -150,7 +158,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsUnloadVoi // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsLoadedVoiceModel< +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsIsLoadedVoiceModel< 'local, >( env: JNIEnv<'local>, @@ -175,7 +183,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsIsLoadedV // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQueryFromKana< +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsAudioQueryFromKana< 'local, >( env: JNIEnv<'local>, @@ -206,7 +214,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuery<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsAudioQuery< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, text: JString<'local>, @@ -234,7 +244,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAudioQuer // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhrasesFromKana< +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsAccentPhrasesFromKana< 'local, >( env: JNIEnv<'local>, @@ -265,7 +275,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhrases<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsAccentPhrases< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, text: JString<'local>, @@ -294,7 +306,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsAccentPhr // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMoraData<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsReplaceMoraData< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, accent_phrases_json: JString<'local>, @@ -324,7 +338,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplacePhonemeLength< +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsReplacePhonemeLength< 'local, >( env: JNIEnv<'local>, @@ -356,7 +370,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplacePh // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMoraPitch<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsReplaceMoraPitch< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, accent_phrases_json: JString<'local>, @@ -386,7 +402,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsReplaceMo // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsSynthesis< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, query_json: JString<'local>, @@ -425,7 +443,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsSynthesis // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKana<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsTtsFromKana< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, kana: JString<'local>, @@ -458,7 +478,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTtsFromKa // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTts<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsTts<'local>( env: JNIEnv<'local>, this: JObject<'local>, query_json: JString<'local>, @@ -491,7 +511,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsTts<'loca // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_Synthesizer_rsDrop<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_Synthesizer_rsDrop<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { diff --git a/crates/voicevox_core_java_api/src/user_dict.rs b/crates/voicevox_core_java_api/src/user_dict.rs index deac5d676..9b02e7b94 100644 --- a/crates/voicevox_core_java_api/src/user_dict.rs +++ b/crates/voicevox_core_java_api/src/user_dict.rs @@ -10,7 +10,7 @@ use jni::{ // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsNew<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsNew<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { @@ -25,7 +25,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsNew<'local>( // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsAddWord<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsAddWord<'local>( env: JNIEnv<'local>, this: JObject<'local>, word_json: JString<'local>, @@ -50,7 +50,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsAddWord<'loc // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsUpdateWord<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsUpdateWord<'local>( env: JNIEnv<'local>, this: JObject<'local>, uuid: JString<'local>, @@ -77,7 +77,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsUpdateWord<' // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsRemoveWord<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsRemoveWord<'local>( env: JNIEnv<'local>, this: JObject<'local>, uuid: JString<'local>, @@ -98,7 +98,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsRemoveWord<' // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsImportDict<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsImportDict<'local>( env: JNIEnv<'local>, this: JObject<'local>, other_dict: JObject<'local>, @@ -119,7 +119,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsImportDict<' // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsLoad<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsLoad<'local>( env: JNIEnv<'local>, this: JObject<'local>, path: JString<'local>, @@ -140,7 +140,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsLoad<'local> // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsSave<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsSave<'local>( env: JNIEnv<'local>, this: JObject<'local>, path: JString<'local>, @@ -161,7 +161,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsSave<'local> // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsGetWords<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsGetWords<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jobject { @@ -179,7 +179,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsGetWords<'lo // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsDrop<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_UserDict_rsDrop<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { @@ -191,7 +191,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsDrop<'local> // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsToZenkaku<'local>( +extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDictWord_rsToZenkaku<'local>( env: JNIEnv<'local>, _cls: JClass<'local>, text: JString<'local>, @@ -209,7 +209,7 @@ extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsToZenkaku<'local>( // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDict_rsValidatePronunciation<'local>( +extern "system" fn Java_jp_hiroshiba_voicevoxcore_UserDictWord_rsValidatePronunciation<'local>( env: JNIEnv<'local>, _cls: JClass<'local>, text: JString<'local>, diff --git a/crates/voicevox_core_java_api/src/voice_model.rs b/crates/voicevox_core_java_api/src/voice_model.rs index 154449d42..d638a6ccc 100644 --- a/crates/voicevox_core_java_api/src/voice_model.rs +++ b/crates/voicevox_core_java_api/src/voice_model.rs @@ -15,7 +15,7 @@ impl HasJavaClassIdent for voicevox_core::blocking::VoiceModelFile { // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsOpen<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_VoiceModelFile_rsOpen<'local>( env: JNIEnv<'local>, this: JObject<'local>, model_path: JString<'local>, @@ -34,7 +34,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsOpen<' // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsGetId<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_VoiceModelFile_rsGetId<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jobject { @@ -52,7 +52,9 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsGetId< // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsGetMetasJson<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_VoiceModelFile_rsGetMetasJson< + 'local, +>( env: JNIEnv<'local>, this: JObject<'local>, ) -> jobject { @@ -70,7 +72,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsGetMet // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsClose<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_VoiceModelFile_rsClose<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { @@ -83,7 +85,7 @@ unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsClose< // SAFETY: voicevox_core_java_apiを構成するライブラリの中に、これと同名のシンボルは存在しない #[unsafe(no_mangle)] -unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_VoiceModelFile_rsDrop<'local>( +unsafe extern "system" fn Java_jp_hiroshiba_voicevoxcore_blocking_VoiceModelFile_rsDrop<'local>( env: JNIEnv<'local>, this: JObject<'local>, ) { diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py index 941ed84fc..9af47148a 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py @@ -208,6 +208,12 @@ class AudioQuery: output_stereo: bool """音声データをステレオ出力するか否か。""" + pause_length: None = None + """句読点などの無音時間。 ``None`` のときは無視される。デフォルト値は ``None`` 。""" + + pause_length_scale: float = 1.0 + """読点などの無音時間(倍率)。デフォルト値は ``1.0`` 。""" + kana: Optional[str] = None """ [読み取り専用] AquesTalk風記法。 diff --git a/crates/voicevox_core_python_api/src/convert.rs b/crates/voicevox_core_python_api/src/convert.rs index d4a867606..711da5fe4 100644 --- a/crates/voicevox_core_python_api/src/convert.rs +++ b/crates/voicevox_core_python_api/src/convert.rs @@ -111,7 +111,7 @@ pub(crate) fn async_modify_accent_phrases<'py, Fun, Fut>( ) -> PyResult<&'py PyAny> where Fun: FnOnce(Vec, StyleId) -> Fut + Send + 'static, - Fut: Future>> + Send + 'static, + Fut: Future>> + Send + 'static, { let rust_accent_phrases = accent_phrases .iter() @@ -121,10 +121,9 @@ where py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { - let replaced_accent_phrases = method(rust_accent_phrases, speaker_id).await; + let replaced_accent_phrases = method(rust_accent_phrases, speaker_id).await?; Python::with_gil(|py| { let replaced_accent_phrases = replaced_accent_phrases - .into_py_result(py)? .iter() .map(move |accent_phrase| { to_pydantic_dataclass( diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index a2d1c2475..d0233c5e7 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -282,6 +282,7 @@ mod blocking { use camino::Utf8PathBuf; use pyo3::{ + exceptions::{PyIndexError, PyValueError}, pyclass, pymethods, types::{IntoPyDict as _, PyBytes, PyDict, PyList}, Py, PyAny, PyObject, PyRef, PyResult, Python, @@ -709,6 +710,20 @@ mod blocking { end: usize, py: Python<'py>, ) -> PyResult<&'py PyBytes> { + if start > audio.frame_length() || end > audio.frame_length() { + return Err(PyIndexError::new_err(format!( + "({}, {}) is out of range for audio feature of length {}", + start, + end, + audio.frame_length(), + ))); + } + if start > end { + return Err(PyValueError::new_err(format!( + "({}, {}) is invalid range because start > end", + start, end, + ))); + } let wav = &self .synthesizer .read()? @@ -1055,11 +1070,9 @@ mod asyncio { #[pyclass] pub(crate) struct Synthesizer { - // FIXME: `Arc`ではなく、`Arc>`を - // `clone`する synthesizer: Arc< Closable< - Arc>, + voicevox_core::nonblocking::Synthesizer, Self, Tokio, >, @@ -1090,7 +1103,7 @@ mod asyncio { cpu_num_threads, }, ); - let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?.into(); + let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?; let synthesizer = Closable::new(synthesizer).into(); Ok(Self { synthesizer }) } @@ -1139,9 +1152,12 @@ mod asyncio { py: Python<'py>, ) -> PyResult<&'py PyAny> { let model: VoiceModelFile = model.extract()?; - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); pyo3_asyncio::tokio::future_into_py(py, async move { - let result = synthesizer.load_voice_model(&*model.model.read()?).await; + let result = synthesizer + .read()? + .load_voice_model(&*model.model.read()?) + .await; Python::with_gil(|py| result.into_py_result(py)) }) } @@ -1173,13 +1189,14 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let kana = kana.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { let audio_query = synthesizer + .read()? .audio_query_from_kana(&kana, StyleId::new(style_id)) .await; @@ -1201,13 +1218,16 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let text = text.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { - let audio_query = synthesizer.audio_query(&text, StyleId::new(style_id)).await; + let audio_query = synthesizer + .read()? + .audio_query(&text, StyleId::new(style_id)) + .await; Python::with_gil(|py| { let audio_query = audio_query.into_py_result(py)?; @@ -1225,13 +1245,14 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let kana = kana.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { let accent_phrases = synthesizer + .read()? .create_accent_phrases_from_kana(&kana, StyleId::new(style_id)) .await; Python::with_gil(|py| { @@ -1254,13 +1275,14 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let text = text.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { let accent_phrases = synthesizer + .read()? .create_accent_phrases(&text, StyleId::new(style_id)) .await; Python::with_gil(|py| { @@ -1283,12 +1305,15 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); crate::convert::async_modify_accent_phrases( accent_phrases, StyleId::new(style_id), py, - |a, s| async move { synthesizer.replace_mora_data(&a, s).await }, + |a, s| async move { + let result = synthesizer.read()?.replace_mora_data(&a, s).await; + Python::with_gil(|py| result.into_py_result(py)) + }, ) } @@ -1298,12 +1323,15 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); crate::convert::async_modify_accent_phrases( accent_phrases, StyleId::new(style_id), py, - |a, s| async move { synthesizer.replace_phoneme_length(&a, s).await }, + |a, s| async move { + let result = synthesizer.read()?.replace_phoneme_length(&a, s).await; + Python::with_gil(|py| result.into_py_result(py)) + }, ) } @@ -1313,12 +1341,15 @@ mod asyncio { style_id: u32, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); crate::convert::async_modify_accent_phrases( accent_phrases, StyleId::new(style_id), py, - |a, s| async move { synthesizer.replace_mora_pitch(&a, s).await }, + |a, s| async move { + let result = synthesizer.read()?.replace_mora_pitch(&a, s).await; + Python::with_gil(|py| result.into_py_result(py)) + }, ) } @@ -1330,12 +1361,13 @@ mod asyncio { enable_interrogative_upspeak: bool, py: Python<'py>, ) -> PyResult<&'py PyAny> { - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { let wav = synthesizer + .read()? .synthesis( &audio_query, StyleId::new(style_id), @@ -1368,13 +1400,16 @@ mod asyncio { let options = TtsOptions { enable_interrogative_upspeak, }; - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let kana = kana.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { - let wav = synthesizer.tts_from_kana(&kana, style_id, &options).await; + let wav = synthesizer + .read()? + .tts_from_kana(&kana, style_id, &options) + .await; Python::with_gil(|py| { let wav = wav.into_py_result(py)?; @@ -1400,13 +1435,13 @@ mod asyncio { let options = TtsOptions { enable_interrogative_upspeak, }; - let synthesizer = self.synthesizer.read()?.clone(); + let synthesizer = self.synthesizer.clone(); let text = text.to_owned(); pyo3_asyncio::tokio::future_into_py_with_locals( py, pyo3_asyncio::tokio::get_current_locals(py)?, async move { - let wav = synthesizer.tts(&text, style_id, &options).await; + let wav = synthesizer.read()?.tts(&text, style_id, &options).await; Python::with_gil(|py| { let wav = wav.into_py_result(py)?; diff --git a/deny.toml b/deny.toml index fa620a797..5736a528c 100644 --- a/deny.toml +++ b/deny.toml @@ -135,10 +135,10 @@ bypass = [ # system-configuration-sys links System Configuration framework on macOS. { name = "system-configuration-sys", version = "0.5", build-script = "cf4c21c898e9671345d4684c75014189623574f9ec96414999a9db2d73b1e40f" }, - # https://github.com/VOICEVOX/ort/blob/8627833456a69e7841ae2a29fd184752df8de8d9/ort-sys/build.rs + # https://github.com/VOICEVOX/ort/blob/17f741301db0bb08da0eafe8a338e5efd8a4b5df/ort-sys/build.rs # # ONNX Runtime is licensed under `MIT` (https://github.com/microsoft/onnxruntime/blob/v1.11.1/LICENSE) - { name = "voicevox-ort-sys", version = "=2.0.0-rc.4", build-script = "69dc8169473b04b8fbd9f0430a9b0c6057bc477fd4e971fe9d981173b073985c" }, + { name = "voicevox-ort-sys", version = "=2.0.0-rc.4", build-script = "5358c54ff958abeebfbe6cad4b0cd925db393f174ad3b443e18309782a9a3f57" }, # https://docs.rs/crate/zstd-sys/2.0.9+zstd.1.5.5/source/build.rs # diff --git a/docs/guide/dev/api-design.md b/docs/guide/dev/api-design.md new file mode 100644 index 000000000..89df3e0dd --- /dev/null +++ b/docs/guide/dev/api-design.md @@ -0,0 +1,15 @@ +# APIデザイン ガイドライン + +## Rust 以外の言語の API + +VOICEVOX CORE の主要機能は Rust で実装されることを前提としており、他の言語のラッパーでのみの機能追加はしない方針としています。これは機能の一貫性を保つための方針です。 + +ただし機能追加ではない範囲で、各言語の習慣に適合するような変更は積極的に行っていきます。例えば: + +* [`AudioQuery`](https://voicevox.github.io/voicevox_core/apis/rust_api/voicevox_core/struct.AudioQuery.html)といったJSONで表現可能なデータ型は、Pythonなら[Pydantic](https://docs.pydantic.dev)、JavaScriptなら[Zod](https://zod.dev/)といったライブラリを使って表現すべきです。 + * Rust APIとやりとりする際はJSONを介して変換します。 +* [`StyleId`](https://voicevox.github.io/voicevox_core/apis/rust_api/voicevox_core/struct.StyleId.html)といった[newtype](https://rust-unofficial.github.io/patterns/patterns/behavioural/newtype.html)は、そのままnewtypeとして表現するべきです。 + * 例えばPythonなら[`typing.NewType`](https://docs.python.org/ja/3/library/typing.html#newtype)で表現します。 +* オプショナルな引数は、キーワード引数がある言語であればキーワード引数で、ビルダースタイルが一般的な言語であればビルダースタイルで表現すべきです。 + + diff --git a/example/python/run.py b/example/python/run.py index 17f7e688f..caa2f36db 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -53,12 +53,14 @@ def main() -> None: if streaming: logger.info("%s", "In streaming mode") chunk_sec = 1.0 - intermediate = synthesizer.precompute_render(audio_query, style_id) - chunk_frames = int(intermediate.frame_rate * chunk_sec) + audio_feature = synthesizer.precompute_render(audio_query, style_id) + chunk_frames = int(audio_feature.frame_rate * chunk_sec) pcm = b"" - for i in range(0, intermediate.frame_length, chunk_frames): - logger.info("%s", f"{i/intermediate.frame_length:.2%}") - pcm += synthesizer.render(intermediate, i, i + chunk_frames) + for i in range(0, audio_feature.frame_length, chunk_frames): + logger.info("%s", f"{i/audio_feature.frame_length:.2%}") + pcm += synthesizer.render( + audio_feature, i, min(i + chunk_frames, audio_feature.frame_length) + ) logger.info("%s", f"100%") wav = wav_from_s16le( pcm, audio_query.output_sampling_rate, audio_query.output_stereo