-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6a3bec8
Showing
71 changed files
with
7,204 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
[submodule "pybind11"] | ||
path = pybind11 | ||
url = https://github.com/pybind/pybind11.git | ||
[submodule "eigen"] | ||
path = eigen | ||
url = https://github.com/libigl/eigen.git | ||
[submodule "argparse"] | ||
path = argparse | ||
url = https://github.com/p-ranav/argparse.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
cmake_minimum_required(VERSION 3.6...3.14) | ||
project(domino) | ||
if (WIN32) | ||
cmake_policy(SET CMP0135 OLD) | ||
endif(WIN32) | ||
|
||
set(CMAKE_CXX_STANDARD 17) | ||
set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
|
||
|
||
if(MSVC) | ||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE) | ||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP") | ||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") | ||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") | ||
else(MSVC) | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -lpthread") | ||
endif(MSVC) | ||
|
||
# 依存ライブラリを追加 | ||
include(FetchContent) | ||
if (WIN32) | ||
FetchContent_Declare(onnxruntime URL https://github.com/microsoft/onnxruntime/releases/download/v1.16.3/onnxruntime-win-x64-1.16.3.zip) | ||
elseif(APPLE) | ||
FetchContent_Declare(onnxruntime URL https://github.com/microsoft/onnxruntime/releases/download/v1.16.3/onnxruntime-osx-universal2-1.16.3.tgz) | ||
else() | ||
FetchContent_Declare(onnxruntime URL https://github.com/microsoft/onnxruntime/releases/download/v1.16.3/onnxruntime-linux-x64-1.16.3.tgz) | ||
endif() | ||
FetchContent_MakeAvailable(onnxruntime) | ||
|
||
add_subdirectory(pybind11) | ||
include_directories(./eigen ./argparse/include ${FETCHCONTENT_BASE_DIR}/onnxruntime-src/include) | ||
link_directories(${FETCHCONTENT_BASE_DIR}/onnxruntime-src/lib) | ||
|
||
pybind11_add_module( | ||
pydomino | ||
src/lib.cpp | ||
src/domino.cpp | ||
src/viterbi.cpp | ||
) | ||
target_link_libraries( | ||
pydomino | ||
PRIVATE onnxruntime | ||
) | ||
|
||
add_executable( | ||
domino | ||
src/main.cpp | ||
src/domino.cpp | ||
src/viterbi.cpp | ||
src/load_wav.cpp | ||
) | ||
target_link_libraries( | ||
domino | ||
PRIVATE onnxruntime | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2024 DWANGO Co.,Ltd. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# pydomino | ||
|
||
`pydomino` は日本語音声に対して音素ラベルをアラインメントするためのツールです。GPUは不要です。 | ||
ライブラリとして Python から使うこともコマンドラインツールとしてコンソールから使うこともできます。 | ||
ドキュメントは [こちら](https://dwangomediavillage.github.io/pydomino/) からご覧いただけます。 | ||
|
||
## Installation | ||
|
||
### Requisites | ||
|
||
- CMake | ||
- Python >= 3.10 (miniconda etc.) | ||
- Visual Studio >= 2019 (for Windows) | ||
|
||
### Build & Install | ||
|
||
#### Linux / Mac | ||
|
||
```sh | ||
git clone --recursive {this-repository-url} | ||
cd pydomino | ||
pip install ./ | ||
``` | ||
|
||
また、下記のように直接 pip インストールもできます(コマンドラインツールはインストールされません): | ||
|
||
```sh | ||
pip install git+{this-repository-url} | ||
``` | ||
|
||
|
||
#### Windows | ||
|
||
`Anaconda Prompt (miniconda3)` 環境において MSVC の `vcvars64.bat` を利用してインストールします: | ||
|
||
* `"C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"` or | ||
* `"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"`. | ||
|
||
```sh | ||
# on `Anaconda Prompt (miniconda3)` | ||
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" | ||
git clone --recursive {this-repository-url} | ||
cd pydomino | ||
pip install ./ | ||
``` | ||
|
||
## Run samples | ||
|
||
### Python Library | ||
|
||
```py | ||
alignmer: pydomino.Aligner = pydomino.Aligner(path-to-model-file.onnx) | ||
|
||
y: np.ndarray = librosa.load(path-to-wav-file, sr=16_000, mono=True, dtype=np.float32)[0] | ||
p: list[str] = path-to-phoneme-file.read_text().split(" ") | ||
z: list[tuple[float, float, str]] = alignmer.align(y, " ".join(p), N=3) # [(start_time_sec, end_time_sec, phoneme_str)] | ||
``` | ||
|
||
* `path-to-model-file.onnx` は事前学習済みの onnx モデルファイルです。 | ||
* `onnx_model/model.onnx`にあります。 | ||
* `path-to-wav-file` はサンプリング周波数 16kHz のモノラル wav ファイルです。 | ||
* `path-to-phoneme-file` は音素を空白区切りしたテキストが格納されたファイルのパスです。 | ||
* NOTE: 開始音素と終了音素は `pau` である必要があります。 | ||
|
||
`phonemes` に使える音素一覧は下記の通りです: | ||
|
||
| | | | | | | | | | | | ||
| ----- | ---- | --- | ---- | --- | ---- | --- | ---- | ---- | ---- | | ||
| `pau` | `ry` | `r` | `my` | `m` | `ny` | `n` | `j` | `z` | `by` | | ||
| `b` | `dy` | `d` | `gy` | `g` | `ky` | `k` | `ch` | `ts` | `sh` | | ||
| `s` | `hy` | `h` | `v` | `f` | `py` | `p` | `t` | `y` | `w` | | ||
| `N` | `a` | `i` | `u` | `e` | `o` | `I` | `U` | `cl` | | | ||
|
||
### Console Application | ||
|
||
上記のインストール手順における `pip install` により Cli ツールも自動でビルドされます。 | ||
|
||
ビルドされたツールは下記のようにして使えます: | ||
|
||
```sh | ||
./build/{temporary-directory}/pydomino/domino \ | ||
--input_path={path-to-wav-file} \ | ||
--input_phoneme={path-to-phoneme-file} \ | ||
--output_path={path-to-output-lab-file} \ | ||
--model={path-to-model-file.onnx} \ | ||
-N=3 | ||
``` | ||
|
||
### label file format (.lab) | ||
|
||
アラインメント結果のラベルファイル (.lab) は、tsv ファイル構造になっています。 | ||
|
||
各行に音素の開始時刻と終了時刻 (いずれも単位は秒) と、そのときの音素が TAB 区切りで並んでいます: | ||
|
||
```txt | ||
0.00 1.87 pau | ||
1.87 1.90 t | ||
1.90 3.10 pau | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Sphinx build info version 1 | ||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. | ||
config: bc9e43d6f0910153b67cb3d34b934501 | ||
tags: 645f666f9bcd5a90fca523b33c5a78b7 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
pydomino.Aliger クラス | ||
====================== | ||
|
||
|
||
.. automodule:: pydomino | ||
:members: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
pydominoの使用例 | ||
================ | ||
|
||
pythonライブラリの場合 | ||
********************** | ||
|
||
|
||
.. code-block:: python | ||
import numpy as np | ||
import librosa | ||
import pydomino | ||
alignmer: pydomino.Aligner = pydomino.Aligner("onnx_model/model.onnx") | ||
wav_source: np.ndarray = librosa.load("example/dowaNgo.wav", sr=16_000, mono=True, dtype=np.float32)[0] | ||
phonemes: list[str] = ["pau"] + ["d", "o", "w", "a", "N", "g", "o"] + ["pau"] | ||
alignment_result: list[tuple[float, float, str]] = alignmer.align(wav_source, " ".join(phonemes), 3) | ||
print(f"{alignment_result}") | ||
とすると、以下のアラインメント結果が得られます | ||
|
||
.. code-block:: python | ||
[ | ||
(0.0, 0.07999999821186066, 'pau'), | ||
(0.07999999821186066, 0.12999999523162842, 'd'), | ||
(0.12999999523162842, 0.20000000298023224, 'o'), | ||
(0.20000000298023224, 0.25999999046325684, 'w'), | ||
(0.25999999046325684, 0.3400000035762787, 'a'), | ||
(0.3400000035762787, 0.44999998807907104, 'N'), | ||
(0.44999998807907104, 0.47999998927116394, 'g'), | ||
(0.47999998927116394, 0.6800000071525574, 'o'), | ||
(0.6800000071525574, 0.7300000190734863, 'pau') | ||
] | ||
ここで、第3引数の最小割り当てフレーム数を5に変えてみると、とすると、以下のアラインメント結果が得られます | ||
|
||
.. code-block:: python | ||
alignment_result: list[tuple[float, float, str]] = alignmer.align(wav_source, " ".join(phonemes), 3) | ||
print(f"{alignment_result}") | ||
.. code-block:: python | ||
[ | ||
(0.0, 0.07999999821186066, 'pau'), | ||
(0.07999999821186066, 0.12999999523162842, 'd'), | ||
(0.12999999523162842, 0.20000000298023224, 'o'), | ||
(0.20000000298023224, 0.25999999046325684, 'w'), | ||
(0.25999999046325684, 0.3400000035762787, 'a'), | ||
(0.3400000035762787, 0.4399999976158142, 'N'), | ||
(0.4399999976158142, 0.49000000953674316, 'g'), | ||
(0.49000000953674316, 0.6800000071525574, 'o'), | ||
(0.6800000071525574, 0.7300000190734863, 'pau') | ||
] | ||
音素 "g" に割り当てられた秒数が 0.03秒から0.05秒に伸びます。これが、最小割り当てフレーム数の保証によるものです。 | ||
|
||
|
||
|
||
|
||
コマンドラインツールの場合 | ||
************************** | ||
|
||
git clone と pip install ./ を使った場合、./build ディレクトリ以下にコマンドラインツールがビルドされます。 | ||
|
||
これによってアラインメントの実行も可能です | ||
|
||
たとえば | ||
|
||
.. code-block:: bash | ||
$ build/{temporary-directory}/pydomino/domino --input_path example/dowaNgo.wav --input_phoneme "pau d o w a N g o pau" --output_path result.lab -N 5 | ||
とすると、以下のアラインメント結果が labファイル(result.lab)に出力されます | ||
|
||
.. code-block:: bash | ||
$ cat result.wav | ||
.. code-block:: guess | ||
0.00 0.08 pau | ||
0.08 0.13 d | ||
0.13 0.20 o | ||
0.20 0.26 w | ||
0.26 0.34 a | ||
0.34 0.44 N | ||
0.44 0.49 g | ||
0.49 0.68 o | ||
0.68 0.73 pau |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
.. pydomino documentation master file, created by | ||
sphinx-quickstart on Thu May 23 12:17:51 2024. | ||
You can adapt this file completely to your liking, but it should at least | ||
contain the root `toctree` directive. | ||
日本語音素アラインメントツール pydomino | ||
======================================= | ||
|
||
pydominoとは | ||
------------ | ||
|
||
pydomino は日本語音声に対して音素ラベルをアラインメントするためのツールです。 | ||
|
||
使い方 | ||
------ | ||
|
||
pydomino はPythonライブラリとコマンドラインツールの2通りで利用できます。 | ||
|
||
インストール方法 | ||
---------------- | ||
|
||
Linux / Mac | ||
*********** | ||
|
||
.. code-block::bash | ||
git clone --recursive {this-repository-url} | ||
pip install ./ | ||
コマンドラインツールがいらない場合はこちらでも直接ライブラリだけインストールできます | ||
|
||
.. code-block:: bash | ||
pip install git+{this-repository-url} | ||
Windows | ||
******* | ||
|
||
Anaconda Prompt (miniconda3) 環境において MSVC の vcvars64.bat を利用してインストールします | ||
|
||
例えば、以下のコマンドでインストールできます | ||
|
||
.. code-block:: bash | ||
# on `Anaconda Prompt (miniconda3)` | ||
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" | ||
git clone --recursive {this-repository-url} | ||
pip install ./ | ||
例示したパスに vcvars64.bat がないなら、例えば以下のような場所にあるかもしれません。 | ||
|
||
.. code-block:: bash | ||
"C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat" or | ||
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat". | ||
.. toctree:: | ||
:maxdepth: 2 | ||
:caption: Contents: | ||
|
||
aligner | ||
examples | ||
|
||
|
||
|
||
Indices and tables | ||
================== | ||
|
||
* :ref:`genindex` | ||
* :ref:`modindex` | ||
* :ref:`search` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
for_docstring | ||
============= | ||
|
||
.. toctree:: | ||
:maxdepth: 4 | ||
|
||
pydomino |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
pydomino module | ||
=============== | ||
|
||
.. automodule:: pydomino | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: |
Oops, something went wrong.