Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

語料改口語調 #21

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
13 changes: 7 additions & 6 deletions fatchord-WaveRNN/dobi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ meta:

env=siatting:
variables:
- PANPUN=0.2.1
- PANPUN=1.0
- BANGTSI=https://www.dropbox.com/s/rye2sd0wo718bj5/SuiSiann-0.2.1.tar?dl=0
- PINLUT=16k

Expand Down Expand Up @@ -182,17 +182,18 @@ job=wavernn-job:
job=huatsiann-job:
use: hunlian-khuanking
env:
- KU={env.KU:tak10-ke7 tsə2-hue1 lai7 tsʰit8-tʰə5 !}
- KU={env.KU:to規-guân規 bûn規-huà本 beh再 siū規 lâng規 khíng規-tīng本.}
# - KU={env.KU:ta̍k規-ke規 tsò規-hué規 lâi規 tshit規-thô本!}
- PANPUN={env.PANPUN}
- CUDA_VISIBLE_DEVICES=1
- CUDA_VISIBLE_DEVICES={env.CUDA_VISIBLE_DEVICES:0}
command: |
bash -eux -c '
cp /tshamsoo/hparams.py .
cp /tshamsoo/text_init.py utils/text/__init__.py
cp /tshamsoo/text_symbols.py utils/text/symbols.py
sed -i "s/required=True, //g" gen_tacotron.py
time python gen_tacotron.py --input_text "$KU" griffinlim
time python gen_tacotron.py --input_text "$KU" wavernn
time python gen_tacotron.py --input_text "$KU" wavernn --unbatched
time python gen_tacotron.py --input_text "$KU" --save_attention griffinlim
time python gen_tacotron.py --input_text "$KU" --save_attention wavernn
time python gen_tacotron.py --input_text "$KU" --save_attention wavernn --unbatched
'
mounts: [tshamsoo, checkpoints, outputs, ]
12 changes: 1 addition & 11 deletions fatchord-WaveRNN/tshamsoo/preprocess-tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,6 @@
from utils.paths import Paths


from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器
from 臺灣言語工具.音標系統.閩南語.臺灣閩南語羅馬字拼音 import 臺灣閩南語羅馬字拼音
from 臺灣言語工具.語音合成.閩南語音韻規則 import 閩南語音韻規則
from 臺灣言語工具.語音合成 import 台灣話口語講法


# Helper functions for argument types
def valid_n_workers(num):
n = int(num)
Expand Down Expand Up @@ -73,11 +67,7 @@ def suisiann(path: Union[str, Path], wav_files):
mia = basename(tsua['音檔'])
if mia in u_tihleh:
imtong = splitext(mia)[0]
hj = tsua['漢字']
lmj = tsua['羅馬字']
text_dict[imtong] = 台灣話口語講法(
拆文分析器.建立句物件(hj, lmj)
)
text_dict[imtong] = tsua['口語調']

return text_dict

Expand Down
24 changes: 15 additions & 9 deletions fatchord-WaveRNN/tshamsoo/text_symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,24 @@

'''
Defines the set of symbols used in text input to the model.

The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
from utils.text import cmudict
The default is a set of ASCII characters that works well for English or text
that has been run through Unidecode.
See TRAINING_DATA.md for details.
'''

_pad = '_'
_punctuation = '!\'(),.:;? '
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_ipa = 'ŋəɛɨʔʰ̩ⁿ'
_punctuation = r' -\.,;:?!"\'\(\)“”‘’~─'
_letters = sorted(set(
'abcdefghijklmnopqrstuvwxyzáàâǎāa̍a̋éèêěēe̍e̋íìîǐīı̍i̍i̋'
'óòôǒōo̍őó͘ò͘ô͘ǒ͘ō͘o̍͘ő͘úùûǔūu̍űḿm̀m̂m̌m̄m̍m̋ńǹn̂ňn̄n̍n̋ⁿ'
))
_tsong = _letters + list(''.join(_letters).upper())

_sooji = '0123456789'
_piantiau = '規本隨再固三仔海'

# Export all symbols:
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + list(_ipa) + list(_sooji)

symbols = (
[_pad] + list(_punctuation)
+ list(_tsong) + list(_sooji) + list(_piantiau)
)