forked from k2-fsa/icefall
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.sh
executable file
·180 lines (152 loc) · 5.43 KB
/
prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=-1
stop_stage=100
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/TALCS_corpus
# You can find three directories:train_set, dev_set, and test_set.
# You can get it from https://ai.100tal.com/dataset
# - dev_set
# - test_set
# - train_set
#
# - $dl_dir/musan
# This directory contains the following directories downloaded from
# http://www.openslr.org/17/
#
# - music
# - noise
# - speech
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# Before you run this script, you must get the TAL_CSASR dataset
# from https://ai.100tal.com/dataset
if [ ! -d $dl_dir/tal_csasr/TALCS_corpus ]; then
mv $dl_dir/TALCS_corpus $dl_dir/tal_csasr
fi
# If you have pre-downloaded it to /path/to/TALCS_corpus,
# you can create a symlink
#
# ln -sfv /path/to/TALCS_corpus $dl_dir/tal_csasr
# If you have pre-downloaded it to /path/to/musan,
# you can create a symlink
#
# ln -sfv /path/to/musan $dl_dir/musan
#
if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare tal_csasr manifest"
# We assume that you have downloaded the TALCS_corpus
# to $dl_dir/tal_csasr
if [ ! -f data/manifests/tal_csasr/.manifests.done ]; then
mkdir -p data/manifests/tal_csasr
lhotse prepare tal-csasr $dl_dir/tal_csasr data/manifests/tal_csasr
touch data/manifests/tal_csasr/.manifests.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
if [ ! -f data/manifests/.musan_manifests.done ]; then
log "It may take 6 minutes"
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan_manifests.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for musan"
if [ ! -f data/fbank/.msuan.done ]; then
mkdir -p data/fbank
./local/compute_fbank_musan.py
touch data/fbank/.msuan.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for tal_csasr"
if [ ! -f data/fbank/.tal_csasr.done ]; then
mkdir -p data/fbank
./local/compute_fbank_tal_csasr.py
touch data/fbank/.tal_csasr.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Prepare char based lang"
lang_char_dir=data/lang_char
mkdir -p $lang_char_dir
# Download BPE models trained with LibriSpeech
# Here we use the BPE model with 5000 units trained with Librispeech.
# You can also use other BPE models if available.
if [ ! -f $lang_char_dir/bpe.model ]; then
wget -O $lang_char_dir/bpe.model \
https://huggingface.co/luomingshuang/bpe_models_trained_with_Librispeech/resolve/main/lang_bpe_5000/bpe.model
fi
# Prepare text.
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
if [ ! -f $lang_char_dir/text_full ]; then
gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_train_set.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text_train
gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_dev_set.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text_dev
gunzip -c data/manifests/tal_csasr/tal_csasr_supervisions_test_set.jsonl.gz \
| jq ".text" | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text_test
for r in text_train text_dev text_test ; do
cat $lang_char_dir/$r >> $lang_char_dir/text_full
done
fi
# Prepare text normalize
if [ ! -f $lang_char_dir/text ]; then
python ./local/text_normalize.py \
--input $lang_char_dir/text_full \
--output $lang_char_dir/text
fi
# Prepare words segments
if [ ! -f $lang_char_dir/text_words_segmentation ]; then
python ./local/text2segments.py \
--input $lang_char_dir/text \
--output $lang_char_dir/text_words_segmentation
cat $lang_char_dir/text_words_segmentation | sed "s/ /\n/g" \
| sort -u | sed "/^$/d" \
| uniq > $lang_char_dir/words_no_ids.txt
fi
# Prepare words.txt
if [ ! -f $lang_char_dir/words.txt ]; then
./local/prepare_words.py \
--input $lang_char_dir/words_no_ids.txt \
--output $lang_char_dir/words.txt
fi
# Tokenize text with BPE model
python ./local/tokenize_with_bpe_model.py \
--input $lang_char_dir/text \
--output $lang_char_dir/text_with_bpe \
--bpe-model $lang_char_dir/bpe.model
if [ ! -f $lang_char_dir/L_disambig.pt ]; then
python local/prepare_char.py
fi
fi