Skip to content

Commit

Permalink
Smart split
Browse files Browse the repository at this point in the history
  • Loading branch information
vimalmanohar committed Oct 20, 2018
1 parent 2668098 commit 63a64e7
Show file tree
Hide file tree
Showing 90 changed files with 8,119 additions and 498 deletions.
93 changes: 48 additions & 45 deletions egs/babel/s5d/local/chain/run_ivector_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ gmm=tri5_cleaned # This specifies a GMM-dir from the features
# of the type you're training the system on;
# it should contain alignments for 'train_set'.
langdir=data/langp/tri5_ali

generate_alignments=true # Set to false to skip alignment generation
num_threads_ubm=12
nnet3_affix=_cleaned
extractor= # If supplied, uses this extractor instead of training a new one

. ./cmd.sh
. ./path.sh
Expand Down Expand Up @@ -57,7 +58,7 @@ if [ $stage -le 1 ]; then
utils/fix_data_dir.sh data/${train_set}_sp
fi

if [ $stage -le 2 ]; then
if $generate_alignments && [ $stage -le 2 ]; then
echo "$0: aligning with the perturbed low-resolution data"
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
Expand Down Expand Up @@ -93,53 +94,55 @@ if [ $stage -le 3 ]; then
steps/compute_cmvn_stats.sh \
data/${datadir}_hires_nopitch exp/make_hires/${datadir}_nopitch $mfccdir || exit 1;
utils/fix_data_dir.sh data/${datadir}_hires_nopitch

done
fi

if [ $stage -le 4 ]; then
echo "$0: computing a subset of data to train the diagonal UBM."

mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

# train a diagonal UBM using a subset of about a quarter of the data
# we don't use the _comb data for this as there is no need for compatibility with
# the alignments, and using the non-combined data is more efficient for I/O
# (no messing about with piped commands).
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
if [ $num_utts_total -le 14000 ] ; then
num_utts=14000
else
num_utts=$num_utts_total
if [ -z "$extractor" ]; then
if [ $stage -le 4 ]; then
echo "$0: computing a subset of data to train the diagonal UBM."

mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

# train a diagonal UBM using a subset of about a quarter of the data
# we don't use the _comb data for this as there is no need for compatibility with
# the alignments, and using the non-combined data is more efficient for I/O
# (no messing about with piped commands).
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
if [ $num_utts_total -le 14000 ] ; then
num_utts=14000
else
num_utts=$num_utts_total
fi
utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
$num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset

echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
exp/nnet3${nnet3_affix}/pca_transform

echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
--num-frames 700000 \
--num-threads $num_threads_ubm \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi
utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
$num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset

echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
exp/nnet3${nnet3_affix}/pca_transform

echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
--num-frames 700000 \
--num-threads $num_threads_ubm \
${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi

if [ $stage -le 5 ]; then
# Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors
# can be sensitive to the amount of data. The script defaults to an iVector dimension of
# 100.
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
exp/nnet3${nnet3_affix}/extractor || exit 1;
if [ $stage -le 5 ]; then
# Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors
# can be sensitive to the amount of data. The script defaults to an iVector dimension of
# 100.
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
exp/nnet3${nnet3_affix}/extractor || exit 1;
fi
extractor=exp/nnet3${nnet3_affix}/extractor
fi

if [ $stage -le 6 ]; then
Expand All @@ -166,7 +169,7 @@ if [ $stage -le 6 ]; then

steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
exp/nnet3${nnet3_affix}/extractor $ivectordir
$extractor $ivectordir

fi

Expand Down
18 changes: 13 additions & 5 deletions egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,25 @@ set -e -o pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=17
stage=0
nj=30
dropout_schedule='0,[email protected],[email protected],0'
train_set=train_cleaned
gmm=tri5_cleaned # the gmm for the target data
langdir=data/langp/tri5_ali
num_threads_ubm=12
nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
num_epochs=4
extractor= # Use a pre-trained i-vector extractor

# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
train_stage=-10
tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
tdnn_affix="_bab7" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs.
common_egs_dir= # you can set this to use previously dumped egs.
chunk_width=150,120,90,75
chunk_left_context=40

# End configuration section.
echo "$0 $@" # Print the command line for logging
Expand All @@ -58,7 +61,8 @@ local/chain/run_ivector_common.sh --stage $stage \
--train-set $train_set \
--gmm $gmm \
--num-threads-ubm $num_threads_ubm \
--nnet3-affix "$nnet3_affix"
--nnet3-affix "$nnet3_affix" \
--extractor "$extractor"


gmm_dir=exp/$gmm
Expand Down Expand Up @@ -194,11 +198,15 @@ if [ $stage -le 18 ]; then
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--egs.dir "$common_egs_dir" \
--egs.chunk-left-context $chunk_left_context \
--egs.chunk-right-context 0 \
--egs.chunk-left-context-initial 0 \
--egs.chunk-right-context-final 0 \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
--trainer.num-chunk-per-minibatch 128 \
--trainer.num-chunk-per-minibatch 128,64 \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs 4 \
--trainer.num-epochs $num_epochs \
--trainer.optimization.num-jobs-initial 2 \
--trainer.optimization.num-jobs-final 12 \
--trainer.dropout-schedule $dropout_schedule \
Expand Down
Loading

0 comments on commit 63a64e7

Please sign in to comment.