Smart split

vimalmanohar · Oct 20, 2018 · 63a64e7 · 63a64e7
1 parent 2668098
commit 63a64e7
Show file tree

Hide file tree

Showing 90 changed files with 8,119 additions and 498 deletions.
diff --git a/egs/babel/s5d/local/chain/run_ivector_common.sh b/egs/babel/s5d/local/chain/run_ivector_common.sh
@@ -16,9 +16,10 @@ gmm=tri5_cleaned          # This specifies a GMM-dir from the features
                           # of the type you're training the system on;
                           # it should contain alignments for 'train_set'.
 langdir=data/langp/tri5_ali
-
+generate_alignments=true    # Set to false to skip alignment generation
 num_threads_ubm=12
 nnet3_affix=_cleaned
+extractor=      # If supplied, uses this extractor instead of training a new one
 
 . ./cmd.sh
 . ./path.sh
@@ -57,7 +58,7 @@ if [ $stage -le 1 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 2 ]; then
+if $generate_alignments && [ $stage -le 2 ]; then
   echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
     data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
@@ -93,53 +94,55 @@ if [ $stage -le 3 ]; then
     steps/compute_cmvn_stats.sh \
       data/${datadir}_hires_nopitch exp/make_hires/${datadir}_nopitch $mfccdir || exit 1;
     utils/fix_data_dir.sh data/${datadir}_hires_nopitch
-
   done
 fi
 
-if [ $stage -le 4 ]; then
-  echo "$0: computing a subset of data to train the diagonal UBM."
-
-  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
-  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
-
-  # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
-  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
-  if [ $num_utts_total -le 14000 ] ; then
-    num_utts=14000
-  else
-    num_utts=$num_utts_total
+if [ -z "$extractor" ]; then
+  if [ $stage -le 4 ]; then
+    echo "$0: computing a subset of data to train the diagonal UBM."
+
+    mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+    temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+    # train a diagonal UBM using a subset of about a quarter of the data
+    # we don't use the _comb data for this as there is no need for compatibility with
+    # the alignments, and using the non-combined data is more efficient for I/O
+    # (no messing about with piped commands).
+    num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+    if [ $num_utts_total -le 14000 ] ; then
+      num_utts=14000
+    else
+      num_utts=$num_utts_total
+    fi
+    utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+    echo "$0: computing a PCA transform from the hires data."
+    steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        --max-utts 10000 --subsample 2 \
+         ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+         exp/nnet3${nnet3_affix}/pca_transform
+
+    echo "$0: training the diagonal UBM."
+    # Use 512 Gaussians in the UBM.
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+      --num-frames 700000 \
+      --num-threads $num_threads_ubm \
+      ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+      exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
   fi
-  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
-    $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
-
-  echo "$0: computing a PCA transform from the hires data."
-  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
-      --splice-opts "--left-context=3 --right-context=3" \
-      --max-utts 10000 --subsample 2 \
-       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
-       exp/nnet3${nnet3_affix}/pca_transform
-
-  echo "$0: training the diagonal UBM."
-  # Use 512 Gaussians in the UBM.
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-    --num-frames 700000 \
-    --num-threads $num_threads_ubm \
-    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
-    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
-fi
 
-if [ $stage -le 5 ]; then
-  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
-  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
-  # 100.
-  echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
-    exp/nnet3${nnet3_affix}/extractor || exit 1;
+  if [ $stage -le 5 ]; then
+    # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+    # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+    # 100.
+    echo "$0: training the iVector extractor"
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+      exp/nnet3${nnet3_affix}/extractor || exit 1;
+  fi
+  extractor=exp/nnet3${nnet3_affix}/extractor
 fi
 
 if [ $stage -le 6 ]; then
@@ -166,7 +169,7 @@ if [ $stage -le 6 ]; then
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
     ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
-    exp/nnet3${nnet3_affix}/extractor $ivectordir
+    $extractor $ivectordir
 
 fi
 

diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
@@ -20,22 +20,25 @@ set -e -o pipefail
 
 # First the options that are passed through to run_ivector_common.sh
 # (some of which are also used in this script directly).
-stage=17
+stage=0
 nj=30
 dropout_schedule='0,[email protected],[email protected],0'
 train_set=train_cleaned
 gmm=tri5_cleaned  # the gmm for the target data
 langdir=data/langp/tri5_ali
 num_threads_ubm=12
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=4
+extractor=      # Use a pre-trained i-vector extractor
 
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
 tdnn_affix="_bab7"  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs  # you can set this to use previously dumped egs.
+common_egs_dir=  # you can set this to use previously dumped egs.
 chunk_width=150,120,90,75
+chunk_left_context=40
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -58,7 +61,8 @@ local/chain/run_ivector_common.sh --stage $stage \
                                   --train-set $train_set \
                                   --gmm $gmm \
                                   --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
+                                  --nnet3-affix "$nnet3_affix" \
+                                  --extractor "$extractor"
 
 
 gmm_dir=exp/$gmm
@@ -194,11 +198,15 @@ if [ $stage -le 18 ]; then
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --egs.dir "$common_egs_dir" \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context 0 \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.num-chunk-per-minibatch 128,64 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
+    --trainer.num-epochs $num_epochs \
     --trainer.optimization.num-jobs-initial 2 \
     --trainer.optimization.num-jobs-final 12 \
     --trainer.dropout-schedule $dropout_schedule \