initial commit

daanzu · Aug 13, 2021 · 5a704db · 5a704db
commit 5a704db
Show file tree

Hide file tree

Showing 14 changed files with 2,039 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/Dockerfile.training b/Dockerfile.training
@@ -0,0 +1,51 @@
+# cat Dockerfile.training | docker build --build-arg=base=debian:9.8 -t daanzu/kaldi_ag_training .
+# cat Dockerfile.training | docker build --build-arg=base=nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 --build-arg=cuda=yes -t daanzu/kaldi_ag_training_gpu .
+
+ARG base
+ARG cuda=
+
+FROM $base
+LABEL maintainer="[email protected]"
+ARG cuda
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        gfortran \
+        patch \
+        ffmpeg \
+	    vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+RUN git clone --depth 1 https://github.com/daanzu/kaldi-fork-active-grammar /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared ${cuda:+--use-cuda} && \
+    make depend -j $(nproc) && \
+    make -j $(nproc) && \
+    find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+    find /opt/intel -type f -regex '.*\(_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
+    rm -rf /opt/kaldi/.git
+
+# _mc included for pre-AVX CPUs
+
+WORKDIR /opt/kaldi/
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,54 @@
+# Kaldi AG Training Setup
+
+[![Donate](https://img.shields.io/badge/donate-GitHub-pink.svg)](https://github.com/sponsors/daanzu)
+[![Donate](https://img.shields.io/badge/donate-Patreon-orange.svg)](https://www.patreon.com/daanzu)
+[![Donate](https://img.shields.io/badge/donate-PayPal-green.svg)](https://paypal.me/daanzu)
+
+Docker image and scripts for training finetuned or completely personal speech models. Particularly for use with [kaldi-active-grammar](https://github.com/daanzu/kaldi-active-grammar).
+
+## Usage
+
+All commands are run in the Docker container as follows. Training on the CPU should work, just much more slowly. To do so, remove the `--runtime=nvidia` and use the image `daanzu/kaldi_ag_training_gpu:2021-08-04` instead the GPU image.
+
+```bash
+docker run -it --rm -v $(pwd):/mnt/input -w /mnt/input --user "$(id -u):$(id -g)" \
+    --runtime=nvidia daanzu/kaldi_ag_training_gpu:2021-08-04 \
+    [command and args...]
+```
+
+Example commands:
+
+```bash
+# Prepare training dataset files
+python3 convert_tsv_to_scp.py -l kaldi_model_daanzu_20200905_1ep-mediumlm-base/dict/lexicon.txt yourdata.tsv [optional output directory]
+
+# Pick only one of the following:
+# Run finetune training, with default settings
+docker run [...] bash run.finetune.sh kaldi_model_daanzu_20200905_1ep-mediumlm-base dataset
+# Run completely personal training, with default settings
+docker run [...] bash run.personal.sh kaldi_model_daanzu_20200905_1ep-mediumlm-base dataset
+
+# When training completes, export trained model
+python3 export_trained_model.py {finetune,personal} [optional output directory]
+```
+
+### Notes
+
+* To run either training, you must have a base model to use as a template. (For finetuning this is also the starting point of the model; for personal it is only a source of basic info.) You can use [this base model](https://github.com/daanzu/kaldi_ag_training/releases/download/v0.1.0/kaldi_model_daanzu_20200905_1ep-mediumlm-base.zip) from this project's release page. Download the zip file and extract it to the root directory of this repo, so the directory `kaldi_model_daanzu_20200905_1ep-mediumlm-base` is here.
+
+* Kaldi requires the training data metadata to be in the SCP format, which is an annoying multi-file format. To convert the standard KaldiAG TSV format to SCP, you can run `python3 convert_tsv_to_scp.py yourdata.tsv dataset` to output SCP format in a new directory `dataset`. You can run these commands within the Docker container, or directly using your own python environment.
+    * Even better, run `python3 convert_tsv_to_scp.py -l kaldi_model_daanzu_20200905_1ep-mediumlm-base/dict/lexicon.txt yourdata.tsv dataset` to filter out utterances containing out-of-vocabulary words. OOV words are not currently well supported by these training scripts.
+
+* The audio data should be 16-bit Signed Integer PCM 1-channel 16kHz WAV files. Note that it needs to be accessible within the Docker container, so it can't be behind a symlink that points outside this repo directory, which is shared with the Docker container.
+
+* There are some directory names you should avoid using in this repo directory, because the scripts will create & use them during training. Avoid: `conf`, `data`, `exp`, `extractor`, `mfcc`, `steps`, `tree_sp`, `utils`.
+
+* Training may use a lot of storage. You may want to locate this directory somewhere with ample room available.
+
+* The training commands (`run.*.sh`) accept many optional parameters. More info later.
+
+    * `--stage n` : Skip to given stage
+
+## License
+
+This project is licensed under the GNU Affero General Public License v3 (AGPL-3.0-or-later). See the [LICENSE file](LICENSE) for details. If this license is problematic for you, please contact me.
diff --git a/cmd.sh b/cmd.sh
@@ -0,0 +1,4 @@
+export train_cmd="utils/run.pl"
+export decode_cmd="utils/run.pl"
+export cuda_cmd="utils/run.pl"
+# export cuda_cmd="utils/run.pl -l gpu=1"
diff --git a/convert_tsv_to_scp.py b/convert_tsv_to_scp.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import argparse, os
+
+parser = argparse.ArgumentParser(description='Convert a TSV file to Kaldi SCP files.')
+parser.add_argument('filename', help='The TSV file to convert.')
+parser.add_argument('output_dir', default='dataset', help='The directory to save the output files.')
+parser.add_argument('-l', '--lexicon_file', help='The name of the lexicon file, for filtering out out-of-vocabulary utterances.')
+args = parser.parse_args()
+
+if not os.path.exists(args.filename):
+    raise Exception('File does not exist: %s' % args.filename)
+os.mkdir(args.output_dir, exist_ok=True)
+
+lexicon = set()
+if args.lexicon_file:
+    with open(args.lexicon_file, 'r') as f:
+        for line in f:
+            word, num = line.strip().split()
+            lexicon.add(word)
+
+utt2spk_dict, wav_dict, text_dict = {}, {}, {}
+with open(args.filename, 'r') as f:
+    for line in f:
+        fields = line.strip().split('\t')
+        text = fields[4]
+        wav_path = fields[0]
+        utt_id = os.path.splitext(os.path.basename(wav_path))[0]
+        if lexicon and any([word not in lexicon for word in text.split()]):
+            continue
+        utt2spk_dict[utt_id] = utt_id
+        wav_dict[utt_id] = wav_path
+        text_dict[utt_id] = text
+
+with open(os.path.join(dir, 'utt2spk'), 'w') as f:
+    for (key, val) in utt2spk_dict.items():
+        f.write('%s %s\n' % (key, val))
+with open(os.path.join(dir, 'wav.scp'), 'w') as f:
+    for (key, val) in wav_dict.items():
+        f.write('%s %s\n' % (key, val))
+with open(os.path.join(dir, 'text'), 'w') as f:
+    for (key, val) in text_dict.items():
+        f.write('%s %s\n' % (key, val))
+
+print(f"Wrote training dataset to {args.output_dir}")
diff --git a/export_trained_model.py b/export_trained_model.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import argparse, os, shutil
+
+parser = argparse.ArgumentParser(description='Export trained model.')
+parser.add_argument('type', choices=('personal', 'finetune'), help='Type of trained model.')
+parser.add_argument('output_dir', default='exported_model', help='Directory to save the output model.')
+parser.add_argument('-b', '--base_model_dir', default='kaldi_model_daanzu_20200905_1ep-mediumlm-base', help='Directory of model to copy base files from.')
+args = parser.parse_args()
+
+if not os.path.exists(args.base_model_dir):
+    raise Exception('Base model directory does not exist.')
+if os.path.exists(args.output_dir):
+    raise Exception('Output directory already exists.')
+os.mkdir(args.output_dir, exist_ok=True)
+shutil.copytree(args.base_model_dir, args.output_dir, ignore=shutil.ignore_patterns('dict', 'tree_stuff'))
+os.mkdir(os.path.join(args.output_dir, 'training'), exist_ok=True)
+
+if args.type == 'personal':
+    for name in 'final.mdl tree'.split():
+        shutil.copy2(os.path.join('exp/chain/tdnn1h_sp_online', name), args.output_dir)
+    for name in 'final.dubm final.ie final.mat global_cmvn.stats'.split():
+        shutil.copy2(os.path.join('exp/chain/tdnn1h_sp_online', 'ivector_extractor', name), os.path.join(args.output_dir, 'ivector_extractor'))
+    shutil.copy2('exp/chain/tdnn1h_sp/accuracy.report', os.path.join(args.output_dir, 'training'))
+    shutil.copy2('params.txt', os.path.join(args.output_dir, 'training'))
+
+elif args.type == 'finetune':
+    for name in 'final.mdl'.split():
+        shutil.copy2(os.path.join('exp/nnet3_chain/finetune', name), args.output_dir)
+    shutil.copy2('exp/nnet3_chain/finetune/accuracy.report', os.path.join(args.output_dir, 'training'))
+
+print(f"Wrote exported {args.type} model to {args.output_dir}")
+print("NOTE: You still must run the following in your kaldi-active-grammar python environment:")
+print("python -m kaldi_active_grammar compile_agf_dictation_graph -v -m [model_dir] G.fst")
diff --git a/path.sh b/path.sh
@@ -0,0 +1,4 @@
+export KALDI_ROOT=/opt/kaldi
+export LD_LIBRARY_PATH="$KALDI_ROOT/tools/openfst-1.3.4/lib:$KALDI_ROOT/src/lib:$LD_LIBRARY_PATH"
+export PATH=$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/../kaldi_lm/:$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/rnnlmbin:$PWD:$PATH
+export LC_ALL=C
diff --git a/run.finetune.sh b/run.finetune.sh
@@ -0,0 +1,84 @@
+# docker run -it --rm -v $(pwd):/mnt/input -v $(pwd)/work:/mnt/work -w /mnt/work --user "$(id -u):$(id -g)" daanzu/kaldi_ag_training:2020-11-28 bash run.finetune.sh models/kaldi_model_daanzu_20200905_1ep-mediumlm data/standard2train --num-epochs 5 --train-stage -10 --stage 1
+# docker run -it --rm -v $(pwd):/mnt/input -v $(pwd)/work:/mnt/work -w /mnt/work --user "$(id -u):$(id -g)" --runtime=nvidia daanzu/kaldi_ag_training_gpu:2020-11-28 bash run.finetune.sh models/kaldi_model_daanzu_20200905_1ep-mediumlm data/standard2train --num-epochs 5 --train-stage -10 --stage 1
+
+set -euxo pipefail
+
+nice_cmd="nice ionice -c idle"
+
+[[ $# -ge 2 ]] || exit 1
+
+model=/mnt/input/$1; shift
+dataset=/mnt/input/$1; shift
+
+[[ -d $model ]] || exit 1
+[[ -d $dataset ]] || exit 1
+
+echo "base_model=${model#/mnt/input/}" >> params.txt
+echo "train_dataset=${dataset#/mnt/input/}" >> params.txt
+
+cat <<\EOF > cmd.sh
+export train_cmd="utils/run.pl"
+export decode_cmd="utils/run.pl"
+export cuda_cmd="utils/run.pl"
+# export cuda_cmd="utils/run.pl -l gpu=1"
+EOF
+cat <<\EOF > path.sh
+export KALDI_ROOT=/opt/kaldi
+export LD_LIBRARY_PATH="$KALDI_ROOT/tools/openfst/lib:$KALDI_ROOT/tools/openfst/lib/fst:$KALDI_ROOT/src/lib:$LD_LIBRARY_PATH"
+export PATH=$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/../kaldi_lm/:$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/rnnlmbin:$PWD:$PATH
+export LC_ALL=C
+EOF
+ln -sf /opt/kaldi/egs/wsj/s5/steps
+ln -sf /opt/kaldi/egs/wsj/s5/utils
+
+mkdir -p conf data/{lang/phones,finetune} exp extractor
+cp $model/conf/{mfcc,mfcc_hires}.conf conf/
+cp $model/conf/online_cmvn.conf conf/  # Only needed if/for finetune_ivector_extractor
+cp $model/conf/online_cmvn.conf extractor/
+# cp $model/ivector_extractor/final.{ie,dubm,mat} extractor/  # Careful not to overwrite finetuned ivector_extractor!
+cp $model/ivector_extractor/global_cmvn.stats extractor/
+cp $model/conf/online_cmvn_iextractor extractor/ 2>/dev/null || true
+cp $model/conf/splice.conf extractor/splice_opts
+echo "18" > data/lang/oov.int
+cp $model/{words,phones}.txt data/lang/
+cp $model/disambig.int data/lang/phones/
+cp $model/wdisambig_{words,phones}.int data/lang/phones/  # Only needed if/for mkgraph.sh
+echo "3" > $model/frame_subsampling_factor
+
+echo "1:2:3:4:5:6:7:8:9:10:11:12:13:14:15" > data/lang/phones/context_indep.csl
+echo "1:2:3:4:5:6:7:8:9:10:11:12:13:14:15" > data/lang/phones/silence.csl
+
+. path.sh
+
+# ln -sfT $model/tree_sp tree_sp
+rm tree_sp 2> /dev/null || true
+mkdir -p tree_sp
+cp $model/phones.txt tree_sp/
+mkdir -p exp/nnet3_chain/finetune/
+cp -r $model/dict data/  # Only needed if/for finetune_tree
+# cp $model/tree_stuff/topo data/lang/  # Only needed if/for finetune_tree
+# cp $model/tree_stuff/sets.int data/lang/phones/  # Only needed if/for finetune_tree
+
+# Skip train.py::create_phone_lm()
+touch tree_sp/ali.1.gz tree_sp/tree tree_sp/final.mdl  # Fake empty, to pacify the training script later
+
+# Skip train.py::create_denominator_fst()
+copy-transition-model $model/final.mdl exp/nnet3_chain/finetune/0.trans_mdl 2> /dev/null
+cp $model/tree $model/tree_stuff/{den,normalization}.fst exp/nnet3_chain/finetune/
+
+perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
+    if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
+    for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
+    < $model/lexiconp.txt > data/lang/lexiconp_pdp.txt || exit 1;
+utils/lang/make_lexicon_fst.py --sil-prob=0.5 --sil-phone=SIL data/lang/lexiconp_pdp.txt | \
+    fstcompile --isymbols=$model/phones.txt --osymbols=$model/words.txt --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > data/lang/L.fst || exit 1
+
+cp -r $dataset/{text,wav.scp,utt2spk} data/finetune
+# ln -sfT /mnt/input/audio_data audio_data
+
+# utils/fix_data_dir.sh data/finetune
+$nice_cmd bash run_finetune_tdnn_1a_daanzu.sh --src-dir $model --extractor-dir extractor --tree-dir tree_sp --nj $(nproc) $*
+
+# > cp -r work.test.per/data/lang/phones/* work.test.fin/data/lang/phones/
+# > cp -r work.test.per/data/lang_chain/topo work.test.fin/data/lang/
diff --git a/run.personal.sh b/run.personal.sh
@@ -0,0 +1,68 @@
+# docker run -it --rm -v $(pwd):/mnt/input -v $(pwd)/work:/mnt/work -w /mnt/work --user "$(id -u):$(id -g)" daanzu/kaldi_ag_training:2020-11-28 bash run.personal.sh models/kaldi_model_daanzu_20200905_1ep-mediumlm data/standard2train --num-epochs 5 --stage -10
+# docker run -it --rm -v $(pwd):/mnt/input -v $(pwd)/work:/mnt/work -w /mnt/work --user "$(id -u):$(id -g)" --runtime=nvidia daanzu/kaldi_ag_training_gpu:2020-11-28 bash run.personal.sh models/kaldi_model_daanzu_20200905_1ep-mediumlm data/standard2train --num-epochs 5 --stage -10
+
+set -euxo pipefail
+
+nice_cmd="nice ionice -c idle"
+stage=-10
+gmm_stage=0  # always stage+10
+
+# Scan through arguments, checking for stage argument, which if included we need to use to set the gmm_stage
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --stage)
+    stage="$2"
+    gmm_stage=$((stage+10))
+    POSITIONAL+=("$1" "$2") # save it in an array for later
+    shift # past argument
+    shift # past value
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift # past argument
+    ;;
+esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+[[ $# -ge 2 ]] || exit 1
+
+model=/mnt/input/$1; shift
+dataset=/mnt/input/$1; shift
+
+[[ -d $model ]] || exit 1
+[[ -d $dataset ]] || exit 1
+
+echo "base_model=${model#/mnt/input/}" >> params.txt
+echo "train_dataset=${dataset#/mnt/input/}" >> params.txt
+
+cat <<\EOF > cmd.sh
+export train_cmd="utils/run.pl"
+export decode_cmd="utils/run.pl"
+export cuda_cmd="utils/run.pl"
+# export cuda_cmd="utils/run.pl -l gpu=1"
+EOF
+cat <<\EOF > path.sh
+export KALDI_ROOT=/opt/kaldi
+export LD_LIBRARY_PATH="$KALDI_ROOT/tools/openfst/lib:$KALDI_ROOT/tools/openfst/lib/fst:$KALDI_ROOT/src/lib:$LD_LIBRARY_PATH"
+export PATH=$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/../kaldi_lm/:$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/rnnlmbin:$PWD:$PATH
+export LC_ALL=C
+EOF
+ln -sf /opt/kaldi/egs/wsj/s5/steps
+ln -sf /opt/kaldi/egs/wsj/s5/utils
+
+mkdir -p data/train data/dict conf exp
+cp $model/conf/{mfcc,mfcc_hires,online_cmvn}.conf conf
+cp $model/dict/{extra_questions.txt,lexiconp.txt,lexicon.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict
+
+[[ $stage -gt -10 ]] || rm -rf data/train/*
+cp $dataset/{text,wav.scp,utt2spk} data/train
+# ln -sfT /mnt/input/audio_data audio_data
+# ln -sfT /mnt/input/audio_data/daanzu wav
+
+# utils/fix_data_dir.sh data/train
+$nice_cmd bash run_personal_gmm.sh --nj $(nproc) --stage $gmm_stage
+$nice_cmd bash run_personal_chain_tdnn_1h.sh --nj $(nproc) $*