fix: multiple issues with regression test suite

- max step is now 1000 for all cases, for fast regression - max epochs also 10, because for very small cases 1000 steps is pretty slow, possibly due to what happens between epochs? (lj-160 did 111 epochs...) - lj-150 is now lj-160, fixing an error with a validation set with fewer than 16 elements. - document how to override cluster parameters - exit early if training fails Still to do: handle failing to fetch bundle.js when there is not proxy set on the job nodes. But that has to be fixed in ReadAlong/Studio, not here.
EveryVoiceTTS · Jan 27, 2025 · 8f9a7e3 · 8f9a7e3
1 parent 5651ada
commit 8f9a7e3
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 11 deletions.
diff --git a/everyvoice/tests/regression/.gitignore b/everyvoice/tests/regression/.gitignore
@@ -1,2 +1,3 @@
 EV-regress.*
+EV-r-main.*
 regress-*
diff --git a/everyvoice/tests/regression/README.md b/everyvoice/tests/regression/README.md
@@ -31,3 +31,13 @@ done
 ```
 
 Or just use `../../regression-test.sh` directly in the loop if you're not on a cluster.
+
+## One script to run them all
+
+All the above can be accomplished by running `go.sh`.
+
+## Cluster parameters
+
+The scripts hardcode NRC's default Slurm cluster parameters. Add `--partition=... --account=...`
+the the `sbatch` commands to override, or edit `go.sh` and `regression-test.sh` to use
+your partition and account settings to request nodes with GPUs available.
diff --git a/everyvoice/tests/regression/go.sh b/everyvoice/tests/regression/go.sh
@@ -18,10 +18,16 @@ set -o errexit
 TOP_LEVEL_DIR=$(mktemp --directory regress-$(date +'%Y%m%d')-XXX)
 cd "$TOP_LEVEL_DIR"
 
+if sbatch -h >& /dev/null; then
+    SUBMIT_COMMAND=sbatch
+else
+    SUBMIT_COMMAND=bash
+fi
+
 ../prep-datasets.sh
 for DIR in regress-*; do
     pushd "$DIR"
-    sbatch ../../regression-test.sh
+    $SUBMIT_COMMAND ../../regression-test.sh
     popd
 done
 

diff --git a/everyvoice/tests/regression/prep-datasets.sh b/everyvoice/tests/regression/prep-datasets.sh
@@ -20,7 +20,7 @@ EVERYVOICE_REGRESS_ROOT=$(python -c 'import everyvoice; print(everyvoice.__path_
 SGILE_DATASET_ROOT=${SGILE_DATASET_ROOT:-$HOME/sgile/data}
 
 LJ_SPEECH_DATASET=$SGILE_DATASET_ROOT/LJSpeech-1.1
-LJSLICES="150 600 1600 full"
+LJSLICES="160 600 1600 full"
 for slice in $LJSLICES; do
     dir=regress-lj-$slice
     mkdir "$dir"

diff --git a/everyvoice/tests/regression/regression-test.sh b/everyvoice/tests/regression/regression-test.sh
@@ -37,33 +37,34 @@ r() {
 echo "Start at $(date)"
 date > START
 
-trap 'echo "Failed or killed at $(date)"; date > FAILED' 0
+trap 'echo "Failed or killed at $(date)"; date | tee FAILED > DONE' 0
 
 # Regression config
 [[ -e "$ACTIVATE_SCRIPT" ]] && source "$ACTIVATE_SCRIPT"
 export TQDM_MININTERVAL=5
 MAX_STEPS=1000
-# For a production config, use MAX_STEPS=100000 and increase the SBATCH --time above
+MAX_EPOCHS=10
+# For a production config, use MAX_STEPS=100000, MAX_EPOCHS=1000, and increase the SBATCH --time above
 
 # Run the new-project wizard
 r "coverage run -p -m everyvoice new-project --resume-from wizard-resume"
 
 # Enter the directory created by the wizard
-cd regress || { echo "Cannot cd into regress directory, aborting."; date > DONE ; exit 1; }
-trap 'echo "Failed or killed at $(date)"; date > ../FAILED' 0
+cd regress || { echo "ERROR: Cannot cd into regress directory, aborting."; exit 1; }
+trap 'echo "Failed or killed at $(date)"; date | tee ../FAILED > ../DONE' 0
 
 # Preprocess
 r "coverage run -p -m everyvoice preprocess config/everyvoice-text-to-spec.yaml"
 
 # Train the fs2 model
-r "coverage run -p -m everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml --config-args training.max_steps=$MAX_STEPS"
+r "coverage run -p -m everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
 FS2=logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt
-ls $FS2
+ls $FS2 || { echo ERROR: Training the text-to-spec model failed, aborting.; exit 1; }
 
 # Train the vocoder
-r "coverage run -p -m everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml --config-args training.max_steps=$MAX_STEPS"
+r "coverage run -p -m everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
 VOCODER=logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt
-ls $VOCODER
+ls $VOCODER || { echo ERROR: Training the Vocoder failed, aborting.; exit 1; }
 
 # Synthesize some text
 r "coverage run -p -m everyvoice synthesize from-text \
@@ -82,7 +83,7 @@ r "coverage run -p -m everyvoice synthesize from-spec \
 
 # Exercise DeepForceAligner
 # Meh, this appears to be broken...
-#r "coverage run -p -m dfaligner train config/everyvoice-aligner.yaml --config-args training.max_steps=$MAX_STEPS"
+#r "coverage run -p -m dfaligner train config/everyvoice-aligner.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
 #r "coverage run -p -m dfaligner extract-alignments"