From 8f9a7e30da52d4140004de15f98bc5d79ad63053 Mon Sep 17 00:00:00 2001 From: Eric Joanis Date: Mon, 27 Jan 2025 17:27:04 -0500 Subject: [PATCH] fix: multiple issues with regression test suite - max step is now 1000 for all cases, for fast regression - max epochs also 10, because for very small cases 1000 steps is pretty slow, possibly due to what happens between epochs? (lj-160 did 111 epochs...) - lj-150 is now lj-160, fixing an error with a validation set with fewer than 16 elements. - document how to override cluster parameters - exit early if training fails Still to do: handle failing to fetch bundle.js when there is not proxy set on the job nodes. But that has to be fixed in ReadAlong/Studio, not here. --- everyvoice/tests/regression/.gitignore | 1 + everyvoice/tests/regression/README.md | 10 ++++++++++ everyvoice/tests/regression/go.sh | 8 +++++++- everyvoice/tests/regression/prep-datasets.sh | 2 +- .../tests/regression/regression-test.sh | 19 ++++++++++--------- 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/everyvoice/tests/regression/.gitignore b/everyvoice/tests/regression/.gitignore index 321505bb..0e71ce9b 100644 --- a/everyvoice/tests/regression/.gitignore +++ b/everyvoice/tests/regression/.gitignore @@ -1,2 +1,3 @@ EV-regress.* +EV-r-main.* regress-* diff --git a/everyvoice/tests/regression/README.md b/everyvoice/tests/regression/README.md index d279bbb3..5effd6a3 100644 --- a/everyvoice/tests/regression/README.md +++ b/everyvoice/tests/regression/README.md @@ -31,3 +31,13 @@ done ``` Or just use `../../regression-test.sh` directly in the loop if you're not on a cluster. + +## One script to run them all + +All the above can be accomplished by running `go.sh`. + +## Cluster parameters + +The scripts hardcode NRC's default Slurm cluster parameters. Add `--partition=... --account=...` +the the `sbatch` commands to override, or edit `go.sh` and `regression-test.sh` to use +your partition and account settings to request nodes with GPUs available. diff --git a/everyvoice/tests/regression/go.sh b/everyvoice/tests/regression/go.sh index f926f2dc..f488e239 100755 --- a/everyvoice/tests/regression/go.sh +++ b/everyvoice/tests/regression/go.sh @@ -18,10 +18,16 @@ set -o errexit TOP_LEVEL_DIR=$(mktemp --directory regress-$(date +'%Y%m%d')-XXX) cd "$TOP_LEVEL_DIR" +if sbatch -h >& /dev/null; then + SUBMIT_COMMAND=sbatch +else + SUBMIT_COMMAND=bash +fi + ../prep-datasets.sh for DIR in regress-*; do pushd "$DIR" - sbatch ../../regression-test.sh + $SUBMIT_COMMAND ../../regression-test.sh popd done diff --git a/everyvoice/tests/regression/prep-datasets.sh b/everyvoice/tests/regression/prep-datasets.sh index b73ad886..4b6ff9ea 100755 --- a/everyvoice/tests/regression/prep-datasets.sh +++ b/everyvoice/tests/regression/prep-datasets.sh @@ -20,7 +20,7 @@ EVERYVOICE_REGRESS_ROOT=$(python -c 'import everyvoice; print(everyvoice.__path_ SGILE_DATASET_ROOT=${SGILE_DATASET_ROOT:-$HOME/sgile/data} LJ_SPEECH_DATASET=$SGILE_DATASET_ROOT/LJSpeech-1.1 -LJSLICES="150 600 1600 full" +LJSLICES="160 600 1600 full" for slice in $LJSLICES; do dir=regress-lj-$slice mkdir "$dir" diff --git a/everyvoice/tests/regression/regression-test.sh b/everyvoice/tests/regression/regression-test.sh index 4ae51c8a..835ab6bc 100755 --- a/everyvoice/tests/regression/regression-test.sh +++ b/everyvoice/tests/regression/regression-test.sh @@ -37,33 +37,34 @@ r() { echo "Start at $(date)" date > START -trap 'echo "Failed or killed at $(date)"; date > FAILED' 0 +trap 'echo "Failed or killed at $(date)"; date | tee FAILED > DONE' 0 # Regression config [[ -e "$ACTIVATE_SCRIPT" ]] && source "$ACTIVATE_SCRIPT" export TQDM_MININTERVAL=5 MAX_STEPS=1000 -# For a production config, use MAX_STEPS=100000 and increase the SBATCH --time above +MAX_EPOCHS=10 +# For a production config, use MAX_STEPS=100000, MAX_EPOCHS=1000, and increase the SBATCH --time above # Run the new-project wizard r "coverage run -p -m everyvoice new-project --resume-from wizard-resume" # Enter the directory created by the wizard -cd regress || { echo "Cannot cd into regress directory, aborting."; date > DONE ; exit 1; } -trap 'echo "Failed or killed at $(date)"; date > ../FAILED' 0 +cd regress || { echo "ERROR: Cannot cd into regress directory, aborting."; exit 1; } +trap 'echo "Failed or killed at $(date)"; date | tee ../FAILED > ../DONE' 0 # Preprocess r "coverage run -p -m everyvoice preprocess config/everyvoice-text-to-spec.yaml" # Train the fs2 model -r "coverage run -p -m everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml --config-args training.max_steps=$MAX_STEPS" +r "coverage run -p -m everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS" FS2=logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt -ls $FS2 +ls $FS2 || { echo ERROR: Training the text-to-spec model failed, aborting.; exit 1; } # Train the vocoder -r "coverage run -p -m everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml --config-args training.max_steps=$MAX_STEPS" +r "coverage run -p -m everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS" VOCODER=logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt -ls $VOCODER +ls $VOCODER || { echo ERROR: Training the Vocoder failed, aborting.; exit 1; } # Synthesize some text r "coverage run -p -m everyvoice synthesize from-text \ @@ -82,7 +83,7 @@ r "coverage run -p -m everyvoice synthesize from-spec \ # Exercise DeepForceAligner # Meh, this appears to be broken... -#r "coverage run -p -m dfaligner train config/everyvoice-aligner.yaml --config-args training.max_steps=$MAX_STEPS" +#r "coverage run -p -m dfaligner train config/everyvoice-aligner.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS" #r "coverage run -p -m dfaligner extract-alignments"