Optimizations for training acoustic models

MontrealCorpusTools · Sep 28, 2024 · eae38e0 · eae38e0
1 parent 88ea033
commit eae38e0
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 57 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,54 +5,6 @@
 3.0 Changelog
 *************
 
-3.2.0
------
-
-- Add support for transcription via whisperx and speechbrain models
-- Update text normalization to normalize to decomposed forms
-- Compatibility with Kalpy 0.6.7
-
-3.1.4
------
-
-- Optimized :code:`mfa g2p` to better use multiple processes
-- Added :code:`--export_scores` to :code:`mfa g2p` for adding a column representing the final weights of the generated pronunciations
-- Added :code:`--output_directory` to :code:`mfa validate` to save generated validation files rather than the temporary directory
-- Fixed a bug in cutoff modeling that was preventing them from being properly parsed
-
-3.1.3
------
-
-- Fixed an issue where silence probability being zero was not correctly removing silence
-- Compatibility with kalpy v0.6.5
-- Added API functionality for verifying transcripts with interjection words in alignment
-- Fixed an error in fine tuning that generated nonsensical boundaries
-
-3.1.2
------
-
-- Fixed a bug where hidden files and folders would be parsed as corpus data
-- Fixed a bug where validation would not respect :code:`--no_final_clean`
-- Fixed a rare crash in training when a job would not have utterances assigned to it
-- Fixed a bug where MFA would mistakenly report a dictionary and acoustic model phones did not match for older versions
-
-3.1.1
------
-
-- Fixed an issue with TextGrids missing intervals
-
-3.1.0
------
-
-- Fixed a bug where cutoffs were not properly modelled
-- Added additional filter on create subset to not include utterances with cutoffs in smaller subsets
-- Added the ability to specify HMM topologies for phones
-- Fixed issues caused by validators not cleaning up temporary files and databases
-- Added support for default and nonnative dictionaries generated from other dictionaries
-- Restricted initial training rounds to exclude default and nonnative dictionaries
-- Changed clustering of phones to not mix silence and non-silence phones
-- Optimized textgrid export
-- Added better memory management for collecting alignments
 
 3.0.8
 -----

diff --git a/docs/source/changelog/changelog_3.1.rst b/docs/source/changelog/changelog_3.1.rst
@@ -0,0 +1,48 @@
+
+.. _changelog_3.1:
+
+*************
+3.1 Changelog
+*************
+
+3.1.4
+-----
+
+- Optimized :code:`mfa g2p` to better use multiple processes
+- Added :code:`--export_scores` to :code:`mfa g2p` for adding a column representing the final weights of the generated pronunciations
+- Added :code:`--output_directory` to :code:`mfa validate` to save generated validation files rather than the temporary directory
+- Fixed a bug in cutoff modeling that was preventing them from being properly parsed
+
+3.1.3
+-----
+
+- Fixed an issue where silence probability being zero was not correctly removing silence
+- Compatibility with kalpy v0.6.5
+- Added API functionality for verifying transcripts with interjection words in alignment
+- Fixed an error in fine tuning that generated nonsensical boundaries
+
+3.1.2
+-----
+
+- Fixed a bug where hidden files and folders would be parsed as corpus data
+- Fixed a bug where validation would not respect :code:`--no_final_clean`
+- Fixed a rare crash in training when a job would not have utterances assigned to it
+- Fixed a bug where MFA would mistakenly report a dictionary and acoustic model phones did not match for older versions
+
+3.1.1
+-----
+
+- Fixed an issue with TextGrids missing intervals
+
+3.1.0
+-----
+
+- Fixed a bug where cutoffs were not properly modelled
+- Added additional filter on create subset to not include utterances with cutoffs in smaller subsets
+- Added the ability to specify HMM topologies for phones
+- Fixed issues caused by validators not cleaning up temporary files and databases
+- Added support for default and nonnative dictionaries generated from other dictionaries
+- Restricted initial training rounds to exclude default and nonnative dictionaries
+- Changed clustering of phones to not mix silence and non-silence phones
+- Optimized textgrid export
+- Added better memory management for collecting alignments
diff --git a/docs/source/changelog/changelog_3.2.rst b/docs/source/changelog/changelog_3.2.rst
@@ -0,0 +1,16 @@
+
+.. _changelog_3.2:
+
+*************
+3.2 Changelog
+*************
+
+3.2.0
+-----
+
+- Added :code:`--subset_word_count` parameter to :ref:`train_acoustic_model` to add a minimum word count for an utterance  to be included in training subsets
+- Added :code:`--minimum_utterance_length` parameter to :ref:`train_acoustic_model` to add a minimum word count for an utterance to be included in training at all
+- Improved memory usage in compiling training graphs for initial subsets
+- Add support for transcription via whisperx and speechbrain models
+- Update text normalization to normalize to decomposed forms
+- Compatibility with Kalpy 0.6.7
diff --git a/docs/source/changelog/index.md b/docs/source/changelog/index.md
@@ -53,6 +53,8 @@
 :hidden:
 :maxdepth: 1
 
+changelog_3.2.rst
+changelog_3.1.rst
 news_3.0.rst
 changelog_3.0.rst
 changelog_2.2.rst

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -153,6 +153,8 @@ def __init__(
         training_configuration: List[Tuple[str, Dict[str, Any]]] = None,
         phone_set_type: str = None,
         model_version: str = None,
+        subset_word_count: int = 3,
+        minimum_utterance_length: int = 2,
         **kwargs,
     ):
         self.param_dict = {
@@ -164,6 +166,7 @@ def __init__(
         }
         self.final_identifier = None
         self.current_subset: int = 0
+        self.subset_word_count = subset_word_count
         self.current_aligner: Optional[AcousticModelTrainingMixin] = None
         self.current_trainer: Optional[AcousticModelTrainingMixin] = None
         self.current_acoustic_model: Optional[AcousticModel] = None
@@ -184,6 +187,7 @@ def __init__(
         self.final_alignment = True
         self.model_version = model_version
         self.boost_silence = 1.5
+        self.minimum_utterance_length = minimum_utterance_length
 
     @classmethod
     def default_training_configurations(cls) -> List[Tuple[str, Dict[str, Any]]]:
@@ -335,6 +339,12 @@ def filter_training_utterances(self):
                         update_mapping.append({"id": u_id, "ignored": True})
                         continue
                     words = text.split()
+                    if (
+                        self.minimum_utterance_length > 1
+                        and len(words) < self.minimum_utterance_length
+                    ):
+                        update_mapping.append({"id": u_id, "ignored": True})
+                        continue
                     if any(x in word_mapping for x in words):
                         continue
                     update_mapping.append({"id": u_id, "ignored": True})
@@ -629,7 +639,7 @@ def train(self) -> None:
         new_phone_lm_path = os.path.join(previous.working_directory, "phone_lm.fst")
         if not os.path.exists(new_phone_lm_path) and os.path.exists(phone_lm_path):
             shutil.copyfile(phone_lm_path, new_phone_lm_path)
-        logger.info(f"Completed training in {time.time()-begin} seconds!")
+        logger.info(f"Completed training in {time.time() - begin} seconds!")
 
     def transition_acc_arguments(self) -> List[TransitionAccArguments]:
         """

diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py
@@ -459,9 +459,9 @@ def _run(self):
                     self.tree_path,
                     lexicon,
                     use_g2p=self.use_g2p,
-                    batch_size=1000
+                    batch_size=500
                     if workflow.workflow_type is not WorkflowType.transcript_verification
-                    else 500,
+                    else 250,
                 )
                 graph_logger.debug(f"Set up took {time.time() - begin} seconds")
                 query = (
@@ -484,7 +484,7 @@ def _run(self):
                 )
                 graph_logger.debug(f"Total compilation time: {time.time() - begin} seconds")
                 del compiler
-        del self.lexicon_compilers
+                del lexicon
 
 
 class AccStatsFunction(KaldiFunction):
@@ -1560,6 +1560,7 @@ def _run(self) -> None:
                                 pass
                         alignment_archive.close()
                         extraction_logger.debug("Finished ali first pass")
+                del lexicon_compiler
             extraction_logger.debug("Finished extraction")
 
 

diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py
@@ -39,7 +39,7 @@
     AcousticDirectoryParser,
     CorpusProcessWorker,
 )
-from montreal_forced_aligner.data import DatabaseImportData, PhoneType, WorkflowType
+from montreal_forced_aligner.data import DatabaseImportData, PhoneType, WordType, WorkflowType
 from montreal_forced_aligner.db import (
     Corpus,
     CorpusWorkflow,
@@ -50,6 +50,7 @@
     Speaker,
     TextFile,
     Utterance,
+    Word,
     bulk_update,
 )
 from montreal_forced_aligner.dictionary.mixins import DictionaryMixin
@@ -1129,6 +1130,42 @@ def load_corpus(self) -> None:
 
         logger.debug(f"Setting up corpus took {time.time() - all_begin:.3f} seconds")
 
+    def subset_lexicon(self) -> None:
+        included_words = set()
+        with self.session() as session:
+            corpus = session.query(Corpus).first()
+            if corpus.current_subset > 0:
+                subset_utterances = (
+                    session.query(Utterance.normalized_text)
+                    .filter(Utterance.in_subset == True)  # noqa
+                    .filter(Utterance.ignored == False)  # noqa
+                )
+                for (u_text,) in subset_utterances:
+                    included_words.update(u_text.split())
+                session.execute(
+                    sqlalchemy.update(Word)
+                    .where(Word.word_type == WordType.speech)
+                    .values(included=False)
+                )
+                session.flush()
+                session.execute(
+                    sqlalchemy.update(Word)
+                    .where(Word.word_type == WordType.speech)
+                    .where(Word.count > self.oov_count_threshold)
+                    .where(Word.word.in_(included_words))
+                    .values(included=True)
+                )
+            else:
+                session.execute(
+                    sqlalchemy.update(Word)
+                    .where(Word.word_type == WordType.speech)
+                    .where(Word.count > self.oov_count_threshold)
+                    .values(included=True)
+                )
+
+            session.commit()
+        self.write_lexicon_information()
+
 
 class AcousticCorpus(AcousticCorpusMixin, DictionaryMixin, MfaWorker):
     """

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -791,7 +791,7 @@ def normalize_text(self) -> None:
                     import traceback
 
                     exc_type, exc_value, exc_traceback = sys.exc_info()
-                    print(
+                    logger.debug(
                         "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                     )
                     raise
@@ -1200,7 +1200,8 @@ def create_subset(self, subset: int) -> None:
             cutoff_pattern = "<(cutoff|hes)"
 
         def add_filters(query):
-            multiword_pattern = r"\s\S+\s"
+            subset_word_count = getattr(self, "subset_word_count", 3)
+            multiword_pattern = rf"(\s\S+){{{subset_word_count},}}"
             filtered = (
                 query.filter(
                     Utterance.normalized_text.op("~")(multiword_pattern)
@@ -1488,7 +1489,7 @@ def add_filters(query):
             log_dir = subset_directory.joinpath("log")
             os.makedirs(log_dir, exist_ok=True)
 
-            logger.debug(f"Setting subset flags took {time.time()-begin} seconds")
+            logger.debug(f"Setting subset flags took {time.time() - begin} seconds")
             with self.session() as session:
                 jobs = (
                     session.query(Job)
@@ -1507,7 +1508,6 @@ def add_filters(query):
                     )
                     for j in self._jobs
                 ]
-
             for _ in run_kaldi_function(ExportKaldiFilesFunction, arguments, total_count=subset):
                 pass
 
@@ -1559,10 +1559,14 @@ def subset_directory(self, subset: typing.Optional[int]) -> Path:
                 c.current_subset = subset
             session.commit()
         if subset is None or subset >= self.num_utterances or subset <= 0:
+            if hasattr(self, "subset_lexicon"):
+                self.subset_lexicon()
             return self.split_directory
         directory = self.corpus_output_directory.joinpath(f"subset_{subset}")
         if not os.path.exists(directory):
             self.create_subset(subset)
+            if hasattr(self, "subset_lexicon"):
+                self.subset_lexicon()
         return directory
 
     def get_latest_workflow_run(self, workflow: WorkflowType, session: Session) -> CorpusWorkflow: