more generic transform name

Helsinki-NLP · Sep 21, 2023 · e801ea4 · e801ea4
1 parent 9f3aab8
commit e801ea4
Show file tree

Hide file tree

Showing 6 changed files with 8 additions and 8 deletions.
diff --git a/docs/source/config_config.md b/docs/source/config_config.md
@@ -34,7 +34,7 @@ The meta-parameters under the `config_config` key:
 Path templates for source and target corpora, respectively.
 The path templates can contain the following variables that will be substituted by `config_config`:
 
-- Directional corpus mode 
+- Directional corpus mode
   - `{src_lang}`: The source language of the task
   - `{tgt_lang}`: The target language of the task
   - `{lang_pair}`: `{src_lang}-{tgt_lang}` for convenience
@@ -99,7 +99,7 @@ Generate translation configs for zero-shot directions.
 #### `transforms` and `ae_transforms`
 
 A list of transforms, for translation tasks and autoencoder tasks, respectively.
-Use this to apply subword segmentation, e.g. using `sentencepiece`, and `bart` noise for autoencoder.
+Use this to apply subword segmentation, e.g. using `sentencepiece`, and `ae_noise` noise for autoencoder.
 Both of these may change the sequence length, necessitating a `filtertoolong` transform.
 
 #### `enc_sharing_groups` and `dec_sharing_groups`

diff --git a/examples/config_config.yaml b/examples/config_config.yaml
@@ -16,7 +16,7 @@ config_config:
   ae_transforms:
     - sentencepiece
     - filtertoolong
-    - bart
+    - ae_noise
   enc_sharing_groups:
     - GROUP
     - FULL
@@ -27,7 +27,7 @@ config_config:
   translation_config_dir: config/translation.opus
   n_gpus_per_node: 4
   n_nodes: 2
-  
+
   # Note that this specifies the groups manually instead of clustering
   groups:
     "en": "en"
@@ -43,7 +43,7 @@ config_config:
 
 save_data: generated/opus.spm32k
 # vocabs serve two purposes: defines the vocab files, and gives the potential languages to consider
-src_vocab: 
+src_vocab:
   "af": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.afr.32k.spm.vocab"
   "da": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.dan.32k.spm.vocab"
   "en": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.eng.32k.spm.vocab"

diff --git a/onmt/tests/test_subword_marker.py b/onmt/tests/test_subword_marker.py
@@ -1,6 +1,6 @@
 import unittest
 
-from onmt.transforms.bart import word_start_finder
+from onmt.transforms.denoising import word_start_finder
 from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer
 from onmt.constants import SubwordMarker
 

diff --git a/onmt/tests/test_transform.py b/onmt/tests/test_transform.py
@@ -11,7 +11,7 @@
     make_transforms,
     TransformPipe,
 )
-from onmt.transforms.bart import BARTNoising
+from onmt.transforms.denoising import BARTNoising
 
 
 class TestTransform(unittest.TestCase):

diff --git a/onmt/transforms/bart.py → onmt/transforms/denoising.py b/onmt/transforms/bart.py → onmt/transforms/denoising.py
diff --git a/onmt/utils/parse.py b/onmt/utils/parse.py
@@ -168,7 +168,7 @@ def _get_all_transform(cls, opt):
         if hasattr(opt, 'lambda_align') and opt.lambda_align > 0.0:
             if not all_transforms.isdisjoint({'sentencepiece', 'bpe', 'onmt_tokenize'}):
                 raise ValueError('lambda_align is not compatible with on-the-fly tokenization.')
-            if not all_transforms.isdisjoint({'tokendrop', 'prefix', 'bart'}):
+            if not all_transforms.isdisjoint({'tokendrop', 'prefix', 'ae_noise'}):
                 raise ValueError('lambda_align is not compatible yet with potential token deletion/addition.')
         opt._all_transform = all_transforms