From e801ea4c7d0fef5a7ca089a2f7c622b3399d9ffd Mon Sep 17 00:00:00 2001 From: Mickus Timothee Date: Thu, 21 Sep 2023 11:38:30 +0300 Subject: [PATCH] more generic transform name --- docs/source/config_config.md | 4 ++-- examples/config_config.yaml | 6 +++--- onmt/tests/test_subword_marker.py | 2 +- onmt/tests/test_transform.py | 2 +- onmt/transforms/{bart.py => denoising.py} | 0 onmt/utils/parse.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) rename onmt/transforms/{bart.py => denoising.py} (100%) diff --git a/docs/source/config_config.md b/docs/source/config_config.md index 650439de..356df0dd 100644 --- a/docs/source/config_config.md +++ b/docs/source/config_config.md @@ -34,7 +34,7 @@ The meta-parameters under the `config_config` key: Path templates for source and target corpora, respectively. The path templates can contain the following variables that will be substituted by `config_config`: -- Directional corpus mode +- Directional corpus mode - `{src_lang}`: The source language of the task - `{tgt_lang}`: The target language of the task - `{lang_pair}`: `{src_lang}-{tgt_lang}` for convenience @@ -99,7 +99,7 @@ Generate translation configs for zero-shot directions. #### `transforms` and `ae_transforms` A list of transforms, for translation tasks and autoencoder tasks, respectively. -Use this to apply subword segmentation, e.g. using `sentencepiece`, and `bart` noise for autoencoder. +Use this to apply subword segmentation, e.g. using `sentencepiece`, and `ae_noise` noise for autoencoder. Both of these may change the sequence length, necessitating a `filtertoolong` transform. #### `enc_sharing_groups` and `dec_sharing_groups` diff --git a/examples/config_config.yaml b/examples/config_config.yaml index 47fe8ca0..c66dc3e4 100644 --- a/examples/config_config.yaml +++ b/examples/config_config.yaml @@ -16,7 +16,7 @@ config_config: ae_transforms: - sentencepiece - filtertoolong - - bart + - ae_noise enc_sharing_groups: - GROUP - FULL @@ -27,7 +27,7 @@ config_config: translation_config_dir: config/translation.opus n_gpus_per_node: 4 n_nodes: 2 - + # Note that this specifies the groups manually instead of clustering groups: "en": "en" @@ -43,7 +43,7 @@ config_config: save_data: generated/opus.spm32k # vocabs serve two purposes: defines the vocab files, and gives the potential languages to consider -src_vocab: +src_vocab: "af": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.afr.32k.spm.vocab" "da": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.dan.32k.spm.vocab" "en": "/scratch/project_2005099/data/opus/prepare_opus_data_tc_out/opusTC.eng.32k.spm.vocab" diff --git a/onmt/tests/test_subword_marker.py b/onmt/tests/test_subword_marker.py index 8987cbc0..afa17fcf 100644 --- a/onmt/tests/test_subword_marker.py +++ b/onmt/tests/test_subword_marker.py @@ -1,6 +1,6 @@ import unittest -from onmt.transforms.bart import word_start_finder +from onmt.transforms.denoising import word_start_finder from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer from onmt.constants import SubwordMarker diff --git a/onmt/tests/test_transform.py b/onmt/tests/test_transform.py index 0cffb0c8..7d6b58f0 100644 --- a/onmt/tests/test_transform.py +++ b/onmt/tests/test_transform.py @@ -11,7 +11,7 @@ make_transforms, TransformPipe, ) -from onmt.transforms.bart import BARTNoising +from onmt.transforms.denoising import BARTNoising class TestTransform(unittest.TestCase): diff --git a/onmt/transforms/bart.py b/onmt/transforms/denoising.py similarity index 100% rename from onmt/transforms/bart.py rename to onmt/transforms/denoising.py diff --git a/onmt/utils/parse.py b/onmt/utils/parse.py index a0b4ae1d..b29729a1 100644 --- a/onmt/utils/parse.py +++ b/onmt/utils/parse.py @@ -168,7 +168,7 @@ def _get_all_transform(cls, opt): if hasattr(opt, 'lambda_align') and opt.lambda_align > 0.0: if not all_transforms.isdisjoint({'sentencepiece', 'bpe', 'onmt_tokenize'}): raise ValueError('lambda_align is not compatible with on-the-fly tokenization.') - if not all_transforms.isdisjoint({'tokendrop', 'prefix', 'bart'}): + if not all_transforms.isdisjoint({'tokendrop', 'prefix', 'ae_noise'}): raise ValueError('lambda_align is not compatible yet with potential token deletion/addition.') opt._all_transform = all_transforms