Merge branch 'feat/model_blowout' into feat/embeddingless

Helsinki-NLP · Oct 17, 2024 · 839b06c · 839b06c
2 parents 1ed939f + 3a16fa0
commit 839b06c
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 4 deletions.
diff --git a/mammoth/distributed/components.py b/mammoth/distributed/components.py
@@ -121,6 +121,15 @@ def state_dict(self, model: NMTModel, prefix='', keep_vars=False) -> Dict[str, A
             sub_module.state_dict(destination=destination, prefix=prefix + name + '.', keep_vars=keep_vars)
         return destination
 
+    def load_state_dict(self, model: NMTModel, state_dict: Dict[str, Any]):
+        module = self.get_module(model)
+        mismatch = module.load_state_dict(state_dict, strict=False)
+        missing_keys = [
+            name for name in mismatch.missing_keys
+            if not name.startswith('attn_layers.') or name.startswith('token_emb.')
+        ]
+        return mismatch._replace(missing_keys=missing_keys)
+
 
 @dataclass  # type: ignore
 class DistributedAttentionLayersBlock(DistributedComponent, ABC):
@@ -147,13 +156,22 @@ def named_parameters(self, model: NMTModel):
     def state_dict(self, model: NMTModel, prefix='', keep_vars=False) -> Dict[str, Any]:
         module = self.get_module(model)
         destination: Dict[str, Any] = OrderedDict()
-        for name, sub_module in module._modules.items():
+        for name, sub_module in module.get_sub_modules().items():
             if name == 'adapters':
                 # Adapters are stored separately
                 continue
             sub_module.state_dict(destination=destination, prefix=prefix + name + '.', keep_vars=keep_vars)
         return destination
 
+    def load_state_dict(self, model: NMTModel, state_dict: Dict[str, Any]):
+        module = self.get_module(model)
+        mismatch = module.load_state_dict(state_dict, strict=False)
+        missing_keys = [
+            name for name in mismatch.missing_keys
+            if not name.startswith('layers.')
+        ]
+        return mismatch._replace(missing_keys=missing_keys)
+
 
 @dataclass
 class DistributedEncoderAttentionLayersBlock(DistributedAttentionLayersBlock):

diff --git a/mammoth/modules/adapters.py b/mammoth/modules/adapters.py
@@ -225,3 +225,10 @@ def _inject_adapters(self):
     def forward(self, *args, **kwargs):
         self._inject_adapters()
         return super().forward(*args, **kwargs)
+
+    def get_sub_modules(self):
+        omit_submodules = {'layers'}
+        return {
+            name: sub_module for name, sub_module in self._modules.items()
+            if name not in omit_submodules
+        }
diff --git a/mammoth/tests/test_task_queue_manager.py b/mammoth/tests/test_task_queue_manager.py
@@ -5,9 +5,10 @@
 from mammoth.distributed import TaskQueueManager, WorldContext
 from mammoth.distributed.components import (
     Side,
-    DistributedEncoderAttentionLayersBlock,
     DistributedDecoderAttentionLayersBlock,
     DistributedEmbedding,
+    DistributedEncoderAttentionLayersBlock,
+    DistributedTransformerWrapper,
     # DistributedAdapter,
     # DistributedAttentionBridge,
     # DistributedComponentAction,
@@ -169,6 +170,34 @@ def __call__(self, sorted_global_ranks):
             layer_stack_index=0,
             xcoder_id="yy",
         ),
+        DistributedTransformerWrapper(
+            global_ranks={0},
+            task_ids={'train_0_a-b'},
+            group=None,
+            task_id='train_0_a-b',
+            side=Side.decoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_1_c-d'},
+            group=None,
+            task_id='train_1_c-d',
+            side=Side.decoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_2_a-d'},
+            group=None,
+            task_id='train_2_a-d',
+            side=Side.decoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={2},
+            task_ids={'train_3_e-b'},
+            group=None,
+            task_id='train_3_e-b',
+            side=Side.decoder,
+        ),
         DistributedEncoderAttentionLayersBlock(
             global_ranks={0, 1},
             task_ids={"train_2_a-d", "train_0_a-b"},
@@ -190,6 +219,34 @@ def __call__(self, sorted_global_ranks):
             layer_stack_index=0,
             xcoder_id="xxx",
         ),
+        DistributedTransformerWrapper(
+            global_ranks={0},
+            task_ids={'train_0_a-b'},
+            group=None,
+            task_id='train_0_a-b',
+            side=Side.encoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_1_c-d'},
+            group=None,
+            task_id='train_1_c-d',
+            side=Side.encoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_2_a-d'},
+            group=None,
+            task_id='train_2_a-d',
+            side=Side.encoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={2},
+            task_ids={'train_3_e-b'},
+            group=None,
+            task_id='train_3_e-b',
+            side=Side.encoder,
+        ),
         DistributedEmbedding(
             global_ranks={0, 1},
             task_ids={"train_0_a-b", "train_2_a-d"},
@@ -251,6 +308,20 @@ def __call__(self, sorted_global_ranks):
             layer_stack_index=0,
             xcoder_id="yy",
         ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_1_c-d'},
+            group=None,
+            task_id='train_1_c-d',
+            side=Side.decoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_2_a-d'},
+            group=None,
+            task_id='train_2_a-d',
+            side=Side.decoder,
+        ),
         DistributedEncoderAttentionLayersBlock(
             global_ranks={0, 1},
             task_ids={"train_2_a-d", "train_0_a-b"},
@@ -265,6 +336,20 @@ def __call__(self, sorted_global_ranks):
             layer_stack_index=0,
             xcoder_id="xx",
         ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_1_c-d'},
+            group=None,
+            task_id='train_1_c-d',
+            side=Side.encoder,
+        ),
+        DistributedTransformerWrapper(
+            global_ranks={1},
+            task_ids={'train_2_a-d'},
+            group=None,
+            task_id='train_2_a-d',
+            side=Side.encoder,
+        ),
         DistributedEmbedding(
             global_ranks={0, 1},
             task_ids={"train_0_a-b", "train_2_a-d"},

diff --git a/mammoth/trainer.py b/mammoth/trainer.py
@@ -393,6 +393,10 @@ def validate(self, valid_iter, moving_average=None, task=None):
                 src_mask = batch.src.mask
                 decoder_input = batch.tgt.tensor[:-1]
                 target = batch.tgt.tensor[1:]
+                # if self.norm_method == "tokens":
+                #     normalization = batch.tgt.mask.sum().item()
+                # else:
+                #     normalization = batch.batch_size
 
                 with torch.cuda.amp.autocast(enabled=self.optim.amp):
                     # F-prop through the model.
@@ -410,6 +414,7 @@ def validate(self, valid_iter, moving_average=None, task=None):
                         rearrange(logits, 't b i -> (t b) i'),
                         rearrange(target, 't b 1 -> (t b)'),
                     )
+                    # loss /= normalization
 
                 # Update statistics.
                 padding_idx = self.loss_functions[metadata.tgt_lang].ignore_index
@@ -490,6 +495,7 @@ def _gradient_accumulation(
                 if loss is not None:
                     if torch.isnan(loss):
                         raise Exception('Loss blowout')
+                    # loss /= normalization
                     self.optim.backward(loss)
 
                 if self.report_training_accuracy:

diff --git a/mammoth/utils/optimizers.py b/mammoth/utils/optimizers.py
@@ -158,6 +158,7 @@ def __init__(
         self.grad_scaler = grad_scaler
         self._training_step = 1
         self._decay_step = 1
+        self._n_clips = 0
 
     @property
     def param_groups(self):
@@ -194,6 +195,7 @@ def load_state_dict(self, state_dict):
     def zero_grad(self):
         """Zero the gradients of optimized parameters."""
         self._optimizer.zero_grad()
+        self._n_clips = 0
 
     def step(self):
         """Update the model parameters based on current gradients. """
@@ -202,7 +204,11 @@ def step(self):
         for group in self._optimizer.param_groups:
             group['lr'] = learning_rate
             if self._max_grad_norm > 0:
-                clip_grad_norm_(group['params'], self._max_grad_norm)
+                orig_norm = clip_grad_norm_(group['params'], self._max_grad_norm)
+                if orig_norm.item() > self._max_grad_norm:
+                    # FIXME: debug. Count and log instead
+                    print(f'Clipping {orig_norm} -> {self._max_grad_norm}')
+                    self._n_clips += 1
 
         if self.grad_scaler is not None:
             self.grad_scaler.step(self._optimizer)
@@ -396,6 +402,9 @@ def amp(self):
         """True if use torch amp mix precision training."""
         return self.grad_scaler is not None
 
+    def n_clips(self):
+        return sum(optimizer._n_clips for optimizer in self.suboptimizers.values())
+
 
 # Code below is an implementation of https://arxiv.org/pdf/1804.04235.pdf
 # inspired but modified from https://github.com/DeadAt0m/adafactor-pytorch

diff --git a/tools/generate_synth_data.py b/tools/generate_synth_data.py
@@ -355,7 +355,7 @@ def generate_from_config(
 @click.command(context_settings={'show_default': True})
 @click.option('--config_path', type=Path, required=True)
 @click.option('--vocab_size', type=int, default=300)
-@click.option('--num_examples_train', type=int, default=10000)
+@click.option('--num_examples_train', type=int, default=1000000)
 @click.option('--num_examples_test', type=int, default=100)
 @click.option('--start_seed', type=int, default=1)
 @click.option(