Merge branch 'main' into ifuUpdate127

ROCm · Jan 31, 2025 · 8105539 · 8105539
2 parents 7e56643 + be867d5
commit 8105539
Show file tree

Hide file tree

Showing 12 changed files with 21 additions and 16 deletions.
diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml
@@ -25,4 +25,4 @@ jobs:
     uses: ./.github/workflows/self-push-amd.yml
     with:
       gpu_flavor: mi210
-    secrets: inherit
+    secrets: inherit
diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml
@@ -25,4 +25,4 @@ jobs:
     uses: ./.github/workflows/self-push-amd.yml
     with:
       gpu_flavor: mi250
-    secrets: inherit
+    secrets: inherit
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -187,7 +187,7 @@ class PretrainedConfig(PushToHubMixin):
             Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may
             not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
             v5.
-        loss_type (`str`, *optional*):
+       loss_type (`str`, *optional*):
             The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
             be automatically infered from the model architecture.
 

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -2516,6 +2516,9 @@ def _inner_training_loop(
                     if (self.state.global_step == args.stable_train_warmup_steps):
                         start_train_stable_time = time.time()
 
+                    if (self.state.global_step == args.stable_train_warmup_steps):
+                        start_train_stable_time = time.time()
+
                     if self.args.include_num_input_tokens_seen:
                         main_input_name = getattr(self.model, "main_input_name", "input_ids")
                         if main_input_name not in inputs:

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
@@ -327,12 +327,12 @@ class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     test_headmasking = False
     test_pruning = False
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_generate_with_static_cache(self):
         super().test_generate_with_static_cache()
         pass
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_generate_from_inputs_embeds_with_static_cache(self):
         super().test_generate_from_inputs_embeds_with_static_cache()
         pass

diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -294,7 +294,7 @@ def test_config(self):
         self.config_tester.run_common_tests()
 
     @require_torch_multi_gpu
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_multi_gpu_data_parallel_forward(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -287,7 +287,7 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     def test_flex_attention_with_grads(self):
         super().test_flex_attention_with_grads()
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_generate_with_static_cache(self):
         super().test_generate_with_static_cache()
         pass

diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
@@ -597,12 +597,12 @@ class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, uni
     all_generative_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else ()
 
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_generate_from_inputs_embeds_with_static_cache(self):
         super().test_generate_from_inputs_embeds_with_static_cache()
         pass
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_generate_with_static_cache(self):
         super().test_generate_with_static_cache()
         pass

diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
@@ -320,12 +320,12 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
 
     def test_generate_from_inputs_embeds_with_static_cache(self):
-        if rocmUtils.is_rocm_skippable(arch='gfx90a'):
+        if rocmUtils.is_rocm_skippable(arch=['gfx90a','gfx942']):
             torch._dynamo.config.capture_dynamic_output_shape_ops = True
         super().test_generate_from_inputs_embeds_with_static_cache()
 
     def test_generate_with_static_cache(self):
-        if rocmUtils.is_rocm_skippable(arch='gfx90a'):
+        if rocmUtils.is_rocm_skippable(arch=['gfx90a','gfx942']):
             torch._dynamo.config.capture_dynamic_output_shape_ops = True
         super().test_generate_with_static_cache()
 

diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py
@@ -306,13 +306,13 @@ class OlmoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     model_split_percents = [0.5, 0.7, 0.8]
 
     def test_generate_with_static_cache(self):
-        if rocmUtils.is_rocm_skippable(arch=['gfx1201','gfx90a']):
+        if rocmUtils.is_rocm_skippable(arch=['gfx1201','gfx90a','gfx942']):
             torch._dynamo.config.capture_dynamic_output_shape_ops = True
         super().test_generate_with_static_cache()
         pass
 
     def test_generate_from_inputs_embeds_with_static_cache(self):
-        if rocmUtils.is_rocm_skippable(arch=['gfx1201','gfx90a']):
+        if rocmUtils.is_rocm_skippable(arch=['gfx1201','gfx90a','gfx942']):
             torch._dynamo.config.capture_dynamic_output_shape_ops = True
         super().test_generate_from_inputs_embeds_with_static_cache()
         pass

diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
@@ -397,17 +397,17 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     fx_compatible = True
     model_split_percents = [0.5, 0.8, 0.9]
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_cpu_offload(self):
         super().test_cpu_offload()
         pass
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_disk_offload_bin(self):
         super().test_disk_offload_bin()
         pass
 
-    @skipIfRocm(arch=['gfx1201','gfx90a'])
+    @skipIfRocm(arch=['gfx1201','gfx90a','gfx942'])
     def test_disk_offload_safetensors(self):
         super().test_disk_offload_safetensors()
         pass

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -3052,6 +3052,8 @@ def test_inputs_embeds_matches_input_ids(self):
                     )[0]
             torch.testing.assert_close(out_embeds, out_ids)
 
+
+    @skipIfRocm
     @require_non_xpu
     @skipIfRocm
     @require_torch_multi_gpu