fix deprecation warning and pytest issue in unit tests

Helsinki-NLP · Jun 26, 2024 · 071cd1a · 071cd1a
1 parent b586eb9
commit 071cd1a
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 12 deletions.
diff --git a/examples/marian_mt.py b/examples/marian_mt.py
@@ -50,11 +50,7 @@ def tokenize_function(example):
         inputs = [pair['de'] for pair in example['translation']]
         targets = [pair['nl'] for pair in example['translation']]
         model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
-
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, truncation=True)
-
+        labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
         model_inputs["labels"] = labels["input_ids"]
         return model_inputs
 

diff --git a/tests/test_swag_bert.py b/tests/test_swag_bert.py
@@ -99,7 +99,8 @@ def test_pretrained_bert_tiny_classifier_test(self):
         logging.debug(out)
         self.assertEqual(out.logits.shape, (1, num_labels))
 
-    def _data_gen(self):
+    @staticmethod
+    def _data_gen():
         yield {"text": "Hello world", "label": 0}
         yield {"text": "Just some swaggering", "label": 1}
         yield {"text": "Have a good day", "label": 0}

diff --git a/tests/test_swag_marian.py b/tests/test_swag_marian.py
@@ -106,7 +106,8 @@ def test_pretrained_marian_tiny_test(self):
         self.assertGreater(len(output), 0)
         self.assertEqual(base_output, output)
 
-    def _data_gen(self):
+    @staticmethod
+    def _data_gen():
         yield {"source": "India and Japan prime ministers meet in Tokyo",
                "target": "Die Premierminister Indiens und Japans trafen sich in Tokio."}
         yield {"source": "High on the agenda are plans for greater nuclear co-operation.",
@@ -136,11 +137,7 @@ def tokenize_function(example):
             inputs = example['source']
             targets = example['target']
             model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
-
-            # Setup the tokenizer for targets
-            with tokenizer.as_target_tokenizer():
-                labels = tokenizer(targets, max_length=max_target_length, truncation=True)
-
+            labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
             model_inputs["labels"] = labels["input_ids"]
             return model_inputs