Merge remote-tracking branch 'upstream/main'

red-hat-data-services · Dec 4, 2024 · c65c519 · c65c519
2 parents 8ff5b0d + 7df3416
commit c65c519
Show file tree

Hide file tree

Showing 57 changed files with 1,434 additions and 832 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -638,7 +638,7 @@ callbacks=cb_,
 dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
 
 # Argument names that match this expression will be ignored.
-ignored-argument-names=_.*|^ignored_|^unused_
+ignored-argument-names=_.*|^ignored_|^unused_|kwargs
 
 # Tells whether we should check for unused import in __init__ files.
 init-import=no

diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py
@@ -54,13 +54,13 @@
 from tuning.utils.import_utils import is_fms_accelerate_available
 
 # for some reason the CI will raise an import error if we try to import
-# these from tests.data
+# these from tests.artifacts.testdata
 TWITTER_COMPLAINTS_JSON_FORMAT = os.path.join(
-    os.path.dirname(__file__), "../data/twitter_complaints_json.json"
+    os.path.dirname(__file__), "../artifacts/testdata/twitter_complaints_json.json"
 )
 TWITTER_COMPLAINTS_TOKENIZED = os.path.join(
     os.path.dirname(__file__),
-    "../data/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
+    "../artifacts/testdata/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
 )
 
 # pylint: disable=import-error

diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -0,0 +1,30 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpful datasets for configuring individual unit tests.
+"""
+# Standard
+import os
+
+### Constants used for data
+PREDEFINED_DATA_CONFIGS = os.path.join(os.path.dirname(__file__))
+APPLY_CUSTOM_TEMPLATE_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "apply_custom_template.yaml"
+)
+PRETOKENIZE_JSON_DATA_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "pretokenized_json_data.yaml"
+)
+TOKENIZE_AND_APPLY_INPUT_MASKING_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking.yaml"
+)
diff --git a/tests/artifacts/predefined_data_configs/apply_custom_template.yaml b/tests/artifacts/predefined_data_configs/apply_custom_template.yaml
@@ -0,0 +1,14 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: apply_custom_data_template
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: apply_custom_data_formatting_template
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            dataset_text_field: "dataset_text_field"
+            dataset_template: "dataset_template"
diff --git a/tests/artifacts/predefined_data_configs/pretokenized_json_data.yaml b/tests/artifacts/predefined_data_configs/pretokenized_json_data.yaml
@@ -0,0 +1,6 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: pretokenized_dataset
+    data_paths:
+      - "FILE_PATH"
diff --git a/tests/artifacts/predefined_data_configs/tokenize_and_apply_input_masking.yaml b/tests/artifacts/predefined_data_configs/tokenize_and_apply_input_masking.yaml
@@ -0,0 +1,14 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: text_dataset_input_output_masking
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_input_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            input_field: "INPUT"
+            output_field: "OUTPUT"
diff --git a/tests/data/__init__.py → tests/artifacts/testdata/__init__.py b/tests/data/__init__.py → tests/artifacts/testdata/__init__.py
diff --git a/tests/data/empty_data.json → tests/artifacts/testdata/empty_data.json b/tests/data/empty_data.json → tests/artifacts/testdata/empty_data.json
diff --git a/tests/data/malformatted_data.json → ...artifacts/testdata/malformatted_data.json b/tests/data/malformatted_data.json → ...artifacts/testdata/malformatted_data.json
diff --git a/tests/data/trainercontroller/__init__.py → ...ts/testdata/trainercontroller/__init__.py b/tests/data/trainercontroller/__init__.py → ...ts/testdata/trainercontroller/__init__.py
diff --git a/...oller/epoch-level-eval-loss-patience.yaml → ...oller/epoch-level-eval-loss-patience.yaml b/...oller/epoch-level-eval-loss-patience.yaml → ...oller/epoch-level-eval-loss-patience.yaml
diff --git a/...inercontroller/epoch-level-eval-loss.yaml → ...inercontroller/epoch-level-eval-loss.yaml b/...inercontroller/epoch-level-eval-loss.yaml → ...inercontroller/epoch-level-eval-loss.yaml
diff --git a/...controller/epoch-level-training-loss.yaml → ...controller/epoch-level-training-loss.yaml b/...controller/epoch-level-training-loss.yaml → ...controller/epoch-level-training-loss.yaml
diff --git a/...ta/trainercontroller/exposed_metrics.yaml → ...ta/trainercontroller/exposed_metrics.yaml b/...ta/trainercontroller/exposed_metrics.yaml → ...ta/trainercontroller/exposed_metrics.yaml
diff --git a/...correct_source_event_exposed_metrics.yaml → ...correct_source_event_exposed_metrics.yaml b/...correct_source_event_exposed_metrics.yaml → ...correct_source_event_exposed_metrics.yaml
diff --git a/...ata/trainercontroller/log_controller.yaml → ...ata/trainercontroller/log_controller.yaml b/...ata/trainercontroller/log_controller.yaml → ...ata/trainercontroller/log_controller.yaml
diff --git a/...trainercontroller/loss_custom_metric.yaml → ...trainercontroller/loss_custom_metric.yaml b/...trainercontroller/loss_custom_metric.yaml → ...trainercontroller/loss_custom_metric.yaml
diff --git a/...inercontroller/loss_custom_operation.yaml → ...inercontroller/loss_custom_operation.yaml b/...inercontroller/loss_custom_operation.yaml → ...inercontroller/loss_custom_operation.yaml
diff --git a/...loss_custom_operation_invalid_action.yaml → ...loss_custom_operation_invalid_action.yaml b/...loss_custom_operation_invalid_action.yaml → ...loss_custom_operation_invalid_action.yaml
diff --git a/...rainercontroller/loss_invalid_metric.yaml → ...rainercontroller/loss_invalid_metric.yaml b/...rainercontroller/loss_invalid_metric.yaml → ...rainercontroller/loss_invalid_metric.yaml
diff --git a/...nercontroller/loss_invalid_operation.yaml → ...nercontroller/loss_invalid_operation.yaml b/...nercontroller/loss_invalid_operation.yaml → ...nercontroller/loss_invalid_operation.yaml
diff --git a/...roller/loss_invalid_operation_action.yaml → ...roller/loss_invalid_operation_action.yaml b/...roller/loss_invalid_operation_action.yaml → ...roller/loss_invalid_operation_action.yaml
diff --git a/...ainercontroller/loss_invalid_trigger.yaml → ...ainercontroller/loss_invalid_trigger.yaml b/...ainercontroller/loss_invalid_trigger.yaml → ...ainercontroller/loss_invalid_trigger.yaml
diff --git a/.../trainercontroller/loss_on_threshold.yaml → .../trainercontroller/loss_on_threshold.yaml b/.../trainercontroller/loss_on_threshold.yaml → .../trainercontroller/loss_on_threshold.yaml
diff --git a/...loss_on_threshold_with_trainer_state.yaml → ...loss_on_threshold_with_trainer_state.yaml b/...loss_on_threshold_with_trainer_state.yaml → ...loss_on_threshold_with_trainer_state.yaml
diff --git a/...ercontroller/loss_unavailable_metric.yaml → ...ercontroller/loss_unavailable_metric.yaml b/...ercontroller/loss_unavailable_metric.yaml → ...ercontroller/loss_unavailable_metric.yaml
diff --git a/...ntroller/loss_with_invalid_type_rule.yaml → ...ntroller/loss_with_invalid_type_rule.yaml b/...ntroller/loss_with_invalid_type_rule.yaml → ...ntroller/loss_with_invalid_type_rule.yaml
diff --git a/...oller/loss_with_malicious_input_rule.yaml → ...oller/loss_with_malicious_input_rule.yaml b/...oller/loss_with_malicious_input_rule.yaml → ...oller/loss_with_malicious_input_rule.yaml
diff --git a/...ntroller/loss_with_malicious_os_rule.yaml → ...ntroller/loss_with_malicious_os_rule.yaml b/...ntroller/loss_with_malicious_os_rule.yaml → ...ntroller/loss_with_malicious_os_rule.yaml
diff --git a/...troller/non-decreasing-training-loss.yaml → ...troller/non-decreasing-training-loss.yaml b/...troller/non-decreasing-training-loss.yaml → ...troller/non-decreasing-training-loss.yaml
diff --git a/tests/data/trainercontroller/on-save.yaml → ...s/testdata/trainercontroller/on-save.yaml b/tests/data/trainercontroller/on-save.yaml → ...s/testdata/trainercontroller/on-save.yaml
diff --git a/...controller/thresholded-training-loss.yaml → ...controller/thresholded-training-loss.yaml b/...controller/thresholded-training-loss.yaml → ...controller/thresholded-training-loss.yaml
diff --git a/...data/twitter_complaints_input_output.json → ...data/twitter_complaints_input_output.json b/...data/twitter_complaints_input_output.json → ...data/twitter_complaints_input_output.json
diff --git a/...ata/twitter_complaints_input_output.jsonl → ...ata/twitter_complaints_input_output.jsonl b/...ata/twitter_complaints_input_output.jsonl → ...ata/twitter_complaints_input_output.jsonl
diff --git a/tests/data/twitter_complaints_small.json → ...ts/testdata/twitter_complaints_small.json b/tests/data/twitter_complaints_small.json → ...ts/testdata/twitter_complaints_small.json
diff --git a/tests/data/twitter_complaints_small.jsonl → ...s/testdata/twitter_complaints_small.jsonl b/tests/data/twitter_complaints_small.jsonl → ...s/testdata/twitter_complaints_small.jsonl
diff --git a/..._tokenized_with_maykeye_tinyllama_v0.json → ..._tokenized_with_maykeye_tinyllama_v0.json b/..._tokenized_with_maykeye_tinyllama_v0.json → ..._tokenized_with_maykeye_tinyllama_v0.json
diff --git a/...tokenized_with_maykeye_tinyllama_v0.jsonl → ...tokenized_with_maykeye_tinyllama_v0.jsonl b/...tokenized_with_maykeye_tinyllama_v0.jsonl → ...tokenized_with_maykeye_tinyllama_v0.jsonl
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
@@ -26,7 +26,7 @@
 # First Party
 from build.accelerate_launch import main
 from build.utils import serialize_args, get_highest_checkpoint
-from tests.data import TWITTER_COMPLAINTS_DATA_JSONL
+from tests.artifacts.testdata import TWITTER_COMPLAINTS_DATA_JSONL
 from tuning.utils.error_logging import (
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,

diff --git a/tests/data/test_data_handlers.py b/tests/data/test_data_handlers.py
@@ -0,0 +1,110 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-License-Identifier: Apache-2.0
+# https://spdx.dev/learn/handling-license-info/
+
+# Third Party
+from transformers import AutoTokenizer
+import datasets
+import pytest
+
+# First Party
+from tests.artifacts.testdata import MODEL_NAME, TWITTER_COMPLAINTS_DATA_JSONL
+
+# Local
+from tuning.data.data_handlers import (
+    apply_custom_data_formatting_template,
+    combine_sequence,
+)
+
+
+def test_apply_custom_formatting_template():
+    json_dataset = datasets.load_dataset(
+        "json", data_files=TWITTER_COMPLAINTS_DATA_JSONL
+    )
+    template = "### Input: {{Tweet text}} \n\n ### Response: {{text_label}}"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    formatted_dataset_field = "formatted_data_field"
+    formatted_dataset = json_dataset.map(
+        apply_custom_data_formatting_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "dataset_text_field": formatted_dataset_field,
+            "template": template,
+        },
+    )
+    # First response from the data file that is read.
+    expected_response = (
+        "### Input: @HMRCcustomers No this is my first job"
+        + " \n\n ### Response: no complaint"
+        + tokenizer.eos_token
+    )
+
+    # a new dataset_text_field is created in Dataset
+    assert formatted_dataset_field in formatted_dataset["train"][0]
+    assert formatted_dataset["train"][0][formatted_dataset_field] == expected_response
+
+
+def test_apply_custom_formatting_template_gives_error_with_wrong_keys():
+    """Tests that the formatting function will throw error if wrong keys are passed to template"""
+    json_dataset = datasets.load_dataset(
+        "json", data_files=TWITTER_COMPLAINTS_DATA_JSONL
+    )
+    template = "### Input: {{not found}} \n\n ### Response: {{text_label}}"
+    formatted_dataset_field = "formatted_data_field"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    with pytest.raises(KeyError):
+        json_dataset.map(
+            apply_custom_data_formatting_template,
+            fn_kwargs={
+                "tokenizer": tokenizer,
+                "dataset_text_field": formatted_dataset_field,
+                "template": template,
+            },
+        )
+
+
+@pytest.mark.parametrize(
+    "input_element,output_element,expected_res",
+    [
+        ("foo ", "bar", "foo bar"),
+        ("foo\n", "bar", "foo\nbar"),
+        ("foo\t", "bar", "foo\tbar"),
+        ("foo", "bar", "foo bar"),
+    ],
+)
+def test_combine_sequence(input_element, output_element, expected_res):
+    """Ensure that input / output elements are combined with correct whitespace handling."""
+    comb_seq = combine_sequence(input_element, output_element)
+    assert isinstance(comb_seq, str)
+    assert comb_seq == expected_res
+
+
+@pytest.mark.parametrize(
+    "input_element,output_element,expected_res",
+    [
+        ("foo ", "bar", "foo bar"),
+        ("foo\n", "bar", "foo\nbar"),
+        ("foo\t", "bar", "foo\tbar"),
+        ("foo", "bar", "foo bar"),
+    ],
+)
+def test_combine_sequence_adds_eos(input_element, output_element, expected_res):
+    """Ensure that input / output elements are combined with correct whitespace handling."""
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    comb_seq = combine_sequence(input_element, output_element, tokenizer.eos_token)
+    expected_res += tokenizer.eos_token
+    assert isinstance(comb_seq, str)
+    assert comb_seq == expected_res