From 113fb428f4934456138053bbed55d2527e432c0b Mon Sep 17 00:00:00 2001
From: Kay Robbins <1189050+VisLab@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:23:53 -0600
Subject: [PATCH] Fixed the duplicate tag tests by adding a DuplicateChecker
 class with hashes

---
 hed/models/hed_tag.py                      | 10 ++--
 hed/models/model_constants.py              | 41 +++++++--------
 hed/validator/onset_validator.py           |  6 +--
 hed/validator/spreadsheet_validator.py     | 56 ++++++++++++++++++++-
 hed/validator/util/dup_util.py             | 58 ++++++++++++++++++++++
 hed/validator/util/group_util.py           | 33 ++----------
 spec_tests/test_errors.py                  | 10 ++--
 tests/validator/test_onset_validator.py    |  6 +--
 tests/validator/test_sidecar_validator.py  |  2 +-
 tests/validator/test_tag_validator.py      | 34 ++++++++-----
 tests/validator/test_tag_validator_base.py |  2 +-
 11 files changed, 178 insertions(+), 80 deletions(-)
 create mode 100644 hed/validator/util/dup_util.py

diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py
index 69ae5e58..405288c0 100644
--- a/hed/models/hed_tag.py
+++ b/hed/models/hed_tag.py
@@ -618,12 +618,14 @@ def replace_placeholder(self, placeholder_value):
             else:
                 self._tag = self.tag.replace("#", placeholder_value)
 
-    def __hash__(self):
+    def get_normalized_str(self):
         if self._schema_entry:
-            return hash(
-                self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold())
+            return self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold()
         else:
-            return hash(self.casefold())
+            return self.casefold()
+
+    def __hash__(self):
+        return hash(self.get_normalized_str())
 
     def __eq__(self, other):
         if self is other:
diff --git a/hed/models/model_constants.py b/hed/models/model_constants.py
index bbc249c0..a1407b6f 100644
--- a/hed/models/model_constants.py
+++ b/hed/models/model_constants.py
@@ -1,20 +1,21 @@
-""" Defined constants for definitions, def labels, and expanded labels. """
-
-
-class DefTagNames:
-    """ Source names for definitions, def labels, and expanded labels. """
-
-    DEF_KEY = 'Def'
-    DEF_EXPAND_KEY = 'Def-expand'
-    DEFINITION_KEY = "Definition"
-
-    ONSET_KEY = "Onset"
-    OFFSET_KEY = "Offset"
-    INSET_KEY = "Inset"
-    DURATION_KEY = "Duration"
-    DELAY_KEY = "Delay"
-
-    TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
-    DURATION_KEYS = {DURATION_KEY, DELAY_KEY}
-
-    ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
+""" Defined constants for definitions, def labels, and expanded labels. """
+
+
+class DefTagNames:
+    """ Source names for definitions, def labels, and expanded labels. """
+
+    DEF_KEY = 'Def'
+    DEF_EXPAND_KEY = 'Def-expand'
+    DEFINITION_KEY = "Definition"
+
+    ONSET_KEY = "Onset"
+    OFFSET_KEY = "Offset"
+    INSET_KEY = "Inset"
+    DURATION_KEY = "Duration"
+    DELAY_KEY = "Delay"
+
+    TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
+    DURATION_KEYS = {DURATION_KEY, DELAY_KEY}
+
+    ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
+    TIMELINE_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY, DELAY_KEY}
diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py
index 1d7a04dd..33eaf2a5 100644
--- a/hed/validator/onset_validator.py
+++ b/hed/validator/onset_validator.py
@@ -64,15 +64,15 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag):
 
     @staticmethod
     def check_for_banned_tags(hed_string):
-        """ Returns an issue for every tag found from the banned list
+        """ Returns an issue for every tag found from the banned list (for files without onset column).
 
         Parameters:
-            hed_string(HedString): the string to check
+            hed_string(HedString): The string to check.
 
         Returns:
             list: The validation issues associated with the characters. Each issue is dictionary.
         """
-        banned_tag_list = DefTagNames.ALL_TIME_KEYS
+        banned_tag_list = DefTagNames.TIMELINE_KEYS
         issues = []
         for tag in hed_string.get_all_tags():
             if tag.short_base_tag in banned_tag_list:
diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py
index 9fd47443..0c75cd67 100644
--- a/hed/validator/spreadsheet_validator.py
+++ b/hed/validator/spreadsheet_validator.py
@@ -1,6 +1,7 @@
 """ Validates spreadsheet tabular data. """
 import copy
 import pandas as pd
+import math
 from hed.models.base_input import BaseInput
 from hed.errors.error_types import ColumnErrors, ErrorContext, ValidationErrors
 from hed.errors.error_reporter import ErrorHandler
@@ -16,6 +17,8 @@
 
 
 class SpreadsheetValidator:
+    ONSET_TOLERANCE = 10-7
+
     def __init__(self, hed_schema):
         """
         Constructor for the SpreadsheetValidator class.
@@ -79,6 +82,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None):
         issues += self._run_checks(df, error_handler=error_handler, row_adj=row_adj, onset_mask=onset_mask)
         if self._onset_validator:
             issues += self._run_onset_checks(onsets, error_handler=error_handler, row_adj=row_adj)
+            issues += self._recheck_duplicates(onsets, error_handler=error_handler, row_adj=row_adj)
         error_handler.pop_error_context()
 
         issues = sort_issues(issues)
@@ -118,6 +122,7 @@ def _run_checks(self, hed_df, error_handler, row_adj, onset_mask=None):
                 error_handler.pop_error_context()  # Row
                 continue
 
+            # Continue on if not a timeline file
             row_string = HedString.from_hed_strings(row_strings)
 
             if row_string:
@@ -149,8 +154,55 @@ def _run_onset_checks(self, onset_filtered, error_handler, row_adj):
             error_handler.pop_error_context()  # Row
         return issues
 
-    def _run_onset_nan_checks(self, onsets, error_handler, row_adj):
-        return
+    def _recheck_duplicates(self, onset_filtered, error_handler, row_adj):
+        issues = []
+        for i in range(len(onset_filtered) - 1):
+            current_row = onset_filtered.iloc[i]
+            next_row = onset_filtered.iloc[i + 1]
+
+            # Skip if the HED column is empty or there was already an error
+            if not current_row["HED"] or \
+                (current_row["original_index"] in self.invalid_original_rows) or \
+                (not self._is_within_tolerance(next_row["onset"], current_row["onset"])):
+                continue
+
+            # At least two rows have been merged with their onsets recognized as the same.
+            error_handler.push_error_context(ErrorContext.ROW, current_row.original_index + row_adj)
+            row_string = HedString(current_row.HED, self._schema, self._hed_validator._def_validator)
+            error_handler.push_error_context(ErrorContext.HED_STRING, row_string)
+            new_column_issues = self._hed_validator.run_full_string_checks(row_string)
+            error_handler.add_context_and_filter(new_column_issues)
+            error_handler.pop_error_context()  # HedString
+            issues += new_column_issues
+            error_handler.pop_error_context()  # Row
+
+        return issues
+
+    def _is_within_tolerance(self, onset1, onset2):
+        """
+        Checks if two onset strings are within the specified tolerance.
+
+        Parameters:
+            onset1 (str): The first onset value as a string.
+            onset2 (str): The second onset value as a string.
+
+        Returns:
+            bool: True if the values are within tolerance and valid, False otherwise.
+        """
+        try:
+            # Convert to floats
+            onset1 = float(onset1)
+            onset2 = float(onset2)
+
+            # Check if both values are finite
+            if not (math.isfinite(onset1) and math.isfinite(onset2)):
+                return False
+
+            # Check if the difference is within tolerance
+            return abs(onset1 - onset2) <= self.ONSET_TOLERANCE
+        except ValueError:
+            # Return False if either value is not convertible to a float
+            return False
 
     def _validate_column_structure(self, base_input, error_handler, row_adj):
         """
diff --git a/hed/validator/util/dup_util.py b/hed/validator/util/dup_util.py
new file mode 100644
index 00000000..fd0209b4
--- /dev/null
+++ b/hed/validator/util/dup_util.py
@@ -0,0 +1,58 @@
+from hed.errors.error_reporter import ErrorHandler
+from hed.models.hed_tag import HedTag
+from hed.errors.error_types import ValidationErrors
+
+
+class DuplicateChecker:
+
+    def __init__(self, hed_schema):
+        """ Constructor for GroupValidator
+
+        Parameters:
+            hed_schema (HedSchema): A HedSchema object.
+        """
+        if hed_schema is None:
+            raise ValueError("HedSchema required for validation")
+        self._hed_schema = hed_schema
+        self.issues = []
+
+    def check_for_duplicates(self, original_group):
+        self.issues = []
+        self._get_recursive_hash(original_group)
+        return self.issues
+
+    def get_hash(self, original_group):
+        self.issues = []
+        duplication_hash = self._get_recursive_hash(original_group)
+        return duplication_hash
+
+    def _get_recursive_hash(self, group):
+        if len(self.issues) > 0:
+            return None
+        group_hashes = set()
+        for child in group.children:
+            if isinstance(child, HedTag):
+                this_hash = hash(child)
+            else:
+                this_hash = self._get_recursive_hash(child)
+            if len(self.issues) > 0 or this_hash is None:
+                return None
+            if this_hash in group_hashes:
+                self.issues += self._get_duplication_error(child)
+                return None
+            group_hashes.add(this_hash)
+        return hash(frozenset(group_hashes))
+
+    @staticmethod
+    def _get_duplication_error(child):
+        if isinstance(child, HedTag):
+            return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED, child)
+        else:
+            found_group = child
+            base_steps_up = 0
+            while isinstance(found_group, list):
+                found_group = found_group[0]
+                base_steps_up += 1
+            for _ in range(base_steps_up):
+                found_group = found_group._parent
+            return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, found_group)
diff --git a/hed/validator/util/group_util.py b/hed/validator/util/group_util.py
index d22f65da..b63e0f3a 100644
--- a/hed/validator/util/group_util.py
+++ b/hed/validator/util/group_util.py
@@ -1,11 +1,12 @@
 """ Validation of the HED tags as strings. """
-
+from collections import deque
 from hed.errors.error_reporter import ErrorHandler
 from hed.models.model_constants import DefTagNames
 from hed.schema.hed_schema_constants import HedKey
 from hed.models.hed_tag import HedTag
 from hed.errors.error_types import ValidationErrors, TemporalErrors
 from hed.validator.reserved_checker import ReservedChecker
+from hed.validator.util.dup_util import DuplicateChecker
 
 
 class GroupValidator:
@@ -23,6 +24,7 @@ def __init__(self, hed_schema):
             raise ValueError("HedSchema required for validation")
         self._hed_schema = hed_schema
         self._reserved_checker = ReservedChecker.get_instance()
+        self._duplicate_checker = DuplicateChecker(hed_schema)
 
     def run_tag_level_validators(self, hed_string_obj):
         """ Report invalid groups at each level.
@@ -39,7 +41,7 @@ def run_tag_level_validators(self, hed_string_obj):
 
         checks = [
             self._check_group_relationships,
-            self._check_for_duplicate_groups,
+            self._duplicate_checker.check_for_duplicates,
             # self.validate_duration_tags,
         ]
 
@@ -283,30 +285,3 @@ def _validate_tags_in_hed_string(self, tags):
         validation_issues += self.check_for_required_tags(tags)
         validation_issues += self.check_multiple_unique_tags_exist(tags)
         return validation_issues
-
-    def _check_for_duplicate_groups_recursive(self, sorted_group, validation_issues):
-        prev_child = None
-        for child in sorted_group:
-            if child == prev_child:
-                if isinstance(child, HedTag):
-                    error_code = ValidationErrors.HED_TAG_REPEATED
-                    validation_issues += ErrorHandler.format_error(error_code, child)
-                else:
-                    error_code = ValidationErrors.HED_TAG_REPEATED_GROUP
-                    found_group = child
-                    base_steps_up = 0
-                    while isinstance(found_group, list):
-                        found_group = found_group[0]
-                        base_steps_up += 1
-                    for _ in range(base_steps_up):
-                        found_group = found_group._parent
-                    validation_issues += ErrorHandler.format_error(error_code, found_group)
-            if not isinstance(child, HedTag):
-                self._check_for_duplicate_groups_recursive(child, validation_issues)
-            prev_child = child
-
-    def _check_for_duplicate_groups(self, original_group):
-        sorted_group = original_group._sorted()
-        validation_issues = []
-        self._check_for_duplicate_groups_recursive(sorted_group, validation_issues)
-        return validation_issues
diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py
index 7e2e310f..496ced84 100644
--- a/spec_tests/test_errors.py
+++ b/spec_tests/test_errors.py
@@ -217,11 +217,11 @@ def test_errors(self):
         print("\n".join(self.fail_count))
         self.assertEqual(len(self.fail_count), 0)
 
-    # def test_debug(self):
-    #     test_file = os.path.realpath('./temp5.json')
-    #     test_name = None
-    #     test_type = None
-    #     self.run_single_test(test_file, test_name, test_type)
+    def test_debug(self):
+        test_file = os.path.realpath('./temp6.json')
+        test_name = None
+        test_type = None
+        self.run_single_test(test_file, test_name, test_type)
 
 
 if __name__ == '__main__':
diff --git a/tests/validator/test_onset_validator.py b/tests/validator/test_onset_validator.py
index 2d22776b..d817dd70 100644
--- a/tests/validator/test_onset_validator.py
+++ b/tests/validator/test_onset_validator.py
@@ -308,7 +308,7 @@ def test_onset_two_in_one_line(self):
         self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False)
 
     def test_check_for_banned_tags(self):
-        hed_string = HedString("Event, (Duration/Short, Label/Example)", self.hed_schema)
+        hed_string = HedString("Event, (Delay/5, (Label/Example))", self.hed_schema)
         issues = OnsetValidator.check_for_banned_tags(hed_string)
         self.assertEqual(len(issues), 1)
 
@@ -316,9 +316,9 @@ def test_check_for_banned_tags(self):
         issues = OnsetValidator.check_for_banned_tags(hed_string)
         self.assertEqual(len(issues), 2)
 
-        hed_string = HedString("(Onset, Duration/Long), Label/Example", self.hed_schema)
+        hed_string = HedString("(Onset, Duration/5.0), Label/Example", self.hed_schema)
         issues = OnsetValidator.check_for_banned_tags(hed_string)
-        self.assertEqual(len(issues), 2)
+        self.assertEqual(len(issues), 1)
 
 
 if __name__ == '__main__':
diff --git a/tests/validator/test_sidecar_validator.py b/tests/validator/test_sidecar_validator.py
index 5a76cdef..db06f1e2 100644
--- a/tests/validator/test_sidecar_validator.py
+++ b/tests/validator/test_sidecar_validator.py
@@ -32,7 +32,7 @@ def test_multicategory_refs(self):
         issues = sidecar.validate(self.hed_schema)
 
         # 3 issues are expected for repeated tags from stacking lines
-        self.assertEqual(len(issues), 3)
+        self.assertEqual(len(issues), 2)
         refs = sidecar.get_column_refs()
         self.assertEqual(len(refs), 2)
 
diff --git a/tests/validator/test_tag_validator.py b/tests/validator/test_tag_validator.py
index 62203cff..40fea653 100644
--- a/tests/validator/test_tag_validator.py
+++ b/tests/validator/test_tag_validator.py
@@ -436,21 +436,31 @@ def test_no_duplicates(self):
         }
         from hed import HedString
         expected_issues = {
-            'topLevelDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=1),
-            'groupDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=3),
+            'topLevelDuplicate': [
+                {'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1}
+            ],
+            'groupDuplicate': [
+                {'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1}
+            ],
             'legalDuplicate': [],
             'noDuplicate': [],
-            'duplicateGroup': self.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP,
-                                                group=HedString("(Sensory-event, Man-made-object/VehicleTrain)",
-                                                                self.hed_schema)),
-            'duplicateSubGroup': self.format_error(
-                ValidationErrors.HED_TAG_REPEATED_GROUP,
-                group=HedString("(Event,(Sensory-event,Man-made-object/VehicleTrain))", self.hed_schema)),
-            'duplicateSubGroupF': self.format_error(
-                ValidationErrors.HED_TAG_REPEATED_GROUP,
-                group=HedString("((Sensory-event,Man-made-object/VehicleTrain),Event)", self.hed_schema)),
+            'duplicateGroup': [
+                {'code': 'TAG_EXPRESSION_REPEATED',
+                 'message': 'Repeated group - "(Man-made-object/VehicleTrain,Sensory-event)"',
+                 'severity': 1}
+            ],
+            'duplicateSubGroup': [
+                {'code': 'TAG_EXPRESSION_REPEATED',
+                 'message': 'Repeated group - "(Event,(Man-made-object/VehicleTrain,Sensory-event))"',
+                 'severity': 1}
+            ],
+            'duplicateSubGroupF': [
+                {'code': 'TAG_EXPRESSION_REPEATED',
+                 'message': 'Repeated group - "((Man-made-object/VehicleTrain,Sensory-event),Event)"',
+                 'severity': 1}
+            ],
         }
-        self.validator_semantic(test_strings, expected_results, expected_issues, False)
+        self.validator_semantic_new(test_strings, expected_results, expected_issues, False)
 
     def test_no_duplicates_semantic(self):
         test_strings = {
diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py
index 11c6e102..a3b99018 100644
--- a/tests/validator/test_tag_validator_base.py
+++ b/tests/validator/test_tag_validator_base.py
@@ -99,13 +99,13 @@ def validator_base_new(self, test_strings, expected_results, expected_issues, te
                            hed_schema, check_for_warnings=False):
         # This does direct comparison of the issue before formatting or context.
         for test_key in test_strings:
+            # print(f"\n{test_key}: {test_strings[test_key]}")
             hed_string_obj = HedString(test_strings[test_key], self.hed_schema)
             test_issues = []
             if self.compute_forms:
                 test_issues += hed_string_obj._calculate_to_canonical_forms(hed_schema)
             if not test_issues:
                 test_issues += test_function(hed_string_obj)
-            # print(f"result: {str(test_issues)}")
             filtered_issues = self.filter_issues(test_issues)
             # print(f"filtered: {str(filtered_issues)}")
             these_issues = expected_issues[test_key]