From 113fb428f4934456138053bbed55d2527e432c0b Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 16 Jan 2025 11:23:53 -0600 Subject: [PATCH] Fixed the duplicate tag tests by adding a DuplicateChecker class with hashes --- hed/models/hed_tag.py | 10 ++-- hed/models/model_constants.py | 41 +++++++-------- hed/validator/onset_validator.py | 6 +-- hed/validator/spreadsheet_validator.py | 56 ++++++++++++++++++++- hed/validator/util/dup_util.py | 58 ++++++++++++++++++++++ hed/validator/util/group_util.py | 33 ++---------- spec_tests/test_errors.py | 10 ++-- tests/validator/test_onset_validator.py | 6 +-- tests/validator/test_sidecar_validator.py | 2 +- tests/validator/test_tag_validator.py | 34 ++++++++----- tests/validator/test_tag_validator_base.py | 2 +- 11 files changed, 178 insertions(+), 80 deletions(-) create mode 100644 hed/validator/util/dup_util.py diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 69ae5e58..405288c0 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -618,12 +618,14 @@ def replace_placeholder(self, placeholder_value): else: self._tag = self.tag.replace("#", placeholder_value) - def __hash__(self): + def get_normalized_str(self): if self._schema_entry: - return hash( - self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold()) + return self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold() else: - return hash(self.casefold()) + return self.casefold() + + def __hash__(self): + return hash(self.get_normalized_str()) def __eq__(self, other): if self is other: diff --git a/hed/models/model_constants.py b/hed/models/model_constants.py index bbc249c0..a1407b6f 100644 --- a/hed/models/model_constants.py +++ b/hed/models/model_constants.py @@ -1,20 +1,21 @@ -""" Defined constants for definitions, def labels, and expanded labels. """ - - -class DefTagNames: - """ Source names for definitions, def labels, and expanded labels. """ - - DEF_KEY = 'Def' - DEF_EXPAND_KEY = 'Def-expand' - DEFINITION_KEY = "Definition" - - ONSET_KEY = "Onset" - OFFSET_KEY = "Offset" - INSET_KEY = "Inset" - DURATION_KEY = "Duration" - DELAY_KEY = "Delay" - - TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY} - DURATION_KEYS = {DURATION_KEY, DELAY_KEY} - - ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS) +""" Defined constants for definitions, def labels, and expanded labels. """ + + +class DefTagNames: + """ Source names for definitions, def labels, and expanded labels. """ + + DEF_KEY = 'Def' + DEF_EXPAND_KEY = 'Def-expand' + DEFINITION_KEY = "Definition" + + ONSET_KEY = "Onset" + OFFSET_KEY = "Offset" + INSET_KEY = "Inset" + DURATION_KEY = "Duration" + DELAY_KEY = "Delay" + + TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY} + DURATION_KEYS = {DURATION_KEY, DELAY_KEY} + + ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS) + TIMELINE_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY, DELAY_KEY} diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py index 1d7a04dd..33eaf2a5 100644 --- a/hed/validator/onset_validator.py +++ b/hed/validator/onset_validator.py @@ -64,15 +64,15 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): @staticmethod def check_for_banned_tags(hed_string): - """ Returns an issue for every tag found from the banned list + """ Returns an issue for every tag found from the banned list (for files without onset column). Parameters: - hed_string(HedString): the string to check + hed_string(HedString): The string to check. Returns: list: The validation issues associated with the characters. Each issue is dictionary. """ - banned_tag_list = DefTagNames.ALL_TIME_KEYS + banned_tag_list = DefTagNames.TIMELINE_KEYS issues = [] for tag in hed_string.get_all_tags(): if tag.short_base_tag in banned_tag_list: diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 9fd47443..0c75cd67 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -1,6 +1,7 @@ """ Validates spreadsheet tabular data. """ import copy import pandas as pd +import math from hed.models.base_input import BaseInput from hed.errors.error_types import ColumnErrors, ErrorContext, ValidationErrors from hed.errors.error_reporter import ErrorHandler @@ -16,6 +17,8 @@ class SpreadsheetValidator: + ONSET_TOLERANCE = 10-7 + def __init__(self, hed_schema): """ Constructor for the SpreadsheetValidator class. @@ -79,6 +82,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): issues += self._run_checks(df, error_handler=error_handler, row_adj=row_adj, onset_mask=onset_mask) if self._onset_validator: issues += self._run_onset_checks(onsets, error_handler=error_handler, row_adj=row_adj) + issues += self._recheck_duplicates(onsets, error_handler=error_handler, row_adj=row_adj) error_handler.pop_error_context() issues = sort_issues(issues) @@ -118,6 +122,7 @@ def _run_checks(self, hed_df, error_handler, row_adj, onset_mask=None): error_handler.pop_error_context() # Row continue + # Continue on if not a timeline file row_string = HedString.from_hed_strings(row_strings) if row_string: @@ -149,8 +154,55 @@ def _run_onset_checks(self, onset_filtered, error_handler, row_adj): error_handler.pop_error_context() # Row return issues - def _run_onset_nan_checks(self, onsets, error_handler, row_adj): - return + def _recheck_duplicates(self, onset_filtered, error_handler, row_adj): + issues = [] + for i in range(len(onset_filtered) - 1): + current_row = onset_filtered.iloc[i] + next_row = onset_filtered.iloc[i + 1] + + # Skip if the HED column is empty or there was already an error + if not current_row["HED"] or \ + (current_row["original_index"] in self.invalid_original_rows) or \ + (not self._is_within_tolerance(next_row["onset"], current_row["onset"])): + continue + + # At least two rows have been merged with their onsets recognized as the same. + error_handler.push_error_context(ErrorContext.ROW, current_row.original_index + row_adj) + row_string = HedString(current_row.HED, self._schema, self._hed_validator._def_validator) + error_handler.push_error_context(ErrorContext.HED_STRING, row_string) + new_column_issues = self._hed_validator.run_full_string_checks(row_string) + error_handler.add_context_and_filter(new_column_issues) + error_handler.pop_error_context() # HedString + issues += new_column_issues + error_handler.pop_error_context() # Row + + return issues + + def _is_within_tolerance(self, onset1, onset2): + """ + Checks if two onset strings are within the specified tolerance. + + Parameters: + onset1 (str): The first onset value as a string. + onset2 (str): The second onset value as a string. + + Returns: + bool: True if the values are within tolerance and valid, False otherwise. + """ + try: + # Convert to floats + onset1 = float(onset1) + onset2 = float(onset2) + + # Check if both values are finite + if not (math.isfinite(onset1) and math.isfinite(onset2)): + return False + + # Check if the difference is within tolerance + return abs(onset1 - onset2) <= self.ONSET_TOLERANCE + except ValueError: + # Return False if either value is not convertible to a float + return False def _validate_column_structure(self, base_input, error_handler, row_adj): """ diff --git a/hed/validator/util/dup_util.py b/hed/validator/util/dup_util.py new file mode 100644 index 00000000..fd0209b4 --- /dev/null +++ b/hed/validator/util/dup_util.py @@ -0,0 +1,58 @@ +from hed.errors.error_reporter import ErrorHandler +from hed.models.hed_tag import HedTag +from hed.errors.error_types import ValidationErrors + + +class DuplicateChecker: + + def __init__(self, hed_schema): + """ Constructor for GroupValidator + + Parameters: + hed_schema (HedSchema): A HedSchema object. + """ + if hed_schema is None: + raise ValueError("HedSchema required for validation") + self._hed_schema = hed_schema + self.issues = [] + + def check_for_duplicates(self, original_group): + self.issues = [] + self._get_recursive_hash(original_group) + return self.issues + + def get_hash(self, original_group): + self.issues = [] + duplication_hash = self._get_recursive_hash(original_group) + return duplication_hash + + def _get_recursive_hash(self, group): + if len(self.issues) > 0: + return None + group_hashes = set() + for child in group.children: + if isinstance(child, HedTag): + this_hash = hash(child) + else: + this_hash = self._get_recursive_hash(child) + if len(self.issues) > 0 or this_hash is None: + return None + if this_hash in group_hashes: + self.issues += self._get_duplication_error(child) + return None + group_hashes.add(this_hash) + return hash(frozenset(group_hashes)) + + @staticmethod + def _get_duplication_error(child): + if isinstance(child, HedTag): + return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED, child) + else: + found_group = child + base_steps_up = 0 + while isinstance(found_group, list): + found_group = found_group[0] + base_steps_up += 1 + for _ in range(base_steps_up): + found_group = found_group._parent + return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, found_group) diff --git a/hed/validator/util/group_util.py b/hed/validator/util/group_util.py index d22f65da..b63e0f3a 100644 --- a/hed/validator/util/group_util.py +++ b/hed/validator/util/group_util.py @@ -1,11 +1,12 @@ """ Validation of the HED tags as strings. """ - +from collections import deque from hed.errors.error_reporter import ErrorHandler from hed.models.model_constants import DefTagNames from hed.schema.hed_schema_constants import HedKey from hed.models.hed_tag import HedTag from hed.errors.error_types import ValidationErrors, TemporalErrors from hed.validator.reserved_checker import ReservedChecker +from hed.validator.util.dup_util import DuplicateChecker class GroupValidator: @@ -23,6 +24,7 @@ def __init__(self, hed_schema): raise ValueError("HedSchema required for validation") self._hed_schema = hed_schema self._reserved_checker = ReservedChecker.get_instance() + self._duplicate_checker = DuplicateChecker(hed_schema) def run_tag_level_validators(self, hed_string_obj): """ Report invalid groups at each level. @@ -39,7 +41,7 @@ def run_tag_level_validators(self, hed_string_obj): checks = [ self._check_group_relationships, - self._check_for_duplicate_groups, + self._duplicate_checker.check_for_duplicates, # self.validate_duration_tags, ] @@ -283,30 +285,3 @@ def _validate_tags_in_hed_string(self, tags): validation_issues += self.check_for_required_tags(tags) validation_issues += self.check_multiple_unique_tags_exist(tags) return validation_issues - - def _check_for_duplicate_groups_recursive(self, sorted_group, validation_issues): - prev_child = None - for child in sorted_group: - if child == prev_child: - if isinstance(child, HedTag): - error_code = ValidationErrors.HED_TAG_REPEATED - validation_issues += ErrorHandler.format_error(error_code, child) - else: - error_code = ValidationErrors.HED_TAG_REPEATED_GROUP - found_group = child - base_steps_up = 0 - while isinstance(found_group, list): - found_group = found_group[0] - base_steps_up += 1 - for _ in range(base_steps_up): - found_group = found_group._parent - validation_issues += ErrorHandler.format_error(error_code, found_group) - if not isinstance(child, HedTag): - self._check_for_duplicate_groups_recursive(child, validation_issues) - prev_child = child - - def _check_for_duplicate_groups(self, original_group): - sorted_group = original_group._sorted() - validation_issues = [] - self._check_for_duplicate_groups_recursive(sorted_group, validation_issues) - return validation_issues diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 7e2e310f..496ced84 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -217,11 +217,11 @@ def test_errors(self): print("\n".join(self.fail_count)) self.assertEqual(len(self.fail_count), 0) - # def test_debug(self): - # test_file = os.path.realpath('./temp5.json') - # test_name = None - # test_type = None - # self.run_single_test(test_file, test_name, test_type) + def test_debug(self): + test_file = os.path.realpath('./temp6.json') + test_name = None + test_type = None + self.run_single_test(test_file, test_name, test_type) if __name__ == '__main__': diff --git a/tests/validator/test_onset_validator.py b/tests/validator/test_onset_validator.py index 2d22776b..d817dd70 100644 --- a/tests/validator/test_onset_validator.py +++ b/tests/validator/test_onset_validator.py @@ -308,7 +308,7 @@ def test_onset_two_in_one_line(self): self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) def test_check_for_banned_tags(self): - hed_string = HedString("Event, (Duration/Short, Label/Example)", self.hed_schema) + hed_string = HedString("Event, (Delay/5, (Label/Example))", self.hed_schema) issues = OnsetValidator.check_for_banned_tags(hed_string) self.assertEqual(len(issues), 1) @@ -316,9 +316,9 @@ def test_check_for_banned_tags(self): issues = OnsetValidator.check_for_banned_tags(hed_string) self.assertEqual(len(issues), 2) - hed_string = HedString("(Onset, Duration/Long), Label/Example", self.hed_schema) + hed_string = HedString("(Onset, Duration/5.0), Label/Example", self.hed_schema) issues = OnsetValidator.check_for_banned_tags(hed_string) - self.assertEqual(len(issues), 2) + self.assertEqual(len(issues), 1) if __name__ == '__main__': diff --git a/tests/validator/test_sidecar_validator.py b/tests/validator/test_sidecar_validator.py index 5a76cdef..db06f1e2 100644 --- a/tests/validator/test_sidecar_validator.py +++ b/tests/validator/test_sidecar_validator.py @@ -32,7 +32,7 @@ def test_multicategory_refs(self): issues = sidecar.validate(self.hed_schema) # 3 issues are expected for repeated tags from stacking lines - self.assertEqual(len(issues), 3) + self.assertEqual(len(issues), 2) refs = sidecar.get_column_refs() self.assertEqual(len(refs), 2) diff --git a/tests/validator/test_tag_validator.py b/tests/validator/test_tag_validator.py index 62203cff..40fea653 100644 --- a/tests/validator/test_tag_validator.py +++ b/tests/validator/test_tag_validator.py @@ -436,21 +436,31 @@ def test_no_duplicates(self): } from hed import HedString expected_issues = { - 'topLevelDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=1), - 'groupDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=3), + 'topLevelDuplicate': [ + {'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1} + ], + 'groupDuplicate': [ + {'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1} + ], 'legalDuplicate': [], 'noDuplicate': [], - 'duplicateGroup': self.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, - group=HedString("(Sensory-event, Man-made-object/VehicleTrain)", - self.hed_schema)), - 'duplicateSubGroup': self.format_error( - ValidationErrors.HED_TAG_REPEATED_GROUP, - group=HedString("(Event,(Sensory-event,Man-made-object/VehicleTrain))", self.hed_schema)), - 'duplicateSubGroupF': self.format_error( - ValidationErrors.HED_TAG_REPEATED_GROUP, - group=HedString("((Sensory-event,Man-made-object/VehicleTrain),Event)", self.hed_schema)), + 'duplicateGroup': [ + {'code': 'TAG_EXPRESSION_REPEATED', + 'message': 'Repeated group - "(Man-made-object/VehicleTrain,Sensory-event)"', + 'severity': 1} + ], + 'duplicateSubGroup': [ + {'code': 'TAG_EXPRESSION_REPEATED', + 'message': 'Repeated group - "(Event,(Man-made-object/VehicleTrain,Sensory-event))"', + 'severity': 1} + ], + 'duplicateSubGroupF': [ + {'code': 'TAG_EXPRESSION_REPEATED', + 'message': 'Repeated group - "((Man-made-object/VehicleTrain,Sensory-event),Event)"', + 'severity': 1} + ], } - self.validator_semantic(test_strings, expected_results, expected_issues, False) + self.validator_semantic_new(test_strings, expected_results, expected_issues, False) def test_no_duplicates_semantic(self): test_strings = { diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py index 11c6e102..a3b99018 100644 --- a/tests/validator/test_tag_validator_base.py +++ b/tests/validator/test_tag_validator_base.py @@ -99,13 +99,13 @@ def validator_base_new(self, test_strings, expected_results, expected_issues, te hed_schema, check_for_warnings=False): # This does direct comparison of the issue before formatting or context. for test_key in test_strings: + # print(f"\n{test_key}: {test_strings[test_key]}") hed_string_obj = HedString(test_strings[test_key], self.hed_schema) test_issues = [] if self.compute_forms: test_issues += hed_string_obj._calculate_to_canonical_forms(hed_schema) if not test_issues: test_issues += test_function(hed_string_obj) - # print(f"result: {str(test_issues)}") filtered_issues = self.filter_issues(test_issues) # print(f"filtered: {str(filtered_issues)}") these_issues = expected_issues[test_key]