Skip to content

Commit

Permalink
Fixed the duplicate tag tests by adding a DuplicateChecker class with…
Browse files Browse the repository at this point in the history
… hashes
  • Loading branch information
VisLab committed Jan 16, 2025
1 parent eac790b commit 113fb42
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 80 deletions.
10 changes: 6 additions & 4 deletions hed/models/hed_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,14 @@ def replace_placeholder(self, placeholder_value):
else:
self._tag = self.tag.replace("#", placeholder_value)

def __hash__(self):
def get_normalized_str(self):
if self._schema_entry:
return hash(
self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold())
return self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold()
else:
return hash(self.casefold())
return self.casefold()

def __hash__(self):
return hash(self.get_normalized_str())

def __eq__(self, other):
if self is other:
Expand Down
41 changes: 21 additions & 20 deletions hed/models/model_constants.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
""" Defined constants for definitions, def labels, and expanded labels. """


class DefTagNames:
""" Source names for definitions, def labels, and expanded labels. """

DEF_KEY = 'Def'
DEF_EXPAND_KEY = 'Def-expand'
DEFINITION_KEY = "Definition"

ONSET_KEY = "Onset"
OFFSET_KEY = "Offset"
INSET_KEY = "Inset"
DURATION_KEY = "Duration"
DELAY_KEY = "Delay"

TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
DURATION_KEYS = {DURATION_KEY, DELAY_KEY}

ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
""" Defined constants for definitions, def labels, and expanded labels. """


class DefTagNames:
""" Source names for definitions, def labels, and expanded labels. """

DEF_KEY = 'Def'
DEF_EXPAND_KEY = 'Def-expand'
DEFINITION_KEY = "Definition"

ONSET_KEY = "Onset"
OFFSET_KEY = "Offset"
INSET_KEY = "Inset"
DURATION_KEY = "Duration"
DELAY_KEY = "Delay"

TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
DURATION_KEYS = {DURATION_KEY, DELAY_KEY}

ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
TIMELINE_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY, DELAY_KEY}
6 changes: 3 additions & 3 deletions hed/validator/onset_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag):

@staticmethod
def check_for_banned_tags(hed_string):
""" Returns an issue for every tag found from the banned list
""" Returns an issue for every tag found from the banned list (for files without onset column).
Parameters:
hed_string(HedString): the string to check
hed_string(HedString): The string to check.
Returns:
list: The validation issues associated with the characters. Each issue is dictionary.
"""
banned_tag_list = DefTagNames.ALL_TIME_KEYS
banned_tag_list = DefTagNames.TIMELINE_KEYS
issues = []
for tag in hed_string.get_all_tags():
if tag.short_base_tag in banned_tag_list:
Expand Down
56 changes: 54 additions & 2 deletions hed/validator/spreadsheet_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Validates spreadsheet tabular data. """
import copy
import pandas as pd
import math
from hed.models.base_input import BaseInput
from hed.errors.error_types import ColumnErrors, ErrorContext, ValidationErrors
from hed.errors.error_reporter import ErrorHandler
Expand All @@ -16,6 +17,8 @@


class SpreadsheetValidator:
ONSET_TOLERANCE = 10-7

def __init__(self, hed_schema):
"""
Constructor for the SpreadsheetValidator class.
Expand Down Expand Up @@ -79,6 +82,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None):
issues += self._run_checks(df, error_handler=error_handler, row_adj=row_adj, onset_mask=onset_mask)
if self._onset_validator:
issues += self._run_onset_checks(onsets, error_handler=error_handler, row_adj=row_adj)
issues += self._recheck_duplicates(onsets, error_handler=error_handler, row_adj=row_adj)
error_handler.pop_error_context()

issues = sort_issues(issues)
Expand Down Expand Up @@ -118,6 +122,7 @@ def _run_checks(self, hed_df, error_handler, row_adj, onset_mask=None):
error_handler.pop_error_context() # Row
continue

# Continue on if not a timeline file
row_string = HedString.from_hed_strings(row_strings)

if row_string:
Expand Down Expand Up @@ -149,8 +154,55 @@ def _run_onset_checks(self, onset_filtered, error_handler, row_adj):
error_handler.pop_error_context() # Row
return issues

def _run_onset_nan_checks(self, onsets, error_handler, row_adj):
return
def _recheck_duplicates(self, onset_filtered, error_handler, row_adj):
issues = []
for i in range(len(onset_filtered) - 1):
current_row = onset_filtered.iloc[i]
next_row = onset_filtered.iloc[i + 1]

# Skip if the HED column is empty or there was already an error
if not current_row["HED"] or \
(current_row["original_index"] in self.invalid_original_rows) or \
(not self._is_within_tolerance(next_row["onset"], current_row["onset"])):
continue

# At least two rows have been merged with their onsets recognized as the same.
error_handler.push_error_context(ErrorContext.ROW, current_row.original_index + row_adj)
row_string = HedString(current_row.HED, self._schema, self._hed_validator._def_validator)
error_handler.push_error_context(ErrorContext.HED_STRING, row_string)
new_column_issues = self._hed_validator.run_full_string_checks(row_string)
error_handler.add_context_and_filter(new_column_issues)
error_handler.pop_error_context() # HedString
issues += new_column_issues
error_handler.pop_error_context() # Row

return issues

def _is_within_tolerance(self, onset1, onset2):
"""
Checks if two onset strings are within the specified tolerance.
Parameters:
onset1 (str): The first onset value as a string.
onset2 (str): The second onset value as a string.
Returns:
bool: True if the values are within tolerance and valid, False otherwise.
"""
try:
# Convert to floats
onset1 = float(onset1)
onset2 = float(onset2)

# Check if both values are finite
if not (math.isfinite(onset1) and math.isfinite(onset2)):
return False

# Check if the difference is within tolerance
return abs(onset1 - onset2) <= self.ONSET_TOLERANCE
except ValueError:
# Return False if either value is not convertible to a float
return False

def _validate_column_structure(self, base_input, error_handler, row_adj):
"""
Expand Down
58 changes: 58 additions & 0 deletions hed/validator/util/dup_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from hed.errors.error_reporter import ErrorHandler
from hed.models.hed_tag import HedTag
from hed.errors.error_types import ValidationErrors


class DuplicateChecker:

def __init__(self, hed_schema):
""" Constructor for GroupValidator
Parameters:
hed_schema (HedSchema): A HedSchema object.
"""
if hed_schema is None:
raise ValueError("HedSchema required for validation")
self._hed_schema = hed_schema
self.issues = []

def check_for_duplicates(self, original_group):
self.issues = []
self._get_recursive_hash(original_group)
return self.issues

def get_hash(self, original_group):
self.issues = []
duplication_hash = self._get_recursive_hash(original_group)
return duplication_hash

def _get_recursive_hash(self, group):
if len(self.issues) > 0:
return None
group_hashes = set()
for child in group.children:
if isinstance(child, HedTag):
this_hash = hash(child)
else:
this_hash = self._get_recursive_hash(child)
if len(self.issues) > 0 or this_hash is None:
return None
if this_hash in group_hashes:
self.issues += self._get_duplication_error(child)
return None
group_hashes.add(this_hash)
return hash(frozenset(group_hashes))

@staticmethod
def _get_duplication_error(child):
if isinstance(child, HedTag):
return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED, child)
else:
found_group = child
base_steps_up = 0
while isinstance(found_group, list):
found_group = found_group[0]
base_steps_up += 1
for _ in range(base_steps_up):
found_group = found_group._parent
return ErrorHandler.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, found_group)
33 changes: 4 additions & 29 deletions hed/validator/util/group_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
""" Validation of the HED tags as strings. """

from collections import deque
from hed.errors.error_reporter import ErrorHandler
from hed.models.model_constants import DefTagNames
from hed.schema.hed_schema_constants import HedKey
from hed.models.hed_tag import HedTag
from hed.errors.error_types import ValidationErrors, TemporalErrors
from hed.validator.reserved_checker import ReservedChecker
from hed.validator.util.dup_util import DuplicateChecker


class GroupValidator:
Expand All @@ -23,6 +24,7 @@ def __init__(self, hed_schema):
raise ValueError("HedSchema required for validation")
self._hed_schema = hed_schema
self._reserved_checker = ReservedChecker.get_instance()
self._duplicate_checker = DuplicateChecker(hed_schema)

def run_tag_level_validators(self, hed_string_obj):
""" Report invalid groups at each level.
Expand All @@ -39,7 +41,7 @@ def run_tag_level_validators(self, hed_string_obj):

checks = [
self._check_group_relationships,
self._check_for_duplicate_groups,
self._duplicate_checker.check_for_duplicates,
# self.validate_duration_tags,
]

Expand Down Expand Up @@ -283,30 +285,3 @@ def _validate_tags_in_hed_string(self, tags):
validation_issues += self.check_for_required_tags(tags)
validation_issues += self.check_multiple_unique_tags_exist(tags)
return validation_issues

def _check_for_duplicate_groups_recursive(self, sorted_group, validation_issues):
prev_child = None
for child in sorted_group:
if child == prev_child:
if isinstance(child, HedTag):
error_code = ValidationErrors.HED_TAG_REPEATED
validation_issues += ErrorHandler.format_error(error_code, child)
else:
error_code = ValidationErrors.HED_TAG_REPEATED_GROUP
found_group = child
base_steps_up = 0
while isinstance(found_group, list):
found_group = found_group[0]
base_steps_up += 1
for _ in range(base_steps_up):
found_group = found_group._parent
validation_issues += ErrorHandler.format_error(error_code, found_group)
if not isinstance(child, HedTag):
self._check_for_duplicate_groups_recursive(child, validation_issues)
prev_child = child

def _check_for_duplicate_groups(self, original_group):
sorted_group = original_group._sorted()
validation_issues = []
self._check_for_duplicate_groups_recursive(sorted_group, validation_issues)
return validation_issues
10 changes: 5 additions & 5 deletions spec_tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,11 @@ def test_errors(self):
print("\n".join(self.fail_count))
self.assertEqual(len(self.fail_count), 0)

# def test_debug(self):
# test_file = os.path.realpath('./temp5.json')
# test_name = None
# test_type = None
# self.run_single_test(test_file, test_name, test_type)
def test_debug(self):
test_file = os.path.realpath('./temp6.json')
test_name = None
test_type = None
self.run_single_test(test_file, test_name, test_type)


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions tests/validator/test_onset_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,17 @@ def test_onset_two_in_one_line(self):
self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False)

def test_check_for_banned_tags(self):
hed_string = HedString("Event, (Duration/Short, Label/Example)", self.hed_schema)
hed_string = HedString("Event, (Delay/5, (Label/Example))", self.hed_schema)
issues = OnsetValidator.check_for_banned_tags(hed_string)
self.assertEqual(len(issues), 1)

hed_string = HedString("Onset, (Offset, Event)", self.hed_schema)
issues = OnsetValidator.check_for_banned_tags(hed_string)
self.assertEqual(len(issues), 2)

hed_string = HedString("(Onset, Duration/Long), Label/Example", self.hed_schema)
hed_string = HedString("(Onset, Duration/5.0), Label/Example", self.hed_schema)
issues = OnsetValidator.check_for_banned_tags(hed_string)
self.assertEqual(len(issues), 2)
self.assertEqual(len(issues), 1)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion tests/validator/test_sidecar_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_multicategory_refs(self):
issues = sidecar.validate(self.hed_schema)

# 3 issues are expected for repeated tags from stacking lines
self.assertEqual(len(issues), 3)
self.assertEqual(len(issues), 2)
refs = sidecar.get_column_refs()
self.assertEqual(len(refs), 2)

Expand Down
34 changes: 22 additions & 12 deletions tests/validator/test_tag_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,21 +436,31 @@ def test_no_duplicates(self):
}
from hed import HedString
expected_issues = {
'topLevelDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=1),
'groupDuplicate': self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=3),
'topLevelDuplicate': [
{'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1}
],
'groupDuplicate': [
{'code': 'TAG_EXPRESSION_REPEATED', 'message': 'Repeated tag - "Event/Sensory-event"', 'severity': 1}
],
'legalDuplicate': [],
'noDuplicate': [],
'duplicateGroup': self.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP,
group=HedString("(Sensory-event, Man-made-object/VehicleTrain)",
self.hed_schema)),
'duplicateSubGroup': self.format_error(
ValidationErrors.HED_TAG_REPEATED_GROUP,
group=HedString("(Event,(Sensory-event,Man-made-object/VehicleTrain))", self.hed_schema)),
'duplicateSubGroupF': self.format_error(
ValidationErrors.HED_TAG_REPEATED_GROUP,
group=HedString("((Sensory-event,Man-made-object/VehicleTrain),Event)", self.hed_schema)),
'duplicateGroup': [
{'code': 'TAG_EXPRESSION_REPEATED',
'message': 'Repeated group - "(Man-made-object/VehicleTrain,Sensory-event)"',
'severity': 1}
],
'duplicateSubGroup': [
{'code': 'TAG_EXPRESSION_REPEATED',
'message': 'Repeated group - "(Event,(Man-made-object/VehicleTrain,Sensory-event))"',
'severity': 1}
],
'duplicateSubGroupF': [
{'code': 'TAG_EXPRESSION_REPEATED',
'message': 'Repeated group - "((Man-made-object/VehicleTrain,Sensory-event),Event)"',
'severity': 1}
],
}
self.validator_semantic(test_strings, expected_results, expected_issues, False)
self.validator_semantic_new(test_strings, expected_results, expected_issues, False)

def test_no_duplicates_semantic(self):
test_strings = {
Expand Down
2 changes: 1 addition & 1 deletion tests/validator/test_tag_validator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ def validator_base_new(self, test_strings, expected_results, expected_issues, te
hed_schema, check_for_warnings=False):
# This does direct comparison of the issue before formatting or context.
for test_key in test_strings:
# print(f"\n{test_key}: {test_strings[test_key]}")
hed_string_obj = HedString(test_strings[test_key], self.hed_schema)
test_issues = []
if self.compute_forms:
test_issues += hed_string_obj._calculate_to_canonical_forms(hed_schema)
if not test_issues:
test_issues += test_function(hed_string_obj)
# print(f"result: {str(test_issues)}")
filtered_issues = self.filter_issues(test_issues)
# print(f"filtered: {str(filtered_issues)}")
these_issues = expected_issues[test_key]
Expand Down

0 comments on commit 113fb42

Please sign in to comment.