diff --git a/geos-ats/src/geos/ats/helpers/restart_check.py b/geos-ats/src/geos/ats/helpers/restart_check.py index 7aa282e..f491107 100644 --- a/geos-ats/src/geos/ats/helpers/restart_check.py +++ b/geos-ats/src/geos/ats/helpers/restart_check.py @@ -7,6 +7,7 @@ import argparse import logging import time +import string from pathlib import Path try: from geos.ats.helpers.permute_array import permuteArray # type: ignore[import] @@ -375,35 +376,100 @@ def compareIntArrays( self, path, arr, base_arr ): ARR [in]: The hdf5 Dataset to compare. BASE_ARR [in]: The hdf5 Dataset to compare against. """ - # If the shapes are different they can't be compared. + message = "" if arr.shape != base_arr.shape: - msg = "Datasets have different shapes and therefore can't be compared: %s, %s.\n" % ( arr.shape, - base_arr.shape ) - self.errorMsg( path, msg, True ) - return + message = "Datasets have different shapes and therefore can't be compared statistically: %s, %s.\n" % ( + arr.shape, base_arr.shape ) + else: + # Calculate the absolute difference. + difference = np.subtract( arr, base_arr ) + np.abs( difference, out=difference ) - # Create a copy of the arrays. + offenders = difference != 0.0 + n_offenders = np.sum( offenders ) - # Calculate the absolute difference. - difference = np.subtract( arr, base_arr ) - np.abs( difference, out=difference ) + if n_offenders != 0: + max_index = np.unravel_index( np.argmax( difference ), difference.shape ) + max_difference = difference[ max_index ] + offenders_mean = np.mean( difference[ offenders ] ) + offenders_std = np.std( difference[ offenders ] ) - offenders = difference != 0.0 - n_offenders = np.sum( offenders ) + message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % ( + arr.dtype, base_arr.dtype, offenders.size, n_offenders ) + message += "Statistics of the differences greater than 0:\n" + message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference, + offenders_mean, offenders_std ) - if n_offenders != 0: - max_index = np.unravel_index( np.argmax( difference ), difference.shape ) - max_difference = difference[ max_index ] - offenders_mean = np.mean( difference[ offenders ] ) - offenders_std = np.std( difference[ offenders ] ) + # actually, int8 arrays are almost always char arrays, so we sould add a character comparison. + if arr.dtype == np.int8 and base_arr.dtype == np.int8: + message += self.compareCharArrays( arr, base_arr ) - message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % ( - arr.dtype, base_arr.dtype, offenders.size, n_offenders ) - message += "Statistics of the differences greater than 0:\n" - message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference, - offenders_mean, offenders_std ) + if message != "": self.errorMsg( path, message, True ) + def compareCharArrays( self, comp_arr, base_arr ): + """ + Compare the valid characters of two arrays and return a formatted string showing differences. + + COMP_ARR [in]: The hdf5 Dataset to compare. + BASE_ARR [in]: The hdf5 Dataset to compare against. + + Returns a formatted string highlighting the differing characters. + """ + comp_ndarr = np.array( comp_arr ).flatten() + base_ndarr = np.array( base_arr ).flatten() + + # Replace invalid characters by group-separator characters ('\x1D') + valid_chars = set( string.printable ) + invalid_char = '\x1D' + comp_str = "".join( + [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in comp_ndarr ] ) + base_str = "".join( + [ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in base_ndarr ] ) + + # replace whitespaces sequences by only one space (preventing indentation / spacing changes detection) + whitespace_pattern = r"[ \t\n\r\v\f]+" + comp_str = re.sub( whitespace_pattern, " ", comp_str ) + base_str = re.sub( whitespace_pattern, " ", base_str ) + # replace invalid characters sequences by a double space (for clear display) + invalid_char_pattern = r"\x1D+" + comp_str_display = re.sub( invalid_char_pattern, " ", comp_str ) + base_str_display = re.sub( invalid_char_pattern, " ", base_str ) + + message = "" + + def limited_display( n, string ): + return string[ :n ] + f"... ({len(string)-n} omitted chars)" if len( string ) > n else string + + if len( comp_str ) != len( base_str ): + max_display = 250 + message = f"Character arrays have different sizes: {len( comp_str )}, {len( base_str )}.\n" + message += f" {limited_display( max_display, comp_str_display )}\n" + message += f" {limited_display( max_display, base_str_display )}\n" + else: + # We need to trim arrays to the length of the shortest one for the comparisons + min_length = min( len( comp_str_display ), len( base_str_display ) ) + comp_str_trim = comp_str_display[ :min_length ] + base_str_trim = base_str_display[ :min_length ] + + differing_indices = np.where( np.array( list( comp_str_trim ) ) != np.array( list( base_str_trim ) ) )[ 0 ] + if differing_indices.size != 0: + # check for reordering + arr_set = sorted( set( comp_str.split( invalid_char ) ) ) + base_arr_set = sorted( set( base_str.split( invalid_char ) ) ) + reordering_detected = arr_set == base_arr_set + + max_display = 110 if reordering_detected else 250 + message = "Differing valid characters" + message += " (substrings reordering detected):\n" if reordering_detected else ":\n" + + message += f" {limited_display( max_display, comp_str_display )}\n" + message += f" {limited_display( max_display, base_str_display )}\n" + message += " " + "".join( + [ "^" if i in differing_indices else " " for i in range( min( max_display, min_length ) ) ] ) + "\n" + + return message + def compareStringArrays( self, path, arr, base_arr ): """ Compare two string datasets. Exact equality is used as the acceptance criteria.