Skip to content

Commit

Permalink
Add char arrays output (#43)
Browse files Browse the repository at this point in the history
* ✨ add char comparison when encountering int8 arrays

* adding detection of wider strings

* better management of strings separators & whitespaces

* removed potencially disturbing character

* 🎨 code style

* 🎨 code style + msg fix
  • Loading branch information
MelReyCG authored Nov 5, 2024
1 parent 8cc89f5 commit a88a093
Showing 1 changed file with 87 additions and 21 deletions.
108 changes: 87 additions & 21 deletions geos-ats/src/geos/ats/helpers/restart_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import argparse
import logging
import time
import string
from pathlib import Path
try:
from geos.ats.helpers.permute_array import permuteArray # type: ignore[import]
Expand Down Expand Up @@ -375,35 +376,100 @@ def compareIntArrays( self, path, arr, base_arr ):
ARR [in]: The hdf5 Dataset to compare.
BASE_ARR [in]: The hdf5 Dataset to compare against.
"""
# If the shapes are different they can't be compared.
message = ""
if arr.shape != base_arr.shape:
msg = "Datasets have different shapes and therefore can't be compared: %s, %s.\n" % ( arr.shape,
base_arr.shape )
self.errorMsg( path, msg, True )
return
message = "Datasets have different shapes and therefore can't be compared statistically: %s, %s.\n" % (
arr.shape, base_arr.shape )
else:
# Calculate the absolute difference.
difference = np.subtract( arr, base_arr )
np.abs( difference, out=difference )

# Create a copy of the arrays.
offenders = difference != 0.0
n_offenders = np.sum( offenders )

# Calculate the absolute difference.
difference = np.subtract( arr, base_arr )
np.abs( difference, out=difference )
if n_offenders != 0:
max_index = np.unravel_index( np.argmax( difference ), difference.shape )
max_difference = difference[ max_index ]
offenders_mean = np.mean( difference[ offenders ] )
offenders_std = np.std( difference[ offenders ] )

offenders = difference != 0.0
n_offenders = np.sum( offenders )
message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
arr.dtype, base_arr.dtype, offenders.size, n_offenders )
message += "Statistics of the differences greater than 0:\n"
message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
offenders_mean, offenders_std )

if n_offenders != 0:
max_index = np.unravel_index( np.argmax( difference ), difference.shape )
max_difference = difference[ max_index ]
offenders_mean = np.mean( difference[ offenders ] )
offenders_std = np.std( difference[ offenders ] )
# actually, int8 arrays are almost always char arrays, so we sould add a character comparison.
if arr.dtype == np.int8 and base_arr.dtype == np.int8:
message += self.compareCharArrays( arr, base_arr )

message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
arr.dtype, base_arr.dtype, offenders.size, n_offenders )
message += "Statistics of the differences greater than 0:\n"
message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
offenders_mean, offenders_std )
if message != "":
self.errorMsg( path, message, True )

def compareCharArrays( self, comp_arr, base_arr ):
"""
Compare the valid characters of two arrays and return a formatted string showing differences.
COMP_ARR [in]: The hdf5 Dataset to compare.
BASE_ARR [in]: The hdf5 Dataset to compare against.
Returns a formatted string highlighting the differing characters.
"""
comp_ndarr = np.array( comp_arr ).flatten()
base_ndarr = np.array( base_arr ).flatten()

# Replace invalid characters by group-separator characters ('\x1D')
valid_chars = set( string.printable )
invalid_char = '\x1D'
comp_str = "".join(
[ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in comp_ndarr ] )
base_str = "".join(
[ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in base_ndarr ] )

# replace whitespaces sequences by only one space (preventing indentation / spacing changes detection)
whitespace_pattern = r"[ \t\n\r\v\f]+"
comp_str = re.sub( whitespace_pattern, " ", comp_str )
base_str = re.sub( whitespace_pattern, " ", base_str )
# replace invalid characters sequences by a double space (for clear display)
invalid_char_pattern = r"\x1D+"
comp_str_display = re.sub( invalid_char_pattern, " ", comp_str )
base_str_display = re.sub( invalid_char_pattern, " ", base_str )

message = ""

def limited_display( n, string ):
return string[ :n ] + f"... ({len(string)-n} omitted chars)" if len( string ) > n else string

if len( comp_str ) != len( base_str ):
max_display = 250
message = f"Character arrays have different sizes: {len( comp_str )}, {len( base_str )}.\n"
message += f" {limited_display( max_display, comp_str_display )}\n"
message += f" {limited_display( max_display, base_str_display )}\n"
else:
# We need to trim arrays to the length of the shortest one for the comparisons
min_length = min( len( comp_str_display ), len( base_str_display ) )
comp_str_trim = comp_str_display[ :min_length ]
base_str_trim = base_str_display[ :min_length ]

differing_indices = np.where( np.array( list( comp_str_trim ) ) != np.array( list( base_str_trim ) ) )[ 0 ]
if differing_indices.size != 0:
# check for reordering
arr_set = sorted( set( comp_str.split( invalid_char ) ) )
base_arr_set = sorted( set( base_str.split( invalid_char ) ) )
reordering_detected = arr_set == base_arr_set

max_display = 110 if reordering_detected else 250
message = "Differing valid characters"
message += " (substrings reordering detected):\n" if reordering_detected else ":\n"

message += f" {limited_display( max_display, comp_str_display )}\n"
message += f" {limited_display( max_display, base_str_display )}\n"
message += " " + "".join(
[ "^" if i in differing_indices else " " for i in range( min( max_display, min_length ) ) ] ) + "\n"

return message

def compareStringArrays( self, path, arr, base_arr ):
"""
Compare two string datasets. Exact equality is used as the acceptance criteria.
Expand Down

0 comments on commit a88a093

Please sign in to comment.