Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add char arrays output #43

Merged
merged 6 commits into from
Nov 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 87 additions & 21 deletions geos-ats/src/geos/ats/helpers/restart_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import argparse
import logging
import time
import string
from pathlib import Path
try:
from geos.ats.helpers.permute_array import permuteArray # type: ignore[import]
Expand Down Expand Up @@ -375,35 +376,100 @@ def compareIntArrays( self, path, arr, base_arr ):
ARR [in]: The hdf5 Dataset to compare.
BASE_ARR [in]: The hdf5 Dataset to compare against.
"""
# If the shapes are different they can't be compared.
message = ""
if arr.shape != base_arr.shape:
msg = "Datasets have different shapes and therefore can't be compared: %s, %s.\n" % ( arr.shape,
base_arr.shape )
self.errorMsg( path, msg, True )
return
message = "Datasets have different shapes and therefore can't be compared statistically: %s, %s.\n" % (
arr.shape, base_arr.shape )
else:
# Calculate the absolute difference.
difference = np.subtract( arr, base_arr )
np.abs( difference, out=difference )

# Create a copy of the arrays.
offenders = difference != 0.0
n_offenders = np.sum( offenders )

# Calculate the absolute difference.
difference = np.subtract( arr, base_arr )
np.abs( difference, out=difference )
if n_offenders != 0:
max_index = np.unravel_index( np.argmax( difference ), difference.shape )
max_difference = difference[ max_index ]
offenders_mean = np.mean( difference[ offenders ] )
offenders_std = np.std( difference[ offenders ] )

offenders = difference != 0.0
n_offenders = np.sum( offenders )
message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
arr.dtype, base_arr.dtype, offenders.size, n_offenders )
message += "Statistics of the differences greater than 0:\n"
message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
offenders_mean, offenders_std )

if n_offenders != 0:
max_index = np.unravel_index( np.argmax( difference ), difference.shape )
max_difference = difference[ max_index ]
offenders_mean = np.mean( difference[ offenders ] )
offenders_std = np.std( difference[ offenders ] )
# actually, int8 arrays are almost always char arrays, so we sould add a character comparison.
if arr.dtype == np.int8 and base_arr.dtype == np.int8:
message += self.compareCharArrays( arr, base_arr )

message = "Arrays of types %s and %s have %s values of which %d have differing values.\n" % (
arr.dtype, base_arr.dtype, offenders.size, n_offenders )
message += "Statistics of the differences greater than 0:\n"
message += "\tmax_index = %s, max = %s, mean = %s, std = %s\n" % ( max_index, max_difference,
offenders_mean, offenders_std )
if message != "":
self.errorMsg( path, message, True )

def compareCharArrays( self, comp_arr, base_arr ):
"""
Compare the valid characters of two arrays and return a formatted string showing differences.

COMP_ARR [in]: The hdf5 Dataset to compare.
BASE_ARR [in]: The hdf5 Dataset to compare against.

Returns a formatted string highlighting the differing characters.
"""
comp_ndarr = np.array( comp_arr ).flatten()
base_ndarr = np.array( base_arr ).flatten()

# Replace invalid characters by group-separator characters ('\x1D')
valid_chars = set( string.printable )
invalid_char = '\x1D'
comp_str = "".join(
[ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in comp_ndarr ] )
base_str = "".join(
[ chr( x ) if ( x >= 0 and chr( x ) in valid_chars ) else invalid_char for x in base_ndarr ] )

# replace whitespaces sequences by only one space (preventing indentation / spacing changes detection)
whitespace_pattern = r"[ \t\n\r\v\f]+"
comp_str = re.sub( whitespace_pattern, " ", comp_str )
base_str = re.sub( whitespace_pattern, " ", base_str )
# replace invalid characters sequences by a double space (for clear display)
invalid_char_pattern = r"\x1D+"
comp_str_display = re.sub( invalid_char_pattern, " ", comp_str )
base_str_display = re.sub( invalid_char_pattern, " ", base_str )

message = ""

def limited_display( n, string ):
return string[ :n ] + f"... ({len(string)-n} omitted chars)" if len( string ) > n else string

if len( comp_str ) != len( base_str ):
max_display = 250
message = f"Character arrays have different sizes: {len( comp_str )}, {len( base_str )}.\n"
message += f" {limited_display( max_display, comp_str_display )}\n"
message += f" {limited_display( max_display, base_str_display )}\n"
else:
# We need to trim arrays to the length of the shortest one for the comparisons
min_length = min( len( comp_str_display ), len( base_str_display ) )
comp_str_trim = comp_str_display[ :min_length ]
base_str_trim = base_str_display[ :min_length ]

differing_indices = np.where( np.array( list( comp_str_trim ) ) != np.array( list( base_str_trim ) ) )[ 0 ]
if differing_indices.size != 0:
# check for reordering
arr_set = sorted( set( comp_str.split( invalid_char ) ) )
base_arr_set = sorted( set( base_str.split( invalid_char ) ) )
reordering_detected = arr_set == base_arr_set

max_display = 110 if reordering_detected else 250
message = "Differing valid characters"
message += " (substrings reordering detected):\n" if reordering_detected else ":\n"

message += f" {limited_display( max_display, comp_str_display )}\n"
message += f" {limited_display( max_display, base_str_display )}\n"
message += " " + "".join(
[ "^" if i in differing_indices else " " for i in range( min( max_display, min_length ) ) ] ) + "\n"

return message

def compareStringArrays( self, path, arr, base_arr ):
"""
Compare two string datasets. Exact equality is used as the acceptance criteria.
Expand Down
Loading