From 37caf4b6746abb488fd46c0279a3dae025b04942 Mon Sep 17 00:00:00 2001 From: Christian Hagau Date: Tue, 1 Oct 2024 10:17:46 +0200 Subject: [PATCH] tools: clean up the comparison functionality in the `inspect_feather` utility --- inspect_feather.py | 59 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/inspect_feather.py b/inspect_feather.py index eb8dfca..155ac53 100755 --- a/inspect_feather.py +++ b/inspect_feather.py @@ -49,47 +49,72 @@ def display_full(df, max_rows:Optional[int] = None, max_columns:Optional[int] = display(df) ####################### -# TODO: quick & dirty +# TODO: Quick & dirty. This could be parallelized. def load_all_inputs(input_filenames): input_list = [] + loaded_filenames = [] for filename in input_filenames: try: df = read_from_file(filename) input_list.append(df) + loaded_filenames.append(filename) except FileNotFoundError as e: print(f'!!>> File "{filename}" not found:\n {e}') continue - return input_list + return input_list, loaded_filenames + +def is_df_pair_equal(df0, df1): + is_equal = all((df0 == df1).all()) -def verify_equality(df0, df1): - is_equal = all(df0, df1) return is_equal -# TODO: quick & dirty, comparison only works for two inputs +def is_df_pair_equal_detailed(df0, df1): + diff = df0.compare(df1) + is_equal = diff.empty + + return is_equal, diff + def process_multiple_inputs(input_list:Iterable[str] , context_options , compare:bool = False , detailed_compare:bool = False ): - dfs = load_all_inputs(input_list) + dfs, loaded_filenames = load_all_inputs(input_list) if compare or detailed_compare: + is_equal_results:list[bool] = [] + num_dfs:int = len(dfs) + diffs:dict[int, pd.DataFrame] = {} + + # Compare the loaded DataFrames one-to-one sequencially. if detailed_compare: - res = dfs[0].compare(dfs[1]) - print(f'{res=}') - is_equal = all((dfs[0] == dfs[1]).all()) - equal_result = '' if is_equal else ' *NOT*' + for i in range(0, num_dfs-1): + print(f'comparing {loaded_filenames[i]} with {loaded_filenames[i+1]}') + is_equal, diff = is_df_pair_equal_detailed(dfs[i], dfs[i+1]) + # Keep the differenve for the detailed log report. + if not diff.empty: + diffs[i] = diff + is_equal_results.append(is_equal) else: - is_equal = all((dfs[0] == dfs[1]).all()) - equal_result = '' if is_equal else ' *NOT*' + for i in range(0, num_dfs-1): + print(f'comparing {loaded_filenames[i]} with {loaded_filenames[i+1]}') + is_equal = is_df_pair_equal(dfs[i], dfs[i+1]) + is_equal_results.append(is_equal) - print(f'{is_equal}: values in the given DataFrames are{equal_result} equal') + is_equal = all(is_equal_results) + equal_result_str = "" if is_equal else " *NOT*" + print(f'{is_equal}: values in the given DataFrames are{equal_result_str} equal') + + if detailed_compare: + # Print the differences for unequal DataFrames. + for k in diffs: + print(f'{k}: {loaded_filenames[k]}: {loaded_filenames[k+1]} :\n{diffs[k]}') return is_equal else: with pd.option_context(*context_options): - print('>>>> Starting a IPython shell...\n>>>> The data has been loaded into `dfs`') + print('>>>> Starting a IPython shell...\n>>>> The DataFrames has been loaded into `dfs`') embed(color_info=True, colors='Linux') ####################### @@ -149,7 +174,11 @@ def main(): context_options = process_context_options(args) if len(args.input) > 1 : - process_multiple_inputs(args.input, context_options, compare=args.compare, detailed_compare=args.detailed_compare) + result_code = process_multiple_inputs(args.input, context_options, compare=args.compare, detailed_compare=args.detailed_compare) + if result_code is not None: + exit((0 if result_code else 1)) + else: + exit(0) else: process_single_input(args.input[0], context_options)