-
-
Notifications
You must be signed in to change notification settings - Fork 1
feat: add a high level dataframe diff utility class #29
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -218,6 +218,179 @@ def are_equal( | |
return equal, compared | ||
|
||
|
||
class DataFrameHighLevelDiff: | ||
"""Class for comparing two dataframes. | ||
|
||
It assumes that all nans are identical, and compares floats by means of certain absolute and relative tolerances. | ||
Construct this class by passing two dataframes of possibly different shape. Then check the are_structurally_equal | ||
property to see if the column and row sets of the two dataframes match and/or check the are_equal flag to also | ||
check for equality of values. | ||
|
||
For cases where there is a difference, various member fields on this class give indications of what is different | ||
(e.g. columns missing in dataframe 1 or 2, index values missing in dataframe 1 or 2, etc.). | ||
|
||
Parameters | ||
---------- | ||
df1 : pd.DataFrame | ||
First dataframe. | ||
df2 : pd.DataFrame | ||
Second dataframe. | ||
absolute_tolerance : float | ||
Absolute tolerance to assume in the comparison of each cell in the dataframes. A value a of an element in df1 is | ||
considered equal to the corresponding element b at the same position in df2, if: | ||
abs(a - b) <= absolute_tolerance | ||
relative_tolerance : float | ||
Relative tolerance to assume in the comparison of each cell in the dataframes. A value a of an element in df1 is | ||
considered equal to the corresponding element b at the same position in df2, if: | ||
abs(a - b) / abs(b) <= relative_tolerance | ||
|
||
""" | ||
|
||
df1: pd.DataFrame | ||
df2: pd.DataFrame | ||
columns_missing_in_df1: List[str] | ||
columns_missing_in_df2: List[str] | ||
index_columns_missing_in_df1: List[str] | ||
index_columns_missing_in_df2: List[str] | ||
index_values_missing_in_df1: pd.Index | ||
index_values_missing_in_df2: pd.Index | ||
duplicate_index_values_in_df1: pd.Series | ||
duplicate_index_values_in_df2: pd.Series | ||
value_differences: Optional[pd.DataFrame] | ||
|
||
def __init__( | ||
self, | ||
df1: pd.DataFrame, | ||
df2: pd.DataFrame, | ||
absolute_tolerance: float, | ||
relative_tolerance: float, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not giving these two a default value? In numpy |
||
): | ||
self.df1 = df1 | ||
self.df2 = df2 | ||
self.absolute_tolerance = absolute_tolerance | ||
self.relative_tolerance = relative_tolerance | ||
self.diff() | ||
|
||
@property | ||
def columns_with_differences(self) -> Any: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could just be called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changes this a bit now and removed this and the simliar rows property for now (instead you access the value_differences dataframe which is now subset to the structural overlap of the two datasets |
||
"""Return the columns that are different in the two dataframes. | ||
|
||
This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. | ||
""" | ||
if self.value_differences is None: | ||
return pd.array([]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return self.value_differences.columns.values | ||
|
||
@property | ||
def rows_with_differences(self) -> Any: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could just be called |
||
"""Return the row indices that are different in the two dataframes. | ||
|
||
This will be an array of index values. If the index is a MultiIndex, the index values will be tuples. | ||
""" | ||
if self.value_differences is None: | ||
return pd.array([]) | ||
return self.value_differences.index.values | ||
|
||
def diff(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be a hidden method. Otherwise, if |
||
"""Diff the two dataframes. | ||
|
||
This can be a somewhat slow operation | ||
""" | ||
self.columns_missing_in_df1 = sorted( | ||
set(self.df2.columns) - set(self.df1.columns) | ||
) | ||
self.columns_missing_in_df2 = sorted( | ||
set(self.df1.columns) - set(self.df2.columns) | ||
) | ||
self.index_columns_missing_in_df1 = sorted( | ||
set(self.df2.index.names) - set(self.df1.index.names) | ||
) | ||
self.index_columns_missing_in_df2 = sorted( | ||
set(self.df1.index.names) - set(self.df2.index.names) | ||
) | ||
self.index_values_missing_in_df1 = self.df2.index.difference(self.df1.index) | ||
self.index_values_missing_in_df2 = self.df1.index.difference(self.df2.index) | ||
self.duplicate_index_values_in_df1 = self.df1[ | ||
self.df1.index.duplicated() | ||
].index.values | ||
self.duplicate_index_values_in_df2 = self.df2[ | ||
self.df2.index.duplicated() | ||
].index.values | ||
if self.are_structurally_equal: | ||
# We don't use the compare function here from above because it builds a new | ||
# dataframe and we want to leave indices intact so we can know which rows and columns | ||
# were different once we drop the ones with no differences | ||
diffs = self.df1.eq(self.df2) | ||
|
||
# Eq above does not take tolerance into account so compare again with tolerance | ||
# for columns that are numeric. this could probably be sped up with a check on any on | ||
# the column first but would have to be benchmarked | ||
for col in diffs.columns: | ||
if (self.df1[col].dtype in (object, "category")) or ( | ||
self.df2[col].dtype in (object, "category") | ||
): | ||
# Apply a direct comparison for strings or categories | ||
pass | ||
else: | ||
# For numeric data, consider them equal within certain absolute and relative tolerances. | ||
compared_values = np.isclose( | ||
self.df1[col].values, | ||
self.df2[col].values, | ||
atol=self.absolute_tolerance, | ||
rtol=self.relative_tolerance, | ||
) | ||
# Treat nans as equal. | ||
compared_values[ | ||
pd.isnull(self.df1[col].values) | ||
& pd.isnull(self.df2[col].values) | ||
] = True | ||
diffs[col] = compared_values | ||
|
||
# We now have a dataframe with the same shape and indices as df1 and df2, filled with | ||
# True where the values are the same. We want to use true for different values, so invert | ||
# element-wise now | ||
diffs = ~diffs | ||
|
||
if diffs.empty: | ||
self.value_differences = None | ||
else: | ||
# Get a copy of diffs with all rows dropped where all values in a row are False | ||
# (i.e. where df1 and df2 have identical values for all columns) | ||
rows_with_diffs = diffs[diffs.any(axis=1)] | ||
if rows_with_diffs.empty or not rows_with_diffs.any().any(): | ||
self.value_differences = None | ||
else: | ||
# Now figure out all columns where there is at least one difference | ||
columns_with_diffs = diffs.any(axis=0) | ||
if not columns_with_diffs.any(): | ||
self.value_differences = None | ||
else: | ||
# Here we drop the columns that did not have differences. We are left with a dataframe | ||
# with the original indices and only the rows and columns let with differences. | ||
self.value_differences = rows_with_diffs.loc[ | ||
:, columns_with_diffs | ||
] | ||
|
||
@property | ||
def are_structurally_equal(self) -> bool: | ||
"""Check if the two dataframes are structurally equal (i.e. same columns, same index values, ...).""" | ||
return not ( | ||
any(self.columns_missing_in_df1) | ||
or any(self.columns_missing_in_df2) | ||
or any(self.index_columns_missing_in_df1) | ||
or any(self.index_columns_missing_in_df2) | ||
or any(self.index_values_missing_in_df1) | ||
or any(self.index_values_missing_in_df2) | ||
or any(self.duplicate_index_values_in_df1) | ||
or any(self.duplicate_index_values_in_df2) | ||
) | ||
|
||
@property | ||
def are_equal(self) -> bool: | ||
"""Check if the two dataframes are equal, both structurally and cell-wise.""" | ||
return self.are_structurally_equal and self.value_differences is None | ||
|
||
|
||
def groupby_agg( | ||
df: pd.DataFrame, | ||
groupby_columns: Union[List[str], str], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that we are in the
dataframes
module, I don't think it's necessary to mentionDataFrame
in the name. Maybe justCompare
(which I think is more accurate thanDiff
).