Skip to content

Commit

Permalink
enhance: sync HighLevelDiff temp copy from data-utils-py, fix formatt…
Browse files Browse the repository at this point in the history
…ing and lint issues
  • Loading branch information
danyx23 committed Jul 11, 2022
1 parent 51d47a9 commit 1dbf7bb
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 40 deletions.
19 changes: 10 additions & 9 deletions etl/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from pathlib import Path
import click
from rich_click import RichCommand, RichGroup
from rich_click.rich_command import RichCommand
from rich_click.rich_group import RichGroup
from rich import print
from typing import cast
import pandas as pd
Expand All @@ -13,7 +14,7 @@


@click.group(cls=RichGroup)
def cli():
def cli() -> None:
"""Compare two dataframes, both structurally and the values.
This tool loads two dataframes, either from the local ETL and the remote catalog
Expand All @@ -37,11 +38,9 @@ def diff_print_and_exit(
show_values: bool,
show_shared: bool,
truncate_lists_at: int,
):
) -> None:
"""Runs the comparison and prints the differences, then exits with the appropriate exit code."""
diff = tempcompare.DataFrameHighLevelDiff(
df1, df2, absolute_tolerance, relative_tolerance
)
diff = tempcompare.HighLevelDiff(df1, df2, absolute_tolerance, relative_tolerance)
if diff.are_equal:
print("[green]Dataframes are equal (within the given thresholds)[/green]")
exit(0)
Expand Down Expand Up @@ -119,7 +118,9 @@ def etl_catalog(
are structurally equal but are otherwise different, 3 if the dataframes have different structure and/or different values.
"""
try:
remote_df = catalog.find_one(table=table, namespace=namespace, dataset=dataset, channels=[cast(catalog.CHANNEL, channel)]) # type: ignore
remote_df = catalog.find_one(
table=table, namespace=namespace, dataset=dataset, channels=channel
)
local_catalog = catalog.LocalCatalog("data")
local_df = local_catalog.find_one(
table=table,
Expand Down Expand Up @@ -165,11 +166,11 @@ def load_dataframe(path_str: str) -> pd.DataFrame:
raise Exception("File does not exist: " + path_str)

if path.suffix.lower() == ".feather":
return pd.read_feather(path_str)
return cast(pd.DataFrame, pd.read_feather(path_str))
elif path.suffix.lower() == ".csv":
return pd.read_csv(path_str)
elif path.suffix.lower() == ".parquet":
return pd.read_parquet(path_str)
return cast(pd.DataFrame, pd.read_parquet(path_str))
else:
raise Exception("Unknown file format: " + path_str)

Expand Down
95 changes: 64 additions & 31 deletions etl/tempcompare.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Callable, Generator, Iterable
from typing import Callable, Generator, Iterable, cast
import pandas as pd
from typing import List, Optional, Any
import numpy as np
Expand Down Expand Up @@ -80,17 +80,20 @@ def yield_formatted_if_not_empty(
yield fallback_message


class DataFrameHighLevelDiff:
class HighLevelDiff:
"""Class for comparing two dataframes.
It assumes that all nans are identical, and compares floats by means of certain absolute and relative tolerances.
Construct this class by passing two dataframes of possibly different shape. Then check the are_structurally_equal
property to see if the column and row sets of the two dataframes match and/or check the are_equal flag to also
check for equality of values.
check for equality of values. The other fields give detailed information on what is different between the two
dataframes.
For cases where there is a difference, various member fields on this class give indications of what is different
(e.g. columns missing in dataframe 1 or 2, index values missing in dataframe 1 or 2, etc.).
The get_description_lines method fetches a list of strings that compactly describe the differences for humans.
Parameters
----------
df1 : pd.DataFrame
Expand Down Expand Up @@ -127,30 +130,31 @@ def __init__(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
absolute_tolerance: float,
relative_tolerance: float,
absolute_tolerance: float = 1e-08,
relative_tolerance: float = 1e-05,
):
self.df1 = df1
self.df2 = df2
self.absolute_tolerance = absolute_tolerance
self.relative_tolerance = relative_tolerance
self.diff()
self._diff()

@property
def value_differences_count(self) -> int:
"""Get number of cells in the structural overlap of the two dataframes that differ by more than tolerance."""
if self.value_differences is None:
return 0
else:
return self.value_differences.sum().sum()
return int(self.value_differences.sum().sum())

@property
def columns_with_differences(self) -> Any:
"""Return the columns that are different in the two dataframes.
"""Get the columns that are different in the two dataframes.
This will be an array of index values. If the index is a MultiIndex, the index values will be tuples.
"""
if self.value_differences is None:
return pd.array([])
return np.array([])
return self.value_differences.columns.values

@property
Expand All @@ -160,10 +164,10 @@ def rows_with_differences(self) -> Any:
This will be an array of index values. If the index is a MultiIndex, the index values will be tuples.
"""
if self.value_differences is None:
return pd.array([])
return np.array([])
return self.value_differences.index.values

def diff(self) -> None:
def _diff(self) -> None:
"""Diff the two dataframes.
This can be a somewhat slow operation
Expand Down Expand Up @@ -249,7 +253,7 @@ def diff(self) -> None:
self.value_differences = None
else:
# Here we drop the columns that did not have differences. We are left with a dataframe
# with the original indices and only the rows and columns let with differences.
# with the original indices and only the rows and columns with differences.
self.value_differences = rows_with_diffs.loc[
:, columns_with_diffs
]
Expand All @@ -271,7 +275,7 @@ def are_structurally_equal(self) -> bool:
@property
def are_equal(self) -> bool:
"""Check if the two dataframes are equal, both structurally and cell-wise."""
return self.are_structurally_equal and self.value_differences is None
return self.are_structurally_equal and self.are_overlapping_values_equal

@property
def are_overlapping_values_equal(self) -> bool:
Expand All @@ -280,23 +284,31 @@ def are_overlapping_values_equal(self) -> bool:

@property
def df1_value_differences(self) -> Optional[pd.DataFrame]:
"""Returns a sliced version of df1 that contains only the columns and rows that differ from df2 (but with the
original values from df1)."""
"""Get a sliced version of df1 that contains only the columns and rows that differ from df2.
Note that this only includes the part of the dataframe that has structural overlap with
the other dataframe (i.e. extra columns or rows are not included).
"""
if self.value_differences is None:
return None
return self.df1.loc[
self.value_differences.index, self.value_differences.columns
]
return cast(
pd.DataFrame,
self.df1.loc[self.value_differences.index, self.value_differences.columns],
)

@property
def df2_value_differences(self) -> Optional[pd.DataFrame]:
"""Returns a sliced version of df2 that contains only the columns and rows that differ from df1 (but with the
original values from df2)."""
"""Get a sliced version of df2 that contains only the columns and rows that differ from df2.
Note that this only includes the part of the dataframe that has structural overlap with
the other dataframe (i.e. extra columns or rows are not included).
"""
if self.value_differences is None:
return None
return self.df2.loc[
self.value_differences.index, self.value_differences.columns
]
return cast(
pd.DataFrame,
self.df2.loc[self.value_differences.index, self.value_differences.columns],
)

def get_description_lines_for_diff(
self,
Expand All @@ -307,19 +319,33 @@ def get_description_lines_for_diff(
show_shared: bool = False,
truncate_lists_longer_than: int = 20,
) -> Generator[str, None, None]:
"""Generate a human readable description of the differences between the two dataframes.
It is returned as a generator of strings, roughly one line per string yielded
(dataframe printing is done by pandas as one string and is returned as a single yielded item)
"""
red, red_end = ("[red]", "[/red]") if use_color_tags else ("", "")
green, green_end = ("[green]", "[/green]") if use_color_tags else ("", "")
blue, blue_end = ("[blue]", "[/blue]") if use_color_tags else ("", "")

if self.are_equal:
yield (f"{green}{df1_label} == {df2_label}{green_end}")
yield (f"{green}{df1_label} is equal to {df2_label}{green_end}")
else:
yield (f"{red}{df1_label} {df2_label}{red_end}")
yield (f"{red}{df1_label} is not equal to {df2_label}{red_end}")

if self.are_structurally_equal:
yield (f"The structure is {green}identical{green_end}")
else:
yield (f"The structure is {red}different{red_end}")

# The structure below works like this: we have a property that is a list
# (e.g. self.columns_missing_in_df1) that can be empty or have elements.
# If the list is empty we don't want to yield any lines. If the list has elements
# we want to yield a line. Additionally, we also want to truncate lines with many
# elements if they are too long. We use yield_formatted_if_not_empty on most of the
# member properties to output the differences if there are any.

# Structural differences
if show_shared:
yield from yield_formatted_if_not_empty(
self.columns_shared,
Expand All @@ -331,10 +357,6 @@ def get_description_lines_for_diff(
),
f"{red}No shared columns{red_end}",
)
elif len(self.columns_shared) == 0:
# Warn about no shared columns even if show_shared is not set
yield f"{red}No shared columns{red_end}"

yield from yield_formatted_if_not_empty(
self.columns_missing_in_df1,
lambda item: yield_list_lines(
Expand Down Expand Up @@ -438,9 +460,14 @@ def get_description_lines_for_diff(
),
)

# Show "coordinates" where there are value differences
# This is done in compact form, e.g. if you have 10 new years for 200 countries
# that would be 2000 values but instead we unpack the hierarchical index tuples
# and show that a (shortened) list for the 200 countries and the 10 new years.
if self.value_differences is not None:
yield (
f"Values in the shared columns/rows are {red}different{red_end}. ({self.value_differences_count} different cells)"
f"Values in the shared columns/rows are {red}different{red_end}. "
+ f"({self.value_differences_count} different cells)"
)
yield from yield_formatted_if_not_empty(
self.columns_with_differences,
Expand All @@ -463,8 +490,14 @@ def get_description_lines_for_diff(
),
)

# This prints the two dataframes one after the other sliced to
# only the area where they have differences
if preview_different_dataframe_values:
if self.columns_shared and self.index_values_shared:
if (
self.value_differences
and self.columns_shared
and self.index_values_shared
):
yield f"Values with differences in {df1_label}:"
yield (
str(
Expand Down

0 comments on commit 1dbf7bb

Please sign in to comment.