enhance: sync HighLevelDiff temp copy from data-utils-py, fix formatt…

…ing and lint issues
owid · Jul 11, 2022 · 1dbf7bb · 1dbf7bb
1 parent 51d47a9
commit 1dbf7bb
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 40 deletions.
diff --git a/etl/compare.py b/etl/compare.py
@@ -4,7 +4,8 @@
 
 from pathlib import Path
 import click
-from rich_click import RichCommand, RichGroup
+from rich_click.rich_command import RichCommand
+from rich_click.rich_group import RichGroup
 from rich import print
 from typing import cast
 import pandas as pd
@@ -13,7 +14,7 @@
 
 
 @click.group(cls=RichGroup)
-def cli():
+def cli() -> None:
     """Compare two dataframes, both structurally and the values.
 
     This tool loads two dataframes, either from the local ETL and the remote catalog
@@ -37,11 +38,9 @@ def diff_print_and_exit(
     show_values: bool,
     show_shared: bool,
     truncate_lists_at: int,
-):
+) -> None:
     """Runs the comparison and prints the differences, then exits with the appropriate exit code."""
-    diff = tempcompare.DataFrameHighLevelDiff(
-        df1, df2, absolute_tolerance, relative_tolerance
-    )
+    diff = tempcompare.HighLevelDiff(df1, df2, absolute_tolerance, relative_tolerance)
     if diff.are_equal:
         print("[green]Dataframes are equal (within the given thresholds)[/green]")
         exit(0)
@@ -119,7 +118,9 @@ def etl_catalog(
     are structurally equal but are otherwise different, 3 if the dataframes have different structure and/or different values.
     """
     try:
-        remote_df = catalog.find_one(table=table, namespace=namespace, dataset=dataset, channels=[cast(catalog.CHANNEL, channel)])  # type: ignore
+        remote_df = catalog.find_one(
+            table=table, namespace=namespace, dataset=dataset, channels=channel
+        )
         local_catalog = catalog.LocalCatalog("data")
         local_df = local_catalog.find_one(
             table=table,
@@ -165,11 +166,11 @@ def load_dataframe(path_str: str) -> pd.DataFrame:
         raise Exception("File does not exist: " + path_str)
 
     if path.suffix.lower() == ".feather":
-        return pd.read_feather(path_str)
+        return cast(pd.DataFrame, pd.read_feather(path_str))
     elif path.suffix.lower() == ".csv":
         return pd.read_csv(path_str)
     elif path.suffix.lower() == ".parquet":
-        return pd.read_parquet(path_str)
+        return cast(pd.DataFrame, pd.read_parquet(path_str))
     else:
         raise Exception("Unknown file format: " + path_str)
 

diff --git a/etl/tempcompare.py b/etl/tempcompare.py
@@ -1,4 +1,4 @@
-from typing import Callable, Generator, Iterable
+from typing import Callable, Generator, Iterable, cast
 import pandas as pd
 from typing import List, Optional, Any
 import numpy as np
@@ -80,17 +80,20 @@ def yield_formatted_if_not_empty(
         yield fallback_message
 
 
-class DataFrameHighLevelDiff:
+class HighLevelDiff:
     """Class for comparing two dataframes.
 
     It assumes that all nans are identical, and compares floats by means of certain absolute and relative tolerances.
     Construct this class by passing two dataframes of possibly different shape. Then check the are_structurally_equal
     property to see if the column and row sets of the two dataframes match and/or check the are_equal flag to also
-    check for equality of values.
+    check for equality of values. The other fields give detailed information on what is different between the two
+    dataframes.
 
     For cases where there is a difference, various member fields on this class give indications of what is different
     (e.g. columns missing in dataframe 1 or 2, index values missing in dataframe 1 or 2, etc.).
 
+    The get_description_lines method fetches a list of strings that compactly describe the differences for humans.
+
     Parameters
     ----------
     df1 : pd.DataFrame
@@ -127,30 +130,31 @@ def __init__(
         self,
         df1: pd.DataFrame,
         df2: pd.DataFrame,
-        absolute_tolerance: float,
-        relative_tolerance: float,
+        absolute_tolerance: float = 1e-08,
+        relative_tolerance: float = 1e-05,
     ):
         self.df1 = df1
         self.df2 = df2
         self.absolute_tolerance = absolute_tolerance
         self.relative_tolerance = relative_tolerance
-        self.diff()
+        self._diff()
 
     @property
     def value_differences_count(self) -> int:
+        """Get number of cells in the structural overlap of the two dataframes that differ by more than tolerance."""
         if self.value_differences is None:
             return 0
         else:
-            return self.value_differences.sum().sum()
+            return int(self.value_differences.sum().sum())
 
     @property
     def columns_with_differences(self) -> Any:
-        """Return the columns that are different in the two dataframes.
+        """Get the columns that are different in the two dataframes.
 
         This will be an array of index values. If the index is a MultiIndex, the index values will be tuples.
         """
         if self.value_differences is None:
-            return pd.array([])
+            return np.array([])
         return self.value_differences.columns.values
 
     @property
@@ -160,10 +164,10 @@ def rows_with_differences(self) -> Any:
         This will be an array of index values. If the index is a MultiIndex, the index values will be tuples.
         """
         if self.value_differences is None:
-            return pd.array([])
+            return np.array([])
         return self.value_differences.index.values
 
-    def diff(self) -> None:
+    def _diff(self) -> None:
         """Diff the two dataframes.
 
         This can be a somewhat slow operation
@@ -249,7 +253,7 @@ def diff(self) -> None:
                         self.value_differences = None
                     else:
                         # Here we drop the columns that did not have differences. We are left with a dataframe
-                        # with the original indices and only the rows and columns let with differences.
+                        # with the original indices and only the rows and columns with differences.
                         self.value_differences = rows_with_diffs.loc[
                             :, columns_with_diffs
                         ]
@@ -271,7 +275,7 @@ def are_structurally_equal(self) -> bool:
     @property
     def are_equal(self) -> bool:
         """Check if the two dataframes are equal, both structurally and cell-wise."""
-        return self.are_structurally_equal and self.value_differences is None
+        return self.are_structurally_equal and self.are_overlapping_values_equal
 
     @property
     def are_overlapping_values_equal(self) -> bool:
@@ -280,23 +284,31 @@ def are_overlapping_values_equal(self) -> bool:
 
     @property
     def df1_value_differences(self) -> Optional[pd.DataFrame]:
-        """Returns a sliced version of df1 that contains only the columns and rows that differ from df2 (but with the
-        original values from df1)."""
+        """Get a sliced version of df1 that contains only the columns and rows that differ from df2.
+
+        Note that this only includes the part of the dataframe that has structural overlap with
+        the other dataframe (i.e. extra columns or rows are not included).
+        """
         if self.value_differences is None:
             return None
-        return self.df1.loc[
-            self.value_differences.index, self.value_differences.columns
-        ]
+        return cast(
+            pd.DataFrame,
+            self.df1.loc[self.value_differences.index, self.value_differences.columns],
+        )
 
     @property
     def df2_value_differences(self) -> Optional[pd.DataFrame]:
-        """Returns a sliced version of df2 that contains only the columns and rows that differ from df1 (but with the
-        original values from df2)."""
+        """Get a sliced version of df2 that contains only the columns and rows that differ from df2.
+
+        Note that this only includes the part of the dataframe that has structural overlap with
+        the other dataframe (i.e. extra columns or rows are not included).
+        """
         if self.value_differences is None:
             return None
-        return self.df2.loc[
-            self.value_differences.index, self.value_differences.columns
-        ]
+        return cast(
+            pd.DataFrame,
+            self.df2.loc[self.value_differences.index, self.value_differences.columns],
+        )
 
     def get_description_lines_for_diff(
         self,
@@ -307,19 +319,33 @@ def get_description_lines_for_diff(
         show_shared: bool = False,
         truncate_lists_longer_than: int = 20,
     ) -> Generator[str, None, None]:
+        """Generate a human readable description of the differences between the two dataframes.
+
+        It is returned as a generator of strings, roughly one line per string yielded
+        (dataframe printing is done by pandas as one string and is returned as a single yielded item)
+        """
         red, red_end = ("[red]", "[/red]") if use_color_tags else ("", "")
         green, green_end = ("[green]", "[/green]") if use_color_tags else ("", "")
         blue, blue_end = ("[blue]", "[/blue]") if use_color_tags else ("", "")
 
         if self.are_equal:
-            yield (f"{green}{df1_label} == {df2_label}{green_end}")
+            yield (f"{green}{df1_label} is equal to {df2_label}{green_end}")
         else:
-            yield (f"{red}{df1_label} ≠ {df2_label}{red_end}")
+            yield (f"{red}{df1_label} is not equal to {df2_label}{red_end}")
 
             if self.are_structurally_equal:
                 yield (f"The structure is {green}identical{green_end}")
             else:
                 yield (f"The structure is {red}different{red_end}")
+
+                # The structure below works like this: we have a property that is a list
+                # (e.g. self.columns_missing_in_df1) that can be empty or have elements.
+                # If the list is empty we don't want to yield any lines. If the list has elements
+                # we want to yield a line. Additionally, we also want to truncate lines with many
+                # elements if they are too long. We use yield_formatted_if_not_empty on most of the
+                # member properties to output the differences if there are any.
+
+                # Structural differences
                 if show_shared:
                     yield from yield_formatted_if_not_empty(
                         self.columns_shared,
@@ -331,10 +357,6 @@ def get_description_lines_for_diff(
                         ),
                         f"{red}No shared columns{red_end}",
                     )
-                elif len(self.columns_shared) == 0:
-                    # Warn about no shared columns even if show_shared is not set
-                    yield f"{red}No shared columns{red_end}"
-
                 yield from yield_formatted_if_not_empty(
                     self.columns_missing_in_df1,
                     lambda item: yield_list_lines(
@@ -438,9 +460,14 @@ def get_description_lines_for_diff(
                     ),
                 )
 
+            # Show "coordinates" where there are value differences
+            # This is done in compact form, e.g. if you have 10 new years for 200 countries
+            # that would be 2000 values but instead we unpack the hierarchical index tuples
+            # and show that a (shortened) list for the 200 countries and the 10 new years.
             if self.value_differences is not None:
                 yield (
-                    f"Values in the shared columns/rows are {red}different{red_end}. ({self.value_differences_count} different cells)"
+                    f"Values in the shared columns/rows are {red}different{red_end}. "
+                    + f"({self.value_differences_count} different cells)"
                 )
                 yield from yield_formatted_if_not_empty(
                     self.columns_with_differences,
@@ -463,8 +490,14 @@ def get_description_lines_for_diff(
                     ),
                 )
 
+        # This prints the two dataframes one after the other sliced to
+        # only the area where they have differences
         if preview_different_dataframe_values:
-            if self.columns_shared and self.index_values_shared:
+            if (
+                self.value_differences
+                and self.columns_shared
+                and self.index_values_shared
+            ):
                 yield f"Values with differences in {df1_label}:"
                 yield (
                     str(