From a48abb8ea7e5fd0d9a9122375d5bd451a3b1d311 Mon Sep 17 00:00:00 2001 From: Erin McAuley Date: Fri, 21 Feb 2025 16:52:48 -0500 Subject: [PATCH] tests: add a user-friendly diff for failing tests (#95) Closes #94. This PR uses `pandas` to compare generated dataframes between `pixy` ground-truth data and generated data within the regression testing suite. If a test fails, the diff is displayed to the user. For each test failure, the `AssertionError`, test name, and diff are output. Here is an updated example of the output: ``` ============================== 1 failed in 5.35s =============================== FAILED [100%] tests/main/test_main.py:406 (test_pixy_main_valid_inputs) pixy_out_dir = PosixPath('/private/var/folders/m8/w5g78dgx65504pv0zv2fdxcm0000gn/T/pytest-of-erin/pytest-985/test_pixy_main_valid_inputs0/output') expected_outputs = PosixPath('/Users/erin/git/oss/pixy/tests/main/expected_outputs') ag1000_pop_path = PosixPath('/Users/erin/git/oss/pixy/tests/main/data/ag1000_populations_file.txt') ag1000_vcf_path = PosixPath('/Users/erin/git/oss/pixy/tests/main/data/ag1000_pixy_test.vcf.gz') capsys = <_pytest.capture.CaptureFixture object at 0x7fe220c99c10> def test_pixy_main_valid_inputs( pixy_out_dir: Path, expected_outputs: Path, ag1000_pop_path: Path, ag1000_vcf_path: Path, capsys: pytest.CaptureFixture, ) -> None: """ Given specific input data, assert that outputs do not change. Uses `filecmp` library to compare 2 files without opening them and reading line-by-line. `filecmp.cmp` returns True if 2 files are equal. """ run_pixy_helper( pixy_out_dir=pixy_out_dir, stats=["pi", "fst", "dxy"], window_size=10000, vcf_path=ag1000_vcf_path, populations_path=ag1000_pop_path, ) captured = capsys.readouterr() assert "Data set contains 2 population(s), 2 chromosome(s), and 36 sample(s)" in captured.out expected_out_files: List[Path] = [ Path("pixy_dxy.txt"), Path("pixy_fst.txt"), Path("pixy_pi.txt"), ] for file in expected_out_files: generated_data_path: Path = pixy_out_dir / file exp_data_path: Path = expected_outputs / "baseline" / file assert generated_data_path.exists() > assert_files_are_consistent(generated_data_path,exp_data_path) test_main.py:440: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ gen_file_path = PosixPath('/private/var/folders/m8/w5g78dgx65504pv0zv2fdxcm0000gn/T/pytest-of-erin/pytest-985/test_pixy_main_valid_inputs0/output/pixy_dxy.txt') exp_file_path = PosixPath('/Users/erin/git/oss/pixy/tests/main/expected_outputs/baseline/pixy_dxy.txt') def assert_files_are_consistent(gen_file_path: Path, exp_file_path: Path) -> None: """ Helper to display diff if two files are found to be different. The basename of the generated file is shown (e.g., `pixy_dxy`). The full path of the expected file is given to highlight which test case is failing. Args: exp_file_path: the path of the expected file (i.e., ground-truth) gen_file_path: the path of the generated file Raises: AssertionError, if the two files are not the same """ if not files_are_consistent(gen_file_path, exp_file_path): diff_string: str = show_diff(gen_file_path, exp_file_path) > raise AssertionError(f"Files differ: {gen_file_path.stem}, {exp_file_path}\n{diff_string}") E AssertionError: Files differ: pixy_dxy, /Users/erin/git/oss/pixy/tests/main/expected_outputs/baseline/pixy_dxy.txt E avg_dxy count_comparisons count_missing E 1 Expected 0.002453 9820052.0 2703196.0 E Generated 0.002454 9818500.0 2704748.0 E 2 Expected 0.001746 10398484.0 2161052.0 E Generated 0.001746 10397188.0 2162348.0 E 3 Expected 0.001072 11058900.0 1285500.0 E Generated 0.001072 11057780.0 1286620.0 E 7 Expected 0.002453 9820052.0 2703196.0 E Generated 0.002454 9818500.0 2704748.0 E 8 Expected 0.001746 10398484.0 2161052.0 E Generated 0.001746 10397188.0 2162348.0 E 9 Expected 0.001072 11058900.0 1285500.0 E Generated 0.001072 11057780.0 1286620.0 ../conftest.py:224: AssertionError ``` --- tests/conftest.py | 49 +++++++++++++++++++++++++++++++++++++++-- tests/main/test_main.py | 15 ++++++------- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 70e0037..8eb6c56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from typing import Optional from unittest.mock import patch +import pandas as pd import pytest from pixy.__main__ import main @@ -214,16 +215,61 @@ def run_pixy_helper( # noqa: C901 if fst_type is not None: test_args.extend((["--fst_type", f"{fst_type}"])) - print(f"test_args: {test_args}") with patch.object(sys, "argv", test_args): main() +def assert_files_are_consistent(gen_file_path: Path, exp_file_path: Path) -> None: + """ + Helper to display diff if two files are found to be different. + + The basename of the generated file is shown (e.g., `pixy_dxy`). The full path of the expected + file is given to highlight which test case is failing. + + Args: + exp_file_path: the path of the expected file (i.e., ground-truth) + gen_file_path: the path of the generated file + + Raises: + AssertionError, if the two files are not the same + """ + if not files_are_consistent(gen_file_path, exp_file_path): + diff_string: str = show_diff(gen_file_path, exp_file_path) + raise AssertionError(f"Files differ: {gen_file_path.stem}, {exp_file_path}\n{diff_string}") + + +def show_diff(expected_file: Path, generated_file: Path) -> str: + """ + Show the diff between an expected and generated file. + + Useful to examine why a regression test fails. + + Args: + expected_file: the path of the expected file (i.e., ground-truth) + generated_file: the path of the generated file + + """ + exp = pd.read_csv(expected_file, sep="\t") + gen = pd.read_csv(generated_file, sep="\t") + diff = exp.compare(gen, result_names=("Expected", "Generated"), align_axis=0) + pretty_display: str = diff.to_string() + return pretty_display + + def files_are_consistent(gen_file_path: Path, exp_file_path: Path) -> bool: """ Helper function to compare non-deterministic files generated by `pixy`. + Checks that the headers are the same and the length of the files matches. + Sorts the data to be deterministic before comparing to ground-truth data. + Used in regression testing to compare specific rows in generated files for reproducibility. + + Raises: + FileNotFoundError: if one or both files do not exist + ValueError: if one of the provided paths is not a file + Returns: + True if lines in file match each other; False if there is a discrepancy """ if not gen_file_path.exists() or not exp_file_path.exists(): raise FileNotFoundError("One or both files do not exist.") @@ -245,5 +291,4 @@ def files_are_consistent(gen_file_path: Path, exp_file_path: Path) -> bool: for line1, line2 in zip(generated_data, expected_data): if line1 != line2: return False - return True diff --git a/tests/main/test_main.py b/tests/main/test_main.py index fe6b5a6..4964d44 100644 --- a/tests/main/test_main.py +++ b/tests/main/test_main.py @@ -1,4 +1,3 @@ -import filecmp import logging import os import shutil @@ -9,7 +8,7 @@ import pytest -from tests.conftest import files_are_consistent +from tests.conftest import assert_files_are_consistent from tests.conftest import run_pixy_helper ################################################################################ @@ -433,7 +432,7 @@ def test_pixy_csi_index( exp_data_path: Path = expected_outputs / "baseline" / file assert generated_data_path.exists() - assert filecmp.cmp(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path) ####################################### @@ -528,7 +527,7 @@ def test_pixy_main_valid_inputs( exp_data_path: Path = expected_outputs / "baseline" / file assert generated_data_path.exists() - assert filecmp.cmp(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path) ################################################################################ @@ -583,7 +582,7 @@ def test_pixy_limited_sites( exp_data_path: Path = expected_outputs / output_prefix / file assert generated_data_path.exists() - assert files_are_consistent(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path) ################################################################################ @@ -623,7 +622,7 @@ def test_pixy_limited_bed_file( exp_data_path: Path = expected_outputs / "limited_bed" / file assert generated_data_path.exists() - assert files_are_consistent(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path) ################################################################################ @@ -674,7 +673,7 @@ def test_pixy_limited_sites_bed( generated_data_path: Path = pixy_out_dir / file exp_data_path: Path = expected_outputs / "limited_sites_and_bed" / file assert generated_data_path.exists() - assert files_are_consistent(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path) ############################################################################### @@ -709,4 +708,4 @@ def test_pixy_hudson_fst( generated_data_path: Path = pixy_out_dir / file exp_data_path: Path = expected_outputs / "hudson_fst" / file assert generated_data_path.exists() - assert files_are_consistent(generated_data_path, exp_data_path) + assert_files_are_consistent(generated_data_path, exp_data_path)