From b9489ae5b71b7095f0cdbb3ef8a572d41e668fd7 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 7 Nov 2023 08:08:33 +0100 Subject: [PATCH] Improve error message on refhash mismatch * Reword the refhash error message to be clearer. * Also write refhash error message to the log * Also improve the error message on malformed FASTA header --- vamb/parsebam.py | 14 ++++++--- vamb/parsecontigs.py | 10 ++----- vamb/vambtools.py | 67 +++++++++++++++++++++++++++++++++----------- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/vamb/parsebam.py b/vamb/parsebam.py index 1d0b2774..e762f9c5 100644 --- a/vamb/parsebam.py +++ b/vamb/parsebam.py @@ -75,7 +75,11 @@ def load( ) if refhash is not None: vambtools.RefHasher.verify_refhash( - abundance.refhash, refhash, "Loaded", None, None + abundance.refhash, + refhash, + "the loaded Abundance object", + "the given refhash", + None, ) return abundance @@ -214,8 +218,10 @@ def run_pycoverm( # Filter length, using comp_metadata's mask, which has been set by minlength if len(mask) != len(headers): raise ValueError( - f"CompositionMetaData was created with {len(mask)} sequences, " - f"but number of refs in BAM files are {len(headers)}." + f"CompositionMetaData used to create Abundance object was created with {len(mask)} sequences, " + f"but number of reference sequences in BAM files are {len(headers)}. " + "Make sure the BAM files were created by mapping to the same FASTA file " + "which you used to create the Composition object." ) headers = [h for (h, m) in zip(headers, mask) if m] @@ -229,7 +235,7 @@ def run_pycoverm( if target_refhash is not None: vambtools.RefHasher.verify_refhash( - refhash, target_refhash, "Composition", "BAM", identifier_pairs + refhash, target_refhash, "FASTA file", "BAM", identifier_pairs ) return (coverage, refhash) diff --git a/vamb/parsecontigs.py b/vamb/parsecontigs.py index 5d256368..5dd0ae47 100644 --- a/vamb/parsecontigs.py +++ b/vamb/parsecontigs.py @@ -225,10 +225,7 @@ def from_file( "You may want to bin more samples as a time, lower the beta parameter, " "or use a different binner altogether." ) - warnings.warn(message) - if logfile is not None: - print("\n", file=logfile) - print(message, file=logfile) + _vambtools.log_and_warn(message, logfile=logfile) # Warn the user if any contigs have been observed, which is smaller # than the threshold. @@ -239,9 +236,6 @@ def from_file( "Better results are obtained if the sequence file is filtered to the minimum " "sequence length before mapping." ) - warnings.warn(message) - if logfile is not None: - print("\n", file=logfile) - print(message, file=logfile) + _vambtools.log_and_warn(message, logfile=logfile) return cls(metadata, tnfs_arr) diff --git a/vamb/vambtools.py b/vamb/vambtools.py index 9cefec49..822409b4 100644 --- a/vamb/vambtools.py +++ b/vamb/vambtools.py @@ -20,6 +20,28 @@ def showwarning_override(message, category, filename, lineno, file=None, line=No print(str(message) + "\n", file=file) +def log_and_raise( + message: str, + errortype: type[Exception] = ValueError, + logfile: Optional[IO[str]] = None, +): + if logfile is not None: + print("\n", file=logfile) + print(message, file=logfile) + raise errortype(message) + + +def log_and_warn( + message: str, + warntype: type[Warning] = UserWarning, + logfile: Optional[IO[str]] = None, +): + if logfile is not None: + print("\n", file=logfile) + print(message, file=logfile) + warnings.warn(message, warntype) + + # It may seem horrifying to override a stdlib method, but this is the way recommended by the # warnings documentation. # We do it because it's the only way I know to prevent displaying file numbers and source @@ -231,7 +253,9 @@ def _verify_header(self, header: bytes) -> tuple[str, str]: raise ValueError( f'Invalid header in FASTA: "{header.decode()}". ' '\nMust conform to identifier regex pattern of SAM specification: "' - '>([0-9A-Za-z!$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*)([^\\S\\r\\n][^\\r\\n]*)?$"' + '>([0-9A-Za-z!$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*)([^\\S\\r\\n][^\\r\\n]*)?$".\n' + "If the header does not fit this pattern, the header cannot be represented in BAM files, " + "which means Vamb cannot compare sequences in BAM and FASTA files." ) identifier, description = m.groups() description = "" if description is None else description.decode() @@ -373,37 +397,46 @@ def verify_refhash( if refhash == target_refhash: return None - obs_name = "Observed" if observed_name is None else observed_name - tgt_name = "Target" if target_name is None else target_name + obs_name = "observed" if observed_name is None else observed_name + tgt_name = "target" if target_name is None else target_name + + message = ( + f"Mismatch between sequence identifiers (names) in {obs_name} and {tgt_name}.\n" + f"Observed {obs_name} identifier hash: {refhash.hex()}.\n" + f"Expected {tgt_name} identifier hash: {target_refhash.hex()}\n" + f"Make sure all identifiers in {obs_name} and {tgt_name} are identical " + "and in the same order. " + "Note that the identifier is the header before any whitespace." + ) + if identifiers is not None: (observed_ids, target_ids) = identifiers for i, (observed_id, target_id) in enumerate( zip_longest(observed_ids, target_ids) ): if observed_id is None: - raise ValueError( - f"{obs_name} identifiers has only {i} identifier(s), which is fewer than {tgt_name}" + message += ( + f"\nIdentifier mismatch: {obs_name} has only " + f"{i} identifier(s), which is fewer than {tgt_name}" ) + log_and_raise(message) elif target_id is None: - raise ValueError( - f"{tgt_name} identifiers has only {i} identifier(s), which is ffewer than {obs_name}" + message += ( + f"\nIdentifier mismatch: {tgt_name} has only " + f"{i} identifier(s), which is fewer than {obs_name}" ) + log_and_raise(message) elif observed_id != target_id: - raise ValueError( - f"Identifier number {i+1} does not match between {obs_name} and {tgt_name}:" + message += ( + f"\nIdentifier mismatch: Identifier number {i+1} does not match " + f"between {obs_name} and {tgt_name}:" f'{obs_name}: "{observed_id}"' f'{tgt_name}: "{target_id}"' ) + log_and_raise(message) assert False else: - raise ValueError( - f"Mismatch between reference hash of {obs_name} and {tgt_name}." - f"Observed {obs_name} hash: {refhash.hex()}." - f"Expected {tgt_name} hash: {target_refhash.hex()}" - "Make sure all identifiers are identical " - "and in the same order. " - "Note that the identifier is the header before any whitespace." - ) + log_and_raise(message) def write_clusters(