Skip to content

Commit

Permalink
Improve error message on refhash mismatch
Browse files Browse the repository at this point in the history
* Reword the refhash error message to be clearer.
* Also write refhash error message to the log
* Also improve the error message on malformed FASTA header
  • Loading branch information
jakobnissen committed Nov 7, 2023
1 parent 16d676a commit b9489ae
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 29 deletions.
14 changes: 10 additions & 4 deletions vamb/parsebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ def load(
)
if refhash is not None:
vambtools.RefHasher.verify_refhash(
abundance.refhash, refhash, "Loaded", None, None
abundance.refhash,
refhash,
"the loaded Abundance object",
"the given refhash",
None,
)

return abundance
Expand Down Expand Up @@ -214,8 +218,10 @@ def run_pycoverm(
# Filter length, using comp_metadata's mask, which has been set by minlength
if len(mask) != len(headers):
raise ValueError(
f"CompositionMetaData was created with {len(mask)} sequences, "
f"but number of refs in BAM files are {len(headers)}."
f"CompositionMetaData used to create Abundance object was created with {len(mask)} sequences, "
f"but number of reference sequences in BAM files are {len(headers)}. "
"Make sure the BAM files were created by mapping to the same FASTA file "
"which you used to create the Composition object."
)

headers = [h for (h, m) in zip(headers, mask) if m]
Expand All @@ -229,7 +235,7 @@ def run_pycoverm(

if target_refhash is not None:
vambtools.RefHasher.verify_refhash(
refhash, target_refhash, "Composition", "BAM", identifier_pairs
refhash, target_refhash, "FASTA file", "BAM", identifier_pairs
)

return (coverage, refhash)
10 changes: 2 additions & 8 deletions vamb/parsecontigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,7 @@ def from_file(
"You may want to bin more samples as a time, lower the beta parameter, "
"or use a different binner altogether."
)
warnings.warn(message)
if logfile is not None:
print("\n", file=logfile)
print(message, file=logfile)
_vambtools.log_and_warn(message, logfile=logfile)

# Warn the user if any contigs have been observed, which is smaller
# than the threshold.
Expand All @@ -239,9 +236,6 @@ def from_file(
"Better results are obtained if the sequence file is filtered to the minimum "
"sequence length before mapping."
)
warnings.warn(message)
if logfile is not None:
print("\n", file=logfile)
print(message, file=logfile)
_vambtools.log_and_warn(message, logfile=logfile)

return cls(metadata, tnfs_arr)
67 changes: 50 additions & 17 deletions vamb/vambtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,28 @@ def showwarning_override(message, category, filename, lineno, file=None, line=No
print(str(message) + "\n", file=file)


def log_and_raise(
message: str,
errortype: type[Exception] = ValueError,
logfile: Optional[IO[str]] = None,
):
if logfile is not None:
print("\n", file=logfile)
print(message, file=logfile)
raise errortype(message)


def log_and_warn(
message: str,
warntype: type[Warning] = UserWarning,
logfile: Optional[IO[str]] = None,
):
if logfile is not None:
print("\n", file=logfile)
print(message, file=logfile)
warnings.warn(message, warntype)


# It may seem horrifying to override a stdlib method, but this is the way recommended by the
# warnings documentation.
# We do it because it's the only way I know to prevent displaying file numbers and source
Expand Down Expand Up @@ -231,7 +253,9 @@ def _verify_header(self, header: bytes) -> tuple[str, str]:
raise ValueError(
f'Invalid header in FASTA: "{header.decode()}". '
'\nMust conform to identifier regex pattern of SAM specification: "'
'>([0-9A-Za-z!$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*)([^\\S\\r\\n][^\\r\\n]*)?$"'
'>([0-9A-Za-z!$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*)([^\\S\\r\\n][^\\r\\n]*)?$".\n'
"If the header does not fit this pattern, the header cannot be represented in BAM files, "
"which means Vamb cannot compare sequences in BAM and FASTA files."
)
identifier, description = m.groups()
description = "" if description is None else description.decode()
Expand Down Expand Up @@ -373,37 +397,46 @@ def verify_refhash(
if refhash == target_refhash:
return None

obs_name = "Observed" if observed_name is None else observed_name
tgt_name = "Target" if target_name is None else target_name
obs_name = "observed" if observed_name is None else observed_name
tgt_name = "target" if target_name is None else target_name

message = (
f"Mismatch between sequence identifiers (names) in {obs_name} and {tgt_name}.\n"
f"Observed {obs_name} identifier hash: {refhash.hex()}.\n"
f"Expected {tgt_name} identifier hash: {target_refhash.hex()}\n"
f"Make sure all identifiers in {obs_name} and {tgt_name} are identical "
"and in the same order. "
"Note that the identifier is the header before any whitespace."
)

if identifiers is not None:
(observed_ids, target_ids) = identifiers
for i, (observed_id, target_id) in enumerate(
zip_longest(observed_ids, target_ids)
):
if observed_id is None:
raise ValueError(
f"{obs_name} identifiers has only {i} identifier(s), which is fewer than {tgt_name}"
message += (
f"\nIdentifier mismatch: {obs_name} has only "
f"{i} identifier(s), which is fewer than {tgt_name}"
)
log_and_raise(message)
elif target_id is None:
raise ValueError(
f"{tgt_name} identifiers has only {i} identifier(s), which is ffewer than {obs_name}"
message += (
f"\nIdentifier mismatch: {tgt_name} has only "
f"{i} identifier(s), which is fewer than {obs_name}"
)
log_and_raise(message)
elif observed_id != target_id:
raise ValueError(
f"Identifier number {i+1} does not match between {obs_name} and {tgt_name}:"
message += (
f"\nIdentifier mismatch: Identifier number {i+1} does not match "
f"between {obs_name} and {tgt_name}:"
f'{obs_name}: "{observed_id}"'
f'{tgt_name}: "{target_id}"'
)
log_and_raise(message)
assert False
else:
raise ValueError(
f"Mismatch between reference hash of {obs_name} and {tgt_name}."
f"Observed {obs_name} hash: {refhash.hex()}."
f"Expected {tgt_name} hash: {target_refhash.hex()}"
"Make sure all identifiers are identical "
"and in the same order. "
"Note that the identifier is the header before any whitespace."
)
log_and_raise(message)


def write_clusters(
Expand Down

0 comments on commit b9489ae

Please sign in to comment.