Skip to content

Commit

Permalink
Skip spectra with invalid custom fields
Browse files Browse the repository at this point in the history
Fixes #59.
  • Loading branch information
bittremieux committed Jul 24, 2024
1 parent 486221c commit e6fff0e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 15 deletions.
12 changes: 6 additions & 6 deletions depthcharge/data/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,15 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:
"intensity_array": parsed.intensity,
}

# Parse custom fields:
entry.update(self.parse_custom_fields(spectrum))
self._update_batch(entry)

except (IndexError, KeyError, ValueError) as exc:
last_exc = exc
n_skipped += 1
continue

# Parse custom fields:
entry.update(self.parse_custom_fields(spectrum))
self._update_batch(entry)

# Update the batch:
if len(self._batch["scan_id"]) == batch_size:
yield self._yield_batch()
Expand All @@ -225,8 +225,8 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:

if n_skipped:
warnings.warn(
f"Skipped {n_skipped} spectra with invalid information."
f"Last error was: \n {str(last_exc)}"
f"Skipped {n_skipped} spectra with invalid information. "
f"Last error was:\n{str(last_exc)}"
)

def _update_batch(self, entry: dict) -> None:
Expand Down
41 changes: 32 additions & 9 deletions tests/unit_tests/test_data/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,38 @@ def test_custom_fields(mgf_small):
expected = pl.Series("seq", ["LESLIEK", "EDITHR"])
assert_series_equal(parsed["seq"], expected)

with pytest.raises(KeyError):
pl.from_arrow(
MgfParser(
mgf_small,
custom_fields=CustomField(
"seq", lambda x: x["params"]["bar"], pa.string()
),
).iter_batches(None)
)
# Test that spectra with invalid custom fields are skipped.
# We don't like the amino acid "D".
def seq_no_d(spec):
if "D" in (seq := spec["params"]["seq"]):
raise ValueError(f"Invalid sequence: {seq}")
return seq

parsed = pl.from_arrow(
MgfParser(
mgf_small,
custom_fields=CustomField(
"seq", seq_no_d, pa.string()
),
).iter_batches(None)
)

assert len(parsed) == 1
assert_series_equal(parsed["seq"], pl.Series("seq", ["LESLIEK"]))

# Invalid custom fields will cause all spectra to get skipped.
parser = MgfParser(
mgf_small,
custom_fields=CustomField(
"seq", lambda x: x["params"]["bar"], pa.string()
),
)

with pytest.warns(
UserWarning, match=r"Skipped 2 spectra with invalid information.*"
):
spectra = list(parser.iter_batches(None))
assert len(spectra) == 0


def test_invalid_file(tmp_path):
Expand Down

0 comments on commit e6fff0e

Please sign in to comment.