Skip to content

Commit

Permalink
Scan id to string
Browse files Browse the repository at this point in the history
  • Loading branch information
wfondrie committed Apr 30, 2024
1 parent f341681 commit 1f07cec
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [v0.4.5]
### Changed
- The `scan_id` column for parsed spectra is not a sting instead of an integer. This is less space efficient, but we ran into issues with Sciex indexing when trying to use only an integer.

## [v0.4.4]

### Changed
Expand Down
4 changes: 2 additions & 2 deletions depthcharge/data/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
self.schema = pa.schema(
[
pa.field("peak_file", pa.string()),
pa.field("scan_id", pa.int64()),
pa.field("scan_id", pa.string()),
pa.field("ms_level", pa.uint8()),
pa.field("precursor_mz", pa.float64()),
pa.field("precursor_charge", pa.int16()),
Expand Down Expand Up @@ -198,7 +198,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:

entry = {
"peak_file": self.peak_file.name,
"scan_id": _parse_scan_id(parsed.scan_id),
"scan_id": str(parsed.scan_id),
"ms_level": parsed.ms_level,
"precursor_mz": parsed.precursor_mz,
"precursor_charge": parsed.precursor_charge,
Expand Down
7 changes: 5 additions & 2 deletions tests/unit_tests/test_data/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
from depthcharge.data.fields import CustomField
from depthcharge.data.preprocessing import scale_to_unit_norm

METADATA_DF1 = pl.DataFrame({"scan_id": [501, 507, 10], "blah": [True] * 3})
SCANS = [
f"controllerType=0 controllerNumber=1 scan={x}" for x in [501, 507, 10]
]
METADATA_DF1 = pl.DataFrame({"scan_id": SCANS, "blah": [True] * 3})
METADATA_DF2 = pl.DataFrame(
{
"scan_id": [501, 507, 10],
"scan_id": SCANS,
"blah": [True] * 3,
"peak_file": ["TMT10-Trail-8.mzML"] * 3,
},
Expand Down
4 changes: 2 additions & 2 deletions tests/unit_tests/test_data/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_indexing(tokenizer, mgf_small, tmp_path):
spec = dataset[0]
assert len(spec) == 7
assert spec["peak_file"] == ["small.mgf"]
assert spec["scan_id"].item() == 0
assert spec["scan_id"] == ["0"]
assert spec["ms_level"].item() == 2
assert (spec["precursor_mz"].item() - 416.2448) < 0.001

Expand Down Expand Up @@ -116,7 +116,7 @@ def test_load(tokenizer, tmp_path, mgf_small):
spec = dataset[0]
assert len(spec) == 8
assert spec["peak_file"] == ["small.mgf"]
assert spec["scan_id"] == 0
assert spec["scan_id"] == ["0"]
assert spec["ms_level"] == 2
assert (spec["precursor_mz"] - 416.2448) < 0.001

Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_data/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_mgf_and_base(mgf_small):
expected = pl.DataFrame(
{
"peak_file": [mgf_small.name] * 2,
"scan_id": [0, 1],
"scan_id": ["0", "1"],
"ms_level": [2, 2],
"precursor_mz": [416.24474357, 257.464565],
"precursor_charge": [2, 3],
Expand Down

0 comments on commit 1f07cec

Please sign in to comment.