From 1f07cec0e190b42420f38b03f3a9728ac81d6055 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Tue, 30 Apr 2024 10:49:18 -0700 Subject: [PATCH] Scan id to string --- CHANGELOG.md | 4 ++++ depthcharge/data/parsers.py | 4 ++-- tests/unit_tests/test_data/test_arrow.py | 7 +++++-- tests/unit_tests/test_data/test_datasets.py | 4 ++-- tests/unit_tests/test_data/test_parsers.py | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f91b65a..8be1135 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [v0.4.5] +### Changed +- The `scan_id` column for parsed spectra is not a sting instead of an integer. This is less space efficient, but we ran into issues with Sciex indexing when trying to use only an integer. + ## [v0.4.4] ### Changed diff --git a/depthcharge/data/parsers.py b/depthcharge/data/parsers.py index 2d9c3a0..4d54ccc 100644 --- a/depthcharge/data/parsers.py +++ b/depthcharge/data/parsers.py @@ -90,7 +90,7 @@ def __init__( self.schema = pa.schema( [ pa.field("peak_file", pa.string()), - pa.field("scan_id", pa.int64()), + pa.field("scan_id", pa.string()), pa.field("ms_level", pa.uint8()), pa.field("precursor_mz", pa.float64()), pa.field("precursor_charge", pa.int16()), @@ -198,7 +198,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch: entry = { "peak_file": self.peak_file.name, - "scan_id": _parse_scan_id(parsed.scan_id), + "scan_id": str(parsed.scan_id), "ms_level": parsed.ms_level, "precursor_mz": parsed.precursor_mz, "precursor_charge": parsed.precursor_charge, diff --git a/tests/unit_tests/test_data/test_arrow.py b/tests/unit_tests/test_data/test_arrow.py index 58efadd..2050189 100644 --- a/tests/unit_tests/test_data/test_arrow.py +++ b/tests/unit_tests/test_data/test_arrow.py @@ -14,10 +14,13 @@ from depthcharge.data.fields import CustomField from depthcharge.data.preprocessing import scale_to_unit_norm -METADATA_DF1 = pl.DataFrame({"scan_id": [501, 507, 10], "blah": [True] * 3}) +SCANS = [ + f"controllerType=0 controllerNumber=1 scan={x}" for x in [501, 507, 10] +] +METADATA_DF1 = pl.DataFrame({"scan_id": SCANS, "blah": [True] * 3}) METADATA_DF2 = pl.DataFrame( { - "scan_id": [501, 507, 10], + "scan_id": SCANS, "blah": [True] * 3, "peak_file": ["TMT10-Trail-8.mzML"] * 3, }, diff --git a/tests/unit_tests/test_data/test_datasets.py b/tests/unit_tests/test_data/test_datasets.py index 2302d87..49b07e7 100644 --- a/tests/unit_tests/test_data/test_datasets.py +++ b/tests/unit_tests/test_data/test_datasets.py @@ -48,7 +48,7 @@ def test_indexing(tokenizer, mgf_small, tmp_path): spec = dataset[0] assert len(spec) == 7 assert spec["peak_file"] == ["small.mgf"] - assert spec["scan_id"].item() == 0 + assert spec["scan_id"] == ["0"] assert spec["ms_level"].item() == 2 assert (spec["precursor_mz"].item() - 416.2448) < 0.001 @@ -116,7 +116,7 @@ def test_load(tokenizer, tmp_path, mgf_small): spec = dataset[0] assert len(spec) == 8 assert spec["peak_file"] == ["small.mgf"] - assert spec["scan_id"] == 0 + assert spec["scan_id"] == ["0"] assert spec["ms_level"] == 2 assert (spec["precursor_mz"] - 416.2448) < 0.001 diff --git a/tests/unit_tests/test_data/test_parsers.py b/tests/unit_tests/test_data/test_parsers.py index 49e6650..b730a77 100644 --- a/tests/unit_tests/test_data/test_parsers.py +++ b/tests/unit_tests/test_data/test_parsers.py @@ -72,7 +72,7 @@ def test_mgf_and_base(mgf_small): expected = pl.DataFrame( { "peak_file": [mgf_small.name] * 2, - "scan_id": [0, 1], + "scan_id": ["0", "1"], "ms_level": [2, 2], "precursor_mz": [416.24474357, 257.464565], "precursor_charge": [2, 3],