Scan id to string

wfondrie · Apr 30, 2024 · 1f07cec · 1f07cec
1 parent f341681
commit 1f07cec
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [v0.4.5]
+### Changed
+- The `scan_id` column for parsed spectra is not a sting instead of an integer. This is less space efficient, but we ran into issues with Sciex indexing when trying to use only an integer.
+
 ## [v0.4.4]
 
 ### Changed

diff --git a/depthcharge/data/parsers.py b/depthcharge/data/parsers.py
@@ -90,7 +90,7 @@ def __init__(
         self.schema = pa.schema(
             [
                 pa.field("peak_file", pa.string()),
-                pa.field("scan_id", pa.int64()),
+                pa.field("scan_id", pa.string()),
                 pa.field("ms_level", pa.uint8()),
                 pa.field("precursor_mz", pa.float64()),
                 pa.field("precursor_charge", pa.int16()),
@@ -198,7 +198,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:
 
                     entry = {
                         "peak_file": self.peak_file.name,
-                        "scan_id": _parse_scan_id(parsed.scan_id),
+                        "scan_id": str(parsed.scan_id),
                         "ms_level": parsed.ms_level,
                         "precursor_mz": parsed.precursor_mz,
                         "precursor_charge": parsed.precursor_charge,

diff --git a/tests/unit_tests/test_data/test_arrow.py b/tests/unit_tests/test_data/test_arrow.py
@@ -14,10 +14,13 @@
 from depthcharge.data.fields import CustomField
 from depthcharge.data.preprocessing import scale_to_unit_norm
 
-METADATA_DF1 = pl.DataFrame({"scan_id": [501, 507, 10], "blah": [True] * 3})
+SCANS = [
+    f"controllerType=0 controllerNumber=1 scan={x}" for x in [501, 507, 10]
+]
+METADATA_DF1 = pl.DataFrame({"scan_id": SCANS, "blah": [True] * 3})
 METADATA_DF2 = pl.DataFrame(
     {
-        "scan_id": [501, 507, 10],
+        "scan_id": SCANS,
         "blah": [True] * 3,
         "peak_file": ["TMT10-Trail-8.mzML"] * 3,
     },

diff --git a/tests/unit_tests/test_data/test_datasets.py b/tests/unit_tests/test_data/test_datasets.py
@@ -48,7 +48,7 @@ def test_indexing(tokenizer, mgf_small, tmp_path):
     spec = dataset[0]
     assert len(spec) == 7
     assert spec["peak_file"] == ["small.mgf"]
-    assert spec["scan_id"].item() == 0
+    assert spec["scan_id"] == ["0"]
     assert spec["ms_level"].item() == 2
     assert (spec["precursor_mz"].item() - 416.2448) < 0.001
 
@@ -116,7 +116,7 @@ def test_load(tokenizer, tmp_path, mgf_small):
     spec = dataset[0]
     assert len(spec) == 8
     assert spec["peak_file"] == ["small.mgf"]
-    assert spec["scan_id"] == 0
+    assert spec["scan_id"] == ["0"]
     assert spec["ms_level"] == 2
     assert (spec["precursor_mz"] - 416.2448) < 0.001
 

diff --git a/tests/unit_tests/test_data/test_parsers.py b/tests/unit_tests/test_data/test_parsers.py
@@ -72,7 +72,7 @@ def test_mgf_and_base(mgf_small):
     expected = pl.DataFrame(
         {
             "peak_file": [mgf_small.name] * 2,
-            "scan_id": [0, 1],
+            "scan_id": ["0", "1"],
             "ms_level": [2, 2],
             "precursor_mz": [416.24474357, 257.464565],
             "precursor_charge": [2, 3],