Quality of life fixes (#48)

* Small qol fixes * Bump changelog
wfondrie · Apr 25, 2024 · 35bf3e7 · 35bf3e7
1 parent d46adf1
commit 35bf3e7
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [v0.4.2]
+
+### Changed
+- The length of `SpectrumDataset` and `AnnotatedSpectrumDataset` is now the number of batches, not the number of spectra. This let's tools like PyTorch Lighting create their progress bars properly.
+- Parsing a dataset now no longer requires reading essentially the whole first file. Now the schema is inferred from the first 128 spectra.
+
 ## [v0.4.1]
 
 ### Added

diff --git a/depthcharge/data/spectrum_datasets.py b/depthcharge/data/spectrum_datasets.py
@@ -2,7 +2,9 @@
 
 from __future__ import annotations
 
+import copy
 import logging
+import math
 import uuid
 from collections.abc import Generator, Iterable
 from os import PathLike
@@ -88,6 +90,10 @@ def __init__(
     ) -> None:
         """Initialize a SpectrumDataset."""
         self._parse_kwargs = {} if parse_kwargs is None else parse_kwargs
+        self._init_kwargs = copy.copy(self._parse_kwargs)
+        self._init_kwargs["batch_size"] = 128
+        self._init_kwargs["progress"] = False
+
         self._tmpdir = None
         if path is None:
             # Create a random temporary file:
@@ -101,7 +107,7 @@ def __init__(
         # Now parse spectra.
         if spectra is not None:
             spectra = utils.listify(spectra)
-            batch = next(_get_records(spectra, **self._parse_kwargs))
+            batch = next(_get_records(spectra, **self._init_kwargs))
             lance.write_dataset(
                 _get_records(spectra, **self._parse_kwargs),
                 str(self._path),
@@ -137,7 +143,7 @@ def add_spectra(
 
         """
         spectra = utils.listify(spectra)
-        batch = next(_get_records(spectra, **self._parse_kwargs))
+        batch = next(_get_records(spectra, **self._init_kwargs))
         self._dataset = lance.write_dataset(
             _get_records(spectra, **self._parse_kwargs),
             self._path,
@@ -167,8 +173,8 @@ def __getitem__(self, idx: int) -> dict[str, Any]:
         return self._to_tensor(self._dataset.take(utils.listify(idx)))
 
     def __len__(self) -> int:
-        """The number of spectra in the lance dataset."""
-        return self._dataset.count_rows()
+        """The number of batches in the lance dataset."""
+        return math.ceil(self._dataset.count_rows() / self.batch_size)
 
     def __del__(self) -> None:
         """Cleanup the temporary directory."""