lucas-diedrich · lucas-diedrich · Feb 3, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ __pycache__/
 /docs/generated/
 /docs/_build/
 /docs/tutorials/mock-data/
+
+.coverage
diff --git a/README.md b/README.md
@@ -51,6 +51,8 @@ Refer to the [Releases page](https://github.com/lucas-diedrich/dvp-io/releases)
 
 > Marconato, L. et al. SpatialData: an open and universal data framework for spatial omics. Nat Methods 1–5 (2024) doi:10.1038/s41592-024-02212-x.
 
+> Zeng, W.-F. et al. AlphaPeptDeep: a modular deep learning framework to predict peptide properties for proteomics. Nat Commun 13, 7238 (2022).
+
 [mambaforge]: https://github.com/conda-forge/miniforge#mambaforge
 [scverse discourse]: https://discourse.scverse.org/
 [issue tracker]: https://github.com/lucas-diedrich/dvp-io/issues

diff --git a/docs/faq.md b/docs/faq.md
@@ -6,13 +6,13 @@ Please raise an [issue](https://github.com/lucas-diedrich/dvp-io/issues) to requ
 
 ### Images (Tested)
 
-| Type  |                                        | Function         | Supported channels | Wrapped library                    |
-| ----- | -------------------------------------- | ---------------- | ------------------ | ---------------------------------- |
-| .czi  | Fluorescence Microscopy Single-Channel | `read_czi`       | Grayscale          | pylibczirw                         |
-| .czi  | Fluorescence Microscopy Multi-channel  | `read_czi`       | Grayscale          | pylibczirw                         |
-| .czi  | Whole Slide Image                      | `read_czi`       | RGB(A)             | pylibczirw                         |
-| .mrxs | Whole Slide Images                     | `read_openslide` | RGB(A)             | openslide                          |
-| .tiff | -                                      | `read_custom`    | grayscale          | dask.array.image/skimage.io.imread |
+| Type  |                                        | Function         | Supported channels       | Wrapped library                    |
+| ----- | -------------------------------------- | ---------------- | ------------------------ | ---------------------------------- |
+| .czi  | Fluorescence Microscopy Single-Channel | `read_czi`       | Grayscale                | pylibczirw                         |
+| .czi  | Fluorescence Microscopy Multi-channel  | `read_czi`       | Grayscale                | pylibczirw                         |
+| .czi  | Whole Slide Image                      | `read_czi`       | RGB(A)                   | pylibczirw                         |
+| .mrxs | Whole Slide Images                     | `read_openslide` | RGB(A)                   | openslide                          |
+| .tiff | -                                      | `read_custom`    | (multichannel) grayscale | dask.array.image/skimage.io.imread |
 
 ### Images (supported, in principle)
 
@@ -32,6 +32,21 @@ Please raise an [issue](https://github.com/lucas-diedrich/dvp-io/issues) to requ
 | ---- | --- | ---------------------------- | --------------- |
 | .xml | LMD | `dvpio.read.shapes.read_lmd` | py-lmd          |
 
+### Omics
+
+| Type               |                                            | Function                                               | Wrapped library |
+| ------------------ | ------------------------------------------ | ------------------------------------------------------ | --------------- |
+| `pandas.DataFrame` | Any type, preprocessed into correct format | `dvpio.read.omics.parse_df`                            | -               |
+| .tsv               | alphaDIA                                   | `dvpio.read.shapes.read_precursor_table` (alphadia)    | alphabase       |
+| .tsv               | DIANN                                      | `dvpio.read.shapes.read_precursor_table` (diann)       | alphabase       |
+| .tsv               | DIANN                                      | `dvpio.read.shapes.read_precursor_table` (diann)       | alphabase       |
+| .tsv               | alphapept                                  | `dvpio.read.shapes.read_precursor_table` (alphapept)   | alphabase       |
+| .tsv               | MSFragger                                  | `dvpio.read.shapes.read_precursor_table` (msfragger)   | alphabase       |
+| .tsv               | DIANN                                      | `dvpio.read.shapes.read_precursor_table` (msfragger)   | alphabase       |
+| .tsv               | spectronaut                                | `dvpio.read.shapes.read_precursor_table` (spectronaut) | alphabase       |
+| .parquet           | alphaDIA                                   | `dvpio.read.shapes.read_precursor_table` (alphadia)    | alphabase       |
+| .parquet           | DIANN                                      | `dvpio.read.shapes.read_precursor_table` (diann)       | alphabase       |
+
 ## How to...
 
 ### ... open spatialdata in Napari?

diff --git a/docs/references.bib b/docs/references.bib
@@ -40,3 +40,46 @@ @article{Spatialdata2024
   keywords   = {Computational platforms and environments,Data integration,Molecular imaging,Software},
   journal    = {Nature Methods}
 }
+
+@article{Rosenberger2023,
+  title        = {Spatial Single-Cell Mass Spectrometry Defines Zonation of the Hepatocyte Proteome},
+  author       = {Rosenberger, Florian A. and Thielert, Marvin and Strauss, Maximilian T. and Schweizer, Lisa and Ammar, Constantin and Mädler, Sophia C. and Metousis, Andreas and Skowronek, Patricia and Wahle, Maria and Madden, Katherine and Gote-Schniering, Janine and Semenova, Anna and Schiller, Herbert B. and Rodriguez, Edwin and Nordmann, Thierry M. and Mund, Andreas and Mann, Matthias},
+  year         = {2023},
+  journal      = {Nature Methods},
+  shortjournal = {Nat Methods},
+  volume       = {20},
+  number       = {10},
+  pages        = {1530--1536},
+  publisher    = {Nature Publishing Group},
+  issn         = {1548-7105},
+  doi          = {10.1038/s41592-023-02007-6},
+  url          = {https://www.nature.com/articles/s41592-023-02007-6},
+  urldate      = {2024-02-07},
+  abstract     = {Single-cell proteomics by mass spectrometry is emerging as a powerful and unbiased method for the characterization of biological heterogeneity. So far, it has been limited to cultured cells, whereas an expansion of the method to complex tissues would greatly enhance biological insights. Here we describe single-cell Deep Visual Proteomics (scDVP), a technology that integrates high-content imaging, laser microdissection and multiplexed mass spectrometry. scDVP resolves the context-dependent, spatial proteome of murine hepatocytes at a current depth of 1,700 proteins from a cell slice. Half of the proteome was differentially regulated in a spatial manner, with protein levels changing dramatically in proximity to the central vein. We applied machine learning to proteome classes and images, which subsequently inferred the spatial proteome from imaging data alone. scDVP is applicable to healthy and diseased tissues and complements other spatial proteomics and spatial omics technologies.},
+  issue        = {10},
+  langid       = {english},
+  keywords     = {Cellular imaging,Mass spectrometry,Metabolism,Proteomics},
+  file         = {/Users/lucas-diedrich/Zotero/storage/GH84VMR7/Rosenberger et al. - 2023 - Spatial single-cell mass spectrometry defines zona.pdf}
+}
+
+@article{Zeng2022,
+  title        = {{{AlphaPeptDeep}}: A Modular Deep Learning Framework to Predict Peptide Properties for Proteomics},
+  shorttitle   = {{{AlphaPeptDeep}}},
+  author       = {Zeng, Wen-Feng and Zhou, Xie-Xuan and Willems, Sander and Ammar, Constantin and Wahle, Maria and Bludau, Isabell and Voytik, Eugenia and Strauss, Maximillian T. and Mann, Matthias},
+  date         = {2022-11-24},
+  journaltitle = {Nature Communications},
+  shortjournal = {Nat Commun},
+  volume       = {13},
+  number       = {1},
+  pages        = {7238},
+  publisher    = {Nature Publishing Group},
+  issn         = {2041-1723},
+  doi          = {10.1038/s41467-022-34904-3},
+  url          = {https://www.nature.com/articles/s41467-022-34904-3},
+  urldate      = {2024-02-07},
+  abstract     = {Machine learning and in particular deep learning (DL) are increasingly important in mass spectrometry (MS)-based proteomics. Recent DL models can predict the retention time, ion mobility and fragment intensities of a peptide just from the amino acid sequence with good accuracy. However, DL is a very rapidly developing field with new neural network architectures frequently appearing, which are challenging to incorporate for proteomics researchers. Here we introduce AlphaPeptDeep, a modular Python framework built on the PyTorch DL library that learns and predicts the properties of peptides (https://github.com/MannLabs/alphapeptdeep). It features a model shop that enables non-specialists to create models in just a few lines of code. AlphaPeptDeep represents post-translational modifications in a generic manner, even if only the chemical composition is known. Extensive use of transfer learning obviates the need for large data sets to refine models for particular experimental conditions. The AlphaPeptDeep models for predicting retention time, collisional cross sections and fragment intensities are at least on par with existing tools. Additional sequence-based properties can also be predicted by AlphaPeptDeep, as demonstrated with a HLA peptide prediction model to improve HLA peptide identification for data-independent acquisition (https://github.com/MannLabs/PeptDeep-HLA).},
+  issue        = {1},
+  langid       = {english},
+  keywords     = {Bioinformatics,Computational platforms and environments,Peptides,Proteomics},
+  file         = {/Users/lucas-diedrich/Zotero/storage/WQGI4UQT/Zeng et al. - 2022 - AlphaPeptDeep a modular deep learning framework t.pdf}
+}
diff --git a/docs/tutorials.md b/docs/tutorials.md
@@ -17,6 +17,8 @@ tutorials/001_introduction-to-spatialdata
 :maxdepth: 1
 
 tutorials/002_read-dvpio.ipynb
+tutorials/004_scdvp.ipynb
+
 ```
 
 ## Data export

diff --git a/docs/tutorials/004_scdvp.ipynb b/docs/tutorials/004_scdvp.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,11 +23,12 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
+  "alphabase @ git+https://github.com/MannLabs/alphabase.git@additional_diann_fdr",
   "anndata",
   "napari-spatialdata",
   "openslide-bin",
   "openslide-python",
-  "py-lmd @ git+https://github.com/MannLabs/py-lmd.git",
+  "py-lmd",
   "pylibczirw",
   "spatialdata",
   "spatialdata-plot",

diff --git a/src/dvpio/_utils.py b/src/dvpio/_utils.py
@@ -0,0 +1,25 @@
+import functools
+import warnings
+
+
+def experimental_docs(func):
+    """Decorator to mark a function as experimental in the docstring."""
+    func.__doc__ = (
+        func.__doc__ or ""
+    ) + "\n\n    **Warning:** This function is experimental and may change in future versions."
+    return func
+
+
+def experimental_log(func):
+    """Decorator to mark a function as experimental with a warning log."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        warnings.warn(
+            f"Function {func.__name__} is experimental and may change in future versions.",
+            category=UserWarning,
+            stacklevel=2,
+        )
+        return func(*args, **kwargs)
+
+    return wrapper
diff --git a/src/dvpio/read/omics/__init__.py b/src/dvpio/read/omics/__init__.py
@@ -0,0 +1,3 @@
+from .report_reader import available_reader, parse_df, read_precursor_table
+
+__all__ = ["available_reader", "parse_df", "read_precursor_table"]
diff --git a/src/dvpio/read/omics/report_reader.py b/src/dvpio/read/omics/report_reader.py
@@ -0,0 +1,193 @@
+from collections.abc import Mapping
+from typing import Any
+
+import anndata as ad
+import pandas as pd
+from alphabase.anndata.anndata_factory import AnnDataFactory
+from alphabase.psm_reader.psm_reader import psm_reader_provider
+from spatialdata.models import TableModel
+
+from dvpio._utils import experimental_docs, experimental_log
+
+
+def available_reader() -> list[str]:
+    """Get a list of all available readers, as provided by alphabase"""
+    return sorted(psm_reader_provider.reader_dict.keys())
+
+
+def _parse_pandas_index(index: pd.Index | pd.MultiIndex, set_index: str | None = None) -> pd.DataFrame:
+    """Parse pandas index to pandas dataframe with object index
+
+    Parameters
+    ----------
+    index
+        :class:`pandas.Index`, will be parsed to :class:`pandas.DataFrame`
+    set_index
+        Defaults to None. Whether to set a column in the dataframe as the new index. If None,
+        returns dataframe with range of type string as index
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with index values as columns, optionally with the column specified in `set_index`
+        as index.
+    """
+    df = index.to_frame(index=False)
+    df.index = df.index.astype(str)
+    df.columns = df.columns.astype(str)
+
+    if set_index is not None:
+        df.set_index(set_index, inplace=True)
+
+    return df
+
+
+def parse_df(
+    df: pd.DataFrame, obs_index: str | None = None, var_index: str | None = None, **table_kwargs
+) -> ad.AnnData:
+    """Convert a pandas dataframe to :class:`anndata.AnnData`
+
+    Parameters
+    ----------
+    df
+        Pandas dataframe of shape N (samples) x F (features). Expects observations (e.g. cells, samples) in rows
+        and features (protein groups) in columns
+    obs_index
+        Name of dataframe column that should be set to index in `.obs` attribute
+        (anndata.AnnData.var_names)
+    var_index
+        Name of dataframe column that should be set to index in `.obs` attribute
+        (anndata.AnnData.var_names)
+    **table_kwargs
+        Keyword arguments passed to :meth:`spatialdata.models.TableModel.parse`
+
+    Returns
+    -------
+    :class:`anndata.AnnData`
+        AnnData object with N observations and F features.
+
+            - .obs Contains content of df.index
+            - .var contains content of df.columns
+
+    Example
+    -------
+    .. code-block:: python
+
+        import numpy as np
+        import pandas as pd
+        from dvpio.read.omics import parse_df
+
+        df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["G1", "G2", "G3"], index=["A", "B", "C"])
+        df = df.rename_axis(columns="gene", index="sample")
+
+        adata = parse_df(df)
+
+        assert adata.shape == (3, 3)
+        assert "sample" in adata.obs.columns
+        assert "gene" in adata.var.columns
+
+        adata = parse_df(df, obs_index="sample")
+        assert "sample" not in adata.obs.columns
+        assert adata.obs.index.name == "sample"
+    """
+    X = df.to_numpy()
+
+    obs = _parse_pandas_index(df.index, set_index=obs_index)
+    var = _parse_pandas_index(df.columns, set_index=var_index)
+
+    adata = ad.AnnData(X=X, obs=obs, var=var)
+    return TableModel.parse(adata, **table_kwargs)
+
+
+@experimental_log
+@experimental_docs
+def read_precursor_table(
+    path: str,
+    reader_type: str,
+    *,
+    intensity_column: str | None = None,
+    protein_id_column: str | None = None,
+    raw_name_column: str | None = None,
+    reader_kwargs: Mapping[str, Any] | None = None,
+    **kwargs: Mapping[str, Any],
+) -> ad.AnnData:
+    """Parse proteomics precursor reports to the :class:`anndata.AnnData` format
+
+    Supported formats include
+
+        - AlphaDIA `alphadia_parquet` (.parquet) `alphadia_tsv` (.tsv)
+        - DIANN `diann` (.tsv)
+        - MaxQuant
+        - MSFragger `msfragger`
+        - Sage `sage_parquet` (.parquet), `sage_tsv` (.tsv)
+        - Spectronaut
+
+    see :func:`dvpio.read.omics.available_reader` for a complete list
+
+    Parameters
+    ----------
+    path
+        Path to proteomics report
+    reader_type
+        Name of engine output, pass the method name of the corresponding reader. You can
+        list all available readers with the :func:`dvpio.read.omics.available_reader` helper function
+    intensity_column
+        Column name of precursor intensity in report
+    protein_id_column
+        Column name of feature (i.e. protein group) in report
+    raw_name_column
+        Column names of individual samples in report.
+    reader_kwargs
+        Optional keyword arguments passed to :class:`alphabase.psm_reader.psm_reader.PSMReaderBase`
+    kwargs
+        Passed to :meth:`spatialdata.models.TableModel.parse`
+
+    Returns
+    -------
+    :class:`ad.AnnData`
+        AnnData object that can be further processed with scVerse packages.
+
+        - adata.X
+            Stores values of the `intensity_column` argument the report as sparse matrix of shape observations x features
+        - adata.obs
+            Stores observations
+        - adata.var
+            Stores features
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        from dvpio.io.read.omics import read_report, available_reader
+
+        print(available_reader())
+        > ['alphadia', 'alphadia_parquet', 'alphapept', 'diann', 'maxquant', ...]
+
+        path = ...
+        adata = read_precursor_table(
+            path,
+            reader_type="diann",
+            intensity_column="Precursor.Normalised",
+            raw_name_column="File.Name",
+            protein_id_column="Protein.Names"
+        )
+
+    """
+    if reader_type not in available_reader():
+        raise ValueError(f"Argument reader_type must be one of {''.join(available_reader())}, not {reader_type}")
+
+    reader_kwargs = {} if reader_kwargs is None else reader_kwargs
+
+    factory = AnnDataFactory.from_files(
+        path,
+        reader_type=reader_type,
+        intensity_column=intensity_column,
+        protein_id_column=protein_id_column,
+        raw_name_column=raw_name_column,
+        **reader_kwargs,
+    )
+
+    adata = factory.create_anndata()
+
+    return TableModel.parse(adata, **kwargs)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .report_reader import available_reader, parse_df, read_precursor_table

		__all__ = ["available_reader", "parse_df", "read_precursor_table"]