Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional diann fdr #291

Open
wants to merge 16 commits into
base: refactor_fdr
Choose a base branch
from
23 changes: 20 additions & 3 deletions alphabase/anndata/anndata_factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Factory class to convert PSM DataFrames to AnnData format."""

import warnings
from typing import List, Optional, Union
from collections import defaultdict
from typing import Any, Dict, List, Optional, Union

import anndata as ad
import numpy as np
Expand Down Expand Up @@ -57,7 +58,7 @@ def create_anndata(self) -> ad.AnnData:
index=PsmDfCols.RAW_NAME,
columns=PsmDfCols.PROTEINS,
values=PsmDfCols.INTENSITY,
aggfunc="first",
aggfunc="first", # DataFrameGroupBy.first -> will skip NA
Copy link
Contributor Author

@mschwoer mschwoer Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vbrennsteiner I looked up the docs, seem that this logic (https://github.com/MannLabs/bm_tools/blob/main/bm_tools/diann.py#L184) is implied in "first"?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find the respective documentation, but I changed this in bm_tools and the test with na-containing reports still works, thanks for the hint! :)

fill_value=np.nan,
dropna=False,
)
Expand Down Expand Up @@ -104,7 +105,11 @@ def from_files(
"""
from alphabase.psm_reader.psm_reader import psm_reader_provider

reader: PSMReaderBase = psm_reader_provider.get_reader(reader_type, **kwargs)
reader_config = cls._get_reader_configuration(reader_type)

reader: PSMReaderBase = psm_reader_provider.get_reader(
reader_type, **reader_config, **kwargs
)

custom_column_mapping = {
k: v
Expand All @@ -121,3 +126,15 @@ def from_files(

psm_df = reader.load(file_paths)
return cls(psm_df)

@classmethod
def _get_reader_configuration(cls, reader_type: str) -> Dict[str, Dict[str, Any]]:
"""Get reader-specific configuration for mapping PSMs to anndata."""
reader_kwargs = defaultdict(dict)

reader_kwargs["diann"] = {
"filter_first_search_fdr": True,
"filter_second_search_fdr": True,
}

return reader_kwargs[reader_type]
8 changes: 7 additions & 1 deletion alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ diann: # 1.8.1
rt_unit: minute
fixed_C57: False
column_mapping:
'raw_name': 'Run' # File.Name?
'raw_name': 'Run'
'sequence': 'Stripped.Sequence'
'charge': 'Precursor.Charge'
'rt': 'RT'
Expand All @@ -200,6 +200,12 @@ diann: # 1.8.1
'score': 'CScore'
'fdr': 'Q.Value'
'intensity': "PG.MaxLFQ"
# extra columns for performing FDR cutoff, not propagated to the output
'_fdr2': 'Global.Q.Value' # first search
'_fdr3': 'Global.PG.Q.Value' # first search
'_fdr4': 'Lib.Q.Value' # second search
'_fdr5': 'Lib.PG.Q.Value' # second search

mod_seq_columns:
- "Modified.Sequence"
modification_mapping_type: 'maxquant'
Expand Down
69 changes: 69 additions & 0 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Readers for Spectronaut's output library and reports, Swath data and DIANN data."""

from typing import List, Optional

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -42,13 +44,80 @@ class DiannReader(ModifiedSequenceReader):
_add_unimod_to_mod_mapping = True
_min_max_rt_norm = False

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
mod_seq_columns: Optional[List[str]] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: Optional[str] = None,
# DIANN reader-specific
filter_first_search_fdr: bool = False,
filter_second_search_fdr: bool = False,
**kwargs,
):
"""Reader for DIANN data.

See documentation of `PSMReaderBase` for more information.

Parameters
----------
filter_first_search_fdr : bool, optional
If True, the FDR filtering will be done also to the first search columns (_fdr2 and _fdr3)

filter_second_search_fdr : bool, optional
If True, the FDR filtering will be done also to the second columns (_fdr4 and _fdr5)

See documentation of `PSMReaderBase` for the rest of parameters.

"""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
mod_seq_columns=mod_seq_columns,
fdr=fdr,
keep_decoy=keep_decoy,
rt_unit=rt_unit,
**kwargs,
)

self._filter_first_search_fdr = filter_first_search_fdr
self._filter_second_search_fdr = filter_second_search_fdr

def _post_process(self, origin_df: pd.DataFrame) -> None:
self._psm_df.rename(
columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True
)

super()._post_process(origin_df)

def _filter_fdr(self) -> None:
"""Filter PSMs based on additional FDR columns and drop the temporary columns."""
super()._filter_fdr()

extra_fdr_columns = []

if self._filter_first_search_fdr:
extra_fdr_columns += [PsmDfCols.FDR2, PsmDfCols.FDR3]

if self._filter_second_search_fdr:
extra_fdr_columns += [PsmDfCols.FDR4, PsmDfCols.FDR5]

mask = np.ones(len(self._psm_df), dtype=bool)
for col in extra_fdr_columns:
if col in self._psm_df.columns:
mask &= self._psm_df[col] <= self._fdr_threshold

if not all(mask):
self._psm_df = self._psm_df[mask]

self._psm_df = self._psm_df.drop(
columns=[PsmDfCols.FDR2, PsmDfCols.FDR3, PsmDfCols.FDR4, PsmDfCols.FDR5],
errors="ignore",
)


class SpectronautReportReader(ModifiedSequenceReader):
"""Reader for Spectronaut's report TSV/CSV."""
Expand Down
17 changes: 12 additions & 5 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,27 @@ class PsmDfCols(metaclass=ConstantsClass):
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"
INTENSITY = "intensity"

RAW_NAME = "raw_name"
CHARGE = "charge"
PROTEINS = "proteins"
INTENSITY = "intensity"

SCAN_NUM = "scan_num"
PRECURSOR_MZ = "precursor_mz"
DIANN_SPEC_INDEX = "diann_spec_idx"

# part of the output, but not directly referenced
_UNIPROT_IDS = "uniprot_ids"
_GENES = "genes"
_QUERY_ID = "query_id"
# part of the output, but not directly referenced in code
UNIPROT_IDS = "uniprot_ids"
GENES = "genes"
QUERY_ID = "query_id"

# part of psm_reader.yaml, but not part of output
# extra FDR columns for DIANN
FDR2 = "_fdr2" # first search
FDR3 = "_fdr3" # first search
FDR4 = "_fdr4" # second search
FDR5 = "_fdr5" # second search


class LibPsmDfCols(metaclass=ConstantsClass):
Expand Down
63 changes: 46 additions & 17 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class ModifiedSequenceReader(PSMReaderBase, ABC):

_add_unimod_to_mod_mapping = True

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
def __init__( # noqa: PLR0913 # too many arguments in function definition, missing argument descriptions
self,
*,
column_mapping: Optional[dict] = None,
Expand All @@ -138,23 +138,13 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: Optional[str] = None,
# MaxQuant reader-specific
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
):
"""Reader for MaxQuant-like data (in terms of modification loading and decoy translation).

See documentation of `PSMReaderBase` for more information.

Parameters
----------
fixed_C57 : bool, optional
If true, the search engine will not show `Carbamidomethyl`
in the modified sequences.
by default read from psm_reader_yaml key `fixed_C57`.

See documentation of `PSMReaderBase` for the rest of parameters.

See documentation of `PSMReaderBase` for the parameters.
"""
super().__init__(
column_mapping=column_mapping,
Expand All @@ -166,11 +156,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
**kwargs,
)

self.fixed_C57 = (
fixed_C57
if fixed_C57 is not None
else psm_reader_yaml[self._reader_type]["fixed_C57"]
)
self.fixed_C57 = False

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand Down Expand Up @@ -207,6 +193,49 @@ class MaxQuantReader(ModifiedSequenceReader):

_reader_type = "maxquant"

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
mod_seq_columns: Optional[List[str]] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: Optional[str] = None,
# MaxQuant reader-specific
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
):
"""Reader for MaxQuant data.

See documentation of `PSMReaderBase` for more information.

Parameters
----------
fixed_C57 : bool, optional
If true, the search engine will not show `Carbamidomethyl`
in the modified sequences.
by default read from psm_reader_yaml key `fixed_C57`.

See documentation of `PSMReaderBase` for the rest of parameters.

"""
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
mod_seq_columns=mod_seq_columns,
fdr=fdr,
keep_decoy=keep_decoy,
rt_unit=rt_unit,
**kwargs,
)

self.fixed_C57 = (
fixed_C57
if fixed_C57 is not None
else psm_reader_yaml[self._reader_type]["fixed_C57"]
)

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""MaxQuant-specific preprocessing of output data."""
df = df[~pd.isna(df["Retention time"])]
Expand Down
Empty file added requirements.txt
Empty file.
Binary file modified tests/integration/reference_data/reference_ad_diann_181.parquet
Binary file not shown.
Binary file modified tests/integration/reference_data/reference_ad_diann_190.parquet
Binary file not shown.
63 changes: 61 additions & 2 deletions tests/unit/anndata/test_anndata_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,47 @@ def test_create_anndata_with_empty_dataframe():


@patch("alphabase.psm_reader.psm_reader.psm_reader_provider.get_reader")
def test_from_files(mock_reader):
@patch("alphabase.anndata.anndata_factory.AnnDataFactory._get_reader_configuration")
def test_from_files(mock_get_reader_configuration, mock_reader):
mock_reader.return_value.load.return_value = _get_test_psm_df()

factory = AnnDataFactory.from_files(["file1", "file2"], reader_type="maxquant")
mock_get_reader_configuration.return_value = {"extra_key": "extra_value"}
factory = AnnDataFactory.from_files(
["file1", "file2"], reader_type="some_reader_type"
)

# when
adata = factory.create_anndata()

assert adata.shape == (2, 2)
assert adata.obs_names.tolist() == ["raw1", "raw2"]
assert adata.var_names.tolist() == ["protein1", "protein2"]
assert np.array_equal(
adata.X, np.array([[100, 200], [300, np.nan]]), equal_nan=True
)

mock_reader.assert_called_once_with("some_reader_type", extra_key="extra_value")


@patch("alphabase.psm_reader.psm_reader.psm_reader_provider.get_reader")
def test_from_files_nan(mock_reader):
df = pd.concat(
[
pd.DataFrame(
{
PsmDfCols.RAW_NAME: ["raw2"],
PsmDfCols.PROTEINS: ["protein2"],
PsmDfCols.INTENSITY: [np.nan],
}
),
_get_test_psm_df(),
]
)
mock_reader.return_value.load.return_value = df

factory = AnnDataFactory.from_files(
["file1", "file2"], reader_type="some_reader_type"
)

# when
adata = factory.create_anndata()
Expand All @@ -119,3 +156,25 @@ def test_from_files(mock_reader):
assert np.array_equal(
adata.X, np.array([[100, 200], [300, np.nan]]), equal_nan=True
)

mock_reader.assert_called_once_with("some_reader_type")


def test_get_reader_configuration_with_valid_reader_type():
"""Test that the correct configuration is returned for a valid reader type."""
# when
config = AnnDataFactory._get_reader_configuration(
"diann"
) # diann is taken as an example here

assert config == {
"filter_first_search_fdr": True,
"filter_second_search_fdr": True,
}


def test_get_reader_configuration_with_unknown_reader_type():
"""Test that a reader type without special config is handled correctly."""
# when
config = AnnDataFactory._get_reader_configuration("invalid_reader_type")
assert config == {}
Loading
Loading