Skip to content

Commit 03e4633

Browse files
authored
Merge pull request OpenMS#7481 from axelwalter/new-dataframes
[Feature] DataFrame export for MSSpectrum and MSChromatogram
2 parents 50b6fbe + a0862f4 commit 03e4633

File tree

2 files changed

+368
-3
lines changed

2 files changed

+368
-3
lines changed

src/pyOpenMS/pyopenms/_dataframes.py

+280-3
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,20 @@
55
from . import ConsensusFeature as _ConsensusFeature
66
from . import FeatureMap as _FeatureMap
77
from . import Feature as _Feature
8+
from . import MRMFeature as _MRMFeature
89
from . import MSExperiment as _MSExperiment
10+
from . import PeakMap as _PeakMap
911
from . import PeptideIdentification as _PeptideIdentification
1012
from . import ControlledVocabulary as _ControlledVocabulary
1113
from . import File as _File
1214
from . import IonSource as _IonSource
15+
from . import MSSpectrum as _MSSpectrum
16+
from . import MSChromatogram as _MSChromatogram
17+
from . import MRMTransitionGroupCP as _MRMTransitionGroupCP
1318

1419
import pandas as _pd
1520
import numpy as _np
21+
from enum import Enum as _Enum
1622

1723
class _ConsensusMapDF(_ConsensusMap):
1824
def __init__(self, *args, **kwargs):
@@ -130,7 +136,11 @@ def get_df(self):
130136
"""
131137
return _pd.concat([self.get_metadata_df(), self.get_intensity_df()], axis=1)
132138

139+
# fix class module and name to show up correctly in readthedocs page generated with sphinx autodoc
140+
# needs to link back to rst page of original class, which is pyopenms.ConsensusMap, NOT pyopenms._dataframes._ConsensusMapDF (wh)
133141
ConsensusMap = _ConsensusMapDF
142+
ConsensusMap.__module__ = _ConsensusMap.__module__
143+
ConsensusMap.__name__ = 'ConsensusMap'
134144

135145
# TODO tell the advanced user that they could change this, in case they have different needs.
136146
# TODO check if type could be inferred in the first pass
@@ -147,7 +157,12 @@ def get_df(self):
147157
b'num_of_masstraces': 'i',
148158
b'masstrace_intensity': 'f', # TODO this is actually a DoubleList. Think about what to do here. For _np.fromiter we would need to set the length of the array.
149159
b'Group': 'U50',
150-
b'is_ungrouped_monoisotopic': 'i' # TODO this sounds very boolean to me
160+
b'is_ungrouped_monoisotopic': 'i', # TODO this sounds very boolean to me
161+
b'left_width': 'f',
162+
b'right_width': 'f',
163+
b'total_xic': 'f',
164+
b'PeptideRef': 'U100',
165+
b'peak_apices_sum': 'f'
151166
}
152167
"""Global dict to define which autoconversion to numpy types is tried for certain metavalues.
153168
@@ -294,6 +309,8 @@ def get_assigned_peptide_identifications(self):
294309
return result
295310

296311
FeatureMap = _FeatureMapDF
312+
FeatureMap.__module__ = _FeatureMap.__module__
313+
FeatureMap.__name__ = 'FeatureMap'
297314

298315

299316
class _MSExperimentDF(_MSExperiment):
@@ -319,7 +336,7 @@ def get_df(self, long : bool = False):
319336
cols = ["RT", "mzarray", "intarray"]
320337

321338
return _pd.DataFrame(data=((spec.getRT(), *spec.get_peaks()) for spec in self), columns=cols)
322-
339+
323340
def get_ion_df(self):
324341
"""Generates a pandas DataFrame with all peaks and the ionic mobility in the MSExperiment
325342
@@ -476,8 +493,13 @@ def _get_ion_spec_arrays(mslevel):
476493

477494
return ms1_df, ms2_df
478495

479-
MSExperiment = _MSExperimentDF
480496
PeakMap = _MSExperimentDF
497+
PeakMap.__module__ = _PeakMap.__module__
498+
PeakMap.__name__ = 'PeakMap'
499+
500+
MSExperiment = _MSExperimentDF
501+
MSExperiment.__module__ = _MSExperiment.__module__
502+
MSExperiment.__name__ = 'MSExperiment'
481503

482504

483505
# TODO think about the best way for such top-level function. IMHO in python, encapsulation in a stateless class in unnecessary.
@@ -608,5 +630,260 @@ def update_scores_from_df(peps: List[_PeptideIdentification], df : _pd.DataFrame
608630

609631
return rets
610632

633+
def _add_meta_values(df: _pd.DataFrame, object: any) -> _pd.DataFrame:
634+
"""
635+
Adds metavalues from given object to given DataFrame.
636+
637+
Args:
638+
df (pd.DataFrame): DataFrame to which metavalues will be added.
639+
object (any): Object from which metavalues will be extracted.
640+
641+
Returns:
642+
pd.DataFrame: DataFrame with added meta values.
643+
"""
644+
mvs = []
645+
object.getKeys(mvs)
646+
for k in mvs:
647+
v = object.getMetaValue(k)
648+
dtype = 'U100'
649+
try:
650+
v = int(v)
651+
dtype = int
652+
except ValueError:
653+
try:
654+
v = float(v)
655+
dtype = 'double'
656+
except ValueError:
657+
dtype = f'U{len(v)}'
658+
659+
df[k.decode()] = _np.full(df.shape[0], v, dtype=_np.dtype(dtype))
660+
661+
return df
611662

663+
class _MSSpectrumDF(_MSSpectrum):
664+
def __init__(self, *args, **kwargs):
665+
super().__init__(*args, **kwargs)
666+
667+
def get_df(self, export_ion_mobility: bool = True, export_meta_values: bool = True, export_peptide_identifications: bool = True) -> _pd.DataFrame:
668+
"""
669+
Returns a DataFrame representation of the MSSpectrum.
670+
671+
mz: The mass-to-charge ratio (m/z) values of the peaks in the mass spectrum.
672+
intensity: The intensity (abundance) of the peaks in the mass spectrum.
673+
ion_mobility: The ion mobility values.
674+
ion_mobility_unit: The ion mobility unit.
675+
ms_level: The MS level of the mass spectrum (1 for MS1, 2 for MS2, etc.).
676+
precursor_mz: The mass-to-charge of the precursor ion.
677+
precursor_charge: The charge of the precursor ion.
678+
native_id: The native identifier of the spectrum.
679+
spectrum: The spectrum of annotated peptide identification hit.
680+
681+
Args:
682+
export_ion_mobility (bool): Whether to export ion mobility data.
683+
export_meta_values (bool): Whether to export meta values.
684+
export_peptide_identifications (bool): Whether to export peptide identifications.
685+
686+
Returns:
687+
pd.DataFrame: DataFrame representation of the MSSpectrum.
688+
"""
689+
mzs, intensities = self.get_peaks()
690+
691+
df = _pd.DataFrame({'mz': mzs, 'intensity': intensities})
692+
693+
cnt = df.shape[0]
694+
695+
if export_ion_mobility:
696+
df['ion_mobility'] = _np.array([i for i in self.getFloatDataArrays()[0]]) if self.containsIMData() else _np.nan
697+
df['ion_mobility_unit'] = _np.full(cnt, self.getDriftTimeUnitAsString(), dtype=_np.dtype('U20'))
698+
699+
df['ms_level'] = _np.full(cnt, self.getMSLevel(), dtype=_np.dtype('uint16'))
700+
701+
precs = self.getPrecursors()
702+
df['precursor_mz'] = _np.full(cnt, (precs[0].getMZ() if precs else 0.0), dtype=_np.dtype('double'))
703+
df['precursor_charge'] = _np.full(cnt, (precs[0].getCharge() if precs else 0), dtype=_np.dtype('uint16'))
704+
705+
df['native_id'] = _np.full(cnt, self.getNativeID(), dtype=_np.dtype('U100'))
706+
707+
if export_peptide_identifications:
708+
peps = self.getPeptideIdentifications()
709+
seq = ''
710+
if peps:
711+
hits = peps[0].getHits()
712+
if hits:
713+
seq = hits[0].getSequence().toString()
714+
df['sequence'] = _np.full(cnt, seq, dtype=_np.dtype(f'U{len(seq)}'))
715+
716+
if export_meta_values:
717+
df = _add_meta_values(df, self)
718+
719+
return df
720+
721+
MSSpectrum = _MSSpectrumDF
722+
MSSpectrum.__module__ = _MSSpectrum.__module__
723+
MSSpectrum.__name__ = 'MSSpectrum'
724+
725+
class _ChromatogramType(_Enum):
726+
MASS_CHROMATOGRAM = 0
727+
TOTAL_ION_CURRENT_CHROMATOGRAM = 1
728+
SELECTED_ION_CURRENT_CHROMATOGRAM = 2
729+
BASEPEAK_CHROMATOGRAM = 3
730+
SELECTED_ION_MONITORING_CHROMATOGRAM = 4
731+
SELECTED_REACTION_MONITORING_CHROMATOGRAM = 5
732+
ELECTROMAGNETIC_RADIATION_CHROMATOGRAM = 6
733+
ABSORPTION_CHROMATOGRAM = 7
734+
EMISSION_CHROMATOGRAM = 8
735+
736+
class _MSChromatogramDF(_MSChromatogram):
737+
def __init__(self, *args, **kwargs):
738+
super().__init__(*args, **kwargs)
739+
740+
def get_df(self, export_meta_values: bool = True) -> _pd.DataFrame:
741+
"""
742+
Returns a DataFrame representation of the MSChromatogram.
743+
744+
time: The retention time (in seconds) of the chromatographic peaks.
745+
intensity: The intensity (abundance) of the signal at each time point.
746+
chromatogram_type: The type of chromatogram.
747+
precursor_mz: The mass-to-charge of the precursor ion.
748+
precursor_charge: The charge of the precursor ion.
749+
comment: A comment assigned to the chromatogram.
750+
native_id: The chromatogram native identifier.
751+
752+
Args:
753+
export_meta_values (bool): Whether to export meta values.
754+
755+
Returns:
756+
pd.DataFrame: DataFrame representation of the MSChromatogram.
757+
"""
758+
def extract_data(c: _MSChromatogram):
759+
rts, intys = c.get_peaks()
760+
for rt, inty in zip(rts, intys):
761+
yield rt, inty
762+
763+
cnt = len(self.get_peaks()[0])
764+
765+
dtypes = [('time', _np.dtype('double')), ('intensity', _np.dtype('uint64'))]
766+
767+
arr = _np.fromiter(iter=extract_data(self), dtype=dtypes, count=cnt)
768+
769+
df = _pd.DataFrame(arr)
770+
771+
df['chromatogram_type'] = _np.full(cnt, _ChromatogramType(self.getChromatogramType()).name, dtype=_np.dtype('U100'))
772+
773+
df['precursor_mz'] = _np.full(cnt, self.getPrecursor().getMZ(), dtype=_np.dtype('double'))
774+
df['precursor_charge'] = _np.full(cnt, self.getPrecursor().getCharge(), dtype=_np.dtype('uint16'))
775+
776+
df['product_mz'] = _np.full(cnt, self.getProduct().getMZ(), dtype=_np.dtype('double'))
777+
778+
df['comment'] = _np.full(cnt, self.getComment(), dtype=_np.dtype('U100'))
779+
780+
df['native_id'] = _np.full(cnt, self.getNativeID(), dtype=_np.dtype('U100'))
781+
782+
if export_meta_values:
783+
df = _add_meta_values(df, self)
784+
785+
return df
786+
787+
MSChromatogram = _MSChromatogramDF
788+
MSChromatogram.__module__ = _MSChromatogram.__module__
789+
MSChromatogram.__name__ = 'MSChromatogram'
790+
791+
class _MRMTransitionGroupCPDF(_MRMTransitionGroupCP):
792+
def __init__(self, *args, **kwargs):
793+
super().__init__(*args, **kwargs)
794+
795+
def get_chromatogram_df(self, export_meta_values: bool = True) -> _pd.DataFrame:
796+
"""
797+
Returns a DataFrame representation of the Chromatograms stored in MRMTransitionGroupCP.
798+
799+
rt: The retention time of the transition group.
800+
intensity: The intensity of the transition group.
801+
precursor_mz: The mass-to-charge ratio of the precursor ion.
802+
precursor_charge: The charge of the precursor ion.
803+
product_mz: The mass-to-charge ratio of the product ion.
804+
product_charge: The charge of the product ion.
805+
native_id: The native identifier of the transition group.
806+
807+
Args:
808+
export_meta_values (bool): Whether to export meta values.
809+
810+
Returns:
811+
pd.DataFrame: DataFrame representation of the chromatograms stored in MRMTransitionGroupCP.
812+
"""
813+
chroms = self.getChromatograms()
814+
out = [ _MSChromatogramDF(c).get_df(export_meta_values=export_meta_values) for c in chroms ]
815+
return _pd.concat(out)
816+
817+
def get_feature_df(self, meta_values: Union[None, List[str], str] = None) -> _pd.DataFrame:
818+
"""
819+
Returns a DataFrame representation of the Features stored in MRMTransitionGroupCP.
820+
821+
rt: The retention time of the transition group.
822+
intensity: The intensity of the transition group.
823+
precursor_mz: The mass-to-charge ratio of the precursor ion.
824+
precursor_charge: The charge of the precursor ion.
825+
product_mz: The mass-to-charge ratio of the product ion.
826+
product_charge: The charge of the product ion.
827+
native_id: The native identifier of the transition group.
828+
829+
Args:
830+
export_meta_values (bool): Whether to export meta values.
831+
832+
Returns:
833+
pd.DataFrame: DataFrame representation of the Features stored in MRMTransitionGroupCP.
834+
"""
835+
# get all possible meta value keys in a set
836+
if meta_values == 'all':
837+
meta_values = set()
838+
for f in self:
839+
mvs = []
840+
f.getKeys(mvs)
841+
for m in mvs:
842+
meta_values.add(m)
843+
844+
elif not meta_values: # if None, set to empty list
845+
meta_values = []
846+
847+
features = self.getFeatures()
848+
849+
def gen(features: List[_MRMFeature], fun):
850+
for f in features:
851+
yield from fun(f)
852+
853+
def extract_meta_data(f: _MRMFeature):
854+
"""Extracts feature meta data.
855+
856+
Extracts information from a given feature with the requested meta values and, if requested,
857+
the sequence, score and ID_filename (primary MS run path of the linked ProteinIdentification)
858+
of the best PeptideHit (first) assigned to that feature.
859+
860+
Parameters:
861+
f (Feature): feature from which to extract the meta data
862+
863+
Yields:
864+
tuple: tuple containing feature information, and meta values (optional)
865+
"""
866+
vals = [f.getMetaValue(m) if f.metaValueExists(m) else _np.nan for m in meta_values]
867+
868+
yield tuple((f.getUniqueId(), f.getRT(), f.getIntensity(), f.getOverallQuality(), *vals))
869+
870+
features = self.getFeatures()
871+
872+
873+
mddtypes = [('feature_id', _np.dtype('uint64')), ('RT', 'f'), ('intensity', 'f'), ('quality', 'f')]
874+
875+
for meta_value in meta_values:
876+
if meta_value in common_meta_value_types:
877+
mddtypes.append((meta_value.decode(), common_meta_value_types[meta_value]))
878+
else:
879+
mddtypes.append((meta_value.decode(), 'U50'))
880+
881+
mdarr = _np.fromiter(iter=gen(features, extract_meta_data), dtype=mddtypes, count=len(features))
882+
883+
return _pd.DataFrame(mdarr).set_index('feature_id')
612884

885+
# fix class module and name to show up correctly in readthedocs page generated with sphinx autodoc
886+
# needs to link back to rst page of original class, which is pyopenms.MRMTransitionGroupCP, NOT pyopenms._dataframes._MRMTransitionGroupCPDF (wh)
887+
MRMTransitionGroupCP = _MRMTransitionGroupCPDF
888+
MRMTransitionGroupCP.__module__ = _MRMTransitionGroupCP.__module__
889+
MRMTransitionGroupCP.__name__ = 'MRMTransitionGroupCP'

0 commit comments

Comments
 (0)