Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Native Proforma resolution #167

Draft
wants to merge 2 commits into
base: peak_table
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 20 additions & 58 deletions metabolomics_spectrum_resolver/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,32 +102,37 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
except ValueError:
raise e
try:
# Split off potential ProForma annotations because not all backends
# can handle those.
usi_base = usi[:match.start(5)] + usi[match.end(5):]
proforma = match.group(5)[1:] if match.group(5) is not None else None
# Retrieve the spectrum from its resource.
collection = match.group(1).lower()
annotation = match.group(5)
# Send all proteomics USIs (by definition all annotated USIs) to
# MassIVE.
# mzdraft USIs are assumed to also use ProForma notation. If this
# changes, be sure to change this logic.
# Send all proteomics USIs to MassIVE.
if (
annotation is not None
or collection.startswith("msv")
collection.startswith("msv")
or collection.startswith("pxd")
or collection.startswith("pxl")
or collection.startswith("rpxd")
or collection == "massivekb"
or collection == "massive"
):
spectrum, source_link = _parse_msv_pxd(usi)
spectrum, source_link = _parse_msv_pxd(usi_base)
elif collection == "gnps":
spectrum, source_link = _parse_gnps(usi)
spectrum, source_link = _parse_gnps(usi_base)
elif collection == "massbank":
spectrum, source_link = _parse_massbank(usi)
spectrum, source_link = _parse_massbank(usi_base)
elif collection == "ms2lda":
spectrum, source_link = _parse_ms2lda(usi)
spectrum, source_link = _parse_ms2lda(usi_base)
elif collection == "motifdb":
spectrum, source_link = _parse_motifdb(usi)
spectrum, source_link = _parse_motifdb(usi_base)
else:
raise UsiError(f"Unknown USI collection: {match.group(1)}", 400)
# Assign ProForma annotation.
if proforma is not None:
# TODO: spectrum_utils native ProForma resolution.
# spectrum.annotate(proforma)
pass
splash_key = splash_builder.splash(
splash.Spectrum(
list(zip(spectrum.mz, spectrum.intensity)),
Expand Down Expand Up @@ -368,52 +373,9 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
f"https://massive.ucsd.edu/ProteoSAFe/"
f"QueryMSV?id={dataset_identifier}"
)

# Parse the peptide if available.
try:
# Get the peptide information from resolution,
# this dereferences proforma.
peptide_clean = lookup_json["usi_components"]["peptide"]
peptide = lookup_json["usi_components"]["variant"]
charge = int(lookup_json["usi_components"]["charge"])

# Parse out gapped sequence (e.g. X+129.04259), faking it
# with Glycine as the base residue and adding more mods to
# it.
gapmod_pattern = re.compile("X[+][0-9.]*")
transformed_peptide = peptide
for match in gapmod_pattern.finditer(peptide):
gap_mass = float(match.group().replace("X", ""))
# Fake the gap with glycine.
transformed_peptide = transformed_peptide.replace(
match.group(), f"G{gap_mass - 57.021463735:+}"
)
peptide_clean = peptide_clean.replace("X", "G")
peptide = transformed_peptide

# Parse out modifications.
mod_pattern = re.compile("[-+][0-9.]*")
modifications, previous_mod_len = {}, 0
for match in mod_pattern.finditer(peptide):
found_pos = match.start()
found_len = len(match.group())
i = max(0, found_pos - previous_mod_len - 1)
modifications[i] = float(match.group())
previous_mod_len += found_len

spectrum = sus.MsmsSpectrum(
usi,
precursor_mz,
charge,
mz,
intensity,
peptide=peptide_clean,
modifications=modifications,
)
except (TypeError, KeyError):
spectrum = sus.MsmsSpectrum(
usi, precursor_mz, charge, mz, intensity
)
spectrum = sus.MsmsSpectrum(
usi, precursor_mz, charge, mz, intensity
)

return spectrum, source_link
except requests.exceptions.HTTPError:
Expand Down