Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notched search #120

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions crates/sage-cli/tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ fn integration() -> anyhow::Result<()> {
let sp = SpectrumProcessor::new(100, 0.0, 1500.0, true);
let processed = sp.process(spectra[0].clone());
assert!(processed.peaks.len() <= 300);
assert!(processed.precursors.len() == 2);
assert!((processed.precursors[1].mz - 643.034396630915).abs() < 1e-4);
assert!((processed.precursors[0].mz - 648.034396630915).abs() < 1e-4);

let scorer = Scorer {
db: &database,
Expand Down
72 changes: 49 additions & 23 deletions crates/sage/src/scoring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ pub struct Feature {
pub peptide_len: usize,
/// Spectrum id
pub spec_id: String,
/// Precursor id
pub prec_id: usize,
/// File identifier
pub file_id: usize,
/// PSM rank
Expand Down Expand Up @@ -373,15 +375,31 @@ impl<'db> Scorer<'db> {
}
}

fn notched_initial_hits(&self, query: &ProcessedSpectrum) -> (InitialHits, Vec<usize>) {
let precursor_hits: Vec<InitialHits> = query.precursors.iter().map(|precursor| self.initial_hits(query, precursor)).collect();
jspaezp marked this conversation as resolved.
Show resolved Hide resolved
// Match lengths is pre cumulative sum of the number of hits for each precursor
let mut cum_match_lengths = Vec::with_capacity(precursor_hits.len());

let mut hits = InitialHits::default();

let mut cumsum = 0;
for precursor_hits in precursor_hits {
cumsum += precursor_hits.preliminary.len();
hits += precursor_hits;
cum_match_lengths.push(cumsum.clone());
}
(hits, cum_match_lengths)
}

/// Score a single [`ProcessedSpectrum`] against the database
pub fn score_standard(&self, query: &ProcessedSpectrum) -> Vec<Feature> {
let precursor = query.precursors.get(0).unwrap_or_else(|| {
panic!("missing MS1 precursor for {}", query.id);
});
if query.precursors.is_empty() {
panic!("missing precursor for {}", query.id);
}
let (hits, match_lens) = self.notched_initial_hits(query);

let hits = self.initial_hits(query, precursor);
let mut features = Vec::with_capacity(self.report_psms);
self.build_features(query, precursor, &hits, self.report_psms, &mut features);
self.build_features(query, &query.precursors, &hits, self.report_psms, &mut features, &match_lens);
features
}

Expand All @@ -390,45 +408,49 @@ impl<'db> Scorer<'db> {
fn build_features(
&self,
query: &ProcessedSpectrum,
precursor: &Precursor,
precursor: &[Precursor],
hits: &InitialHits,
report_psms: usize,
features: &mut Vec<Feature>,
index_rle: &[usize],
) {
let mut score_vector = hits
.preliminary
.iter()
.filter(|score| score.peptide != PeptideIx::default())
.map(|pre| self.score_candidate(query, pre))
.filter(|s| (s.0.matched_b + s.0.matched_y) >= self.min_matched_peaks)
.enumerate()
.filter(|(_i, score)| score.peptide != PeptideIx::default())
.map(|(i, pre)| (i, self.score_candidate(query, pre)))
.filter(|(_i, s)| (s.0.matched_b + s.0.matched_y) >= self.min_matched_peaks)
.collect::<Vec<_>>();

// Hyperscore is our primary score function for PSMs
score_vector.sort_by(|a, b| b.0.hyperscore.total_cmp(&a.0.hyperscore));
score_vector.sort_by(|(_i, a), (_ii, b)| b.0.hyperscore.total_cmp(&a.0.hyperscore));

// Expected value for poisson distribution
// (average # of matches peaks/peptide candidate)
let lambda = hits.matched_peaks as f64 / hits.scored_candidates as f64;

// Sage operates on masses without protons; [M] instead of [MH+]
let mz = precursor.mz - PROTON;

for idx in 0..report_psms.min(score_vector.len()) {
let score = score_vector[idx].0;
let fragments: Option<Fragments> = score_vector[idx].1.take();
let score = score_vector[idx].1.0;
let score_index = score_vector[idx].0;
let mut precursor_index: usize = 0;
while score_index >= index_rle[precursor_index as usize] {
precursor_index += 1;
}

let fragments: Option<Fragments> = score_vector[idx].1.1.take();
let psm_id = increment_psm_counter();

let peptide = &self.db[score.peptide];
let precursor_mass = mz * score.precursor_charge as f32;

let next = score_vector
.get(idx + 1)
.map(|score| score.0.hyperscore)
.map(|score| score.1.0.hyperscore)
.unwrap_or_default();

let best = score_vector
.get(0)
.map(|score| score.0.hyperscore)
.map(|score| score.1.0.hyperscore)
.expect("we know that index 0 is valid");

// Poisson distribution probability mass function
Expand All @@ -440,6 +462,9 @@ impl<'db> Scorer<'db> {
poisson = 1E-325;
}

// Sage operates on masses without protons; [M] instead of [MH+]
let mz = precursor[precursor_index].mz - PROTON;
jspaezp marked this conversation as resolved.
Show resolved Hide resolved
let precursor_mass = mz * score.precursor_charge as f32;
let isotope_error = score.isotope_error as f32 * NEUTRON;
let delta_mass = (precursor_mass - peptide.monoisotopic - isotope_error).abs() * 2E6
/ (precursor_mass - isotope_error + peptide.monoisotopic);
Expand All @@ -451,6 +476,7 @@ impl<'db> Scorer<'db> {
psm_id,
peptide_idx: score.peptide,
spec_id: query.id.clone(),
prec_id: precursor_index,
file_id: query.file_id,
rank: idx as u32 + 1,
label: peptide.label(),
Expand Down Expand Up @@ -539,18 +565,18 @@ impl<'db> Scorer<'db> {
/// Return multiple PSMs for each spectra - first is the best match, second PSM is the best match
/// after all theoretical peaks assigned to the best match are removed, etc
pub fn score_chimera_fast(&self, query: &ProcessedSpectrum) -> Vec<Feature> {
let precursor = query.precursors.get(0).unwrap_or_else(|| {
panic!("missing MS1 precursor for {}", query.id);
});
if query.precursors.is_empty() {
panic!("missing precursor for {}", query.id);
}

let mut query = query.clone();
let hits = self.initial_hits(&query, precursor);
let (hits, match_lengths) = self.notched_initial_hits(&query);

let mut candidates: Vec<Feature> = Vec::with_capacity(self.report_psms);

let mut prev = 0;
while candidates.len() < self.report_psms {
self.build_features(&query, precursor, &hits, 1, &mut candidates);
self.build_features(&query, &query.precursors, &hits, 1, &mut candidates, &match_lengths);
if candidates.len() > prev {
if let Some(feat) = candidates.get_mut(prev) {
self.remove_matched_peaks(&mut query, feat);
Expand Down
21 changes: 20 additions & 1 deletion tests/LQSRPAAPPAPGPGQLTLR.mzML
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,26 @@
</scanWindowList>
</scan>
</scanList>
<precursorList count="1">
<precursorList count="2">
<precursor spectrumRef="controllerType=0 controllerNumber=1 scan=30068">
<isolationWindow>
<cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="648.368408203125" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="1.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="1.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<userParam name="ms level" value="1"/>
</isolationWindow>
<selectedIonList count="1">
<selectedIon>
<cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="648.034396630915" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="3"/>
<cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="3.0614616075e08" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of detector counts"/>
</selectedIon>
</selectedIonList>
<activation>
<cvParam cvRef="MS" accession="MS:1000422" name="beam-type collision-induced dissociation" value=""/>
<cvParam cvRef="MS" accession="MS:1000045" name="collision energy" value="25.0" unitCvRef="UO" unitAccession="UO:0000266" unitName="electronvolt"/>
</activation>
</precursor>
<precursor spectrumRef="controllerType=0 controllerNumber=1 scan=30068">
<isolationWindow>
<cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="643.368408203125" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
Expand Down
Loading