Skip to content

Commit

Permalink
(refactor) moved format reader dispatch
Browse files Browse the repository at this point in the history
  • Loading branch information
jspaezp authored and lazear committed Dec 14, 2024
1 parent bd5b12c commit 97263b1
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 23 deletions.
29 changes: 6 additions & 23 deletions crates/sage-cli/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,34 +161,17 @@ impl Runner {
min_deisotope_mz.unwrap_or(0.0),
);

let bruker_extensions = [".d", ".tdf", ".tdf_bin", "ms2", "raw"];
let spectra = chunk
.par_iter()
.enumerate()
.flat_map(|(idx, path)| {
let file_id = chunk_idx * batch_size + idx;

let path_lower = path.to_lowercase();
let res = if path_lower.ends_with(".mgf.gz") || path_lower.ends_with(".mgf") {
sage_cloudpath::util::read_mgf(path, file_id)
} else if bruker_extensions.iter().any(|ext| {
if path_lower.ends_with(std::path::MAIN_SEPARATOR) {
path_lower
.strip_suffix(std::path::MAIN_SEPARATOR)
.unwrap()
.ends_with(ext)
} else {
path_lower.ends_with(ext)
}
}) {
sage_cloudpath::util::read_tdf(
path,
file_id,
self.parameters.bruker_spectrum_processor,
)
} else {
sage_cloudpath::util::read_mzml(path, file_id, sn)
};
let res = sage_cloudpath::util::read_spectra(
path,
file_id,
sn,
self.parameters.bruker_spectrum_processor,
);

match res {
Ok(s) => {
Expand Down
65 changes: 65 additions & 0 deletions crates/sage-cloudpath/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,55 @@ use sage_core::spectrum::RawSpectrum;
use serde::Serialize;
use tokio::io::AsyncReadExt;

#[derive(Debug, PartialEq, Eq)]
enum FileFormat {
MzML,
MGF,
TDF,
Unidentified,
}

const BRUKER_EXTENSIONS: [&str; 5] = [".d", ".tdf", ".tdf_bin", "ms2", "raw"];

fn is_bruker(path: &str) -> bool {
BRUKER_EXTENSIONS.iter().any(|ext| {
if path.ends_with(std::path::MAIN_SEPARATOR) {
path.strip_suffix(std::path::MAIN_SEPARATOR)
.unwrap()
.ends_with(ext)
} else {
path.ends_with(ext)
}
})
}

fn identify_format(s: &str) -> FileFormat {
let path_lower = s.to_lowercase();
if path_lower.ends_with(".mgf.gz") || path_lower.ends_with(".mgf") {
FileFormat::MGF
} else if is_bruker(&path_lower) {
FileFormat::TDF
} else if path_lower.ends_with(".mzml.gz") || path_lower.ends_with(".mzml") {
FileFormat::MzML
} else {
FileFormat::Unidentified
}
}

pub fn read_spectra<S: AsRef<str>>(
path: S,
file_id: usize,
sn: Option<u8>,
bruker_processor: BrukerSpectrumProcessor,
) -> Result<Vec<RawSpectrum>, Error> {
match identify_format(path.as_ref()) {
FileFormat::MzML => read_mzml(path, file_id, sn),
FileFormat::MGF => read_mgf(path, file_id),
FileFormat::TDF => read_tdf(path, file_id, bruker_processor),
FileFormat::Unidentified => panic!("Unable to get type for '{}'", path.as_ref()), // read_mzml(path, file_id, sn),
}
}

pub fn read_mzml<S: AsRef<str>>(
s: S,
file_id: usize,
Expand Down Expand Up @@ -91,3 +140,19 @@ where
Ok(())
})
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_identify_format() {
assert_eq!(identify_format("foo.mzml"), FileFormat::MzML);
assert_eq!(identify_format("foo.mzML"), FileFormat::MzML);
assert_eq!(identify_format("foo.mgf"), FileFormat::MGF);
assert_eq!(identify_format("foo.mgf.gz"), FileFormat::MGF);
assert_eq!(identify_format("foo.tdf"), FileFormat::TDF);
assert_eq!(identify_format("./tomato/foo.d"), FileFormat::TDF);
assert_eq!(identify_format("./tomato/foo.d/"), FileFormat::TDF);
}
}

0 comments on commit 97263b1

Please sign in to comment.