diff --git a/src/bam/header.rs b/src/bam/header.rs index b8a6a25e7..a911ff775 100644 --- a/src/bam/header.rs +++ b/src/bam/header.rs @@ -4,6 +4,7 @@ // except according to those terms. use crate::bam::HeaderView; +use crate::errors::{Error, Result}; use lazy_static::lazy_static; use linear_map::LinearMap; use regex::Regex; @@ -67,42 +68,50 @@ impl Header { /// This returns a header as a HashMap. /// Comment lines starting with "@CO" will NOT be included in the HashMap. /// Comment lines can be obtained by the `comments` function. - pub fn to_hashmap(&self) -> HashMap>> { + pub fn to_hashmap(&self) -> Result>>> { let mut header_map = HashMap::default(); lazy_static! { static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap(); static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap(); } - - let header_string = String::from_utf8(self.to_bytes()).unwrap(); - - for line in header_string.split('\n').filter(|x| !x.is_empty()) { - let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect(); - // assert!(rec_type_re.is_match(parts[0])); - let record_type = REC_TYPE_RE - .captures(parts[0]) - .unwrap() - .get(1) - .unwrap() - .as_str() - .to_owned(); - if record_type.eq("CO") { - continue; - } - let mut field = LinearMap::default(); - for part in parts.iter().skip(1) { - let cap = TAG_RE.captures(part).unwrap(); - let tag = cap.get(1).unwrap().as_str().to_owned(); - let value = cap.get(2).unwrap().as_str().to_owned(); - field.insert(tag, value); + if let Ok(header_string) = String::from_utf8(self.to_bytes()) { + for line in header_string.split('\n').filter(|x| !x.is_empty()) { + let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect(); + if parts.is_empty() { + continue; + } + let record_type = REC_TYPE_RE + .captures(parts[0]) + .and_then(|captures| captures.get(1)) + .map(|m| m.as_str().to_owned()); + + if let Some(record_type) = record_type { + if record_type == "CO" { + continue; + } + let mut field = LinearMap::default(); + for part in parts.iter().skip(1) { + if let Some(cap) = TAG_RE.captures(part) { + let tag = cap.get(1).unwrap().as_str().to_owned(); + let value = cap.get(2).unwrap().as_str().to_owned(); + field.insert(tag, value); + } else { + return Err(Error::HeaderParse); + } + } + header_map + .entry(record_type) + .or_insert_with(Vec::new) + .push(field); + } else { + return Err(Error::HeaderParse); + } } - header_map - .entry(record_type) - .or_insert_with(Vec::new) - .push(field); + Ok(header_map) + } else { + Err(Error::HeaderParse) } - header_map } /// Returns an iterator of comment lines. @@ -160,6 +169,7 @@ impl<'a> HeaderRecord<'a> { #[cfg(test)] mod tests { use super::HeaderRecord; + use crate::bam::Header; #[test] fn test_push_tag() { @@ -174,4 +184,26 @@ mod tests { assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x"); } + + #[test] + fn test_header_hash_map() { + let mut records = Vec::new(); + let mut record = HeaderRecord::new(b"HD"); + record.push_tag(b"X1", 0); + records.push(record); + let mut record = HeaderRecord::new(b"PG"); + record.push_tag(b"ID", "mytool"); + records.push(record); + let mut record = HeaderRecord::new(b"PG"); + record.push_tag(b"ID", "other_tool"); + records.push(record); + let header = Header { + records: records.into_iter().map(|rec| rec.to_bytes()).collect(), + }; + let hm = header.to_hashmap().unwrap(); + assert!(hm.contains_key("HD")); + assert!(hm.contains_key("PG")); + assert_eq!(hm.get("HD").unwrap().len(), 1); + assert_eq!(hm.get("PG").unwrap().len(), 2); + } } diff --git a/src/bam/mod.rs b/src/bam/mod.rs index b79bd89a2..92198a498 100644 --- a/src/bam/mod.rs +++ b/src/bam/mod.rs @@ -2996,8 +2996,9 @@ CCCCCCCCCCCCCCCCCCC"[..], #[test] fn test_bam_header_sync() { let reader = Reader::from_path("test/test_issue_156_no_text.bam").unwrap(); - let header_hashmap = Header::from_template(reader.header()).to_hashmap(); - let header_refseqs = header_hashmap.get("SQ").unwrap(); + let header_hashmap = Header::from_template(reader.header()).to_hashmap().unwrap(); + let header_refseqs = header_hashmap.get("SQ".into()).unwrap(); + assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",); assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",); } diff --git a/src/errors.rs b/src/errors.rs index a8c32e585..dbfc35c45 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -77,6 +77,8 @@ pub enum Error { BamPileup, #[error("file is not sorted by position")] BamUnsorted, + #[error("error parsing header")] + HeaderParse, // Errors for BAM auxiliary fields #[error("failed to add aux field (out of memory?)")] diff --git a/src/lib.rs b/src/lib.rs index 36b9995bf..b4633fd71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,8 @@ //! let bam = bam::Reader::from_path(&"test/test.bam").unwrap(); //! let header = bam::Header::from_template(bam.header()); //! -//! // print header records to the terminal, akin to samtool -//! for (key, records) in header.to_hashmap() { +//! // print header records to the terminal, akin to samtools +//! for (key, records) in header.to_hashmap().expect("should parse header") { //! for record in records { //! println!("@{}\tSN:{}\tLN:{}", key, record["SN"], record["LN"]); //! }