From bc5c9f8ac936e82ac5200f93cbf4a366eb90f6b0 Mon Sep 17 00:00:00 2001 From: Art Rand Date: Wed, 14 Jun 2023 10:30:47 -0700 Subject: [PATCH 1/2] fix: makes Header.to_hashmap() not call unwrap. Regex still calls unwrap, add test to make sure it's safe. Change API to return a Result. --- src/bam/header.rs | 88 ++++++++++++++++++++++++++++++++--------------- src/bam/mod.rs | 2 +- src/errors.rs | 2 ++ src/lib.rs | 4 +-- 4 files changed, 65 insertions(+), 31 deletions(-) diff --git a/src/bam/header.rs b/src/bam/header.rs index b8a6a25e7..e3e43ae05 100644 --- a/src/bam/header.rs +++ b/src/bam/header.rs @@ -4,6 +4,7 @@ // except according to those terms. use crate::bam::HeaderView; +use crate::errors::{Error, Result}; use lazy_static::lazy_static; use linear_map::LinearMap; use regex::Regex; @@ -67,42 +68,50 @@ impl Header { /// This returns a header as a HashMap. /// Comment lines starting with "@CO" will NOT be included in the HashMap. /// Comment lines can be obtained by the `comments` function. - pub fn to_hashmap(&self) -> HashMap>> { + pub fn to_hashmap(&self) -> Result>>> { let mut header_map = HashMap::default(); lazy_static! { static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap(); static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap(); } - - let header_string = String::from_utf8(self.to_bytes()).unwrap(); - - for line in header_string.split('\n').filter(|x| !x.is_empty()) { - let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect(); - // assert!(rec_type_re.is_match(parts[0])); - let record_type = REC_TYPE_RE - .captures(parts[0]) - .unwrap() - .get(1) - .unwrap() - .as_str() - .to_owned(); - if record_type.eq("CO") { - continue; - } - let mut field = LinearMap::default(); - for part in parts.iter().skip(1) { - let cap = TAG_RE.captures(part).unwrap(); - let tag = cap.get(1).unwrap().as_str().to_owned(); - let value = cap.get(2).unwrap().as_str().to_owned(); - field.insert(tag, value); + if let Ok(header_string) = String::from_utf8(self.to_bytes()) { + for line in header_string.split('\n').filter(|x| !x.is_empty()) { + let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect(); + if parts.is_empty() { + continue; + } + let record_type = REC_TYPE_RE + .captures(parts[0]) + .and_then(|captures| captures.get(1)) + .map(|m| m.as_str().to_owned()); + + if let Some(record_type) = record_type { + if record_type == "CO" { + continue + } + let mut field = LinearMap::default(); + for part in parts.iter().skip(1) { + if let Some(cap) = TAG_RE.captures(part) { + let tag = cap.get(1).unwrap().as_str().to_owned(); + let value = cap.get(2).unwrap().as_str().to_owned(); + field.insert(tag, value); + } else { + return Err(Error::HeaderParse); + } + } + header_map + .entry(record_type) + .or_insert_with(Vec::new) + .push(field); + } else { + return Err(Error::HeaderParse); + } } - header_map - .entry(record_type) - .or_insert_with(Vec::new) - .push(field); + Ok(header_map) + } else { + Err(Error::HeaderParse) } - header_map } /// Returns an iterator of comment lines. @@ -160,6 +169,7 @@ impl<'a> HeaderRecord<'a> { #[cfg(test)] mod tests { use super::HeaderRecord; + use crate::bam::Header; #[test] fn test_push_tag() { @@ -174,4 +184,26 @@ mod tests { assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x"); } + + #[test] + fn test_header_hash_map() { + let mut records = Vec::new(); + let mut record = HeaderRecord::new(b"HD"); + record.push_tag(b"X1", 0); + records.push(record); + let mut record = HeaderRecord::new(b"PG"); + record.push_tag(b"ID", "mytool"); + records.push(record); + let mut record = HeaderRecord::new(b"PG"); + record.push_tag(b"ID", "other_tool"); + records.push(record); + let header = Header { + records: records.into_iter().map(|rec| rec.to_bytes()).collect(), + }; + let hm = header.to_hashmap().unwrap(); + assert!(hm.contains_key("HD")); + assert!(hm.contains_key("PG")); + assert_eq!(hm.get("HD").unwrap().len(), 1); + assert_eq!(hm.get("PG").unwrap().len(), 2); + } } diff --git a/src/bam/mod.rs b/src/bam/mod.rs index dae37c511..2d7651c4f 100644 --- a/src/bam/mod.rs +++ b/src/bam/mod.rs @@ -2994,7 +2994,7 @@ CCCCCCCCCCCCCCCCCCC"[..], #[test] fn test_bam_header_sync() { let reader = Reader::from_path("test/test_issue_156_no_text.bam").unwrap(); - let header_hashmap = Header::from_template(reader.header()).to_hashmap(); + let header_hashmap = Header::from_template(reader.header()).to_hashmap().unwrap(); let header_refseqs = header_hashmap.get("SQ".into()).unwrap(); assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",); assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",); diff --git a/src/errors.rs b/src/errors.rs index 18e3d6b5f..4e459c1ca 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -73,6 +73,8 @@ pub enum Error { BamPileup, #[error("file is not sorted by position")] BamUnsorted, + #[error("error parsing header")] + HeaderParse, // Errors for BAM auxiliary fields #[error("failed to add aux field (out of memory?)")] diff --git a/src/lib.rs b/src/lib.rs index 36b9995bf..b4633fd71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,8 @@ //! let bam = bam::Reader::from_path(&"test/test.bam").unwrap(); //! let header = bam::Header::from_template(bam.header()); //! -//! // print header records to the terminal, akin to samtool -//! for (key, records) in header.to_hashmap() { +//! // print header records to the terminal, akin to samtools +//! for (key, records) in header.to_hashmap().expect("should parse header") { //! for record in records { //! println!("@{}\tSN:{}\tLN:{}", key, record["SN"], record["LN"]); //! } From 9c4165708a3c294bbef190abfe510bbfea08e07f Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 12 Nov 2024 16:00:24 +0100 Subject: [PATCH 2/2] format --- src/bam/header.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bam/header.rs b/src/bam/header.rs index e3e43ae05..a911ff775 100644 --- a/src/bam/header.rs +++ b/src/bam/header.rs @@ -88,7 +88,7 @@ impl Header { if let Some(record_type) = record_type { if record_type == "CO" { - continue + continue; } let mut field = LinearMap::default(); for part in parts.iter().skip(1) {