Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: bam header to_hashmap method now returns a result in order to deal with header parsing issues #396

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 60 additions & 28 deletions src/bam/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// except according to those terms.

use crate::bam::HeaderView;
use crate::errors::{Error, Result};
use lazy_static::lazy_static;
use linear_map::LinearMap;
use regex::Regex;
Expand Down Expand Up @@ -67,42 +68,50 @@ impl Header {
/// This returns a header as a HashMap.
/// Comment lines starting with "@CO" will NOT be included in the HashMap.
/// Comment lines can be obtained by the `comments` function.
pub fn to_hashmap(&self) -> HashMap<String, Vec<LinearMap<String, String>>> {
pub fn to_hashmap(&self) -> Result<HashMap<String, Vec<LinearMap<String, String>>>> {
let mut header_map = HashMap::default();

lazy_static! {
static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap();
static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap();
}

let header_string = String::from_utf8(self.to_bytes()).unwrap();

for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
// assert!(rec_type_re.is_match(parts[0]));
let record_type = REC_TYPE_RE
.captures(parts[0])
.unwrap()
.get(1)
.unwrap()
.as_str()
.to_owned();
if record_type.eq("CO") {
continue;
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
let cap = TAG_RE.captures(part).unwrap();
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
if let Ok(header_string) = String::from_utf8(self.to_bytes()) {
for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
if parts.is_empty() {
continue;
}
let record_type = REC_TYPE_RE
.captures(parts[0])
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_owned());

if let Some(record_type) = record_type {
if record_type == "CO" {
continue
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
if let Some(cap) = TAG_RE.captures(part) {
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
Ok(header_map)
} else {
Err(Error::HeaderParse)
}
header_map
}

/// Returns an iterator of comment lines.
Expand Down Expand Up @@ -160,6 +169,7 @@ impl<'a> HeaderRecord<'a> {
#[cfg(test)]
mod tests {
use super::HeaderRecord;
use crate::bam::Header;

#[test]
fn test_push_tag() {
Expand All @@ -174,4 +184,26 @@ mod tests {

assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x");
}

#[test]
fn test_header_hash_map() {
let mut records = Vec::new();
let mut record = HeaderRecord::new(b"HD");
record.push_tag(b"X1", 0);
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "mytool");
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "other_tool");
records.push(record);
let header = Header {
records: records.into_iter().map(|rec| rec.to_bytes()).collect(),
};
let hm = header.to_hashmap().unwrap();
assert!(hm.contains_key("HD"));
assert!(hm.contains_key("PG"));
assert_eq!(hm.get("HD").unwrap().len(), 1);
assert_eq!(hm.get("PG").unwrap().len(), 2);
}
}
2 changes: 1 addition & 1 deletion src/bam/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2996,7 +2996,7 @@ CCCCCCCCCCCCCCCCCCC"[..],
#[test]
fn test_bam_header_sync() {
let reader = Reader::from_path("test/test_issue_156_no_text.bam").unwrap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap().unwrap();
let header_refseqs = header_hashmap.get("SQ".into()).unwrap();
assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",);
assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",);
Expand Down
2 changes: 2 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ pub enum Error {
BamPileup,
#[error("file is not sorted by position")]
BamUnsorted,
#[error("error parsing header")]
HeaderParse,

// Errors for BAM auxiliary fields
#[error("failed to add aux field (out of memory?)")]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
//! let bam = bam::Reader::from_path(&"test/test.bam").unwrap();
//! let header = bam::Header::from_template(bam.header());
//!
//! // print header records to the terminal, akin to samtool
//! for (key, records) in header.to_hashmap() {
//! // print header records to the terminal, akin to samtools
//! for (key, records) in header.to_hashmap().expect("should parse header") {
//! for record in records {
//! println!("@{}\tSN:{}\tLN:{}", key, record["SN"], record["LN"]);
//! }
Expand Down
Loading