Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: bam header to_hashmap method now returns a result in order to deal with header parsing issues #396

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 60 additions & 28 deletions src/bam/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// except according to those terms.

use crate::bam::HeaderView;
use crate::errors::{Error, Result};
use lazy_static::lazy_static;
use linear_map::LinearMap;
use regex::Regex;
Expand Down Expand Up @@ -67,42 +68,50 @@ impl Header {
/// This returns a header as a HashMap.
/// Comment lines starting with "@CO" will NOT be included in the HashMap.
/// Comment lines can be obtained by the `comments` function.
pub fn to_hashmap(&self) -> HashMap<String, Vec<LinearMap<String, String>>> {
pub fn to_hashmap(&self) -> Result<HashMap<String, Vec<LinearMap<String, String>>>> {
let mut header_map = HashMap::default();

lazy_static! {
static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap();
static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap();
}

let header_string = String::from_utf8(self.to_bytes()).unwrap();

for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
// assert!(rec_type_re.is_match(parts[0]));
let record_type = REC_TYPE_RE
.captures(parts[0])
.unwrap()
.get(1)
.unwrap()
.as_str()
.to_owned();
if record_type.eq("CO") {
continue;
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
let cap = TAG_RE.captures(part).unwrap();
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
if let Ok(header_string) = String::from_utf8(self.to_bytes()) {
for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
if parts.is_empty() {
continue;
}
let record_type = REC_TYPE_RE
.captures(parts[0])
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_owned());

if let Some(record_type) = record_type {
if record_type == "CO" {
continue;
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
if let Some(cap) = TAG_RE.captures(part) {
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
Ok(header_map)
} else {
Err(Error::HeaderParse)
}
header_map
}

/// Returns an iterator of comment lines.
Expand Down Expand Up @@ -160,6 +169,7 @@ impl<'a> HeaderRecord<'a> {
#[cfg(test)]
mod tests {
use super::HeaderRecord;
use crate::bam::Header;

#[test]
fn test_push_tag() {
Expand All @@ -174,4 +184,26 @@ mod tests {

assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x");
}

#[test]
fn test_header_hash_map() {
let mut records = Vec::new();
let mut record = HeaderRecord::new(b"HD");
record.push_tag(b"X1", 0);
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "mytool");
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "other_tool");
records.push(record);
let header = Header {
records: records.into_iter().map(|rec| rec.to_bytes()).collect(),
};
let hm = header.to_hashmap().unwrap();
assert!(hm.contains_key("HD"));
assert!(hm.contains_key("PG"));
assert_eq!(hm.get("HD").unwrap().len(), 1);
assert_eq!(hm.get("PG").unwrap().len(), 2);
}
}
5 changes: 3 additions & 2 deletions src/bam/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1392,7 +1392,7 @@
header_string.len(),
);

let rec = htslib::sam_hdr_parse((l_text + 1), text as *const c_char);

Check warning on line 1395 in src/bam/mod.rs

View workflow job for this annotation

GitHub Actions / Testing-Features (no-default-features)

unnecessary parentheses around function argument
(*rec).text = text as *mut c_char;
(*rec).l_text = l_text;
rec
Expand Down Expand Up @@ -2996,8 +2996,9 @@
#[test]
fn test_bam_header_sync() {
let reader = Reader::from_path("test/test_issue_156_no_text.bam").unwrap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap();
let header_refseqs = header_hashmap.get("SQ").unwrap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap().unwrap();
let header_refseqs = header_hashmap.get("SQ".into()).unwrap();

Check failure on line 3000 in src/bam/mod.rs

View workflow job for this annotation

GitHub Actions / Testing-Features (no-default-features)

type annotations needed

assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",);
assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",);
}
Expand Down Expand Up @@ -3027,7 +3028,7 @@
let mut writer = Writer::from_path(&bampath, &header, Format::Bam).unwrap();

// Build an empty record
let mut record = Record::new();

Check warning on line 3031 in src/bam/mod.rs

View workflow job for this annotation

GitHub Actions / Testing-Features (no-default-features)

variable does not need to be mutable

// Write the record (this previously seg-faulted)
assert!(writer.write(&record).is_ok());
Expand Down
2 changes: 2 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ pub enum Error {
BamPileup,
#[error("file is not sorted by position")]
BamUnsorted,
#[error("error parsing header")]
HeaderParse,

// Errors for BAM auxiliary fields
#[error("failed to add aux field (out of memory?)")]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
//! let bam = bam::Reader::from_path(&"test/test.bam").unwrap();
//! let header = bam::Header::from_template(bam.header());
//!
//! // print header records to the terminal, akin to samtool
//! for (key, records) in header.to_hashmap() {
//! // print header records to the terminal, akin to samtools
//! for (key, records) in header.to_hashmap().expect("should parse header") {
//! for record in records {
//! println!("@{}\tSN:{}\tLN:{}", key, record["SN"], record["LN"]);
//! }
Expand Down
Loading