Skip to content

Commit

Permalink
feat: implement htslib basemod api (#385)
Browse files Browse the repository at this point in the history
* implement htslib basemod API

* add error types for basemod API

* fix formatting with cargo fmt

* fix linter warnings

* update `hts-sys` to new release `2.1.0`

* update to available version of `hts-sys`, now `2.1.1`

* ask rustdoc to ignore example code

* hide code in doctest instead of ignore

* try the fix that doctest failure suggests

* import basemod doctest, add test data from htslib

* add additional tests for basemod API

* update base mod tests with cargo fmt

---------

Co-authored-by: David Laehnemann <[email protected]>
  • Loading branch information
jts and dlaehnemann authored Jun 20, 2023
1 parent e89538d commit 8beee14
Show file tree
Hide file tree
Showing 5 changed files with 377 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ bio-types = ">=0.9"
byteorder = "1.3"
custom_derive = "0.1"
derive-new = "0.5"
hts-sys = {version = "2.0.3", default-features = false}
hts-sys = {version = "2.1.1", default-features = false}
ieee754 = "0.2"
lazy_static = "1.4"
libc = "0.2"
Expand Down
357 changes: 357 additions & 0 deletions src/bam/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,48 @@ impl Record {
}
}

/// Access the base modifications associated with this Record through the MM tag.
/// Example:
/// ```
/// use rust_htslib::bam::{Read, Reader, Record};
/// let mut bam = Reader::from_path(&"test/base_mods/MM-orient.sam").unwrap();
/// let mut mod_count = 0;
/// for r in bam.records() {
/// let record = r.unwrap();
/// if let Ok(mods) = record.basemods_iter() {
/// // print metadata for the modifications present in this record
/// for mod_code in mods.recorded() {
/// if let Ok(mod_metadata) = mods.query_type(*mod_code) {
/// println!("mod found with code {}/{} flags: [{} {} {}]",
/// mod_code, *mod_code as u8 as char,
/// mod_metadata.strand, mod_metadata.implicit, mod_metadata.canonical as u8 as char);
/// }
/// }
///
/// // iterate over the modifications in this record
/// // the modifications are returned as a tuple with the
/// // position within SEQ and an hts_base_mod struct
/// for res in mods {
/// if let Ok( (position, m) ) = res {
/// println!("{} {},{}", position, m.modified_base as u8 as char, m.qual);
/// mod_count += 1;
/// }
/// }
/// };
/// }
/// assert_eq!(mod_count, 14);
/// ```
pub fn basemods_iter(&self) -> Result<BaseModificationsIter> {
BaseModificationsIter::new(self)
}

/// An iterator that returns all of the modifications for each position as a vector.
/// This is useful for the case where multiple possible modifications can be annotated
/// at a single position (for example a C could be 5-mC or 5-hmC)
pub fn basemods_position_iter(&self) -> Result<BaseModificationsPositionIter> {
BaseModificationsPositionIter::new(self)
}

/// Infer read pair orientation from record. Returns `SequenceReadPairOrientation::None` if record
/// is not paired, mates are not mapping to the same contig, or mates start at the
/// same position.
Expand Down Expand Up @@ -2171,6 +2213,241 @@ impl fmt::Display for CigarStringView {
}
}

pub struct BaseModificationMetadata {
pub strand: i32,
pub implicit: i32,
pub canonical: i8,
}

/// struct containing the internal state required to access
/// the base modifications for a bam::Record
pub struct BaseModificationState<'a> {
record: &'a Record,
state: *mut htslib::hts_base_mod_state,
buffer: Vec<htslib::hts_base_mod>,
buffer_pos: i32,
}

impl BaseModificationState<'_> {
/// Initialize a new BaseModification struct from a bam::Record
/// This function allocates memory for the state structure
/// and initializes the iterator to the start of the modification
/// records.
fn new<'a>(r: &'a Record) -> Result<BaseModificationState<'a>> {
let mut bm = unsafe {
BaseModificationState {
record: r,
state: hts_sys::hts_base_mod_state_alloc(),
buffer: Vec::new(),
buffer_pos: -1,
}
};

if bm.state.is_null() {
panic!("Unable to allocate memory for hts_base_mod_state");
}

// parse the MM tag to initialize the state
unsafe {
let ret = hts_sys::bam_parse_basemod(bm.record.inner_ptr(), bm.state);
if ret != 0 {
return Err(Error::BamBaseModificationTagNotFound);
}
}

let types = bm.recorded();
bm.buffer.reserve(types.len());
return Ok(bm);
}

pub fn buffer_next_mods(&mut self) -> Result<usize> {
unsafe {
let ret = hts_sys::bam_next_basemod(
self.record.inner_ptr(),
self.state,
self.buffer.as_mut_ptr(),
self.buffer.capacity() as i32,
&mut self.buffer_pos,
);

if ret < 0 {
return Err(Error::BamBaseModificationIterationFailed);
}

// the htslib API won't write more than buffer.capacity() mods to the output array but it will
// return the actual number of modifications found. We return an error to the caller
// in the case where there was insufficient storage to return all mods.
if ret as usize > self.buffer.capacity() {
return Err(Error::BamBaseModificationTooManyMods);
}

// we read the modifications directly into the vector, which does
// not update the length so needs to be manually set
self.buffer.set_len(ret as usize);

return Ok(ret as usize);
}
}

/// Return an array containing the modification codes listed for this record.
/// Positive values are ascii character codes (eg m), negative values are chEBI codes.
pub fn recorded<'a>(&self) -> &'a [i32] {
unsafe {
let mut n: i32 = 0;
let data_ptr: *const i32 = hts_sys::bam_mods_recorded(self.state, &mut n);

// htslib should not return a null pointer, even when there are no base mods
if data_ptr.is_null() {
panic!("Unable to obtain pointer to base modifications");
}
assert!(n >= 0);
return slice::from_raw_parts(data_ptr, n as usize);
}
}

/// Return metadata for the specified character code indicating the strand
/// the base modification was called on, whether the tag uses implicit mode
/// and the ascii code for the canonical base.
/// If there are multiple modifications with the same code this will return the data
/// for the first mod. See https://github.com/samtools/htslib/issues/1635
pub fn query_type<'a>(&self, code: i32) -> Result<BaseModificationMetadata> {
unsafe {
let mut strand: i32 = 0;
let mut implicit: i32 = 0;
let mut canonical: i8 = 0;

let ret = hts_sys::bam_mods_query_type(
self.state,
code,
&mut strand,
&mut implicit,
&mut canonical,
);
if ret == -1 {
return Err(Error::BamBaseModificationTypeNotFound);
} else {
return Ok(BaseModificationMetadata {
strand,
implicit,
canonical,
});
}
}
}
}

impl Drop for BaseModificationState<'_> {
fn drop<'a>(&mut self) {
unsafe {
hts_sys::hts_base_mod_state_free(self.state);
}
}
}

/// Iterator over the base modifications that returns
/// a vector for all of the mods at each position
pub struct BaseModificationsPositionIter<'a> {
mod_state: BaseModificationState<'a>,
}

impl BaseModificationsPositionIter<'_> {
fn new<'a>(r: &'a Record) -> Result<BaseModificationsPositionIter<'a>> {
let state = BaseModificationState::new(r)?;
Ok(BaseModificationsPositionIter { mod_state: state })
}

pub fn recorded<'a>(&self) -> &'a [i32] {
return self.mod_state.recorded();
}

pub fn query_type<'a>(&self, code: i32) -> Result<BaseModificationMetadata> {
return self.mod_state.query_type(code);
}
}

impl<'a> Iterator for BaseModificationsPositionIter<'a> {
type Item = Result<(i32, Vec<hts_sys::hts_base_mod>)>;

fn next(&mut self) -> Option<Self::Item> {
let ret = self.mod_state.buffer_next_mods();

// Three possible things happened in buffer_next_mods:
// 1. the htslib API call was successful but there are no more mods
// 2. ths htslib API call was successful and we read some mods
// 3. the htslib API call failed, we propogate the error wrapped in an option
match ret {
Ok(num_mods) => {
if num_mods == 0 {
return None;
} else {
let data = (self.mod_state.buffer_pos, self.mod_state.buffer.clone());
return Some(Ok(data));
}
}
Err(e) => return Some(Err(e)),
}
}
}

/// Iterator over the base modifications that returns
/// the next modification found, one by one
pub struct BaseModificationsIter<'a> {
mod_state: BaseModificationState<'a>,
buffer_idx: usize,
}

impl BaseModificationsIter<'_> {
fn new<'a>(r: &'a Record) -> Result<BaseModificationsIter<'a>> {
let state = BaseModificationState::new(r)?;
Ok(BaseModificationsIter {
mod_state: state,
buffer_idx: 0,
})
}

pub fn recorded<'a>(&self) -> &'a [i32] {
return self.mod_state.recorded();
}

pub fn query_type<'a>(&self, code: i32) -> Result<BaseModificationMetadata> {
return self.mod_state.query_type(code);
}
}

impl<'a> Iterator for BaseModificationsIter<'a> {
type Item = Result<(i32, hts_sys::hts_base_mod)>;

fn next(&mut self) -> Option<Self::Item> {
if self.buffer_idx == self.mod_state.buffer.len() {
// need to use the internal state to read the next
// set of modifications into the buffer
let ret = self.mod_state.buffer_next_mods();

match ret {
Ok(num_mods) => {
if num_mods == 0 {
// done iterating
return None;
} else {
// we read some mods, reset the position in the buffer then fall through
self.buffer_idx = 0;
}
}
Err(e) => return Some(Err(e)),
}
}

// if we got here when there are mods buffered that we haven't emitted yet
assert!(self.buffer_idx < self.mod_state.buffer.len());
let data = (
self.mod_state.buffer_pos,
self.mod_state.buffer[self.buffer_idx],
);
self.buffer_idx += 1;
return Some(Ok(data));
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -2616,3 +2893,83 @@ mod alignment_cigar_tests {
}
}
}

#[cfg(test)]
mod basemod_tests {
use crate::bam::{Read, Reader};

#[test]
pub fn test_count_recorded() {
let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap();

for r in bam.records() {
let record = r.unwrap();
if let Ok(mods) = record.basemods_iter() {
let n = mods.recorded().len();
assert_eq!(n, 3);
};
}
}

#[test]
pub fn test_query_type() {
let mut bam = Reader::from_path(&"test/base_mods/MM-orient.sam").unwrap();

let mut n_fwd = 0;
let mut n_rev = 0;

for r in bam.records() {
let record = r.unwrap();
if let Ok(mods) = record.basemods_iter() {
for mod_code in mods.recorded() {
if let Ok(mod_metadata) = mods.query_type(*mod_code) {
if mod_metadata.strand == 0 {
n_fwd += 1;
}
if mod_metadata.strand == 1 {
n_rev += 1;
}
}
}
};
}
assert_eq!(n_fwd, 2);
assert_eq!(n_rev, 2);
}

#[test]
pub fn test_mod_iter() {
let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap();
let expected_positions = [1, 7, 12, 13, 13, 22, 30, 31];
let mut i = 0;

for r in bam.records() {
let record = r.unwrap();
for res in record.basemods_iter().unwrap() {
if let Ok((position, _m)) = res {
assert_eq!(position, expected_positions[i]);
i += 1;
}
}
}
}

#[test]
pub fn test_position_iter() {
let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap();
let expected_positions = [1, 7, 12, 13, 22, 30, 31];
let expected_counts = [1, 1, 1, 2, 1, 1, 1];
let mut i = 0;

for r in bam.records() {
let record = r.unwrap();
for res in record.basemods_position_iter().unwrap() {
if let Ok((position, elements)) = res {
assert_eq!(position, expected_positions[i]);
assert_eq!(elements.len(), expected_counts[i]);
i += 1;
}
}
}
}
}
Loading

0 comments on commit 8beee14

Please sign in to comment.