From 70fcb694517bdc51036587f244e25a6b3daf36ae Mon Sep 17 00:00:00 2001 From: Michael Macias Date: Tue, 14 Jan 2025 11:11:54 -0600 Subject: [PATCH] bam/bai/io/writer: Extract index writer --- noodles-bam/src/bai/async/io/writer.rs | 312 +----------------- noodles-bam/src/bai/async/io/writer/index.rs | 313 +++++++++++++++++++ noodles-bam/src/bai/io/writer.rs | 214 +------------ noodles-bam/src/bai/io/writer/index.rs | 211 +++++++++++++ 4 files changed, 534 insertions(+), 516 deletions(-) create mode 100644 noodles-bam/src/bai/async/io/writer/index.rs create mode 100644 noodles-bam/src/bai/io/writer/index.rs diff --git a/noodles-bam/src/bai/async/io/writer.rs b/noodles-bam/src/bai/async/io/writer.rs index 7afed7cfd..ecf3f6ab1 100644 --- a/noodles-bam/src/bai/async/io/writer.rs +++ b/noodles-bam/src/bai/async/io/writer.rs @@ -1,18 +1,9 @@ -use indexmap::IndexMap; -use noodles_bgzf as bgzf; -use noodles_csi::{ - binning_index::{ - index::{ - reference_sequence::{bin::Chunk, index::LinearIndex, Bin, Metadata}, - ReferenceSequence, - }, - ReferenceSequence as _, - }, - BinningIndex, -}; +mod index; + use tokio::io::{self, AsyncWrite, AsyncWriteExt}; -use crate::bai::{Index, MAGIC_NUMBER}; +use self::index::write_index; +use crate::bai::Index; /// An async BAM index (BAI) writer. pub struct Writer { @@ -119,299 +110,6 @@ where /// # } /// ``` pub async fn write_index(&mut self, index: &Index) -> io::Result<()> { - write_magic(&mut self.inner).await?; - - write_reference_sequences(&mut self.inner, index.reference_sequences()).await?; - - if let Some(unplaced_unmapped_record_count) = index.unplaced_unmapped_record_count() { - write_unplaced_unmapped_record_count(&mut self.inner, unplaced_unmapped_record_count) - .await?; - } - - Ok(()) - } -} - -async fn write_magic(writer: &mut W) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - writer.write_all(MAGIC_NUMBER).await -} - -async fn write_reference_sequences( - writer: &mut W, - reference_sequences: &[ReferenceSequence], -) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let n_ref = u32::try_from(reference_sequences.len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(n_ref).await?; - - for reference_sequence in reference_sequences { - write_reference_sequence(writer, reference_sequence).await?; - } - - Ok(()) -} - -async fn write_reference_sequence( - writer: &mut W, - reference_sequence: &ReferenceSequence, -) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - write_bins( - writer, - reference_sequence.bins(), - reference_sequence.metadata(), - ) - .await?; - - write_intervals(writer, reference_sequence.index()).await?; - - Ok(()) -} - -async fn write_bins( - writer: &mut W, - bins: &IndexMap, - metadata: Option<&Metadata>, -) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let n_bin = u32::try_from(bins.len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) - .and_then(|n| { - if metadata.is_some() { - n.checked_add(1) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "n_bin overflow")) - } else { - Ok(n) - } - })?; - - writer.write_u32_le(n_bin).await?; - - for (&id, bin) in bins { - write_bin(writer, id, bin).await?; - } - - if let Some(m) = metadata { - write_metadata(writer, m).await?; - } - - Ok(()) -} - -async fn write_bin(writer: &mut W, id: usize, bin: &Bin) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let id = u32::try_from(id).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(id).await?; - write_chunks(writer, bin.chunks()).await?; - Ok(()) -} - -async fn write_chunks(writer: &mut W, chunks: &[Chunk]) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let n_chunk = - u32::try_from(chunks.len()).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(n_chunk).await?; - - for chunk in chunks { - write_chunk(writer, chunk).await?; - } - - Ok(()) -} - -async fn write_chunk(writer: &mut W, chunk: &Chunk) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let chunk_beg = u64::from(chunk.start()); - writer.write_u64_le(chunk_beg).await?; - - let chunk_end = u64::from(chunk.end()); - writer.write_u64_le(chunk_end).await?; - - Ok(()) -} - -async fn write_intervals(writer: &mut W, intervals: &[bgzf::VirtualPosition]) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - let n_intv = u32::try_from(intervals.len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(n_intv).await?; - - for &interval in intervals { - let ioffset = u64::from(interval); - writer.write_u64_le(ioffset).await?; - } - - Ok(()) -} - -async fn write_metadata(writer: &mut W, metadata: &Metadata) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - use crate::bai::DEPTH; - - const METADATA_ID: usize = Bin::metadata_id(DEPTH); - const METADATA_CHUNK_COUNT: usize = 2; - - let id = - u32::try_from(METADATA_ID).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(id).await?; - - let n_chunk = u32::try_from(METADATA_CHUNK_COUNT) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32_le(n_chunk).await?; - - let ref_beg = u64::from(metadata.start_position()); - writer.write_u64_le(ref_beg).await?; - - let ref_end = u64::from(metadata.end_position()); - writer.write_u64_le(ref_end).await?; - - let n_mapped = metadata.mapped_record_count(); - writer.write_u64_le(n_mapped).await?; - - let n_unmapped = metadata.unmapped_record_count(); - writer.write_u64_le(n_unmapped).await?; - - Ok(()) -} - -async fn write_unplaced_unmapped_record_count( - writer: &mut W, - unplaced_unmapped_record_count: u64, -) -> io::Result<()> -where - W: AsyncWrite + Unpin, -{ - writer.write_u64_le(unplaced_unmapped_record_count).await -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_write_magic() -> io::Result<()> { - let mut buf = Vec::new(); - write_magic(&mut buf).await?; - assert_eq!(buf, b"BAI\x01"); - Ok(()) - } - - #[tokio::test] - async fn test_write_bins() -> io::Result<()> { - let bins = [(8, Bin::new(Vec::new()))].into_iter().collect(); - - let mut buf = Vec::new(); - write_bins(&mut buf, &bins, None).await?; - - let expected = [ - 0x01, 0x00, 0x00, 0x00, // n_bins = 1 - 0x08, 0x00, 0x00, 0x00, // bins[0].bin = 8 - 0x00, 0x00, 0x00, 0x00, // bins[0].n_chunk = 0 - ]; - - assert_eq!(buf, expected); - - Ok(()) - } - - #[tokio::test] - async fn test_write_bins_with_metadata() -> io::Result<()> { - let bins = [(8, Bin::new(Vec::new()))].into_iter().collect(); - let metadata = Metadata::new( - bgzf::VirtualPosition::from(13), - bgzf::VirtualPosition::from(21), - 5, - 0, - ); - - let mut buf = Vec::new(); - write_bins(&mut buf, &bins, Some(&metadata)).await?; - - #[rustfmt::skip] - let expected = [ - 0x02, 0x00, 0x00, 0x00, // n_bins = 2 - - 0x08, 0x00, 0x00, 0x00, // bins[0].bin = 8 - 0x00, 0x00, 0x00, 0x00, // bins[0].n_chunk = 0 - - 0x4a, 0x92, 0x00, 0x00, // bins[1].bin = 37450 - 0x02, 0x00, 0x00, 0x00, // bins[1].n_chunk = 2 - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[0].chunk_beg = 13 - 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[0].chunk_end = 21 - 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[1].chunk_beg = 5 - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[1].chunk_end = 0 - ]; - - assert_eq!(buf, expected); - - Ok(()) - } - - #[tokio::test] - async fn test_write_bin() -> io::Result<()> { - let bin = Bin::new(vec![Chunk::new( - bgzf::VirtualPosition::from(13), - bgzf::VirtualPosition::from(21), - )]); - - let mut buf = Vec::new(); - write_bin(&mut buf, 8, &bin).await?; - - let expected = [ - 0x08, 0x00, 0x00, 0x00, // bin = 8 - 0x01, 0x00, 0x00, 0x00, // n_chunk = 1 - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // chunk[0].chunk_beg - 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // chunk[0].chunk_end - ]; - - assert_eq!(buf, expected); - - Ok(()) - } - - #[tokio::test] - async fn test_write_metadata() -> io::Result<()> { - let metadata = Metadata::new( - bgzf::VirtualPosition::from(610), - bgzf::VirtualPosition::from(1597), - 55, - 0, - ); - - let mut buf = Vec::new(); - write_metadata(&mut buf, &metadata).await?; - - let expected = [ - 0x4a, 0x92, 0x00, 0x00, // bin = 37450 - 0x02, 0x00, 0x00, 0x00, // n_chunks = 2 - 0x62, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_beg = 610 - 0x3d, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_end = 1597 - 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_mapped = 55 - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_unmapped = 0 - ]; - - assert_eq!(buf, expected); - - Ok(()) + write_index(&mut self.inner, index).await } } diff --git a/noodles-bam/src/bai/async/io/writer/index.rs b/noodles-bam/src/bai/async/io/writer/index.rs new file mode 100644 index 000000000..0ed7a5b14 --- /dev/null +++ b/noodles-bam/src/bai/async/io/writer/index.rs @@ -0,0 +1,313 @@ +use indexmap::IndexMap; +use noodles_bgzf as bgzf; +use noodles_csi::{ + binning_index::{ + index::{ + reference_sequence::{bin::Chunk, index::LinearIndex, Bin, Metadata}, + ReferenceSequence, + }, + ReferenceSequence as _, + }, + BinningIndex, +}; +use tokio::io::{self, AsyncWrite, AsyncWriteExt}; + +use crate::bai::{Index, MAGIC_NUMBER}; + +pub(super) async fn write_index(writer: &mut W, index: &Index) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + write_magic(writer).await?; + write_reference_sequences(writer, index.reference_sequences()).await?; + + if let Some(unplaced_unmapped_record_count) = index.unplaced_unmapped_record_count() { + write_unplaced_unmapped_record_count(writer, unplaced_unmapped_record_count).await?; + } + + Ok(()) +} + +async fn write_magic(writer: &mut W) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + writer.write_all(MAGIC_NUMBER).await +} + +async fn write_reference_sequences( + writer: &mut W, + reference_sequences: &[ReferenceSequence], +) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let n_ref = u32::try_from(reference_sequences.len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(n_ref).await?; + + for reference_sequence in reference_sequences { + write_reference_sequence(writer, reference_sequence).await?; + } + + Ok(()) +} + +async fn write_reference_sequence( + writer: &mut W, + reference_sequence: &ReferenceSequence, +) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + write_bins( + writer, + reference_sequence.bins(), + reference_sequence.metadata(), + ) + .await?; + + write_intervals(writer, reference_sequence.index()).await?; + + Ok(()) +} + +async fn write_bins( + writer: &mut W, + bins: &IndexMap, + metadata: Option<&Metadata>, +) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let n_bin = u32::try_from(bins.len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) + .and_then(|n| { + if metadata.is_some() { + n.checked_add(1) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "n_bin overflow")) + } else { + Ok(n) + } + })?; + + writer.write_u32_le(n_bin).await?; + + for (&id, bin) in bins { + write_bin(writer, id, bin).await?; + } + + if let Some(m) = metadata { + write_metadata(writer, m).await?; + } + + Ok(()) +} + +async fn write_bin(writer: &mut W, id: usize, bin: &Bin) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let id = u32::try_from(id).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(id).await?; + write_chunks(writer, bin.chunks()).await?; + Ok(()) +} + +async fn write_chunks(writer: &mut W, chunks: &[Chunk]) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let n_chunk = + u32::try_from(chunks.len()).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(n_chunk).await?; + + for chunk in chunks { + write_chunk(writer, chunk).await?; + } + + Ok(()) +} + +async fn write_chunk(writer: &mut W, chunk: &Chunk) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let chunk_beg = u64::from(chunk.start()); + writer.write_u64_le(chunk_beg).await?; + + let chunk_end = u64::from(chunk.end()); + writer.write_u64_le(chunk_end).await?; + + Ok(()) +} + +async fn write_intervals(writer: &mut W, intervals: &[bgzf::VirtualPosition]) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + let n_intv = u32::try_from(intervals.len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(n_intv).await?; + + for &interval in intervals { + let ioffset = u64::from(interval); + writer.write_u64_le(ioffset).await?; + } + + Ok(()) +} + +async fn write_metadata(writer: &mut W, metadata: &Metadata) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + use crate::bai::DEPTH; + + const METADATA_ID: usize = Bin::metadata_id(DEPTH); + const METADATA_CHUNK_COUNT: usize = 2; + + let id = + u32::try_from(METADATA_ID).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(id).await?; + + let n_chunk = u32::try_from(METADATA_CHUNK_COUNT) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32_le(n_chunk).await?; + + let ref_beg = u64::from(metadata.start_position()); + writer.write_u64_le(ref_beg).await?; + + let ref_end = u64::from(metadata.end_position()); + writer.write_u64_le(ref_end).await?; + + let n_mapped = metadata.mapped_record_count(); + writer.write_u64_le(n_mapped).await?; + + let n_unmapped = metadata.unmapped_record_count(); + writer.write_u64_le(n_unmapped).await?; + + Ok(()) +} + +async fn write_unplaced_unmapped_record_count( + writer: &mut W, + unplaced_unmapped_record_count: u64, +) -> io::Result<()> +where + W: AsyncWrite + Unpin, +{ + writer.write_u64_le(unplaced_unmapped_record_count).await +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_write_magic() -> io::Result<()> { + let mut buf = Vec::new(); + write_magic(&mut buf).await?; + assert_eq!(buf, b"BAI\x01"); + Ok(()) + } + + #[tokio::test] + async fn test_write_bins() -> io::Result<()> { + let bins = [(8, Bin::new(Vec::new()))].into_iter().collect(); + + let mut buf = Vec::new(); + write_bins(&mut buf, &bins, None).await?; + + let expected = [ + 0x01, 0x00, 0x00, 0x00, // n_bins = 1 + 0x08, 0x00, 0x00, 0x00, // bins[0].bin = 8 + 0x00, 0x00, 0x00, 0x00, // bins[0].n_chunk = 0 + ]; + + assert_eq!(buf, expected); + + Ok(()) + } + + #[tokio::test] + async fn test_write_bins_with_metadata() -> io::Result<()> { + let bins = [(8, Bin::new(Vec::new()))].into_iter().collect(); + let metadata = Metadata::new( + bgzf::VirtualPosition::from(13), + bgzf::VirtualPosition::from(21), + 5, + 0, + ); + + let mut buf = Vec::new(); + write_bins(&mut buf, &bins, Some(&metadata)).await?; + + #[rustfmt::skip] + let expected = [ + 0x02, 0x00, 0x00, 0x00, // n_bins = 2 + + 0x08, 0x00, 0x00, 0x00, // bins[0].bin = 8 + 0x00, 0x00, 0x00, 0x00, // bins[0].n_chunk = 0 + + 0x4a, 0x92, 0x00, 0x00, // bins[1].bin = 37450 + 0x02, 0x00, 0x00, 0x00, // bins[1].n_chunk = 2 + 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[0].chunk_beg = 13 + 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[0].chunk_end = 21 + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[1].chunk_beg = 5 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // bins[1].chunks[1].chunk_end = 0 + ]; + + assert_eq!(buf, expected); + + Ok(()) + } + + #[tokio::test] + async fn test_write_bin() -> io::Result<()> { + let bin = Bin::new(vec![Chunk::new( + bgzf::VirtualPosition::from(13), + bgzf::VirtualPosition::from(21), + )]); + + let mut buf = Vec::new(); + write_bin(&mut buf, 8, &bin).await?; + + let expected = [ + 0x08, 0x00, 0x00, 0x00, // bin = 8 + 0x01, 0x00, 0x00, 0x00, // n_chunk = 1 + 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // chunk[0].chunk_beg + 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // chunk[0].chunk_end + ]; + + assert_eq!(buf, expected); + + Ok(()) + } + + #[tokio::test] + async fn test_write_metadata() -> io::Result<()> { + let metadata = Metadata::new( + bgzf::VirtualPosition::from(610), + bgzf::VirtualPosition::from(1597), + 55, + 0, + ); + + let mut buf = Vec::new(); + write_metadata(&mut buf, &metadata).await?; + + let expected = [ + 0x4a, 0x92, 0x00, 0x00, // bin = 37450 + 0x02, 0x00, 0x00, 0x00, // n_chunks = 2 + 0x62, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_beg = 610 + 0x3d, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_end = 1597 + 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_mapped = 55 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_unmapped = 0 + ]; + + assert_eq!(buf, expected); + + Ok(()) + } +} diff --git a/noodles-bam/src/bai/io/writer.rs b/noodles-bam/src/bai/io/writer.rs index 901dee813..594d4575e 100644 --- a/noodles-bam/src/bai/io/writer.rs +++ b/noodles-bam/src/bai/io/writer.rs @@ -1,18 +1,9 @@ -use std::io::{self, Write}; +mod index; -use byteorder::{LittleEndian, WriteBytesExt}; -use noodles_csi::{ - binning_index::{ - index::{ - reference_sequence::{bin::Chunk, index::LinearIndex, Bin, Metadata}, - ReferenceSequence, - }, - ReferenceSequence as _, - }, - BinningIndex, -}; +use std::io::{self, Write}; -use crate::bai::{Index, MAGIC_NUMBER}; +use self::index::write_index; +use crate::bai::Index; /// A BAM index (BAI) writer. /// @@ -108,201 +99,6 @@ where /// # Ok::<(), io::Error>(()) /// ``` pub fn write_index(&mut self, index: &Index) -> io::Result<()> { - self.inner.write_all(MAGIC_NUMBER)?; - - let n_ref = u32::try_from(index.reference_sequences().len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - self.inner.write_u32::(n_ref)?; - - for reference_sequence in index.reference_sequences() { - write_reference_sequence(&mut self.inner, reference_sequence)?; - } - - if let Some(n_no_coor) = index.unplaced_unmapped_record_count() { - self.inner.write_u64::(n_no_coor)?; - } - - Ok(()) - } -} - -fn write_reference_sequence( - writer: &mut W, - reference_sequence: &ReferenceSequence, -) -> io::Result<()> -where - W: Write, -{ - let mut n_bin = u32::try_from(reference_sequence.bins().len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - - if reference_sequence.metadata().is_some() { - n_bin = n_bin - .checked_add(1) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "n_bin overflow"))?; - } - - writer.write_u32::(n_bin)?; - - for (&id, bin) in reference_sequence.bins() { - write_bin(writer, id, bin)?; - } - - if let Some(metadata) = reference_sequence.metadata() { - write_metadata(writer, metadata)?; - } - - let n_intv = u32::try_from(reference_sequence.index().len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32::(n_intv)?; - - for interval in reference_sequence.index() { - let ioffset = u64::from(*interval); - writer.write_u64::(ioffset)?; - } - - Ok(()) -} - -fn write_bin(writer: &mut W, id: usize, bin: &Bin) -> io::Result<()> -where - W: Write, -{ - let id = u32::try_from(id).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32::(id)?; - - let n_chunk = u32::try_from(bin.chunks().len()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32::(n_chunk)?; - - for chunk in bin.chunks() { - write_chunk(writer, chunk)?; - } - - Ok(()) -} - -fn write_chunk(writer: &mut W, chunk: &Chunk) -> io::Result<()> -where - W: Write, -{ - let chunk_beg = u64::from(chunk.start()); - writer.write_u64::(chunk_beg)?; - - let chunk_end = u64::from(chunk.end()); - writer.write_u64::(chunk_end)?; - - Ok(()) -} - -fn write_metadata(writer: &mut W, metadata: &Metadata) -> io::Result<()> -where - W: Write, -{ - use crate::bai::DEPTH; - - const METADATA_ID: usize = Bin::metadata_id(DEPTH); - const METADATA_CHUNK_COUNT: usize = 2; - - let id = - u32::try_from(METADATA_ID).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32::(id)?; - - let n_chunk = u32::try_from(METADATA_CHUNK_COUNT) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; - writer.write_u32::(n_chunk)?; - - let ref_beg = u64::from(metadata.start_position()); - writer.write_u64::(ref_beg)?; - - let ref_end = u64::from(metadata.end_position()); - writer.write_u64::(ref_end)?; - - let n_mapped = metadata.mapped_record_count(); - writer.write_u64::(n_mapped)?; - - let n_unmapped = metadata.unmapped_record_count(); - writer.write_u64::(n_unmapped)?; - - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::io::BufWriter; - - use noodles_bgzf as bgzf; - - use super::*; - - #[test] - fn test_write_index() -> io::Result<()> { - let chunks = vec![Chunk::new( - bgzf::VirtualPosition::from(509268599425), - bgzf::VirtualPosition::from(509268599570), - )]; - let bins = [(16385, Bin::new(chunks))].into_iter().collect(); - let intervals = vec![bgzf::VirtualPosition::from(337)]; - let reference_sequences = vec![ReferenceSequence::new(bins, intervals, None)]; - let index = Index::builder() - .set_reference_sequences(reference_sequences) - .build(); - - let mut actual_writer = Writer::new(Vec::new()); - actual_writer.write_index(&index)?; - - let mut expected_writer = BufWriter::new(Vec::new()); - // magic - expected_writer.write_all(MAGIC_NUMBER)?; - // n_ref - expected_writer.write_u32::(1)?; - // n_bin - expected_writer.write_u32::(1)?; - // bin - expected_writer.write_u32::(16385)?; - // n_chunk - expected_writer.write_u32::(1)?; - // chunk_beg - expected_writer.write_u64::(509268599425)?; - // chunk_end - expected_writer.write_u64::(509268599570)?; - // n_intv - expected_writer.write_u32::(1)?; - // ioffset - expected_writer.write_u64::(337)?; - expected_writer.flush()?; - - let actual = actual_writer.get_ref(); - let expected = expected_writer.get_ref(); - - assert_eq!(actual, expected); - - Ok(()) - } - - #[test] - fn test_write_metadata() -> io::Result<()> { - let metadata = Metadata::new( - bgzf::VirtualPosition::from(610), - bgzf::VirtualPosition::from(1597), - 55, - 0, - ); - - let mut buf = Vec::new(); - write_metadata(&mut buf, &metadata)?; - - let expected = [ - 0x4a, 0x92, 0x00, 0x00, // bin = 37450 - 0x02, 0x00, 0x00, 0x00, // chunks = 2 - 0x62, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_beg = 610 - 0x3d, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_end = 1597 - 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_mapped = 55 - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_unmapped = 0 - ]; - - assert_eq!(buf, expected); - - Ok(()) + write_index(&mut self.inner, index) } } diff --git a/noodles-bam/src/bai/io/writer/index.rs b/noodles-bam/src/bai/io/writer/index.rs new file mode 100644 index 000000000..4a5d1d09b --- /dev/null +++ b/noodles-bam/src/bai/io/writer/index.rs @@ -0,0 +1,211 @@ +use std::io::{self, Write}; + +use byteorder::{LittleEndian, WriteBytesExt}; +use noodles_csi::{ + binning_index::{ + index::{ + reference_sequence::{bin::Chunk, index::LinearIndex, Bin, Metadata}, + ReferenceSequence, + }, + ReferenceSequence as _, + }, + BinningIndex, +}; + +use crate::bai::{Index, MAGIC_NUMBER}; + +pub(super) fn write_index(writer: &mut W, index: &Index) -> io::Result<()> +where + W: Write, +{ + writer.write_all(MAGIC_NUMBER)?; + + let n_ref = u32::try_from(index.reference_sequences().len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(n_ref)?; + + for reference_sequence in index.reference_sequences() { + write_reference_sequence(writer, reference_sequence)?; + } + + if let Some(n_no_coor) = index.unplaced_unmapped_record_count() { + writer.write_u64::(n_no_coor)?; + } + + Ok(()) +} + +fn write_reference_sequence( + writer: &mut W, + reference_sequence: &ReferenceSequence, +) -> io::Result<()> +where + W: Write, +{ + let mut n_bin = u32::try_from(reference_sequence.bins().len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + + if reference_sequence.metadata().is_some() { + n_bin = n_bin + .checked_add(1) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "n_bin overflow"))?; + } + + writer.write_u32::(n_bin)?; + + for (&id, bin) in reference_sequence.bins() { + write_bin(writer, id, bin)?; + } + + if let Some(metadata) = reference_sequence.metadata() { + write_metadata(writer, metadata)?; + } + + let n_intv = u32::try_from(reference_sequence.index().len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(n_intv)?; + + for interval in reference_sequence.index() { + let ioffset = u64::from(*interval); + writer.write_u64::(ioffset)?; + } + + Ok(()) +} + +fn write_bin(writer: &mut W, id: usize, bin: &Bin) -> io::Result<()> +where + W: Write, +{ + let id = u32::try_from(id).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(id)?; + + let n_chunk = u32::try_from(bin.chunks().len()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(n_chunk)?; + + for chunk in bin.chunks() { + write_chunk(writer, chunk)?; + } + + Ok(()) +} + +fn write_chunk(writer: &mut W, chunk: &Chunk) -> io::Result<()> +where + W: Write, +{ + let chunk_beg = u64::from(chunk.start()); + writer.write_u64::(chunk_beg)?; + + let chunk_end = u64::from(chunk.end()); + writer.write_u64::(chunk_end)?; + + Ok(()) +} + +fn write_metadata(writer: &mut W, metadata: &Metadata) -> io::Result<()> +where + W: Write, +{ + use crate::bai::DEPTH; + + const METADATA_ID: usize = Bin::metadata_id(DEPTH); + const METADATA_CHUNK_COUNT: usize = 2; + + let id = + u32::try_from(METADATA_ID).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(id)?; + + let n_chunk = u32::try_from(METADATA_CHUNK_COUNT) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + writer.write_u32::(n_chunk)?; + + let ref_beg = u64::from(metadata.start_position()); + writer.write_u64::(ref_beg)?; + + let ref_end = u64::from(metadata.end_position()); + writer.write_u64::(ref_end)?; + + let n_mapped = metadata.mapped_record_count(); + writer.write_u64::(n_mapped)?; + + let n_unmapped = metadata.unmapped_record_count(); + writer.write_u64::(n_unmapped)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use noodles_bgzf as bgzf; + + use super::*; + + #[test] + fn test_write_index() -> io::Result<()> { + let chunks = vec![Chunk::new( + bgzf::VirtualPosition::from(509268599425), + bgzf::VirtualPosition::from(509268599570), + )]; + let bins = [(16385, Bin::new(chunks))].into_iter().collect(); + let intervals = vec![bgzf::VirtualPosition::from(337)]; + let reference_sequences = vec![ReferenceSequence::new(bins, intervals, None)]; + let index = Index::builder() + .set_reference_sequences(reference_sequences) + .build(); + + let mut buf = Vec::new(); + write_index(&mut buf, &index)?; + + let mut expected = Vec::new(); + // magic + expected.write_all(MAGIC_NUMBER)?; + // n_ref + expected.write_u32::(1)?; + // n_bin + expected.write_u32::(1)?; + // bin + expected.write_u32::(16385)?; + // n_chunk + expected.write_u32::(1)?; + // chunk_beg + expected.write_u64::(509268599425)?; + // chunk_end + expected.write_u64::(509268599570)?; + // n_intv + expected.write_u32::(1)?; + // ioffset + expected.write_u64::(337)?; + + assert_eq!(buf, expected); + + Ok(()) + } + + #[test] + fn test_write_metadata() -> io::Result<()> { + let metadata = Metadata::new( + bgzf::VirtualPosition::from(610), + bgzf::VirtualPosition::from(1597), + 55, + 0, + ); + + let mut buf = Vec::new(); + write_metadata(&mut buf, &metadata)?; + + let expected = [ + 0x4a, 0x92, 0x00, 0x00, // bin = 37450 + 0x02, 0x00, 0x00, 0x00, // chunks = 2 + 0x62, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_beg = 610 + 0x3d, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // ref_end = 1597 + 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_mapped = 55 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // n_unmapped = 0 + ]; + + assert_eq!(buf, expected); + + Ok(()) + } +}