Skip to content

Commit

Permalink
vcf: Add indexer
Browse files Browse the repository at this point in the history
See #214.
  • Loading branch information
zaeleus committed Nov 10, 2023
1 parent 712c46e commit f5ba063
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 40 deletions.
6 changes: 6 additions & 0 deletions noodles-vcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@

### Added

* vcf: Add indexer (`vcf::index`) ([#214]).

This is a convenience function to index a bgzip-compressed VCF file.

* vcf/header/record/value/map: Add mutable getter for other fields
(`Map::<I>::other_fields_mut`).

[#214]: https://github.com/zaeleus/noodles/issues/214

## 0.45.0 - 2023-11-02

### Changed
Expand Down
40 changes: 1 addition & 39 deletions noodles-vcf/examples/vcf_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,57 +6,19 @@
use std::{
env,
fs::File,
io::{self, BufWriter},
};

use noodles_bgzf as bgzf;
use noodles_core::Position;
use noodles_csi::{self as csi, index::reference_sequence::bin::Chunk};
use noodles_tabix as tabix;
use noodles_vcf as vcf;

fn main() -> io::Result<()> {
let src = env::args().nth(1).expect("missing src");

let mut reader = File::open(src)
.map(bgzf::Reader::new)
.map(vcf::Reader::new)?;

let header = reader.read_header()?;

let mut record = vcf::Record::default();

let mut indexer = tabix::index::Indexer::default();
indexer.set_header(csi::index::header::Builder::vcf().build());

let mut start_position = reader.get_ref().virtual_position();

while reader.read_record(&header, &mut record)? != 0 {
let end_position = reader.get_ref().virtual_position();
let chunk = Chunk::new(start_position, end_position);

let reference_sequence_name = record.chromosome().to_string();
let start = Position::try_from(usize::from(record.position()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let end = record
.end()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
.and_then(|position| {
Position::try_from(usize::from(position))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

indexer.add_record(&reference_sequence_name, start, end, chunk)?;

start_position = end_position;
}

let index = indexer.build();
let index = vcf::index(src)?;

let stdout = io::stdout().lock();
let mut writer = tabix::Writer::new(BufWriter::new(stdout));

writer.write_index(&index)?;

Ok(())
Expand Down
51 changes: 51 additions & 0 deletions noodles-vcf/src/indexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::{fs::File, io, path::Path};

use noodles_bgzf as bgzf;
use noodles_core::Position;
use noodles_csi::{self as csi, index::reference_sequence::bin::Chunk};
use noodles_tabix as tabix;

use super::{Reader, Record};

/// Indexes a bgzipped-compressed VCF file.
///
/// ```no_run
/// use noodles_vcf as vcf;
/// let index = vcf::index("sample.vcf.gz")?;
/// # Ok::<_, std::io::Error>(())
/// ```
pub fn index<P>(src: P) -> io::Result<csi::Index>
where
P: AsRef<Path>,
{
let mut reader = File::open(src).map(bgzf::Reader::new).map(Reader::new)?;
let header = reader.read_header()?;

let mut indexer = tabix::index::Indexer::default();
indexer.set_header(csi::index::header::Builder::vcf().build());

let mut record = Record::default();
let mut start_position = reader.get_ref().virtual_position();

while reader.read_record(&header, &mut record)? != 0 {
let end_position = reader.get_ref().virtual_position();
let chunk = Chunk::new(start_position, end_position);

let reference_sequence_name = record.chromosome().to_string();
let start = Position::try_from(usize::from(record.position()))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let end = record
.end()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
.and_then(|position| {
Position::try_from(usize::from(position))
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

indexer.add_record(&reference_sequence_name, start, end, chunk)?;

start_position = end_position;
}

Ok(indexer.build())
}
3 changes: 2 additions & 1 deletion noodles-vcf/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ mod r#async;

pub mod header;
pub mod indexed_reader;
mod indexer;
pub mod io;
pub mod lazy;
pub mod reader;
Expand All @@ -33,7 +34,7 @@ mod variant_writer;
pub mod writer;

pub use self::{
header::Header, indexed_reader::IndexedReader, reader::Reader, record::Record,
header::Header, indexed_reader::IndexedReader, indexer::index, reader::Reader, record::Record,
variant_reader::VariantReader, variant_writer::VariantWriter, writer::Writer,
};

Expand Down

0 comments on commit f5ba063

Please sign in to comment.