Skip to content

Commit

Permalink
polbin: Actually dump FlatGFA binary files (#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Mar 11, 2024
2 parents 5b14d15 + 1793f8d commit 60d001c
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 43 deletions.
32 changes: 32 additions & 0 deletions polbin/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions polbin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
argh = "0.1.12"
bstr = "1.9.1"
gfa = "0.10.1"
memmap = "0.7.0"
Expand Down
78 changes: 72 additions & 6 deletions polbin/src/file.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use crate::flatgfa;
use zerocopy::{FromBytes, FromZeroes};
use std::mem::{size_of, size_of_val};
use zerocopy::{AsBytes, FromBytes, FromZeroes};

const MAGIC_NUMBER: usize = 0x1337_4915;
const MAGIC_NUMBER: u64 = 0xB101_1054;

#[derive(FromBytes, FromZeroes)]
#[derive(FromBytes, FromZeroes, AsBytes)]
#[repr(packed)]
struct TOC {
magic: usize,
magic: u64,
header_len: usize,
segs_count: usize,
paths_count: usize,
Expand All @@ -25,11 +27,13 @@ fn get_prefix(data: &[u8], len: usize) -> (&[u8], &[u8]) {
(&data[0..len], &data[len..])
}

/// Get a FlatGFA backed by the data in a byte buffer.
pub fn view(data: &[u8]) -> flatgfa::FlatGFA {
// Table of contents.
let toc = TOC::ref_from_prefix(data).unwrap();
let rest = &data[std::mem::size_of::<TOC>()..];
assert_eq!(toc.magic, MAGIC_NUMBER);
let rest = &data[size_of::<TOC>()..];
let magic = toc.magic;
assert_eq!(magic, MAGIC_NUMBER);

// Get slices for each chunk.
let (header, rest) = get_prefix(rest, toc.header_len);
Expand Down Expand Up @@ -58,3 +62,65 @@ pub fn view(data: &[u8]) -> flatgfa::FlatGFA {
line_order,
}
}

fn write_bump<'a, 'b, T: AsBytes + ?Sized>(buf: &'a mut [u8], data: &'b T) -> Option<&'a mut [u8]> {
let len = size_of_val(data);
data.write_to_prefix(buf)?;
Some(&mut buf[len..])
}

fn write_bytes<'a, 'b>(buf: &'a mut [u8], data: &'b [u8]) -> Option<&'a mut [u8]> {
let len = data.len();
buf[0..len].copy_from_slice(data);
Some(&mut buf[len..])
}

/// Copy a FlatGFA into a byte buffer.
pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) {
// Table of contents.
let toc = TOC {
magic: MAGIC_NUMBER,
header_len: gfa.header.len(),
segs_count: gfa.segs.len(),
paths_count: gfa.paths.len(),
links_count: gfa.links.len(),
steps_count: gfa.steps.len(),
seq_data_len: gfa.seq_data.len(),
overlaps_count: gfa.overlaps.len(),
alignment_count: gfa.alignment.len(),
name_data_len: gfa.name_data.len(),
optional_data_len: gfa.optional_data.len(),
line_order_len: gfa.line_order.len(),
};
let rest = write_bump(buf, &toc).unwrap();

// All the slices.
let rest = write_bytes(rest, gfa.header).unwrap();
let rest = write_bump(rest, gfa.segs).unwrap();
let rest = write_bump(rest, gfa.paths).unwrap();
let rest = write_bump(rest, gfa.links).unwrap();
let rest = write_bump(rest, gfa.steps).unwrap();
let rest = write_bytes(rest, gfa.seq_data).unwrap();
let rest = write_bump(rest, gfa.overlaps).unwrap();
let rest = write_bump(rest, gfa.alignment).unwrap();
let rest = write_bytes(rest, gfa.name_data).unwrap();
let rest = write_bytes(rest, gfa.optional_data).unwrap();
write_bytes(rest, gfa.line_order).unwrap();
}

/// Get the total size in bytes of a FlatGFA structure. This should result in a big
/// enough buffer to write the entire FlatGFA into with `dump`.
pub fn size(gfa: &flatgfa::FlatGFA) -> usize {
size_of::<TOC>()
+ gfa.header.len()
+ gfa.segs.len() * size_of::<flatgfa::Segment>()
+ gfa.paths.len() * size_of::<flatgfa::Path>()
+ gfa.links.len() * size_of::<flatgfa::Link>()
+ gfa.steps.len() * size_of::<flatgfa::Handle>()
+ gfa.seq_data.len()
+ gfa.overlaps.len() * size_of::<flatgfa::Span>()
+ gfa.alignment.len() * size_of::<flatgfa::AlignOp>()
+ gfa.name_data.len()
+ gfa.optional_data.len()
+ gfa.line_order.len()
}
42 changes: 24 additions & 18 deletions polbin/src/flatgfa.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use bstr::{BStr, BString};
use num_enum::{IntoPrimitive, TryFromPrimitive};
use zerocopy::{FromBytes, FromZeroes};
use zerocopy::{AsBytes, FromBytes, FromZeroes};

/// An efficient flattened representation of a GFA file.
///
Expand Down Expand Up @@ -65,22 +65,23 @@ pub struct FlatGFA<'a> {
/// useful for creating new ones from scratch.
#[derive(Default)]
pub struct FlatGFAStore {
header: BString,
segs: Vec<Segment>,
paths: Vec<Path>,
links: Vec<Link>,
steps: Vec<Handle>,
seq_data: Vec<u8>,
overlaps: Vec<Span>,
alignment: Vec<AlignOp>,
name_data: BString,
optional_data: BString,
line_order: Vec<u8>,
pub header: BString,
pub segs: Vec<Segment>,
pub paths: Vec<Path>,
pub links: Vec<Link>,
pub steps: Vec<Handle>,
pub seq_data: Vec<u8>,
pub overlaps: Vec<Span>,
pub alignment: Vec<AlignOp>,
pub name_data: BString,
pub optional_data: BString,
pub line_order: Vec<u8>,
}

/// GFA graphs consist of "segment" nodes, which are fragments of base-pair sequences
/// that can be strung together into paths.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Segment {
/// The segment's name. We assume all names are just plain numbers.
pub name: usize,
Expand All @@ -93,7 +94,8 @@ pub struct Segment {
}

/// A path is a sequence of oriented references to segments.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Path {
/// The path's name. This can be an arbitrary string. It is a renge in the
/// `name_data` pool.
Expand All @@ -108,7 +110,8 @@ pub struct Path {
}

/// An allowed edge between two oriented segments.
#[derive(Debug, FromBytes, FromZeroes)]
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Link {
/// The source of the edge.
pub from: Handle,
Expand All @@ -134,7 +137,8 @@ pub enum Orientation {
/// A Handle refers to the forward (+) or backward (-) orientation for a given segment.
/// So, logically, it consists of a pair of a segment reference (usize) and an
/// orientation (1 bit). We pack the two values into a single word.
#[derive(Debug, FromBytes, FromZeroes)]
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Handle(usize);

impl Handle {
Expand Down Expand Up @@ -171,7 +175,8 @@ pub enum AlignOpcode {
///
/// Logically, this is a pair of a number and an `AlignOpcode`. We pack the two
/// into a single u32.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes)]
#[repr(packed)]
pub struct AlignOp(u32);

impl AlignOp {
Expand Down Expand Up @@ -216,7 +221,8 @@ pub enum LineKind {
///
/// TODO: Consider smaller indices for this, and possibly base/offset instead
/// of start/end.
#[derive(Debug, FromZeroes, FromBytes)]
#[derive(Debug, FromZeroes, FromBytes, AsBytes, Clone, Copy)]
#[repr(packed)]
pub struct Span {
pub start: usize,
pub end: usize,
Expand Down
61 changes: 50 additions & 11 deletions polbin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,62 @@ mod file;
mod flatgfa;
mod parse;
mod print;
use memmap::Mmap;
use argh::FromArgs;
use memmap::{Mmap, MmapMut};

fn map_file(name: &str) -> Mmap {
let file = std::fs::File::open(name).unwrap();
unsafe { Mmap::map(&file) }.unwrap()
}

fn map_file_mut(name: &str, size: u64) -> MmapMut {
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.open(name)
.unwrap();
file.set_len(size).unwrap();
unsafe { MmapMut::map_mut(&file) }.unwrap()
}

#[derive(FromArgs)]
/// Convert between GFA text and FlatGFA binary formats.
struct PolBin {
/// read from a binary FlatGFA file
#[argh(option, short = 'i')]
input: Option<String>,

/// write to a binary FlatGFA file
#[argh(option, short = 'o')]
output: Option<String>,
}

fn main() {
// Read either GFA text from stdin or a binary file from the first argument.
if let Some(name) = std::env::args().nth(1) {
let mmap = map_file(&name);
let gfa = file::view(&mmap);
print::print(&gfa);
} else {
let stdin = std::io::stdin();
let store = parse::Parser::parse(stdin.lock());
let gfa = store.view();
print::print(&gfa);
let args: PolBin = argh::from_env();

// Load the input from a file (binary) or stdin (text).
let mmap;
let store;
let gfa = match args.input {
Some(name) => {
mmap = map_file(&name);
file::view(&mmap)
}
None => {
let stdin = std::io::stdin();
store = parse::Parser::parse(stdin.lock());
store.view()
}
};

// Write the output to a file (binary) or stdout (text).
match args.output {
Some(name) => {
let mut mmap = map_file_mut(&name, file::size(&gfa) as u64);
file::dump(&gfa, &mut mmap);
mmap.flush().unwrap();
}
None => print::print(&gfa),
}
}
20 changes: 13 additions & 7 deletions polbin/src/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ impl<'a> fmt::Display for flatgfa::Alignment<'a> {
}

fn print_step(gfa: &flatgfa::FlatGFA, handle: &flatgfa::Handle) {
let seg = &gfa.segs[handle.segment()];
print!("{}{}", seg.name, handle.orient());
let seg = gfa.segs[handle.segment()];
let name = seg.name;
print!("{}{}", name, handle.orient());
}

fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) {
Expand All @@ -57,18 +58,23 @@ fn print_path(gfa: &flatgfa::FlatGFA, path: &flatgfa::Path) {
}

fn print_link(gfa: &flatgfa::FlatGFA, link: &flatgfa::Link) {
let from = link.from;
let from_name = gfa.segs[from.segment()].name;
let to = link.to;
let to_name = gfa.segs[to.segment()].name;
println!(
"L\t{}\t{}\t{}\t{}\t{}",
gfa.segs[link.from.segment()].name,
link.from.orient(),
gfa.segs[link.to.segment()].name,
link.to.orient(),
from_name,
from.orient(),
to_name,
to.orient(),
gfa.get_alignment(&link.overlap)
);
}

fn print_seg(gfa: &flatgfa::FlatGFA, seg: &flatgfa::Segment) {
print!("S\t{}\t{}", seg.name, gfa.get_seq(seg));
let name = seg.name;
print!("S\t{}\t{}", name, gfa.get_seq(seg));
if !seg.optional.is_empty() {
print!("\t{}", gfa.get_optional_data(seg));
}
Expand Down
6 changes: 5 additions & 1 deletion tests/turnt.toml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ binary = true
command = "pollen_data_gen simple {filename} | jq .depth"
output.json = "-"

[envs.polbin_roundtrip]
[envs.polbin_mem]
command = "../polbin/target/debug/polbin < {filename}"
output.gfa = "-"

[envs.polbin_file]
command = "../polbin/target/debug/polbin < {filename} -o {base}.flatgfa ; ../polbin/target/debug/polbin -i {base}.flatgfa"
output.gfa = "-"

0 comments on commit 60d001c

Please sign in to comment.