Skip to content

Commit

Permalink
Extract Functioning (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
susan-garry authored Sep 12, 2024
1 parent ccf1911 commit 7e9f620
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/
**/*.degree
**/*.depth
**/*.emit
**/*.extract
**/*.flatten
**/*.flip
**/*.matrix
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,8 @@ test-flatgfa: fetch
-turnt --save -v -e odgi_depth tests/*.gfa
turnt -v -e flatgfa_depth tests/*.gfa

-turnt --save -v -e odgi_extract tests/*.gfa
turnt -v -e flatgfa_extract tests/*.gfa

clean:
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.gfa tests/*.og
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.extract tests/*.gfa tests/*.og
94 changes: 82 additions & 12 deletions flatgfa/src/cmds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@ pub struct Extract {
/// number of edges "away" from the node to include
#[argh(option, short = 'c')]
link_distance: usize,

/// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together
#[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")]
max_distance_subpaths: usize, // TODO: possibly make this bigger

/// maximum number of iterations before we stop merging subpaths
#[argh(option, short = 'e', long = "max-merging-iterations", default = "6")]
num_iterations: usize // TODO: probably make this smaller
}

pub fn extract(
Expand All @@ -156,7 +164,8 @@ pub fn extract(
let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?;

let mut subgraph = SubgraphBuilder::new(gfa);
subgraph.extract(origin_seg, args.link_distance);
subgraph.add_header();
subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations);
Ok(subgraph.store)
}

Expand All @@ -181,6 +190,16 @@ impl<'a> SubgraphBuilder<'a> {
}
}

/// Include the old graph's header
fn add_header(&mut self) {
// pub fn add_header(&mut self, version: &[u8]) {
// assert!(self.header.as_ref().is_empty());
// self.header.add_slice(version);
// }
assert!(self.store.header.as_ref().is_empty());
self.store.header.add_slice(self.old.header.all());
}

/// Add a segment from the source graph to this subgraph.
fn include_seg(&mut self, seg_id: Id<Segment>) {
let seg = &self.old.segs[seg_id];
Expand Down Expand Up @@ -208,6 +227,40 @@ impl<'a> SubgraphBuilder<'a> {
.add_path(name.as_bytes(), steps, std::iter::empty());
}

/// Identify all the subpaths in a path from the original graph that cross through
/// segments in this subgraph and merge them if possible.
fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) {
// these are subpaths which *aren't* already included in the new graph
let mut cur_subpath_start: Option<usize> = Some(0);
let mut subpath_length = 0;
let mut ignore_path = true;

for (idx, step) in self.old.steps[path.steps].iter().enumerate() {
let in_neighb = self.seg_map.contains_key(&step.segment());

if let (Some(start), true) = (&cur_subpath_start, in_neighb) {
// We just entered the subgraph. End the current subpath.
if !ignore_path && subpath_length <= max_distance_subpaths {
// TODO: type safety
let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32);
for step in &self.old.steps[subpath_span] {
if !self.seg_map.contains_key(&step.segment()) {
self.include_seg(step.segment());
}
}
}
cur_subpath_start = None;
ignore_path = false;
} else if let (None, false) = (&cur_subpath_start, in_neighb) {
// We've exited the current subgraph, start a new subpath
cur_subpath_start = Some(idx);
}

// Track the current bp position in the path.
subpath_length += self.old.get_handle_seg(*step).len();
}
}

/// Identify all the subpaths in a path from the original graph that cross through
/// segments in this subgraph and add them.
fn find_subpaths(&mut self, path: &flatgfa::Path) {
Expand Down Expand Up @@ -260,17 +313,33 @@ impl<'a> SubgraphBuilder<'a> {
///
/// Include any links between the segments in the neighborhood and subpaths crossing
/// through the neighborhood.
fn extract(&mut self, origin: Id<Segment>, dist: usize) {
fn extract(&mut self, origin: Id<Segment>, dist: usize, max_distance_subpaths: usize, num_iterations: usize) {
self.include_seg(origin);

// Find the set of all segments that are 1 link away.
assert_eq!(dist, 1, "only `-c 1` is implemented so far");
for link in self.old.links.all().iter() {
if let Some(other_seg) = link.incident_seg(origin) {
if !self.seg_map.contains_key(&other_seg) {
self.include_seg(other_seg);
// Find the set of all segments that are c links away.
let mut frontier: Vec<Id<Segment>> = Vec::new();
let mut next_frontier: Vec<Id<Segment>> = Vec::new();
frontier.push(origin);
for _ in 0..dist {
while let Some(seg_id) = frontier.pop() {
for link in self.old.links.all().iter() {
if let Some(other_seg) = link.incident_seg(seg_id) {
// Add other_seg to the frontier set if it is not already in the frontier set or the seg_map
if !self.seg_map.contains_key(&other_seg) {
self.include_seg(other_seg);
next_frontier.push(other_seg);
}
}
}
}
(frontier, next_frontier) = (next_frontier, frontier);
}

// Merge subpaths within max_distance_subpaths bp of each other, num_iterations times
for _ in 0..num_iterations {
for path in self.old.paths.all().iter() {
self.merge_subpaths(path, max_distance_subpaths);
}
}

// Include all links within the subgraph.
Expand Down Expand Up @@ -343,9 +412,10 @@ pub fn chop<'a>(
gfa: &'a flatgfa::FlatGFA<'a>,
args: Chop,
) -> Result<flatgfa::HeapGFAStore, &'static str> {
let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
// seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1
let mut seg_map: Vec<Span<Segment>> = Vec::new();
// The smallest id (>0) which does not already belong to a segment in `flat`
Expand Down Expand Up @@ -378,14 +448,14 @@ pub fn chop<'a>(
let mut offset = seg.seq.start.index();
let segs_start = flat.segs.next_id();
// Could also generate end_id by setting it equal to the start_id and
// updating it for each segment that is added - only benefits us if we
// updating it for each segment that is added - only benefits us if we
// don't unroll the last iteration of this loop
while offset < seq_end.index() - args.c {
// Generate a new segment of length c
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), Id::new(offset + args.c)),
optional: Span::new_empty(),
optional: Span::new_empty()
});
offset += args.c;
max_node_id += 1;
Expand Down
4 changes: 2 additions & 2 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ impl<'a> FlatGFA<'a> {

/// Look up a CIGAR alignment.
pub fn get_alignment(&self, overlap: Span<AlignOp>) -> Alignment {
Alignment {
ops: &self.alignment[overlap],
Alignment {
ops: &self.alignment[overlap]
}
}

Expand Down
17 changes: 11 additions & 6 deletions mygfa/mygfa/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ def rev(self) -> "Link":
return Link(self.to_.rev(), self.from_.rev(), self.overlap)

def __str__(self) -> str:
if self.to_.name < self.from_.name:
return str(self.rev())
if self.from_.name == self.to_.name:
if not self.from_.ori:
return str(self.rev())
return "\t".join(
[
"L",
Expand Down Expand Up @@ -323,10 +328,10 @@ def emit(self, outfile: TextIO, showlinks: bool = True) -> None:
"""Emit a GFA file."""
for header in self.headers:
print(header, file=outfile)
for segment in self.segments.values():
print(str(segment), file=outfile)
for path in self.paths.values():
print(str(path), file=outfile)
for segment in sorted(self.segments.items()):
print(str(segment[1]), file=outfile)
for path in sorted(self.paths.items()):
print(str(path[1]), file=outfile)
if showlinks:
for link in sorted(self.links):
print(str(link), file=outfile)
for link_str in sorted(map(lambda l: str(l), self.links)):
print(link_str, file=outfile)
11 changes: 10 additions & 1 deletion tests/turnt.toml
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,13 @@ output.chop = "-"

[envs.flatgfa_chop]
command = "../flatgfa/target/debug/fgfa -I {filename} chop -l -c 3 | slow_odgi norm"
output.chop = "-"
output.chop = "-"

[envs.odgi_extract]
binary = true
command = "odgi extract -i {filename} -n 3 -c 3 -o - | odgi view -g -i - | slow_odgi norm"
output.extract = "-"

[envs.flatgfa_extract]
command = "../flatgfa/target/debug/fgfa -I {filename} extract -n 3 -c 3 | slow_odgi norm"
output.extract = "-"

0 comments on commit 7e9f620

Please sign in to comment.