Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract Functioning #190

Merged
merged 26 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7881627
typo
susan-garry Jun 10, 2024
74726d1
initial chop implementation
susan-garry Jun 14, 2024
0a34f05
test chop, fix off-by-one error
susan-garry Jun 17, 2024
633ab10
test fgfa depth
susan-garry Jun 17, 2024
2407f55
typo
susan-garry Jun 18, 2024
2cfa2fc
add benchmarking for chop, allow shell commands in config.toml (so we…
susan-garry Jun 18, 2024
1cff638
chop now computes new links, but is buggy (as is odgi), testing frame…
susan-garry Jul 6, 2024
9ff0d6f
re-implement chop treating links as bidirectional, tests pass
susan-garry Jul 8, 2024
0609993
clippy
susan-garry Jul 8, 2024
551d286
turnt error messages are verbose
susan-garry Jul 8, 2024
f58b6bd
flatgfa now requires odgi and slow_odgi
susan-garry Jul 8, 2024
0f31491
make fetch-og generates odgi files, which test-flatgfa depends on
susan-garry Jul 8, 2024
7c9cd1b
Get changes to workflow file
susan-garry Jul 8, 2024
ec66a32
flatgfa tests display turnt diffs
susan-garry Jul 8, 2024
62de9fc
tests that rely on odgi use odgi files, avoids unnecessary conversions
susan-garry Jul 8, 2024
3f50bcb
turnt prints stuff
susan-garry Jul 9, 2024
7d2cefc
use latest version of odgi
susan-garry Jul 9, 2024
a9b0032
actually run tests, don't just print turnt commands
susan-garry Jul 9, 2024
71a7e53
simplifications, code comments, and better benchmarking
susan-garry Jul 10, 2024
0e19844
represent the ranges of newly chopped segments as spans; abstracting …
susan-garry Jul 11, 2024
c391fa7
oops
susan-garry Jul 11, 2024
ba6f192
extract for radius (-n) 1, actually normalize mygfa output graphs
susan-garry Jul 11, 2024
9d1803b
full normalize mygfa output
susan-garry Jul 11, 2024
a8202db
extract tests pass for values of c > 1
susan-garry Jul 12, 2024
a78c618
cleanup
susan-garry Jul 12, 2024
9001544
Merge branch 'main' into extract
susan-garry Sep 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/
**/*.degree
**/*.depth
**/*.emit
**/*.extract
**/*.flatten
**/*.flip
**/*.matrix
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,8 @@ test-flatgfa: fetch
-turnt --save -v -e odgi_depth tests/*.gfa
turnt -v -e flatgfa_depth tests/*.gfa

-turnt --save -v -e odgi_extract tests/*.gfa
turnt -v -e flatgfa_extract tests/*.gfa

clean:
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.gfa tests/*.og
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.extract tests/*.gfa tests/*.og
94 changes: 82 additions & 12 deletions flatgfa/src/cmds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@ pub struct Extract {
/// number of edges "away" from the node to include
#[argh(option, short = 'c')]
link_distance: usize,

/// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together
#[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")]
max_distance_subpaths: usize, // TODO: possibly make this bigger

/// maximum number of iterations before we stop merging subpaths
#[argh(option, short = 'e', long = "max-merging-iterations", default = "6")]
num_iterations: usize // TODO: probably make this smaller
}

pub fn extract(
Expand All @@ -156,7 +164,8 @@ pub fn extract(
let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?;

let mut subgraph = SubgraphBuilder::new(gfa);
subgraph.extract(origin_seg, args.link_distance);
subgraph.add_header();
subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations);
Ok(subgraph.store)
}

Expand All @@ -181,6 +190,16 @@ impl<'a> SubgraphBuilder<'a> {
}
}

/// Include the old graph's header
fn add_header(&mut self) {
// pub fn add_header(&mut self, version: &[u8]) {
// assert!(self.header.as_ref().is_empty());
// self.header.add_slice(version);
// }
assert!(self.store.header.as_ref().is_empty());
self.store.header.add_slice(self.old.header.all());
}

/// Add a segment from the source graph to this subgraph.
fn include_seg(&mut self, seg_id: Id<Segment>) {
let seg = &self.old.segs[seg_id];
Expand Down Expand Up @@ -208,6 +227,40 @@ impl<'a> SubgraphBuilder<'a> {
.add_path(name.as_bytes(), steps, std::iter::empty());
}

/// Identify all the subpaths in a path from the original graph that cross through
/// segments in this subgraph and merge them if possible.
fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) {
// these are subpaths which *aren't* already included in the new graph
let mut cur_subpath_start: Option<usize> = Some(0);
let mut subpath_length = 0;
let mut ignore_path = true;

for (idx, step) in self.old.steps[path.steps].iter().enumerate() {
let in_neighb = self.seg_map.contains_key(&step.segment());

if let (Some(start), true) = (&cur_subpath_start, in_neighb) {
// We just entered the subgraph. End the current subpath.
if !ignore_path && subpath_length <= max_distance_subpaths {
// TODO: type safety
let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32);
for step in &self.old.steps[subpath_span] {
if !self.seg_map.contains_key(&step.segment()) {
self.include_seg(step.segment());
}
}
}
cur_subpath_start = None;
ignore_path = false;
} else if let (None, false) = (&cur_subpath_start, in_neighb) {
// We've exited the current subgraph, start a new subpath
cur_subpath_start = Some(idx);
}

// Track the current bp position in the path.
subpath_length += self.old.get_handle_seg(*step).len();
}
}

/// Identify all the subpaths in a path from the original graph that cross through
/// segments in this subgraph and add them.
fn find_subpaths(&mut self, path: &flatgfa::Path) {
Expand Down Expand Up @@ -260,17 +313,33 @@ impl<'a> SubgraphBuilder<'a> {
///
/// Include any links between the segments in the neighborhood and subpaths crossing
/// through the neighborhood.
fn extract(&mut self, origin: Id<Segment>, dist: usize) {
fn extract(&mut self, origin: Id<Segment>, dist: usize, max_distance_subpaths: usize, num_iterations: usize) {
self.include_seg(origin);

// Find the set of all segments that are 1 link away.
assert_eq!(dist, 1, "only `-c 1` is implemented so far");
for link in self.old.links.all().iter() {
if let Some(other_seg) = link.incident_seg(origin) {
if !self.seg_map.contains_key(&other_seg) {
self.include_seg(other_seg);
// Find the set of all segments that are c links away.
let mut frontier: Vec<Id<Segment>> = Vec::new();
let mut next_frontier: Vec<Id<Segment>> = Vec::new();
frontier.push(origin);
for _ in 0..dist {
while let Some(seg_id) = frontier.pop() {
for link in self.old.links.all().iter() {
if let Some(other_seg) = link.incident_seg(seg_id) {
// Add other_seg to the frontier set if it is not already in the frontier set or the seg_map
if !self.seg_map.contains_key(&other_seg) {
self.include_seg(other_seg);
next_frontier.push(other_seg);
}
}
}
}
(frontier, next_frontier) = (next_frontier, frontier);
}

// Merge subpaths within max_distance_subpaths bp of each other, num_iterations times
for _ in 0..num_iterations {
for path in self.old.paths.all().iter() {
self.merge_subpaths(path, max_distance_subpaths);
}
}

// Include all links within the subgraph.
Expand Down Expand Up @@ -343,9 +412,10 @@ pub fn chop<'a>(
gfa: &'a flatgfa::FlatGFA<'a>,
args: Chop,
) -> Result<flatgfa::HeapGFAStore, &'static str> {
let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
// seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1
let mut seg_map: Vec<Span<Segment>> = Vec::new();
// The smallest id (>0) which does not already belong to a segment in `flat`
Expand Down Expand Up @@ -378,14 +448,14 @@ pub fn chop<'a>(
let mut offset = seg.seq.start.index();
let segs_start = flat.segs.next_id();
// Could also generate end_id by setting it equal to the start_id and
// updating it for each segment that is added - only benefits us if we
// updating it for each segment that is added - only benefits us if we
// don't unroll the last iteration of this loop
while offset < seq_end.index() - args.c {
// Generate a new segment of length c
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), Id::new(offset + args.c)),
optional: Span::new_empty(),
optional: Span::new_empty()
});
offset += args.c;
max_node_id += 1;
Expand Down
4 changes: 2 additions & 2 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ impl<'a> FlatGFA<'a> {

/// Look up a CIGAR alignment.
pub fn get_alignment(&self, overlap: Span<AlignOp>) -> Alignment {
Alignment {
ops: &self.alignment[overlap],
Alignment {
ops: &self.alignment[overlap]
}
}

Expand Down
17 changes: 11 additions & 6 deletions mygfa/mygfa/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ def rev(self) -> "Link":
return Link(self.to_.rev(), self.from_.rev(), self.overlap)

def __str__(self) -> str:
if self.to_.name < self.from_.name:
return str(self.rev())
if self.from_.name == self.to_.name:
if not self.from_.ori:
return str(self.rev())
return "\t".join(
[
"L",
Expand Down Expand Up @@ -323,10 +328,10 @@ def emit(self, outfile: TextIO, showlinks: bool = True) -> None:
"""Emit a GFA file."""
for header in self.headers:
print(header, file=outfile)
for segment in self.segments.values():
print(str(segment), file=outfile)
for path in self.paths.values():
print(str(path), file=outfile)
for segment in sorted(self.segments.items()):
print(str(segment[1]), file=outfile)
for path in sorted(self.paths.items()):
print(str(path[1]), file=outfile)
if showlinks:
for link in sorted(self.links):
print(str(link), file=outfile)
for link_str in sorted(map(lambda l: str(l), self.links)):
print(link_str, file=outfile)
11 changes: 10 additions & 1 deletion tests/turnt.toml
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,13 @@ output.chop = "-"

[envs.flatgfa_chop]
command = "../flatgfa/target/debug/fgfa -I {filename} chop -l -c 3 | slow_odgi norm"
output.chop = "-"
output.chop = "-"

[envs.odgi_extract]
binary = true
command = "odgi extract -i {filename} -n 3 -c 3 -o - | odgi view -g -i - | slow_odgi norm"
output.extract = "-"

[envs.flatgfa_extract]
command = "../flatgfa/target/debug/fgfa -I {filename} extract -n 3 -c 3 | slow_odgi norm"
output.extract = "-"
Loading