From 20bdae0bc90738ba374f638ec5c83ef5da18d7a2 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 14 Nov 2024 11:26:19 +0100 Subject: [PATCH 1/8] Add unit test for nested multicomponent chain --- src/snarl_distance_index.hpp | 2 + src/unittest/zip_code_tree.cpp | 100 ++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 781b30f670..ebb19f64d7 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -14,6 +14,8 @@ using namespace sdsl; using namespace handlegraph; using namespace bdsg; +//TODO: If anyone ever remakes the distance index, it would be really helpful for the multicomponent chains to know the lengths of each component + //Minimum distance taking a pos instead of id/orientation/offset size_t minimum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2, bool unoriented_distance = false, const HandleGraph* graph=nullptr); diff --git a/src/unittest/zip_code_tree.cpp b/src/unittest/zip_code_tree.cpp index 1ea69d64f7..15be8a1105 100644 --- a/src/unittest/zip_code_tree.cpp +++ b/src/unittest/zip_code_tree.cpp @@ -2953,9 +2953,107 @@ namespace unittest { zip_forest.validate_zip_forest(dist_index, &seeds, 100); } } + TEST_CASE( "zipcode tree multicomponent chain nested in irregular snarl", + "[zip_tree][bug]" ) { + VG graph; + + Node* n1 = graph.create_node("GCAAAAAAAAAAAAAAAAAAAAAAAAA"); + Node* n2 = graph.create_node("T"); + Node* n3 = graph.create_node("G"); + Node* n4 = graph.create_node("CTGA"); + Node* n5 = graph.create_node("GCA"); + Node* n6 = graph.create_node("T"); + Node* n7 = graph.create_node("T"); + Node* n8 = graph.create_node("TTTTTTTTT"); + Node* n9 = graph.create_node("TTTTTTTTT"); + Node* n10 = graph.create_node("GCAAAAAAAAAAAAA"); + Node* n11 = graph.create_node("TTT"); + Node* n12 = graph.create_node("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); + Node* n13 = graph.create_node("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); + + Edge* e1 = graph.create_edge(n1, n2); + Edge* e2 = graph.create_edge(n1, n12); + Edge* e3 = graph.create_edge(n2, n3); + Edge* e4 = graph.create_edge(n2, n10); + Edge* e5 = graph.create_edge(n3, n4); + Edge* e6 = graph.create_edge(n3, n5); + Edge* e7 = graph.create_edge(n5, n6); + Edge* e8 = graph.create_edge(n6, n7, true, false); + Edge* e9 = graph.create_edge(n7, n8); + Edge* e10 = graph.create_edge(n7, n9); + Edge* e11 = graph.create_edge(n8, n9); + Edge* e12 = graph.create_edge(n9, n11); + Edge* e13 = graph.create_edge(n10, n11); + Edge* e14 = graph.create_edge(n10, n10, false, true); + Edge* e15 = graph.create_edge(n11, n12); + Edge* e16 = graph.create_edge(n12, n13); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex dist_index; + fill_in_distance_index(&dist_index, &graph, &snarl_finder); + + //graph.to_dot(cerr); + + SECTION( "Cross unreachable chain" ) { + net_handle_t n = dist_index.get_node_net_handle(n6->id()); + while(!dist_index.is_root(n)) { + cerr << dist_index.net_handle_as_string(n) << endl; + n = dist_index.get_parent(n); + } + + vector> positions; + positions.emplace_back(make_pos_t(n3->id(), false, 0), 0); + positions.emplace_back(make_pos_t(n4->id(), false, 0), 0); + positions.emplace_back(make_pos_t(n5->id(), false, 1), 1); + positions.emplace_back(make_pos_t(n6->id(), false, 0), 2); + positions.emplace_back(make_pos_t(n7->id(), false, 0), 3); + positions.emplace_back(make_pos_t(n8->id(), false, 0), 4); + positions.emplace_back(make_pos_t(n9->id(), false, 0), 5); + + vector seeds; + vector minimizers; + + for (size_t i = 0 ; i < positions.size() ; ++i) { + auto pos = positions[i]; + ZipCode zipcode; + zipcode.fill_in_zipcode(dist_index, pos.first); + zipcode.fill_in_full_decoder(); + seeds.push_back({ pos.first, i, zipcode}); + + minimizers.emplace_back(); + minimizers.back().value.offset = pos.second; + minimizers.back().value.is_reverse = false; + } + VectorView minimizer_vector(minimizers); + + + ZipCodeForest zip_forest; + zip_forest.fill_in_forest(seeds, minimizer_vector, dist_index, 100, 100); + zip_forest.print_self(&seeds, &minimizer_vector); + zip_forest.validate_zip_forest(dist_index, &seeds, 100); + vector seed_order; + for (size_t i = 0 ; i < zip_forest.trees[0].get_tree_size() ; i++) { + if (zip_forest.trees[0].get_item_at_index(i).get_type() == ZipCodeTree::SEED) { + seed_order.emplace_back(zip_forest.trees[0].get_item_at_index(i).get_value()); + } + } + //The seeds should be in order of the chain, which is the order I put them in + if (seed_order.front() == 0) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == i); + } + } else if (seed_order.front() == 6) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == 6-i); + } + } else { + REQUIRE((seed_order.front() == 0 || seed_order.front() == 6)); + } + } + } //TODO: we can't deal with this properly yet - //TEST_CASE( "Looping chain zipcode tree", "[zip_tree][bug]" ) { + //TEST_CASE( "Looping chain zipcode tree", "[zip_tree]" ) { // //TODO: This might change but it's a chain 2rev->2rev // VG graph; From 59ca07ef8f6c50e746c5586612138021debb491c Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 14 Nov 2024 16:14:18 +0100 Subject: [PATCH 2/8] Make the zip code tree maker use the prefix sum values in the orientation of the chain instead of the tree itself --- src/unittest/zip_code_tree.cpp | 7 +- src/zip_code_tree.cpp | 119 +++++++++++++++++++++------------ 2 files changed, 76 insertions(+), 50 deletions(-) diff --git a/src/unittest/zip_code_tree.cpp b/src/unittest/zip_code_tree.cpp index 15be8a1105..37ba9dbe52 100644 --- a/src/unittest/zip_code_tree.cpp +++ b/src/unittest/zip_code_tree.cpp @@ -2954,7 +2954,7 @@ namespace unittest { } } TEST_CASE( "zipcode tree multicomponent chain nested in irregular snarl", - "[zip_tree][bug]" ) { + "[zip_tree]" ) { VG graph; Node* n1 = graph.create_node("GCAAAAAAAAAAAAAAAAAAAAAAAAA"); @@ -2995,11 +2995,6 @@ namespace unittest { //graph.to_dot(cerr); SECTION( "Cross unreachable chain" ) { - net_handle_t n = dist_index.get_node_net_handle(n6->id()); - while(!dist_index.is_root(n)) { - cerr << dist_index.net_handle_as_string(n) << endl; - n = dist_index.get_parent(n); - } vector> positions; positions.emplace_back(make_pos_t(n3->id(), false, 0), 0); diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 37f5656e16..1e7167f03f 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -55,7 +55,7 @@ void ZipCodeForest::open_chain(forest_growing_state_t& forest_state, #endif const Seed& current_seed = forest_state.seeds->at(seed_index); - size_t current_max_depth = current_seed.zipcode.max_depth(); + bool is_node = current_seed.zipcode.max_depth() == depth; if (depth == 0) { //If this is the start of a new top-level chain, make a new tree, which will be the new active tree @@ -95,8 +95,10 @@ void ZipCodeForest::open_chain(forest_growing_state_t& forest_state, std::numeric_limits::max(), false); //Remember the start of the chain and its prefix sum value as a child of the chain - forest_state.sibling_indices_at_depth[depth].push_back({ZipCodeTree::CHAIN_START, 0}); - forest_state.sibling_indices_at_depth[depth].back().chain_component = 0; + forest_state.sibling_indices_at_depth[depth].push_back({ZipCodeTree::CHAIN_START, chain_is_reversed ? current_seed.zipcode.get_length(depth, true) + : 0}); + forest_state.sibling_indices_at_depth[depth].back().chain_component = chain_is_reversed && !is_node ? current_seed.zipcode.get_last_chain_component(depth, true) + : 0; //And, if it is the child of a snarl, then remember the chain as a child of the snarl if (depth != 0) { @@ -107,8 +109,33 @@ void ZipCodeForest::open_chain(forest_growing_state_t& forest_state, //chain to the ends of the chains // //Remember the distance to the start of this child in the chain - forest_state.sibling_indices_at_depth[depth-1].back().distances.first - = forest_state.sort_values_by_seed[seed_index].get_distance_value(); + if (chain_is_reversed) { + //If the chain is reversed, then we need to find the distance to the end of the chain from the prefix sum of the seed and the length of the chain + //If the length of the chain is infinite, then this is not the last component of the chain and the distance is infinite + //Otherwise, find the length of the chain/last component and the length of the child, if it is a snarl + size_t chain_length = current_seed.zipcode.get_length(depth, true); + if (chain_length == std::numeric_limits::max()) { + forest_state.sibling_indices_at_depth[depth-1].back().distances.first + = std::numeric_limits::max(); + } else { + forest_state.sibling_indices_at_depth[depth-1].back().distances.first + = SnarlDistanceIndex::minus(chain_length, SnarlDistanceIndex::sum(forest_state.sort_values_by_seed[seed_index].get_distance_value(), + is_node || current_seed.zipcode.get_code_type(depth+1) == ZipCode::NODE ? 0 + : current_seed.zipcode.get_length(depth+1))); + } + } else { + //If the chain is traversed forward, then the value is the prefix sum of the first component + if (!is_node && current_seed.zipcode.get_chain_component(depth+1) != 0) { + //If this isn't the first component, then it is infinite + forest_state.sibling_indices_at_depth[depth-1].back().distances.first + = std::numeric_limits::max(); + } else { + //Otherwise, just the prefix sum + forest_state.sibling_indices_at_depth[depth-1].back().distances.first + = forest_state.sort_values_by_seed[seed_index].get_distance_value(); + + } + } //Remember the opening of this chain, and if its first child was far enough from the start to //start a new subtree @@ -179,8 +206,10 @@ void ZipCodeForest::close_chain(forest_growing_state_t& forest_state, //The value that got stored in forest_state.sibling_indices_at_depth was the prefix sum //traversing the chain according to its orientation in the tree, so either way //the distance is the length of the chain - the prefix sum - size_t distance_to_chain_end = SnarlDistanceIndex::minus(last_seed.zipcode.get_length(depth), - forest_state.sibling_indices_at_depth[depth].back().value); + size_t distance_to_chain_end = chain_is_reversed + ? forest_state.sibling_indices_at_depth[depth].back().value + : SnarlDistanceIndex::minus(last_seed.zipcode.get_length(depth), + forest_state.sibling_indices_at_depth[depth].back().value); bool add_distances = true; if (distance_to_chain_end > forest_state.distance_limit && forest_state.open_chains.back().second) { //If the distance to the end is greater than the distance limit, and there was something @@ -325,12 +354,7 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, } else { //Otherwise, get the distance to the start or end of the chain - current_offset = chain_is_reversed - ? SnarlDistanceIndex::minus(current_seed.zipcode.get_length(chain_depth) , - SnarlDistanceIndex::sum( - current_seed.zipcode.get_offset_in_chain(depth), - current_seed.zipcode.get_length(depth))) - : current_seed.zipcode.get_offset_in_chain(depth); + current_offset = current_seed.zipcode.get_offset_in_chain(depth); } @@ -345,13 +369,19 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, ///////////////////// Record the distance from the previous thing in the chain/node // Or add a new tree if the distance is too far if (chain_depth > 0 && forest_state.sibling_indices_at_depth[chain_depth][0].type == ZipCodeTree::CHAIN_START){ + //If this is the first thing in a non-root chain or node, remember the distance to the //start of the chain/node. //This distance will be added to distances in the parent snarl - forest_state.sibling_indices_at_depth[chain_depth-1][0].distances.first = current_offset; + forest_state.sibling_indices_at_depth[chain_depth-1][0].distances.first = chain_is_reversed + ? SnarlDistanceIndex::minus(current_seed.zipcode.get_length(chain_depth, true), + SnarlDistanceIndex::sum(current_offset, + (is_trivial_chain || current_type == ZipCode::NODE ? 0 : current_seed.zipcode.get_length(chain_depth+1)))) + : current_offset; - //Also update the last chain opened - forest_state.open_chains.back().second = current_offset > forest_state.distance_limit; + //Update the last chain opened + forest_state.open_chains.back().second = std::max(current_offset, previous_offset) - std::min(current_offset, previous_offset) + > forest_state.distance_limit; } else if (forest_state.sibling_indices_at_depth[chain_depth][0].type != ZipCodeTree::CHAIN_START) { @@ -362,7 +392,7 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, //If the parent is a multicomponent chain, then they might be in different components distance_between = std::numeric_limits::max(); } else { - distance_between = current_offset - previous_offset; + distance_between = std::max(current_offset, previous_offset) - std::min(current_offset, previous_offset); } if (chain_depth == 0 && distance_between > forest_state.distance_limit) { @@ -399,8 +429,10 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, //The first sibling in the chain is now the chain start, not the previous seed, so replace it forest_state.sibling_indices_at_depth[chain_depth].pop_back(); - forest_state.sibling_indices_at_depth[chain_depth].push_back({ZipCodeTree::CHAIN_START, 0}); - forest_state.sibling_indices_at_depth[chain_depth].back().chain_component = 0; + forest_state.sibling_indices_at_depth[chain_depth].push_back({ZipCodeTree::CHAIN_START, chain_is_reversed ? current_seed.zipcode.get_length(chain_depth, true) + : 0}); + forest_state.sibling_indices_at_depth[chain_depth].back().chain_component = !is_trivial_chain ? current_seed.zipcode.get_last_chain_component(chain_depth, true) + : 0; } else if (distance_between > forest_state.distance_limit) { //If this is too far from the previous thing, but inside a snarl @@ -447,7 +479,12 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, assert(forest_state.open_chains.back().second); #endif - forest_state.sibling_indices_at_depth[chain_depth-1].back().distances.first = current_offset; + + forest_state.sibling_indices_at_depth[chain_depth-1].back().distances.first = chain_is_reversed + ? SnarlDistanceIndex::minus(current_seed.zipcode.get_length(chain_depth, true), + SnarlDistanceIndex::sum(current_offset, + (is_trivial_chain || current_type == ZipCode::NODE ? 0 : current_seed.zipcode.get_length(chain_depth+1)))) + : current_offset; //Don't need to update open_chains, since the next slice will also start at the chain start and be able to make //a new thing @@ -706,8 +743,10 @@ void ZipCodeForest::close_snarl(forest_growing_state_t& forest_state, assert(trees[forest_state.active_tree_index].zip_code_tree.back().get_type() == ZipCodeTree::CHAIN_START); #endif forest_state.sibling_indices_at_depth[depth-1].pop_back(); - forest_state.sibling_indices_at_depth[depth-1].push_back({ ZipCodeTree::CHAIN_START, 0}); - forest_state.sibling_indices_at_depth[depth-1].back().chain_component = 0; + forest_state.sibling_indices_at_depth[depth-1].push_back({ ZipCodeTree::CHAIN_START, forest_state.open_intervals[forest_state.open_intervals.size()-2].is_reversed ? last_seed.zipcode.get_length(depth-1, true) + : 0}); + forest_state.sibling_indices_at_depth[depth-1].back().chain_component = last_seed.zipcode.get_last_chain_component(depth-1, true); + } } else { @@ -2048,11 +2087,6 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, size_t max_sort_value = 0; size_t min_sort_value = std::numeric_limits::max(); - - //If this interval is a chain or node and it is being traversed backwards, save the prefix sum values facing backwards - //If it is a root chain or node, it won't be reversed anyway - bool order_is_reversed = interval.is_reversed && (interval.code_type == ZipCode::CHAIN || interval.code_type == ZipCode::NODE); - //The min and max chain components size_t min_component = 0; size_t max_component = 0; @@ -2081,7 +2115,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, : offset(seed.pos)) << endl;; #endif sort_values_by_seed[zipcode_sort_order[i]].set_sort_value( - is_rev(seed.pos) != order_is_reversed ? seed.zipcode.get_length(interval.depth) - offset(seed.pos) + is_rev(seed.pos) ? seed.zipcode.get_length(interval.depth) - offset(seed.pos) : offset(seed.pos)); sort_values_by_seed[zipcode_sort_order[i]].set_code_type(ZipCode::NODE); @@ -2095,10 +2129,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, // and 2 will be added to the node with an offset in the node of 0 (node 3 if the chain is traversed forward) // See sort_value_t for more details - size_t prefix_sum = order_is_reversed ? SnarlDistanceIndex::minus(seed.zipcode.get_length(interval.depth), - SnarlDistanceIndex::sum( seed.zipcode.get_offset_in_chain(interval.depth+1), - seed.zipcode.get_length(interval.depth+1))) - : seed.zipcode.get_offset_in_chain(interval.depth+1); + size_t prefix_sum = seed.zipcode.get_offset_in_chain(interval.depth+1); ZipCode::code_type_t child_type = seed.zipcode.get_code_type(interval.depth+1); sort_values_by_seed[zipcode_sort_order[i]].set_code_type(child_type); @@ -2117,7 +2148,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, } else { //If this is a node, then the order depends on where the position falls in the node bool node_is_rev = seed.zipcode.get_is_reversed_in_parent(interval.depth+1) != is_rev(seed.pos); - node_is_rev = order_is_reversed ? !node_is_rev : node_is_rev; + node_is_rev = node_is_rev; size_t node_offset = node_is_rev ? seed.zipcode.get_length(interval.depth+1) - offset(seed.pos) : offset(seed.pos); @@ -2202,7 +2233,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, cerr << "Sort by chain component" << endl; #endif //Sort by component using radix sort. I doubt that there will be enough components to make it more efficient to use the default sort - radix_sort_zipcodes(zipcode_sort_order, sort_values_by_seed, interval, order_is_reversed ? false : interval.is_reversed, min_component, max_component, true); + radix_sort_zipcodes(zipcode_sort_order, sort_values_by_seed, interval, interval.is_reversed, min_component, max_component, true); //Now get the next intervals in sub_intervals size_t start = interval.interval_start; @@ -2229,7 +2260,7 @@ void ZipCodeForest::sort_one_interval(forest_growing_state_t& forest_state, for (auto& sub_interval : sub_intervals) { //Snarls are already sorted by a topological order of the orientation of the zip tree, so don't reverse them //And don't reverse the sort if that has already been taken into account in the value finding - bool reverse_order = (sub_interval.code_type == ZipCode::REGULAR_SNARL || sub_interval.code_type == ZipCode::IRREGULAR_SNARL || order_is_reversed) + bool reverse_order = (sub_interval.code_type == ZipCode::REGULAR_SNARL || sub_interval.code_type == ZipCode::IRREGULAR_SNARL) ? false : sub_interval.is_reversed; if (use_radix) { @@ -2384,9 +2415,9 @@ void ZipCodeForest::radix_sort_zipcodes(vector& zipcode_sort_order, cons #ifdef DEBUG_ZIP_CODE_SORTING assert(sort_value >= min_value); assert(sort_value <= max_value); - cerr << "Sort value for seed " << seeds->at(zipcode_sort_order[i]).pos << ": " - << sort_value << endl; - assert(counts.size() > sort_values_by_seed[zipcode_sort_order[i]].get_sort_value() - min_value + 1); + //cerr << "Sort value for seed " << seeds->at(zipcode_sort_order[i]).pos << ": " + // << sort_value << endl; + assert(counts.size() > sort_value - min_value + 1); #endif size_t next_rank = sort_value - min_value + 1; @@ -3219,12 +3250,12 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s #ifdef DEBUG_ZIP_CODE_SORTING assert(new_sort_order.size() == (snarl_interval.interval_end - snarl_interval.interval_start)); cerr << "New sort order " << endl; - for (auto& interval : new_intervals) { - for (size_t i = interval.interval_start ; i < interval.interval_end ; i++) { - cerr << seeds->at(zipcode_sort_order[i]).pos << ", "; - } - cerr << "|"; - } + //for (auto& interval : new_intervals) { + // for (size_t i = interval.interval_start ; i < interval.interval_end ; i++) { + // cerr << seeds->at(zipcode_sort_order[i]).pos << ", "; + // } + // cerr << "|"; + //} cerr << endl; #endif From 39f2d621734b928da0ab25a31aa8155ec9567d12 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 15 Nov 2024 11:03:26 +0100 Subject: [PATCH 3/8] Flip runs of a cyclic snarl to take into account the new order of the prefix sum --- src/zip_code_tree.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 1e7167f03f..acf05e96a8 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -3208,6 +3208,10 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s //Now decide which direction the run is traversed in bool run_is_traversed_backwards = run_correlation < 0.0; + //If the chain is reversed, then the prefix sum values are all flipped, so the correlation is flipped + if (run.is_reversed) { + run_is_traversed_backwards = !run_is_traversed_backwards; + } reverse_run = run_is_traversed_backwards != snarl_is_traversed_backwards; } From 8413d3b8ce75fca44cbb64b68268affed95ba8f5 Mon Sep 17 00:00:00 2001 From: Xian Date: Fri, 15 Nov 2024 13:02:49 +0100 Subject: [PATCH 4/8] Add unit tests for what I thought would fail but looks like the bug I thought I made won't happen --- src/unittest/zip_code_tree.cpp | 250 +++++++++++++++++++++++++++++++++ src/zip_code_tree.cpp | 8 ++ src/zip_code_tree.hpp | 5 + 3 files changed, 263 insertions(+) diff --git a/src/unittest/zip_code_tree.cpp b/src/unittest/zip_code_tree.cpp index 37ba9dbe52..02e2656e23 100644 --- a/src/unittest/zip_code_tree.cpp +++ b/src/unittest/zip_code_tree.cpp @@ -1224,6 +1224,256 @@ namespace unittest { zip_forest.validate_zip_forest(distance_index, &seeds, 4); } } + TEST_CASE( "zip tree bubble nested in inversion", "[zip_tree]" ) { + + VG graph; + + Node* n1 = graph.create_node("GCAAAAAAAAAA"); + Node* n2 = graph.create_node("GCAA"); + Node* n3 = graph.create_node("GCAG"); + Node* n4 = graph.create_node("GCA"); + Node* n5 = graph.create_node("GCAAAAAAAAAAAAAAA"); + + Edge* e1 = graph.create_edge(n1, n2); + Edge* e2 = graph.create_edge(n1, n4, false, true); + Edge* e3 = graph.create_edge(n2, n3); + Edge* e4 = graph.create_edge(n2, n4); + Edge* e5 = graph.create_edge(n2, n5, true, false); + Edge* e6 = graph.create_edge(n3, n4); + Edge* e7 = graph.create_edge(n4, n5); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + + + //graph.to_dot(cerr); + + SECTION( "Traverse nested chain forwards but the orientation of the chain is backwards in the snarl tree" ) { + + vector> positions; + positions.emplace_back(make_pos_t(1, false, 0), 0); + positions.emplace_back(make_pos_t(1, false, 5), 5); + positions.emplace_back(make_pos_t(2, false, 0), 7); + positions.emplace_back(make_pos_t(3, false, 0), 9); + positions.emplace_back(make_pos_t(4, false, 0), 12); + positions.emplace_back(make_pos_t(5, false, 0), 15); + positions.emplace_back(make_pos_t(5, false, 4), 19); + //all are in the same cluster + vector seeds; + vector minimizers; + for (size_t i= 0 ; i < positions.size() ; i++ ) { + auto pos = positions[i]; + ZipCode zipcode; + zipcode.fill_in_zipcode(distance_index, pos.first); + zipcode.fill_in_full_decoder(); + seeds.push_back({ pos.first, i, zipcode}); + + minimizers.emplace_back(); + minimizers.back().value.offset = pos.second; + minimizers.back().value.is_reverse = false; + } + + VectorView minimizer_vector(minimizers); + + + ZipCodeForest zip_forest; + zip_forest.fill_in_forest(seeds, minimizer_vector, distance_index, std::numeric_limits::max()); + REQUIRE(zip_forest.trees.size() == 1); + zip_forest.print_self(&seeds, &minimizer_vector); + zip_forest.validate_zip_forest(distance_index, &seeds); + + + bool chain_is_reversed = distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n1->id())) != + distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n2->id())); + if (chain_is_reversed) { + + vector seed_order; + for (size_t i = 0 ; i < zip_forest.trees[0].get_tree_size() ; i++) { + if (zip_forest.trees[0].get_item_at_index(i).get_type() == ZipCodeTree::SEED) { + seed_order.emplace_back(zip_forest.trees[0].get_item_at_index(i).get_value()); + } + } + //The seeds should be in the same order as the original list of seeds, but the orientation depends on the orientation of the top-level chain so either way is fine + if (seed_order.front() == 0) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == i); + } + } else if (seed_order.front() == 5) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == 5-i); + } + } else { + REQUIRE((seed_order.front() == 0 || seed_order.front() == 5)); + } + } else { + //This unit test is for testing the nested chain going backwards so if it isn't it should be rewritten + //Chain 2->4 should be traversed backwards in the snarl tree + //It doesn't matter which direction chain 1->5 is going + cerr << "This test isn't testing the thing its supposed to test because the snarl finder put the chain in a different orientation. So it should probably be rewritten" << endl; + } + + } + SECTION( "Traverse nested chain backwards but the orientation of the chain is backwards in the snarl tree" ) { + + vector> positions; + positions.emplace_back(make_pos_t(1, false, 0), 0); + positions.emplace_back(make_pos_t(1, false, 5), 5); + positions.emplace_back(make_pos_t(4, false, 0), 7); + positions.emplace_back(make_pos_t(3, false, 0), 9); + positions.emplace_back(make_pos_t(2, false, 0), 12); + positions.emplace_back(make_pos_t(5, false, 0), 15); + positions.emplace_back(make_pos_t(5, false, 4), 19); + //all are in the same cluster + vector seeds; + vector minimizers; + for (size_t i= 0 ; i < positions.size() ; i++ ) { + auto pos = positions[i]; + ZipCode zipcode; + zipcode.fill_in_zipcode(distance_index, pos.first); + zipcode.fill_in_full_decoder(); + seeds.push_back({ pos.first, i, zipcode}); + + minimizers.emplace_back(); + minimizers.back().value.offset = pos.second; + minimizers.back().value.is_reverse = false; + } + + VectorView minimizer_vector(minimizers); + + + ZipCodeForest zip_forest; + zip_forest.fill_in_forest(seeds, minimizer_vector, distance_index, std::numeric_limits::max()); + REQUIRE(zip_forest.trees.size() == 1); + zip_forest.print_self(&seeds, &minimizer_vector); + zip_forest.validate_zip_forest(distance_index, &seeds); + + + bool chain_is_reversed = distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n1->id())) != + distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n2->id())); + if (chain_is_reversed) { + + vector seed_order; + for (size_t i = 0 ; i < zip_forest.trees[0].get_tree_size() ; i++) { + if (zip_forest.trees[0].get_item_at_index(i).get_type() == ZipCodeTree::SEED) { + seed_order.emplace_back(zip_forest.trees[0].get_item_at_index(i).get_value()); + } + } + //The seeds should be in the same order as the original list of seeds, but the orientation depends on the orientation of the top-level chain so either way is fine + if (seed_order.front() == 0) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == i); + } + } else if (seed_order.front() == 5) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == 5-i); + } + } else { + REQUIRE((seed_order.front() == 0 || seed_order.front() == 5)); + } + } else { + //This unit test is for testing the nested chain going backwards so if it isn't it should be rewritten + //Chain 2->4 should be traversed backwards in the snarl tree + //It doesn't matter which direction chain 1->5 is going + cerr << "This test isn't testing the thing its supposed to test because the snarl finder put the chain in a different orientation. So it should probably be rewritten" << endl; + } + + } + + } + TEST_CASE( "zip tree bubble nested in cyclic snarl", "[zip_tree]" ) { + + VG graph; + + Node* n1 = graph.create_node("GCAAAAAAAAAA"); + Node* n2 = graph.create_node("GCAA"); + Node* n3 = graph.create_node("GCAG"); + Node* n4 = graph.create_node("GCA"); + Node* n5 = graph.create_node("GCAAAAAAAAAAAAAAA"); + + Edge* e1 = graph.create_edge(n1, n2); + Edge* e2 = graph.create_edge(n1, n5); + Edge* e3 = graph.create_edge(n2, n3); + Edge* e4 = graph.create_edge(n2, n4); + Edge* e5 = graph.create_edge(n5, n5, true, false); + Edge* e6 = graph.create_edge(n3, n4); + Edge* e7 = graph.create_edge(n4, n5); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + + + //graph.to_dot(cerr); + + SECTION( "Traverse nested chain forwards but the orientation of the chain is backwards in the snarl tree" ) { + + vector> positions; + positions.emplace_back(make_pos_t(1, false, 0), 0); + positions.emplace_back(make_pos_t(1, false, 5), 5); + positions.emplace_back(make_pos_t(2, false, 0), 7); + positions.emplace_back(make_pos_t(3, false, 0), 9); + positions.emplace_back(make_pos_t(4, false, 0), 12); + positions.emplace_back(make_pos_t(5, false, 0), 15); + positions.emplace_back(make_pos_t(5, false, 4), 19); + //all are in the same cluster + vector seeds; + vector minimizers; + for (size_t i= 0 ; i < positions.size() ; i++ ) { + auto pos = positions[i]; + ZipCode zipcode; + zipcode.fill_in_zipcode(distance_index, pos.first); + zipcode.fill_in_full_decoder(); + seeds.push_back({ pos.first, i, zipcode}); + + minimizers.emplace_back(); + minimizers.back().value.offset = pos.second; + minimizers.back().value.is_reverse = false; + } + + VectorView minimizer_vector(minimizers); + + + ZipCodeForest zip_forest; + zip_forest.fill_in_forest(seeds, minimizer_vector, distance_index, std::numeric_limits::max()); + REQUIRE(zip_forest.trees.size() == 1); + zip_forest.print_self(&seeds, &minimizer_vector); + zip_forest.validate_zip_forest(distance_index, &seeds); + + + bool chain_is_reversed = distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n1->id())) != + distance_index.is_reversed_in_parent(distance_index.get_node_net_handle(n2->id())); + if (chain_is_reversed) { + + vector seed_order; + for (size_t i = 0 ; i < zip_forest.trees[0].get_tree_size() ; i++) { + if (zip_forest.trees[0].get_item_at_index(i).get_type() == ZipCodeTree::SEED) { + seed_order.emplace_back(zip_forest.trees[0].get_item_at_index(i).get_value()); + } + } + //The seeds should be in the same order as the original list of seeds, but the orientation depends on the orientation of the top-level chain so either way is fine + if (seed_order.front() == 0) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == i); + } + } else if (seed_order.front() == 5) { + for (size_t i = 0 ; i < seed_order.size() ; i++) { + REQUIRE(seed_order[i] == 5-i); + } + } else { + REQUIRE((seed_order.front() == 0 || seed_order.front() == 5)); + } + } else { + //This unit test is for testing the nested chain going backwards so if it isn't it should be rewritten + //Chain 2->4 should be traversed backwards in the snarl tree + //It doesn't matter which direction chain 1->5 is going + cerr << "This test isn't testing the thing its supposed to test because the snarl finder put the chain in a different orientation. So it should probably be rewritten" << endl; + } + + } + } TEST_CASE( "zip tree snarl with inversion", "[zip_tree]" ) { VG graph; diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index acf05e96a8..236f007ce5 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -3209,6 +3209,7 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s //Now decide which direction the run is traversed in bool run_is_traversed_backwards = run_correlation < 0.0; //If the chain is reversed, then the prefix sum values are all flipped, so the correlation is flipped + //I'm not sure if the chain will ever be reversed though, I can't seem to make a unit test that makes it reversed in a snarl if (run.is_reversed) { run_is_traversed_backwards = !run_is_traversed_backwards; } @@ -3218,6 +3219,10 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s } if (!reverse_run) { +#ifdef DEBUG_ZIP_CODE_TREE + cerr << "Go through the run forwards" << endl; +#endif + //If we can only go forwards through the run or //if the read is going through the snarl and partition in the same direction for (size_t sort_i : run_seeds) { @@ -3239,6 +3244,9 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s } } else { +#ifdef DEBUG_ZIP_CODE_TREE + cerr << "Go through the run backwards" << endl; +#endif //If the read is going through the run in the opposite direction as the snarl, then flip it for (int i = run_seeds.size()-1 ; i >= 0 ; --i) { new_sort_order.push_back(zipcode_sort_order[snarl_interval.interval_start+run_seeds[i]]); diff --git a/src/zip_code_tree.hpp b/src/zip_code_tree.hpp index 7c65ec01b1..d6b116bb0b 100644 --- a/src/zip_code_tree.hpp +++ b/src/zip_code_tree.hpp @@ -729,6 +729,11 @@ class ZipCodeForest { /// Sorts the given interval (which must contain seeds on the same snarl/chain/node at the given /// depth) Sorting is roughly linear along the top-level chains, in a topological-ish order in /// snarls. Uses radix_sort_zipcodes and default_sort_zipcodes + /// For chains, everything is sorted with the prefix sum value of the chain itself from the distance index, + /// not the order in the chain in the zip code tree. Everything will be sorted in the order of the zip + /// code tree, but the values will be set from the distance index. This means that later, the values + /// may be out of order or may need to be subtracted from the length of the chain to get the distances + /// to the ends of the chain void sort_one_interval(forest_growing_state_t& forest_state, const interval_state_t& interval) const; From 95dae9ee0fe5f7ac313cd456459e25eb4b241418 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Tue, 19 Nov 2024 06:41:31 -0800 Subject: [PATCH 5/8] Get the prefix sum to the right side of a snarl in a chain --- src/zip_code_tree.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 236f007ce5..715f6b1f80 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -356,6 +356,10 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, current_offset = current_seed.zipcode.get_offset_in_chain(depth); } + if (chain_is_reversed && !(current_type == ZipCode::NODE || current_type == ZipCode::ROOT_NODE || is_trivial_chain)) { + //If we are adding a snarl and the chain is being traversed backwards, then make sure the prefix sum is going to the right end of the snarl + current_offset = SnarlDistanceIndex::sum(current_offset, current_seed.zipcode.get_length(depth)); + } /////////////////////// Get the offset of the previous thing in the parent chain/node @@ -579,8 +583,9 @@ void ZipCodeForest::add_child_to_chain(forest_growing_state_t& forest_state, //For finding the distance to the next thing in the chain, the offset //stored should be the offset of the end bound of the snarl, so add the //length of the snarl - current_offset = SnarlDistanceIndex::sum(current_offset, - current_seed.zipcode.get_length(depth)); + current_offset = chain_is_reversed + ? SnarlDistanceIndex::minus(current_offset, current_seed.zipcode.get_length(depth)) + : SnarlDistanceIndex::sum(current_offset, current_seed.zipcode.get_length(depth)); } From 0c2c1333adb243015ac67e142c14d105d3de2ad2 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Tue, 19 Nov 2024 13:08:38 -0800 Subject: [PATCH 6/8] Fix getting snarl offsets in children of cyclic snarls to match the old version --- src/zip_code_tree.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 715f6b1f80..7fb76f9a9b 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -3005,8 +3005,20 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s bool is_reversed_read = minimizer.value.is_reverse; size_t read_offset = minimizer.value.offset; size_t chain_offset = sort_values_by_seed[zipcode_sort_order[sort_i]].get_distance_value(); + size_t chain_offset_plus_snarl = chain_offset; + + //If the child interval is + ZipCode::code_type_t chain_child_type = seed.zipcode.get_code_type(snarl_interval.depth+2); + if (chain_child_type == ZipCode::REGULAR_SNARL || chain_child_type == ZipCode::IRREGULAR_SNARL || + chain_child_type == ZipCode::CYCLIC_SNARL ) { + //If we're traversing the chain backwards, get the offset as the distance to the other end of the snarl + size_t child_length = seed.zipcode.get_length(snarl_interval.depth+2); + + chain_offset_plus_snarl = SnarlDistanceIndex::sum(chain_offset, child_length); + chain_offset = chain_offset_plus_snarl; + } #ifdef DEBUG_ZIP_CODE_TREE - cerr << "AT SEED: " << seed.pos << " with chain offset " << chain_offset << " and read offset " << read_offset << endl; + cerr << "AT SEED: " << seed.pos << " with chain offset " << chain_offset << " to " << chain_offset_plus_snarl << " and read offset " << read_offset << endl; #endif //Remember the values for finding the correlation later @@ -3020,7 +3032,8 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s //Make a new run for the seed, to be updated with anything combined with it run_t seed_run({sort_i - snarl_interval.interval_start, read_offset, read_offset, - chain_offset, chain_offset, + std::min(chain_offset, chain_offset_plus_snarl), + std::max(chain_offset, chain_offset_plus_snarl), interval_i, child_interval.depth, child_interval.code_type, From 2c63e4722228b07158e624a5de16d86c1976445d Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Tue, 19 Nov 2024 13:09:12 -0800 Subject: [PATCH 7/8] Get snarl offsets as a range in the child of a cyclic snarl --- src/zip_code_tree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index 7fb76f9a9b..b8a27da074 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -3015,7 +3015,6 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s size_t child_length = seed.zipcode.get_length(snarl_interval.depth+2); chain_offset_plus_snarl = SnarlDistanceIndex::sum(chain_offset, child_length); - chain_offset = chain_offset_plus_snarl; } #ifdef DEBUG_ZIP_CODE_TREE cerr << "AT SEED: " << seed.pos << " with chain offset " << chain_offset << " to " << chain_offset_plus_snarl << " and read offset " << read_offset << endl; From a3d55b08b4ada9776bd9030a78b0aa6552a40c43 Mon Sep 17 00:00:00 2001 From: Xian Chang Date: Wed, 20 Nov 2024 02:45:04 -0800 Subject: [PATCH 8/8] Fix bug getting the grandchild of a snarl --- src/zip_code_tree.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zip_code_tree.cpp b/src/zip_code_tree.cpp index b8a27da074..e095ce7bc6 100644 --- a/src/zip_code_tree.cpp +++ b/src/zip_code_tree.cpp @@ -3008,7 +3008,9 @@ void ZipCodeForest::get_cyclic_snarl_intervals( forest_growing_state_t& forest_s size_t chain_offset_plus_snarl = chain_offset; //If the child interval is - ZipCode::code_type_t chain_child_type = seed.zipcode.get_code_type(snarl_interval.depth+2); + ZipCode::code_type_t chain_child_type = seed.zipcode.max_depth() < snarl_interval.depth+2 + ? ZipCode::NODE + : seed.zipcode.get_code_type(snarl_interval.depth+2); if (chain_child_type == ZipCode::REGULAR_SNARL || chain_child_type == ZipCode::IRREGULAR_SNARL || chain_child_type == ZipCode::CYCLIC_SNARL ) { //If we're traversing the chain backwards, get the offset as the distance to the other end of the snarl