From 9740f8ffda35d77e37d7cc7ac860395149a1ca67 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 12 Sep 2024 14:40:44 -0500 Subject: [PATCH 01/29] make the direction hard cutoff in chaining more correct --- src/map/include/computeMap.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index fd5519af..946cfb49 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1641,11 +1641,11 @@ namespace skch int64_t query_dist = std::numeric_limits::max(); auto dist = std::numeric_limits::max(); auto awed = std::numeric_limits::max(); - if (it->strand == strnd::FWD && it->queryStartPos <= it2->queryStartPos) { + if (it->strand == strnd::FWD && it->queryEndPos <= it2->queryStartPos) { query_dist = it2->queryStartPos - it->queryEndPos; dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); - } else if (it->strand != strnd::FWD && it->queryEndPos >= it2->queryEndPos) { + } else if (it->strand != strnd::FWD && it->queryStartPos >= it2->queryEndPos) { query_dist = it->queryStartPos - it2->queryEndPos; dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); From 41e84af00aaef0482299742f8cbdb6d95f7d2f4c Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 12 Sep 2024 14:46:35 -0500 Subject: [PATCH 02/29] more stringent query based sort --- src/map/include/computeMap.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 946cfb49..aaf62bf8 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1684,7 +1684,7 @@ namespace skch // Sort the chain by query, then reference, start position std::sort(it, it_end, [](const MappingResult &a, const MappingResult &b) { - return std::tie(a.queryStartPos, a.refStartPos) < std::tie(b.queryStartPos, b.refStartPos); + return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); }); // if we have an infinite max mappinng length, we should just emit the chain here From d7ca6aaa565764cda0eb021ae45240e1e56090eb Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Fri, 13 Sep 2024 22:48:29 -0500 Subject: [PATCH 03/29] feat: Implement new chaining logic for mapping merging --- src/map/include/computeMap.hpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index aaf62bf8..7b13eca8 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1625,7 +1625,7 @@ namespace skch //Start the procedure to identify the chains for (auto it = readMappings.begin(); it != readMappings.end(); it++) { - std::vector> distances; + std::vector> chainPairs; for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) { //If this mapping is for the same segment, ignore if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) { @@ -1651,18 +1651,25 @@ namespace skch awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); } if (dist < max_dist) { - distances.push_back(std::make_tuple(awed, dist, it2->splitMappingId)); + chainPairs.push_back(std::make_pair(awed, it2->splitMappingId)); } } } - if (distances.size()) { - std::sort(distances.begin(), distances.end()); - disjoint_sets.unite(it->splitMappingId, std::get<2>(distances.front())); + if (!chainPairs.empty()) { + std::sort(chainPairs.begin(), chainPairs.end()); + it->chainPairs = std::move(chainPairs); } } - //Assign the merged mapping ids - for (auto it = readMappings.begin(); it != readMappings.end(); it++) { + // Second pass: Apply disjoint sets union operation + for (auto it = readMappings.begin(); it != readMappings.end(); it++) { + if (!it->chainPairs.empty()) { + disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second); + } + } + + // Assign the merged mapping ids + for (auto it = readMappings.begin(); it != readMappings.end(); it++) { it->splitMappingId = disjoint_sets.find(it->splitMappingId); } From 09242d801e93b993f3a70398d7448739072a3ee8 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Fri, 13 Sep 2024 22:50:20 -0500 Subject: [PATCH 04/29] feat: Apply chaining logic to second mapping and sort chain pairs in ascending order --- src/map/include/computeMap.hpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 7b13eca8..d95f517d 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1651,19 +1651,16 @@ namespace skch awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); } if (dist < max_dist) { - chainPairs.push_back(std::make_pair(awed, it2->splitMappingId)); + it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId)); } } } - if (!chainPairs.empty()) { - std::sort(chainPairs.begin(), chainPairs.end()); - it->chainPairs = std::move(chainPairs); - } } - // Second pass: Apply disjoint sets union operation + // Second pass: Sort chain pairs and apply disjoint sets union operation for (auto it = readMappings.begin(); it != readMappings.end(); it++) { if (!it->chainPairs.empty()) { + std::sort(it->chainPairs.begin(), it->chainPairs.end()); disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second); } } From 869da9944525bb3c5bd7b3104361bb618f6d405e Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Fri, 13 Sep 2024 22:53:27 -0500 Subject: [PATCH 05/29] feat: Modify mergeMappingsInRange function to improve merging logic --- src/map/include/computeMap.hpp | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index d95f517d..d5a0469e 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1604,12 +1604,12 @@ namespace skch if(readMappings.size() < 2) return; - //Sort the mappings by reference (then query) position + //Sort the mappings by query position, then reference sequence id, then reference position std::sort( readMappings.begin(), readMappings.end(), [](const MappingResult &a, const MappingResult &b) { - return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos) - < std::tie(b.refSeqId, b.refStartPos, b.queryStartPos); + return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) + < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); }); //First assign a unique id to each split mapping in the sorted order @@ -1631,25 +1631,16 @@ namespace skch if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) { continue; } - //If this mapping is too far from current mapping being evaluated, stop finding a merge - if (it2->refSeqId != it->refSeqId || it2->refStartPos > it->refEndPos + max_dist) { + //If this mapping is too far from current mapping being evaluated in the query, stop finding a merge + if (it2->queryStartPos > it->queryEndPos + max_dist) { break; } //If the next mapping is within range, check if it's in range and if (it2->strand == it->strand) { int64_t ref_dist = it2->refStartPos - it->refEndPos; - int64_t query_dist = std::numeric_limits::max(); - auto dist = std::numeric_limits::max(); - auto awed = std::numeric_limits::max(); - if (it->strand == strnd::FWD && it->queryEndPos <= it2->queryStartPos) { - query_dist = it2->queryStartPos - it->queryEndPos; - dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); - } else if (it->strand != strnd::FWD && it->queryStartPos >= it2->queryEndPos) { - query_dist = it->queryStartPos - it2->queryEndPos; - dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); - } + int64_t query_dist = it2->queryStartPos - it->queryEndPos; + auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); + auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); if (dist < max_dist) { it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId)); } From 68c9df1a8ba8f249232eb3ccc29fac1103cf481d Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 13 Sep 2024 23:45:13 -0500 Subject: [PATCH 06/29] simplify sorting and just keep best chain pair option around --- src/map/include/base_types.hpp | 2 ++ src/map/include/computeMap.hpp | 38 +++++++++++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index 9505f3b9..f0b65d87 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -178,6 +178,8 @@ namespace skch uint8_t discard; // set to 1 for deletion bool overlapped; // set to true if this mapping is overlapped with another mapping bool selfMapFilter; // set to true if a long-to-short mapping in all-vs-all mode (we report short as the query) + double chainPairScore; // best score for potential chain pair + int64_t chainPairId; // best partner mapping for potential chain pair offset_t qlen() { //length of this mapping on query axis return queryEndPos - queryStartPos + 1; diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index d5a0469e..48e87607 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1616,6 +1616,8 @@ namespace skch for (auto it = readMappings.begin(); it != readMappings.end(); it++) { it->splitMappingId = std::distance(readMappings.begin(), it); it->discard = 0; + it->chainPairScore = std::numeric_limits::max(); + it->chainPairId = std::numeric_limits::max(); } // set up our union find data structure to track merges @@ -1625,7 +1627,10 @@ namespace skch //Start the procedure to identify the chains for (auto it = readMappings.begin(); it != readMappings.end(); it++) { - std::vector> chainPairs; + // we we merge only with the best-scored previous mapping in query space + if (it->chainPairScore != std::numeric_limits::max()) { + disjoint_sets.unite(it->splitMappingId, it->chainPairId); + } for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) { //If this mapping is for the same segment, ignore if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) { @@ -1638,24 +1643,23 @@ namespace skch //If the next mapping is within range, check if it's in range and if (it2->strand == it->strand) { int64_t ref_dist = it2->refStartPos - it->refEndPos; + if (ref_dist < 0) { + continue; + } int64_t query_dist = it2->queryStartPos - it->queryEndPos; + if (query_dist < 0) { + continue; + } auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); - if (dist < max_dist) { - it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId)); + if (dist < max_dist && it2->chainPairScore > awed) { + it2->chainPairId = it->splitMappingId; + it2->chainPairScore = awed; } } } } - // Second pass: Sort chain pairs and apply disjoint sets union operation - for (auto it = readMappings.begin(); it != readMappings.end(); it++) { - if (!it->chainPairs.empty()) { - std::sort(it->chainPairs.begin(), it->chainPairs.end()); - disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second); - } - } - // Assign the merged mapping ids for (auto it = readMappings.begin(); it != readMappings.end(); it++) { it->splitMappingId = disjoint_sets.find(it->splitMappingId); @@ -1666,22 +1670,14 @@ namespace skch readMappings.begin(), readMappings.end(), [](const MappingResult &a, const MappingResult &b) { - return a.splitMappingId < b.splitMappingId - || (a.splitMappingId == b.splitMappingId - && (a.queryStartPos < b.queryStartPos - || (a.queryStartPos == b.queryStartPos - && a.refStartPos < b.refStartPos))); + return std::tie(a.splitMappingId, a.queryStartPos, a.refSeqId, a.refStartPos) + < std::tie(b.splitMappingId, b.queryStartPos, b.refSeqId, b.refStartPos); }); for(auto it = readMappings.begin(); it != readMappings.end();) { //Bucket by each chain auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); - // Sort the chain by query, then reference, start position - std::sort(it, it_end, [](const MappingResult &a, const MappingResult &b) { - return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); - }); - // if we have an infinite max mappinng length, we should just emit the chain here if (param.max_mapping_length == std::numeric_limits::max()) { // Process any remaining fragment From 0e8bcc93fcb529d66d46331c7bdfe18e09b5475a Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 13 Sep 2024 23:53:48 -0500 Subject: [PATCH 07/29] get rid of axis weighted distance --- src/map/include/computeMap.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 48e87607..bb2ed304 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1651,10 +1651,10 @@ namespace skch continue; } auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); - if (dist < max_dist && it2->chainPairScore > awed) { + //auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); + if (dist < max_dist && it2->chainPairScore > dist) { it2->chainPairId = it->splitMappingId; - it2->chainPairScore = awed; + it2->chainPairScore = dist; } } } From fefb4ee0891406aa8ebff9ace81a595648f7d33d Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 14 Sep 2024 00:01:30 -0500 Subject: [PATCH 08/29] comment cleanup --- src/map/include/computeMap.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index bb2ed304..9a6d213a 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1640,7 +1640,7 @@ namespace skch if (it2->queryStartPos > it->queryEndPos + max_dist) { break; } - //If the next mapping is within range, check if it's in range and + //If the next mapping is within range, we can potentially merge if (it2->strand == it->strand) { int64_t ref_dist = it2->refStartPos - it->refEndPos; if (ref_dist < 0) { @@ -1651,7 +1651,6 @@ namespace skch continue; } auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - //auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9); if (dist < max_dist && it2->chainPairScore > dist) { it2->chainPairId = it->splitMappingId; it2->chainPairScore = dist; From 377f0c5ea4fdfdd487050ad9226e0cd375c73fd6 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sat, 14 Sep 2024 08:50:26 -0500 Subject: [PATCH 09/29] refactor: processChainWithSplits --- src/map/include/computeMap.hpp | 175 ++++++++++++++++----------------- 1 file changed, 86 insertions(+), 89 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 9a6d213a..5dcdbd2e 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1677,109 +1677,106 @@ namespace skch //Bucket by each chain auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); - // if we have an infinite max mappinng length, we should just emit the chain here if (param.max_mapping_length == std::numeric_limits::max()) { - // Process any remaining fragment + // Process the entire chain as one fragment processMappingFragment(it, it_end); - it = it_end; - continue; + } else { + // Process the chain with potential splits + processChainWithSplits(it, it_end); } + + // Move the iterator to the end of the processed chain + it = it_end; + } - // tweak start and end positions of consecutive mappings - // TODO: XXX double check that the consecutive mappings are not overlapping!!! - // extra: force global alignment by patching head and tail to mapping start and end coordinates - //adjustConsecutiveMappings(it, it_end, param.segLength); - - // First pass: Mark cuttable positions - const int consecutive_mappings_window = 4; // Configurable parameter - std::vector is_cuttable(std::distance(it, it_end), false); - - auto window_start = it; - int consecutive_count = 0; + // After processing all chains, remove discarded mappings + readMappings.erase( + std::remove_if(readMappings.begin(), readMappings.end(), + [](const MappingResult& e) { return e.discard == 1; }), + readMappings.end() + ); + } - for (auto current = it; current != it_end; ++current) { - is_cuttable[std::distance(it, current)] = true; - } + /** + * @brief Process a chain of mappings, potentially splitting it into smaller fragments + * @param begin Iterator to the start of the chain + * @param end Iterator to the end of the chain + */ + void processChainWithSplits(std::vector::iterator begin, std::vector::iterator end) { + if (begin == end) return; - // mark anything positions on either side of a discontinuity as non-cuttable - for (auto current = it; current != it_end; ++current) { - if (current == it || current == std::prev(it_end)) { - continue; - } - if (current->queryStartPos - std::prev(current)->queryEndPos > param.segLength / 5 - || current->refStartPos - std::prev(current)->refEndPos > param.segLength / 5) { - is_cuttable[std::distance(it, current) - 1] = false; - is_cuttable[std::distance(it, current)] = false; - } + std::vector is_cuttable(std::distance(begin, end), true); + + // Mark positions that are not cuttable (near discontinuities) + for (auto it = std::next(begin); it != end; ++it) { + auto prev = std::prev(it); + if (it->queryStartPos - prev->queryEndPos > param.segLength / 5 || + it->refStartPos - prev->refEndPos > param.segLength / 5) { + is_cuttable[std::distance(begin, prev)] = false; + is_cuttable[std::distance(begin, it)] = false; } + } - adjustConsecutiveMappings(it, it_end, param.segLength); - - auto fragment_start = it; - auto current = it; - offset_t accumulate_length = 0; - double accumulate_nuc_identity = 0.0; - - while (current != it_end) { - // chunks are made across the query - accumulate_length += current->queryEndPos - current->queryStartPos; - if (accumulate_length >= param.max_mapping_length - && is_cuttable[std::distance(it, current)]) { - if (current != fragment_start) { - adjustConsecutiveMappings(std::prev(current), current, param.chain_gap); - } - processMappingFragment(fragment_start, current); - fragment_start = current; - accumulate_length = 0; - } - ++current; + adjustConsecutiveMappings(begin, end, param.segLength); + + auto fragment_start = begin; + offset_t accumulate_length = 0; + + for (auto it = begin; it != end; ++it) { + accumulate_length += it->queryEndPos - it->queryStartPos; + + if (accumulate_length >= param.max_mapping_length && is_cuttable[std::distance(begin, it)]) { + // Process the fragment up to this point + processMappingFragment(fragment_start, std::next(it)); + + // Start a new fragment + fragment_start = std::next(it); + accumulate_length = 0; } + } - // Process any remaining fragment - processMappingFragment(fragment_start, current); - - // Compute chain statistics and coordinates - offset_t target_len_in_full_chain = 0; - offset_t query_len_in_full_chain = 0; - int n_in_full_chain = std::distance(it, it_end); - offset_t chain_start_query = std::numeric_limits::max(); - offset_t chain_end_query = std::numeric_limits::min(); - offset_t chain_start_ref = std::numeric_limits::max(); - offset_t chain_end_ref = std::numeric_limits::min(); - for (auto x = it; x != it_end; ++x) { - chain_start_query = std::min(chain_start_query, x->queryStartPos); - chain_end_query = std::max(chain_end_query, x->queryEndPos); - chain_start_ref = std::min(chain_start_ref, x->refStartPos); - chain_end_ref = std::max(chain_end_ref, x->refEndPos); - target_len_in_full_chain += x->refEndPos - x->refStartPos; - query_len_in_full_chain += x->queryEndPos - x->queryStartPos; - accumulate_nuc_identity += current->nucIdentity; - } + // Process any remaining fragment + if (fragment_start != end) { + processMappingFragment(fragment_start, end); + } - // Assign chain statistics to all mappings in the chain - auto chain_nuc_identity = accumulate_nuc_identity / n_in_full_chain; - auto block_length = std::max(chain_end_query - chain_start_query, chain_end_ref - chain_start_ref); - for (auto x = it; x != it_end; ++x) { - auto& m = *x; - m.n_merged = n_in_full_chain; - m.blockLength = block_length; - m.blockQueryStartPos = chain_start_query; - m.blockQueryEndPos = chain_end_query; - m.blockRefStartPos = chain_start_ref; - m.blockRefEndPos = chain_end_ref; - m.blockNucIdentity = chain_nuc_identity; - } + // Compute and assign chain statistics + computeChainStatistics(begin, end); + } - // Move the iterator to the end of the processed chain - it = it_end; + /** + * @brief Compute and assign chain statistics to all mappings in the chain + * @param begin Iterator to the start of the chain + * @param end Iterator to the end of the chain + */ + void computeChainStatistics(std::vector::iterator begin, std::vector::iterator end) { + offset_t chain_start_query = std::numeric_limits::max(); + offset_t chain_end_query = std::numeric_limits::min(); + offset_t chain_start_ref = std::numeric_limits::max(); + offset_t chain_end_ref = std::numeric_limits::min(); + double accumulate_nuc_identity = 0.0; + int n_in_full_chain = std::distance(begin, end); + + for (auto it = begin; it != end; ++it) { + chain_start_query = std::min(chain_start_query, it->queryStartPos); + chain_end_query = std::max(chain_end_query, it->queryEndPos); + chain_start_ref = std::min(chain_start_ref, it->refStartPos); + chain_end_ref = std::max(chain_end_ref, it->refEndPos); + accumulate_nuc_identity += it->nucIdentity; } - // After processing all chains, remove discarded mappings - readMappings.erase( - std::remove_if(readMappings.begin(), readMappings.end(), - [](const MappingResult& e) { return e.discard == 1; }), - readMappings.end() - ); + auto chain_nuc_identity = accumulate_nuc_identity / n_in_full_chain; + auto block_length = std::max(chain_end_query - chain_start_query, chain_end_ref - chain_start_ref); + + for (auto it = begin; it != end; ++it) { + it->n_merged = n_in_full_chain; + it->blockLength = block_length; + it->blockQueryStartPos = chain_start_query; + it->blockQueryEndPos = chain_end_query; + it->blockRefStartPos = chain_start_ref; + it->blockRefEndPos = chain_end_ref; + it->blockNucIdentity = chain_nuc_identity; + } } /** From 810ce0aafa4cda11254471cf65f437a6e4d43b4b Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 14 Sep 2024 10:01:19 -0500 Subject: [PATCH 10/29] cleanup decision to chain --- src/map/include/computeMap.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 5dcdbd2e..a4b4222a 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1617,7 +1617,7 @@ namespace skch it->splitMappingId = std::distance(readMappings.begin(), it); it->discard = 0; it->chainPairScore = std::numeric_limits::max(); - it->chainPairId = std::numeric_limits::max(); + it->chainPairId = std::numeric_limits::min(); } // set up our union find data structure to track merges @@ -1632,8 +1632,12 @@ namespace skch disjoint_sets.unite(it->splitMappingId, it->chainPairId); } for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) { - //If this mapping is for the same segment, ignore - if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) { + // If this mapping is for a different reference sequence, ignore + if (it2->refSeqId != it->refSeqId) { + continue; + } + //If this mapping is for exactly the same segment, ignore + if (it2->queryStartPos == it->queryStartPos) { continue; } //If this mapping is too far from current mapping being evaluated in the query, stop finding a merge From cb9472665cba4c6e46dff666979f96a63bfdd88f Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 14 Sep 2024 10:44:15 -0500 Subject: [PATCH 11/29] best-buddy chain merging --- src/map/include/computeMap.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index a4b4222a..ba4f3c7a 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1627,6 +1627,8 @@ namespace skch //Start the procedure to identify the chains for (auto it = readMappings.begin(); it != readMappings.end(); it++) { + double best_score = std::numeric_limits::max(); + auto best_it2 = readMappings.end(); // we we merge only with the best-scored previous mapping in query space if (it->chainPairScore != std::numeric_limits::max()) { disjoint_sets.unite(it->splitMappingId, it->chainPairId); @@ -1647,7 +1649,8 @@ namespace skch //If the next mapping is within range, we can potentially merge if (it2->strand == it->strand) { int64_t ref_dist = it2->refStartPos - it->refEndPos; - if (ref_dist < 0) { + // If we jitter backwards in reference position too far, don't merge + if (ref_dist < -param.segLength/5) { continue; } int64_t query_dist = it2->queryStartPos - it->queryEndPos; @@ -1655,12 +1658,16 @@ namespace skch continue; } auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - if (dist < max_dist && it2->chainPairScore > dist) { - it2->chainPairId = it->splitMappingId; - it2->chainPairScore = dist; + if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) { + best_it2 = it2; + best_score = dist; } } } + if (best_it2 != readMappings.end()) { + best_it2->chainPairScore = best_score; + best_it2->chainPairId = it->splitMappingId; + } } // Assign the merged mapping ids From d33a392d4808bf47601830abd7d00d4472bcca42 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 14:48:30 -0500 Subject: [PATCH 12/29] feat: Implement filtering based on maximally merged chains --- src/map/include/computeMap.hpp | 39 +++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index ba4f3c7a..171cc54d 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1684,6 +1684,43 @@ namespace skch < std::tie(b.splitMappingId, b.queryStartPos, b.refSeqId, b.refStartPos); }); + // Create maximallyMergedMappings + MappingResultsVector_t maximallyMergedMappings; + for(auto it = readMappings.begin(); it != readMappings.end();) { + auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); + MappingResult mergedMapping = *it; + mergedMapping.queryStartPos = it->queryStartPos; + mergedMapping.queryEndPos = std::prev(it_end)->queryEndPos; + mergedMapping.refStartPos = it->refStartPos; + mergedMapping.refEndPos = std::prev(it_end)->refEndPos; + mergedMapping.blockLength = std::max(mergedMapping.refEndPos - mergedMapping.refStartPos, + mergedMapping.queryEndPos - mergedMapping.queryStartPos); + mergedMapping.n_merged = std::distance(it, it_end); + maximallyMergedMappings.push_back(mergedMapping); + it = it_end; + } + + // Apply filtering to maximallyMergedMappings + filterWeakMappings(maximallyMergedMappings, std::floor(param.block_length / param.segLength)); + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t filteredMappings; + filterByGroup(maximallyMergedMappings, filteredMappings, param.numMappingsForSegment - 1, false); + maximallyMergedMappings = std::move(filteredMappings); + } + + // Create a set of splitMappingIds that survived filtering + std::unordered_set keptChains; + for (const auto& mapping : maximallyMergedMappings) { + keptChains.insert(mapping.splitMappingId); + } + + // Final filtering of original mappings + readMappings.erase( + std::remove_if(readMappings.begin(), readMappings.end(), + [&keptChains](const MappingResult& e) { return keptChains.count(e.splitMappingId) == 0; }), + readMappings.end() + ); + for(auto it = readMappings.begin(); it != readMappings.end();) { //Bucket by each chain auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); @@ -1705,7 +1742,7 @@ namespace skch std::remove_if(readMappings.begin(), readMappings.end(), [](const MappingResult& e) { return e.discard == 1; }), readMappings.end() - ); + ); } /** From 6401eba0a51dd2ef61cbc658bd9883f416f423d6 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 15:05:41 -0500 Subject: [PATCH 13/29] feat: Restructure mapping and filtering logic --- src/map/include/computeMap.hpp | 236 ++++++++++++++------------------- 1 file changed, 103 insertions(+), 133 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 171cc54d..13c38610 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -20,6 +20,8 @@ #include namespace fs = std::filesystem; #include +#include +#include //Own includes #include "map/include/base_types.hpp" @@ -625,139 +627,105 @@ namespace skch MappingResultsVector_t unfilteredMappings; int refGroup = this->getRefGroup(input->seqName); - if(! param.split || input->len <= param.segLength) + if (!param.split || input->len <= param.segLength) { - QueryMetaData Q; - Q.seq = &(input->seq)[0u]; - Q.len = input->len; - Q.fullLen = input->len; - Q.seqCounter = input->seqCounter; - Q.seqName = input->seqName; - Q.refGroup = refGroup; - - //Map this sequence - mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); - - // save the output - unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); - - // indicate that we mapped full length - split_mapping = false; - - input->progress.increment(input->len); - } - else //Split read mapping - { - int noOverlapFragmentCount = input->len / param.segLength; - - //Map individual non-overlapping fragments in the read - for (int i = 0; i < noOverlapFragmentCount; i++) - { - //Prepare fragment sequence object QueryMetaData Q; - Q.seq = &(input->seq)[0u] + i * param.segLength; - Q.len = param.segLength; + Q.seq = &(input->seq)[0u]; + Q.len = input->len; Q.fullLen = input->len; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; Q.refGroup = refGroup; - intervalPoints.clear(); - l1Mappings.clear(); - l2Mappings.clear(); - - //Map this fragment + // Map this sequence mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); - - //Adjust query coordinates and length in the reported mapping - std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){ - e.queryLen = input->len; - e.queryStartPos = i * param.segLength; - e.queryEndPos = i * param.segLength + Q.len; - }); - - // save the output - unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); - input->progress.increment(param.segLength); - } - - //Map last overlapping fragment to cover the whole read - if (noOverlapFragmentCount >= 1 && input->len % param.segLength != 0) - { - //Prepare fragment sequence object - QueryMetaData Q; - Q.seq = &(input->seq)[0u] + input->len - param.segLength; - Q.len = param.segLength; - Q.seqCounter = input->seqCounter; - Q.seqName = input->seqName; - Q.refGroup = refGroup; - - intervalPoints.clear(); - l1Mappings.clear(); - l2Mappings.clear(); - - //Map this fragment - mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); - - //Adjust query coordinates and length in the reported mapping - std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){ - e.queryLen = input->len; - e.queryStartPos = input->len - param.segLength; - e.queryEndPos = input->len; - }); - unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + + // Apply non-merged filtering + filterNonMergedMappings(unfilteredMappings, param); - input->progress.increment(input->len % param.segLength); - } + split_mapping = false; + input->progress.increment(input->len); } - - // how many mappings to keep - int n_mappings = (input->len < param.segLength ? - param.numMappingsForShortSequence - : param.numMappingsForSegment) - 1; - - if (split_mapping) + else // Split read mapping { - if (param.mergeMappings) - { - // hardcore merge using the chain gap - mergeMappingsInRange(unfilteredMappings, param.chain_gap); + int noOverlapFragmentCount = input->len / param.segLength; - // remove short chains that didn't exceed block length - filterWeakMappings(unfilteredMappings, std::floor(param.block_length / param.segLength)); + // Map individual non-overlapping fragments in the read + for (int i = 0; i < noOverlapFragmentCount; i++) + { + QueryMetaData Q; + Q.seq = &(input->seq)[0u] + i * param.segLength; + Q.len = param.segLength; + Q.fullLen = input->len; + Q.seqCounter = input->seqCounter; + Q.seqName = input->seqName; + Q.refGroup = refGroup; + + intervalPoints.clear(); + l1Mappings.clear(); + l2Mappings.clear(); + + mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); + + std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){ + e.queryLen = input->len; + e.queryStartPos = i * param.segLength; + e.queryEndPos = i * param.segLength + Q.len; + }); - // now that filtering has happened, set back the individual mapping coordinates and block length - setBlockCoordsToMappingCoords(unfilteredMappings); - } else { - // set block coordinates - setBlockCoordsToMappingCoords(unfilteredMappings); - } - } else { - // set block coordinates - setBlockCoordsToMappingCoords(unfilteredMappings); - } + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + input->progress.increment(param.segLength); + } - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - MappingResultsVector_t tmpMappings; - tmpMappings.reserve(output->readMappings.size()); - filterByGroup(unfilteredMappings, tmpMappings, n_mappings, false); - unfilteredMappings = std::move(tmpMappings); - } + // Map last overlapping fragment to cover the whole read + if (noOverlapFragmentCount >= 1 && input->len % param.segLength != 0) + { + QueryMetaData Q; + Q.seq = &(input->seq)[0u] + input->len - param.segLength; + Q.len = param.segLength; + Q.seqCounter = input->seqCounter; + Q.seqName = input->seqName; + Q.refGroup = refGroup; + + intervalPoints.clear(); + l1Mappings.clear(); + l2Mappings.clear(); + + mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); + + std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){ + e.queryLen = input->len; + e.queryStartPos = input->len - param.segLength; + e.queryEndPos = input->len; + }); - output->readMappings = std::move(unfilteredMappings); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + input->progress.increment(input->len % param.segLength); + } - //Make sure mapping boundary don't exceed sequence lengths - this->mappingBoundarySanityCheck(input, output->readMappings); + if (param.mergeMappings) + { + mergeMappingsInRange(unfilteredMappings, param.chain_gap); + unfilteredMappings = filterMaximallyMerged(unfilteredMappings, param); + } + else + { + filterNonMergedMappings(unfilteredMappings, param); + } + } - // remove alignments where the ratio between query and target length is < our identity threshold + // Common post-processing for both merged and non-merged mappings + mappingBoundarySanityCheck(input, unfilteredMappings); + if (param.filterLengthMismatches) { - this->filterFalseHighIdentity(output->readMappings); + filterFalseHighIdentity(unfilteredMappings); } + + sparsifyMappings(unfilteredMappings); - // sparsify the mappings, if requested - this->sparsifyMappings(output->readMappings); + output->readMappings = std::move(unfilteredMappings); return output; } @@ -1591,7 +1559,30 @@ namespace skch double axis_factor = 1.0 - (2.0 * std::min(std::abs(dx), std::abs(dy))) / (std::abs(dx) + std::abs(dy)); return euclidean * (1.0 + w * axis_factor); } - + + /** + * @brief Filter maximally merged mappings + * @param[in] readMappings Merged mappings computed by Mashmap + * @param[in] param Algorithm parameters + * @return Filtered mappings + */ + MappingResultsVector_t filterMaximallyMerged(const MappingResultsVector_t& readMappings, const Parameters& param) + { + MappingResultsVector_t filteredMappings = readMappings; + + // Filter weak mappings + filterWeakMappings(filteredMappings, std::floor(param.block_length / param.segLength)); + + // Apply group filtering if necessary + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t groupFilteredMappings; + filterByGroup(filteredMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false); + filteredMappings = std::move(groupFilteredMappings); + } + + return filteredMappings; + } + /** * @brief Merge fragment mappings by convolution of a 2D range over the alignment matrix * @param[in/out] readMappings Mappings computed by Mashmap (L2 stage) for a read @@ -1700,27 +1691,6 @@ namespace skch it = it_end; } - // Apply filtering to maximallyMergedMappings - filterWeakMappings(maximallyMergedMappings, std::floor(param.block_length / param.segLength)); - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - MappingResultsVector_t filteredMappings; - filterByGroup(maximallyMergedMappings, filteredMappings, param.numMappingsForSegment - 1, false); - maximallyMergedMappings = std::move(filteredMappings); - } - - // Create a set of splitMappingIds that survived filtering - std::unordered_set keptChains; - for (const auto& mapping : maximallyMergedMappings) { - keptChains.insert(mapping.splitMappingId); - } - - // Final filtering of original mappings - readMappings.erase( - std::remove_if(readMappings.begin(), readMappings.end(), - [&keptChains](const MappingResult& e) { return keptChains.count(e.splitMappingId) == 0; }), - readMappings.end() - ); - for(auto it = readMappings.begin(); it != readMappings.end();) { //Bucket by each chain auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); From 29ceaa2d1bad39fb7c1194765c9c3cd95e9da1aa Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 15:06:20 -0500 Subject: [PATCH 14/29] feat: add filterNonMergedMappings function --- src/map/include/computeMap.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 13c38610..fbf285dd 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -763,6 +763,22 @@ namespace skch delete output; } + /** + * @brief Filter non-merged mappings + * @param[in/out] readMappings Mappings computed by Mashmap + * @param[in] param Algorithm parameters + */ + void filterNonMergedMappings(MappingResultsVector_t &readMappings, const Parameters& param) + { + setBlockCoordsToMappingCoords(readMappings); + + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t filteredMappings; + filterByGroup(readMappings, filteredMappings, param.numMappingsForSegment - 1, false); + readMappings = std::move(filteredMappings); + } + } + /** * @brief map the parsed query sequence (L1 and L2 mapping) * @param[in] Q metadata about query sequence From 90fe5b628e48fcc89a725b65c28942014642029a Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 15 Sep 2024 15:22:41 -0500 Subject: [PATCH 15/29] one codepath for mapping chain processing --- src/map/include/computeMap.hpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index fbf285dd..27c526c3 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1711,14 +1711,9 @@ namespace skch //Bucket by each chain auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); - if (param.max_mapping_length == std::numeric_limits::max()) { - // Process the entire chain as one fragment - processMappingFragment(it, it_end); - } else { - // Process the chain with potential splits - processChainWithSplits(it, it_end); - } - + // Process the chain into chunks defined by max_mapping_length + processChainWithSplits(it, it_end); + // Move the iterator to the end of the processed chain it = it_end; } From c7c171e9d196af92d9c161bf9e1cc2bd19cfb1d6 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 15 Sep 2024 15:48:50 -0500 Subject: [PATCH 16/29] remove unused block coordinate info --- src/map/include/base_types.hpp | 4 ---- src/map/include/computeMap.hpp | 21 ++------------------- src/map/include/filter.hpp | 16 ++++++++-------- 3 files changed, 10 insertions(+), 31 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index f0b65d87..605b7560 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -157,10 +157,6 @@ namespace skch seqno_t refSeqId; //internal sequence id of the reference contig seqno_t querySeqId; //internal sequence id of the query sequence offset_t blockLength; //the block length of the mapping - offset_t blockRefStartPos; - offset_t blockRefEndPos; - offset_t blockQueryStartPos; - offset_t blockQueryEndPos; float blockNucIdentity; float nucIdentity; //calculated identity diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 27c526c3..669422f2 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -462,17 +462,6 @@ namespace skch readMappings.end()); } - void setBlockCoordsToMappingCoords(MappingResultsVector_t &readMappings) { - for (auto& m : readMappings) { - m.blockRefStartPos = m.refStartPos; - m.blockRefEndPos = m.refEndPos; - m.blockQueryStartPos = m.queryStartPos; - m.blockQueryEndPos = m.queryEndPos; - m.blockLength = std::max(m.blockRefEndPos - m.blockRefStartPos, m.blockQueryEndPos - m.blockQueryStartPos); - m.blockNucIdentity = m.nucIdentity; - } - } - /** * @brief helper to main mapping function * @details filters mappings whose identity and query/ref length don't agree @@ -485,8 +474,8 @@ namespace skch std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ - int64_t q_l = (int64_t)e.blockQueryEndPos - (int64_t)e.blockQueryStartPos; - int64_t r_l = (int64_t)e.blockRefEndPos - (int64_t)e.blockRefStartPos; + int64_t q_l = (int64_t)e.queryEndPos - (int64_t)e.queryStartPos; + int64_t r_l = (int64_t)e.refEndPos - (int64_t)e.refStartPos; uint64_t delta = std::abs(r_l - q_l); double len_id_bound = (1.0 - (double)delta/(((double)q_l+r_l)/2)); return len_id_bound < std::min(0.7, std::pow(param.percentageIdentity,3)); @@ -770,8 +759,6 @@ namespace skch */ void filterNonMergedMappings(MappingResultsVector_t &readMappings, const Parameters& param) { - setBlockCoordsToMappingCoords(readMappings); - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { MappingResultsVector_t filteredMappings; filterByGroup(readMappings, filteredMappings, param.numMappingsForSegment - 1, false); @@ -1800,10 +1787,6 @@ namespace skch for (auto it = begin; it != end; ++it) { it->n_merged = n_in_full_chain; it->blockLength = block_length; - it->blockQueryStartPos = chain_start_query; - it->blockQueryEndPos = chain_end_query; - it->blockRefStartPos = chain_start_ref; - it->blockRefEndPos = chain_end_ref; it->blockNucIdentity = chain_nuc_identity; } } diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index fa18b132..eeaa7470 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -70,11 +70,11 @@ namespace skch // compute the overlap of the two mappings double get_overlap(const int x, const int y) const { - offset_t overlap_start = std::max(vec[x].blockQueryStartPos, vec[y].blockQueryStartPos); - offset_t overlap_end = std::min(vec[x].blockQueryEndPos, vec[y].blockQueryEndPos); + offset_t overlap_start = std::max(vec[x].queryStartPos, vec[y].queryStartPos); + offset_t overlap_end = std::min(vec[x].queryEndPos, vec[y].queryEndPos); offset_t overlap_length = std::max(0, static_cast(overlap_end - overlap_start)); - offset_t x_length = vec[x].blockQueryEndPos - vec[x].blockQueryStartPos; - offset_t y_length = vec[y].blockQueryEndPos - vec[y].blockQueryStartPos; + offset_t x_length = vec[x].queryEndPos - vec[x].queryStartPos; + offset_t y_length = vec[y].queryEndPos - vec[y].queryStartPos; return static_cast(overlap_length) / std::min(x_length, y_length); } @@ -343,11 +343,11 @@ namespace skch // compute the overlap of the two mappings double get_overlap(const int x, const int y) const { - offset_t overlap_start = std::max(vec[x].blockRefStartPos, vec[y].blockRefStartPos); - offset_t overlap_end = std::min(vec[x].blockRefEndPos, vec[y].blockRefEndPos); + offset_t overlap_start = std::max(vec[x].refStartPos, vec[y].refStartPos); + offset_t overlap_end = std::min(vec[x].refEndPos, vec[y].refEndPos); offset_t overlap_length = std::max(0, static_cast(overlap_end - overlap_start)); - offset_t x_length = vec[x].blockRefEndPos - vec[x].blockRefStartPos; - offset_t y_length = vec[y].blockRefEndPos - vec[y].blockRefStartPos; + offset_t x_length = vec[x].refEndPos - vec[x].refStartPos; + offset_t y_length = vec[y].refEndPos - vec[y].refStartPos; return static_cast(overlap_length) / std::min(x_length, y_length); } From 972222d8b265c959fcb5074f93ce6fb178b08714 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 15 Sep 2024 15:52:31 -0500 Subject: [PATCH 17/29] simplify maximally merged filter --- src/map/include/computeMap.hpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 669422f2..9ec402b7 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -696,7 +696,7 @@ namespace skch if (param.mergeMappings) { mergeMappingsInRange(unfilteredMappings, param.chain_gap); - unfilteredMappings = filterMaximallyMerged(unfilteredMappings, param); + filterMaximallyMerged(unfilteredMappings, param); } else { @@ -1569,21 +1569,18 @@ namespace skch * @param[in] param Algorithm parameters * @return Filtered mappings */ - MappingResultsVector_t filterMaximallyMerged(const MappingResultsVector_t& readMappings, const Parameters& param) + void filterMaximallyMerged(MappingResultsVector_t& readMappings, const Parameters& param) { - MappingResultsVector_t filteredMappings = readMappings; - // Filter weak mappings - filterWeakMappings(filteredMappings, std::floor(param.block_length / param.segLength)); + filterWeakMappings(readMappings, std::floor(param.block_length / param.segLength)); // Apply group filtering if necessary if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { MappingResultsVector_t groupFilteredMappings; - filterByGroup(filteredMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false); - filteredMappings = std::move(groupFilteredMappings); + filterByGroup(readMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false); + readMappings = std::move(groupFilteredMappings); } - return filteredMappings; } /** From 99b66818dfd50ea3cc6d47d6a9c85fded543ebd4 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 16:03:27 -0500 Subject: [PATCH 18/29] fix: Modify FASTA file mapping to allow self-mapping --- src/interface/parse_args.hpp | 3 +-- src/map/include/computeMap.hpp | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 5d25b07f..52b896d2 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -237,8 +237,7 @@ void parse_args(int argc, // If there are no queries, go in all-vs-all mode with the sequences specified in `target_sequence_file` if (target_sequence_file && map_parameters.querySequences.empty()) { - map_parameters.skip_self = true; - std::cerr << "[mashmap] Skipping self mappings for single file all-vs-all mapping." << std::endl; + std::cerr << "[mashmap] Performing all-vs-all mapping including self mappings." << std::endl; map_parameters.querySequences.push_back(map_parameters.refSequences.back()); align_parameters.querySequences.push_back(align_parameters.refSequences.back()); } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 9ec402b7..c2acc7fb 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -908,10 +908,12 @@ namespace skch { const IP_const_iterator ip_it = pq.front().it; const auto& ref = this->refSketch.metadata[ip_it->seqId]; - if ((!param.skip_self || Q.seqName != ref.name) - && (!param.skip_prefix || this->refIdGroup[ip_it->seqId] != Q.refGroup) - && (!param.lower_triangular || Q.seqCounter > ip_it->seqId) - ) { + bool skip_mapping = false; + if (param.skip_self && Q.seqName == ref.name) skip_mapping = true; + if (param.skip_prefix && this->refIdGroup[ip_it->seqId] == Q.refGroup) skip_mapping = true; + if (param.lower_triangular && Q.seqCounter <= ip_it->seqId) skip_mapping = true; + + if (!skip_mapping) { intervalPoints.push_back(*ip_it); } std::pop_heap(pq.begin(), pq.end(), heap_cmp); From 022249670c1af76cc5747d5b8529e33242695783 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 15 Sep 2024 16:54:55 -0500 Subject: [PATCH 19/29] feat: Implement improved mapping merging and filtering --- src/map/include/computeMap.hpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index c2acc7fb..1a3cf6d5 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -695,9 +695,26 @@ namespace skch if (param.mergeMappings) { - mergeMappingsInRange(unfilteredMappings, param.chain_gap); - filterMaximallyMerged(unfilteredMappings, param); - } + // the maximally merged mappings are top-level chains + // while the unfiltered mappings now contain splits at max_mapping_length + auto maximallyMergedMappings = + mergeMappingsInRange(unfilteredMappings, param.chain_gap); + // we filter on the top level chains + filterMaximallyMerged(maximallyMergedMappings, param); + // collect splitMappingIds in the maximally merged mappings + robin_hood::unordered_set kept_chains; + for (auto &mapping : maximallyMergedMappings) { + kept_chains.insert(mapping.splitMappingId); + } + // and use them to filter mappings to discard + unfilteredMappings.erase( + std::remove_if(unfilteredMappings.begin(), unfilteredMappings.end(), + [&kept_chains](const MappingResult &mapping) { + return !kept_chains.count(mapping.splitMappingId); + }), + unfilteredMappings.end() + ); + } else { filterNonMergedMappings(unfilteredMappings, param); @@ -1591,11 +1608,11 @@ namespace skch * @param[in] max_dist Distance to look in target and query */ template - void mergeMappingsInRange(VecIn &readMappings, + VecIn mergeMappingsInRange(VecIn &readMappings, int max_dist) { assert(param.split == true); - if(readMappings.size() < 2) return; + if(readMappings.size() < 2) return readMappings; //Sort the mappings by query position, then reference sequence id, then reference position std::sort( @@ -1710,6 +1727,8 @@ namespace skch [](const MappingResult& e) { return e.discard == 1; }), readMappings.end() ); + + return maximallyMergedMappings; } /** From c4408b7ed13abeb674e40cf30454577577abc2f5 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 21:56:50 -0500 Subject: [PATCH 20/29] fix: Update distance calculation in range merging to account for strand direction --- src/map/include/computeMap.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 1a3cf6d5..3501d1df 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1658,7 +1658,12 @@ namespace skch } //If the next mapping is within range, we can potentially merge if (it2->strand == it->strand) { - int64_t ref_dist = it2->refStartPos - it->refEndPos; + int64_t ref_dist; + if (it->strand == strnd::FWD) { + ref_dist = it2->refStartPos - it->refEndPos; + } else { + ref_dist = it->refStartPos - it2->refEndPos; + } // If we jitter backwards in reference position too far, don't merge if (ref_dist < -param.segLength/5) { continue; From 5fb526edca866744d4d990b05a6b2ae622172fd5 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 22:26:43 -0500 Subject: [PATCH 21/29] refactor: Improve distance calculation logic in computeMap.hpp --- src/map/include/computeMap.hpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 3501d1df..017b889d 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1658,24 +1658,25 @@ namespace skch } //If the next mapping is within range, we can potentially merge if (it2->strand == it->strand) { + // Always calculate query distance the same way, as query always moves forward + int64_t query_dist = it2->queryStartPos - it->queryEndPos; + + // Reference distance calculation depends on strand int64_t ref_dist; if (it->strand == strnd::FWD) { ref_dist = it2->refStartPos - it->refEndPos; } else { + // For reverse complement, we need to invert the order ref_dist = it->refStartPos - it2->refEndPos; } - // If we jitter backwards in reference position too far, don't merge - if (ref_dist < -param.segLength/5) { - continue; - } - int64_t query_dist = it2->queryStartPos - it->queryEndPos; - if (query_dist < 0) { - continue; - } - auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2)); - if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) { - best_it2 = it2; - best_score = dist; + + // Check if the distance is within acceptable range + if (query_dist >= 0 && ref_dist >= -param.segLength/5 && ref_dist <= max_dist) { + double dist = std::sqrt(std::pow(query_dist, 2) + std::pow(ref_dist, 2)); + if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) { + best_it2 = it2; + best_score = dist; + } } } } From c3e2b02780a803ab74185c359065aff0f4a5ff4e Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Sun, 15 Sep 2024 22:30:55 -0500 Subject: [PATCH 22/29] refactor: Ensure all fields in merged mappings are properly initialized --- src/map/include/computeMap.hpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 017b889d..fdb152be 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1704,7 +1704,7 @@ namespace skch MappingResultsVector_t maximallyMergedMappings; for(auto it = readMappings.begin(); it != readMappings.end();) { auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} ); - MappingResult mergedMapping = *it; + MappingResult mergedMapping = *it; // Copy all fields from the first mapping in the chain mergedMapping.queryStartPos = it->queryStartPos; mergedMapping.queryEndPos = std::prev(it_end)->queryEndPos; mergedMapping.refStartPos = it->refStartPos; @@ -1712,6 +1712,30 @@ namespace skch mergedMapping.blockLength = std::max(mergedMapping.refEndPos - mergedMapping.refStartPos, mergedMapping.queryEndPos - mergedMapping.queryStartPos); mergedMapping.n_merged = std::distance(it, it_end); + + // Recalculate average values for the merged mapping + double totalNucIdentity = 0.0; + double totalKmerComplexity = 0.0; + int totalConservedSketches = 0; + int totalSketchSize = 0; + for (auto subIt = it; subIt != it_end; ++subIt) { + totalNucIdentity += subIt->nucIdentity; + totalKmerComplexity += subIt->kmerComplexity; + totalConservedSketches += subIt->conservedSketches; + totalSketchSize += subIt->sketchSize; + } + mergedMapping.nucIdentity = totalNucIdentity / mergedMapping.n_merged; + mergedMapping.kmerComplexity = totalKmerComplexity / mergedMapping.n_merged; + mergedMapping.conservedSketches = totalConservedSketches; + mergedMapping.sketchSize = totalSketchSize; + + // Ensure other fields are properly set + mergedMapping.approxMatches = std::round(mergedMapping.nucIdentity * mergedMapping.blockLength / 100.0); + mergedMapping.discard = 0; + mergedMapping.overlapped = false; + mergedMapping.chainPairScore = std::numeric_limits::max(); + mergedMapping.chainPairId = std::numeric_limits::min(); + maximallyMergedMappings.push_back(mergedMapping); it = it_end; } From c89f1b069a857eac7c44cbeac0cc3b887fb83e37 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Mon, 16 Sep 2024 20:42:35 -0500 Subject: [PATCH 23/29] feat: Instrument the Lee plane sweep filter algorithm --- src/map/include/filter.hpp | 58 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index eeaa7470..543b433c 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include //Own includes #include "map/include/base_types.hpp" @@ -86,6 +88,9 @@ namespace skch template inline void markGood(Type &L, int secondaryToKeep, bool dropRand, double overlapThreshold) { + std::cerr << "DEBUG: Entering markGood" << std::endl; + std::cerr << "DEBUG: L size: " << L.size() << ", secondaryToKeep: " << secondaryToKeep << ", dropRand: " << dropRand << ", overlapThreshold: " << overlapThreshold << std::endl; + //first segment in the set order auto beg = L.begin(); @@ -95,12 +100,15 @@ namespace skch auto it = L.begin(); for( ; it != L.end(); it++) { + std::cerr << "DEBUG: Checking mapping " << *it << ", score: " << get_score(*it) << std::endl; if ((this->greater_score(*beg, *it) || vec[*it].discard == 0) && kept > secondaryToKeep) { + std::cerr << "DEBUG: Breaking loop, kept: " << kept << std::endl; break; } vec[*it].discard = 0; ++kept; + std::cerr << "DEBUG: Marked mapping " << *it << " as good, kept: " << kept << std::endl; } auto kit = it; @@ -109,9 +117,12 @@ namespace skch if (it == L.begin()) continue; int idx = *it; for (auto it2 = L.begin(); it2 != kit; it2++) { - if (get_overlap(idx, *it2) > overlapThreshold) { + double overlap = get_overlap(idx, *it2); + std::cerr << "DEBUG: Checking overlap between " << idx << " and " << *it2 << ": " << overlap << std::endl; + if (overlap > overlapThreshold) { vec[idx].overlapped = 1; // Mark as bad if it overlaps >50% with the best mapping vec[idx].discard = 1; + std::cerr << "DEBUG: Marked mapping " << idx << " as overlapped and discarded" << std::endl; break; } } @@ -122,6 +133,7 @@ namespace skch // we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value if (kept > secondaryToKeep && dropRand) { + std::cerr << "DEBUG: Entering dropRand section, kept: " << kept << std::endl; // we will use hashes of the mapping structs to break ties // first we'll make a vector of the mappings including the hashes std::vector> score_and_hash; // The tuple is (score, hash, pointer to the mapping) @@ -130,24 +142,30 @@ namespace skch if(vec[*it].discard == 0) { score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]); + std::cerr << "DEBUG: Added mapping " << *it << " to score_and_hash, score: " << get_score(*it) << ", hash: " << vec[*it].hash() << std::endl; } } // now we'll sort the vector by score and hash std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{}); + std::cerr << "DEBUG: Sorted score_and_hash, size: " << score_and_hash.size() << std::endl; // reset kept counter kept = 0; for (auto& x : score_and_hash) { std::get<2>(x)->discard = 1; + std::cerr << "DEBUG: Initially marked mapping as discarded, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << std::endl; } // now we mark the best to keep for (auto& x : score_and_hash) { if (kept > secondaryToKeep) { + std::cerr << "DEBUG: Reached secondaryToKeep limit, breaking" << std::endl; break; } std::get<2>(x)->discard = 0; ++kept; + std::cerr << "DEBUG: Marked mapping as kept, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << ", kept: " << kept << std::endl; } } + std::cerr << "DEBUG: Exiting markGood, final kept: " << kept << std::endl; } }; @@ -159,12 +177,22 @@ namespace skch template void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand, double overlapThreshold) { + std::cerr << "DEBUG: Entering liFilterAlgorithm" << std::endl; + std::cerr << "DEBUG: Initial readMappings size: " << readMappings.size() << std::endl; + if(readMappings.size() <= 1) + { + std::cerr << "DEBUG: readMappings size <= 1, returning" << std::endl; return; + } //Initially mark all mappings as bad //Maintain the order of this vector till end of this function - std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ e.discard = 1; e.overlapped = 0; }); + std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ + e.discard = 1; + e.overlapped = 0; + std::cerr << "DEBUG: Marking mapping as bad: " << e.queryStartPos << "-" << e.queryEndPos << std::endl; + }); //Initialize object of Helper struct Helper obj (readMappings); @@ -182,13 +210,19 @@ namespace skch { eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i); eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i); + std::cerr << "DEBUG: Added events for mapping " << i << ": " + << readMappings[i].queryStartPos << " (BEGIN), " + << readMappings[i].queryEndPos << " (END)" << std::endl; } std::sort(eventSchedule.begin(), eventSchedule.end()); + std::cerr << "DEBUG: Sorted event schedule" << std::endl; //Execute the plane sweep algorithm for(auto it = eventSchedule.begin(); it!= eventSchedule.end();) { + std::cerr << "DEBUG: Processing event at position " << std::get<0>(*it) << std::endl; + //Find events that correspond to current position auto it2 = std::find_if(it, eventSchedule.end(), [&](const eventRecord_t &e) { @@ -199,21 +233,39 @@ namespace skch std::for_each(it, it2, [&](const eventRecord_t &e) { if (std::get<1>(e) == event::BEGIN) + { bst.insert (std::get<2>(e)); + std::cerr << "DEBUG: Inserted segment " << std::get<2>(e) << " into BST" << std::endl; + } else + { bst.erase (std::get<2>(e)); + std::cerr << "DEBUG: Erased segment " << std::get<2>(e) << " from BST" << std::endl; + } }); + std::cerr << "DEBUG: Calling markGood" << std::endl; //mark mappings as good obj.markGood(bst, secondaryToKeep, dropRand, overlapThreshold); it = it2; } + std::cerr << "DEBUG: Removing bad mappings" << std::endl; //Remove bad mappings + auto initialSize = readMappings.size(); readMappings.erase( - std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ return e.discard == 1 || e.overlapped == 1; }), + std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ + if (e.discard == 1 || e.overlapped == 1) { + std::cerr << "DEBUG: Removing mapping: " << e.queryStartPos << "-" << e.queryEndPos << std::endl; + return true; + } + return false; + }), readMappings.end()); + + std::cerr << "DEBUG: Removed " << (initialSize - readMappings.size()) << " mappings" << std::endl; + std::cerr << "DEBUG: Final readMappings size: " << readMappings.size() << std::endl; } /** From c67c73bebf02a98990188427aceccdfb65743967 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Mon, 16 Sep 2024 20:58:24 -0500 Subject: [PATCH 24/29] feat: Add detailed mapping information to debug statements --- src/map/include/filter.hpp | 48 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index 543b433c..875a6b91 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -191,7 +191,13 @@ namespace skch std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ e.discard = 1; e.overlapped = 0; - std::cerr << "DEBUG: Marking mapping as bad: " << e.queryStartPos << "-" << e.queryEndPos << std::endl; + std::cerr << "DEBUG: Marking mapping as bad: QueryID=" << e.querySeqId + << ", QueryStart=" << e.queryStartPos + << ", QueryEnd=" << e.queryEndPos + << ", RefID=" << e.refSeqId + << ", RefStart=" << e.refStartPos + << ", RefEnd=" << e.refEndPos + << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl; }); //Initialize object of Helper struct @@ -211,8 +217,13 @@ namespace skch eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i); eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i); std::cerr << "DEBUG: Added events for mapping " << i << ": " - << readMappings[i].queryStartPos << " (BEGIN), " - << readMappings[i].queryEndPos << " (END)" << std::endl; + << "QueryID=" << readMappings[i].querySeqId + << ", QueryStart=" << readMappings[i].queryStartPos + << " (BEGIN), QueryEnd=" << readMappings[i].queryEndPos + << " (END), RefID=" << readMappings[i].refSeqId + << ", RefStart=" << readMappings[i].refStartPos + << ", RefEnd=" << readMappings[i].refEndPos + << ", Strand=" << (readMappings[i].strand == strnd::FWD ? "+" : "-") << std::endl; } std::sort(eventSchedule.begin(), eventSchedule.end()); @@ -232,15 +243,30 @@ namespace skch //update sweep line status by adding/removing segments std::for_each(it, it2, [&](const eventRecord_t &e) { + int idx = std::get<2>(e); if (std::get<1>(e) == event::BEGIN) { - bst.insert (std::get<2>(e)); - std::cerr << "DEBUG: Inserted segment " << std::get<2>(e) << " into BST" << std::endl; + bst.insert (idx); + std::cerr << "DEBUG: Inserted segment " << idx << " into BST: " + << "QueryID=" << readMappings[idx].querySeqId + << ", QueryStart=" << readMappings[idx].queryStartPos + << ", QueryEnd=" << readMappings[idx].queryEndPos + << ", RefID=" << readMappings[idx].refSeqId + << ", RefStart=" << readMappings[idx].refStartPos + << ", RefEnd=" << readMappings[idx].refEndPos + << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl; } else { - bst.erase (std::get<2>(e)); - std::cerr << "DEBUG: Erased segment " << std::get<2>(e) << " from BST" << std::endl; + bst.erase (idx); + std::cerr << "DEBUG: Erased segment " << idx << " from BST: " + << "QueryID=" << readMappings[idx].querySeqId + << ", QueryStart=" << readMappings[idx].queryStartPos + << ", QueryEnd=" << readMappings[idx].queryEndPos + << ", RefID=" << readMappings[idx].refSeqId + << ", RefStart=" << readMappings[idx].refStartPos + << ", RefEnd=" << readMappings[idx].refEndPos + << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl; } }); @@ -257,7 +283,13 @@ namespace skch readMappings.erase( std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ if (e.discard == 1 || e.overlapped == 1) { - std::cerr << "DEBUG: Removing mapping: " << e.queryStartPos << "-" << e.queryEndPos << std::endl; + std::cerr << "DEBUG: Removing mapping: QueryID=" << e.querySeqId + << ", QueryStart=" << e.queryStartPos + << ", QueryEnd=" << e.queryEndPos + << ", RefID=" << e.refSeqId + << ", RefStart=" << e.refStartPos + << ", RefEnd=" << e.refEndPos + << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl; return true; } return false; From 7c143410928603adc7cc9e16df9e8d6b0643fb36 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Mon, 16 Sep 2024 21:05:38 -0500 Subject: [PATCH 25/29] fix: Recalculate blockNucIdentity and handle edge cases in score calculation --- src/map/include/computeMap.hpp | 3 +++ src/map/include/filter.hpp | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index fdb152be..88a5ed1c 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1729,6 +1729,9 @@ namespace skch mergedMapping.conservedSketches = totalConservedSketches; mergedMapping.sketchSize = totalSketchSize; + // Calculate blockNucIdentity + mergedMapping.blockNucIdentity = mergedMapping.nucIdentity; + // Ensure other fields are properly set mergedMapping.approxMatches = std::round(mergedMapping.nucIdentity * mergedMapping.blockLength / 100.0); mergedMapping.discard = 0; diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index 875a6b91..b9c9d193 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -43,7 +43,12 @@ namespace skch Helper(MappingResultsVector_t &v) : vec(v) {} - double get_score(const int x) const {return vec[x].blockNucIdentity * log(vec[x].blockLength); } + double get_score(const int x) const { + if (vec[x].blockLength <= 0 || vec[x].blockNucIdentity <= 0) { + return std::numeric_limits::lowest(); + } + return vec[x].blockNucIdentity * std::log(static_cast(vec[x].blockLength)); + } //Greater than comparison by score and begin position //used to define order in BST From 60ba144ae676602a3948c0a0485223f5df53d799 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Mon, 16 Sep 2024 21:07:11 -0500 Subject: [PATCH 26/29] fix: Remove debug statements from liFilterAlgorithm --- src/map/include/filter.hpp | 62 +------------------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index b9c9d193..71042b44 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -182,27 +182,14 @@ namespace skch template void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand, double overlapThreshold) { - std::cerr << "DEBUG: Entering liFilterAlgorithm" << std::endl; - std::cerr << "DEBUG: Initial readMappings size: " << readMappings.size() << std::endl; - if(readMappings.size() <= 1) - { - std::cerr << "DEBUG: readMappings size <= 1, returning" << std::endl; return; - } //Initially mark all mappings as bad //Maintain the order of this vector till end of this function std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ e.discard = 1; e.overlapped = 0; - std::cerr << "DEBUG: Marking mapping as bad: QueryID=" << e.querySeqId - << ", QueryStart=" << e.queryStartPos - << ", QueryEnd=" << e.queryEndPos - << ", RefID=" << e.refSeqId - << ", RefStart=" << e.refStartPos - << ", RefEnd=" << e.refEndPos - << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl; }); //Initialize object of Helper struct @@ -221,24 +208,13 @@ namespace skch { eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i); eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i); - std::cerr << "DEBUG: Added events for mapping " << i << ": " - << "QueryID=" << readMappings[i].querySeqId - << ", QueryStart=" << readMappings[i].queryStartPos - << " (BEGIN), QueryEnd=" << readMappings[i].queryEndPos - << " (END), RefID=" << readMappings[i].refSeqId - << ", RefStart=" << readMappings[i].refStartPos - << ", RefEnd=" << readMappings[i].refEndPos - << ", Strand=" << (readMappings[i].strand == strnd::FWD ? "+" : "-") << std::endl; } std::sort(eventSchedule.begin(), eventSchedule.end()); - std::cerr << "DEBUG: Sorted event schedule" << std::endl; //Execute the plane sweep algorithm for(auto it = eventSchedule.begin(); it!= eventSchedule.end();) { - std::cerr << "DEBUG: Processing event at position " << std::get<0>(*it) << std::endl; - //Find events that correspond to current position auto it2 = std::find_if(it, eventSchedule.end(), [&](const eventRecord_t &e) { @@ -250,59 +226,23 @@ namespace skch { int idx = std::get<2>(e); if (std::get<1>(e) == event::BEGIN) - { bst.insert (idx); - std::cerr << "DEBUG: Inserted segment " << idx << " into BST: " - << "QueryID=" << readMappings[idx].querySeqId - << ", QueryStart=" << readMappings[idx].queryStartPos - << ", QueryEnd=" << readMappings[idx].queryEndPos - << ", RefID=" << readMappings[idx].refSeqId - << ", RefStart=" << readMappings[idx].refStartPos - << ", RefEnd=" << readMappings[idx].refEndPos - << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl; - } else - { bst.erase (idx); - std::cerr << "DEBUG: Erased segment " << idx << " from BST: " - << "QueryID=" << readMappings[idx].querySeqId - << ", QueryStart=" << readMappings[idx].queryStartPos - << ", QueryEnd=" << readMappings[idx].queryEndPos - << ", RefID=" << readMappings[idx].refSeqId - << ", RefStart=" << readMappings[idx].refStartPos - << ", RefEnd=" << readMappings[idx].refEndPos - << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl; - } }); - std::cerr << "DEBUG: Calling markGood" << std::endl; //mark mappings as good obj.markGood(bst, secondaryToKeep, dropRand, overlapThreshold); it = it2; } - std::cerr << "DEBUG: Removing bad mappings" << std::endl; //Remove bad mappings - auto initialSize = readMappings.size(); readMappings.erase( std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ - if (e.discard == 1 || e.overlapped == 1) { - std::cerr << "DEBUG: Removing mapping: QueryID=" << e.querySeqId - << ", QueryStart=" << e.queryStartPos - << ", QueryEnd=" << e.queryEndPos - << ", RefID=" << e.refSeqId - << ", RefStart=" << e.refStartPos - << ", RefEnd=" << e.refEndPos - << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl; - return true; - } - return false; + return e.discard == 1 || e.overlapped == 1; }), readMappings.end()); - - std::cerr << "DEBUG: Removed " << (initialSize - readMappings.size()) << " mappings" << std::endl; - std::cerr << "DEBUG: Final readMappings size: " << readMappings.size() << std::endl; } /** From 9bdef533383a225b6ac8f9d5031d584da4067b08 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Mon, 16 Sep 2024 21:11:31 -0500 Subject: [PATCH 27/29] fix: Remove debug statements from markGood function --- src/map/include/filter.hpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp index 71042b44..a6fe6ad3 100644 --- a/src/map/include/filter.hpp +++ b/src/map/include/filter.hpp @@ -93,9 +93,6 @@ namespace skch template inline void markGood(Type &L, int secondaryToKeep, bool dropRand, double overlapThreshold) { - std::cerr << "DEBUG: Entering markGood" << std::endl; - std::cerr << "DEBUG: L size: " << L.size() << ", secondaryToKeep: " << secondaryToKeep << ", dropRand: " << dropRand << ", overlapThreshold: " << overlapThreshold << std::endl; - //first segment in the set order auto beg = L.begin(); @@ -105,15 +102,12 @@ namespace skch auto it = L.begin(); for( ; it != L.end(); it++) { - std::cerr << "DEBUG: Checking mapping " << *it << ", score: " << get_score(*it) << std::endl; if ((this->greater_score(*beg, *it) || vec[*it].discard == 0) && kept > secondaryToKeep) { - std::cerr << "DEBUG: Breaking loop, kept: " << kept << std::endl; break; } vec[*it].discard = 0; ++kept; - std::cerr << "DEBUG: Marked mapping " << *it << " as good, kept: " << kept << std::endl; } auto kit = it; @@ -123,11 +117,9 @@ namespace skch int idx = *it; for (auto it2 = L.begin(); it2 != kit; it2++) { double overlap = get_overlap(idx, *it2); - std::cerr << "DEBUG: Checking overlap between " << idx << " and " << *it2 << ": " << overlap << std::endl; if (overlap > overlapThreshold) { vec[idx].overlapped = 1; // Mark as bad if it overlaps >50% with the best mapping vec[idx].discard = 1; - std::cerr << "DEBUG: Marked mapping " << idx << " as overlapped and discarded" << std::endl; break; } } @@ -138,7 +130,6 @@ namespace skch // we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value if (kept > secondaryToKeep && dropRand) { - std::cerr << "DEBUG: Entering dropRand section, kept: " << kept << std::endl; // we will use hashes of the mapping structs to break ties // first we'll make a vector of the mappings including the hashes std::vector> score_and_hash; // The tuple is (score, hash, pointer to the mapping) @@ -147,30 +138,24 @@ namespace skch if(vec[*it].discard == 0) { score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]); - std::cerr << "DEBUG: Added mapping " << *it << " to score_and_hash, score: " << get_score(*it) << ", hash: " << vec[*it].hash() << std::endl; } } // now we'll sort the vector by score and hash std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{}); - std::cerr << "DEBUG: Sorted score_and_hash, size: " << score_and_hash.size() << std::endl; // reset kept counter kept = 0; for (auto& x : score_and_hash) { std::get<2>(x)->discard = 1; - std::cerr << "DEBUG: Initially marked mapping as discarded, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << std::endl; } // now we mark the best to keep for (auto& x : score_and_hash) { if (kept > secondaryToKeep) { - std::cerr << "DEBUG: Reached secondaryToKeep limit, breaking" << std::endl; break; } std::get<2>(x)->discard = 0; ++kept; - std::cerr << "DEBUG: Marked mapping as kept, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << ", kept: " << kept << std::endl; } } - std::cerr << "DEBUG: Exiting markGood, final kept: " << kept << std::endl; } }; From cd0542ea677c86e8c08cfc1e761f564b890986ec Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 18 Sep 2024 16:18:07 -0500 Subject: [PATCH 28/29] fix: Correct calculation of query_end and target_end in tail patching --- src/common/wflign/src/wflign_patch.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 65d467e7..6eae79e6 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -1397,17 +1397,17 @@ void write_merged_alignment( query_pos = query_length; target_pos = target_length; - // Add safety checks first - uint64_t new_query_end = query_offset + query_length; - uint64_t new_target_end = target_offset + target_length + actual_extension; + // Calculate new ends relative to the segment being aligned + uint64_t new_query_end = query_length; + uint64_t new_target_end = target_length + actual_extension; - if (new_query_end > query_total_length || new_target_end > target_total_length) { + if (query_offset + new_query_end > query_total_length || target_offset + new_target_end > target_total_length) { std::cerr << "[wfmash::patch] Warning: Alignment extends beyond sequence bounds. Truncating." << std::endl; } - // Adjust query_end and target_end, ensuring we don't exceed the total lengths - query_end = std::min(new_query_end, query_total_length); - target_end = std::min(new_target_end, target_total_length); + // Adjust query_end and target_end, ensuring we don't exceed the segment lengths + query_end = std::min(new_query_end, query_length); + target_end = std::min(new_target_end, target_length + actual_extension); } } From 240a0d36389e9c81795d51fdea989dbea1b5e954 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Wed, 18 Sep 2024 18:20:20 -0500 Subject: [PATCH 29/29] add pafcheck to validation --- .github/workflows/test_on_push.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index 7d34b429..4f813f3b 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -53,8 +53,12 @@ jobs: override: true - name: Install wgatools run: cargo install --git https://github.com/wjwei-handsome/wgatools.git + - name: Install wgatools + run: cargo install --git https://github.com/ekg/pafcheck.git - name: Run wfmash and generate PAF - run: build/bin/wfmash -t 8 -n 1 -k 19 -s 5000 -p 90 -c 30k -P 50k -T SGDref -Q S288C -Y '#' data/scerevisiae8.fa.gz data/scerevisiae8.fa.gz > test.paf + run: build/bin/wfmash -t 8 -T SGDref -Q S288C -Y '#' data/scerevisiae8.fa.gz > test.paf + - name: check PAF coordinates and extended CIGAR validity + run: pafcheck --query-fasta data/scerevisiae8.fa.gz --paf test.paf - name: Convert PAF to MAF using wgatools run: wgatools paf2maf --target data/scerevisiae8.fa.gz --query data/scerevisiae8.fa.gz test.paf > test.maf - name: Check if MAF file is not empty