From 9740f8ffda35d77e37d7cc7ac860395149a1ca67 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Thu, 12 Sep 2024 14:40:44 -0500
Subject: [PATCH 01/29] make the direction hard cutoff in chaining more correct

---
 src/map/include/computeMap.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index fd5519af..946cfb49 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1641,11 +1641,11 @@ namespace skch
                       int64_t query_dist = std::numeric_limits<int64_t>::max();
                       auto dist = std::numeric_limits<double>::max();
                       auto awed = std::numeric_limits<double>::max();
-                      if (it->strand == strnd::FWD && it->queryStartPos <= it2->queryStartPos) {
+                      if (it->strand == strnd::FWD && it->queryEndPos <= it2->queryStartPos) {
                           query_dist = it2->queryStartPos - it->queryEndPos;
                           dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
                           awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
-                      } else if (it->strand != strnd::FWD && it->queryEndPos >= it2->queryEndPos) {
+                      } else if (it->strand != strnd::FWD && it->queryStartPos >= it2->queryEndPos) {
                           query_dist = it->queryStartPos - it2->queryEndPos;
                           dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
                           awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);

From 41e84af00aaef0482299742f8cbdb6d95f7d2f4c Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Thu, 12 Sep 2024 14:46:35 -0500
Subject: [PATCH 02/29] more stringent query based sort

---
 src/map/include/computeMap.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 946cfb49..aaf62bf8 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1684,7 +1684,7 @@ namespace skch
 
               // Sort the chain by query, then reference, start position
               std::sort(it, it_end, [](const MappingResult &a, const MappingResult &b) {
-                  return std::tie(a.queryStartPos, a.refStartPos) < std::tie(b.queryStartPos, b.refStartPos);
+                  return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos);
               });
 
               // if we have an infinite max mappinng length, we should just emit the chain here

From d7ca6aaa565764cda0eb021ae45240e1e56090eb Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Fri, 13 Sep 2024 22:48:29 -0500
Subject: [PATCH 03/29] feat: Implement new chaining logic for mapping merging

---
 src/map/include/computeMap.hpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index aaf62bf8..7b13eca8 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1625,7 +1625,7 @@ namespace skch
 
           //Start the procedure to identify the chains
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
-              std::vector<std::tuple<double, double, int64_t>> distances;
+              std::vector<std::pair<double, int64_t>> chainPairs;
               for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) {
                   //If this mapping is for the same segment, ignore
                   if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) {
@@ -1651,18 +1651,25 @@ namespace skch
                           awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
                       }
                       if (dist < max_dist) {
-                          distances.push_back(std::make_tuple(awed, dist, it2->splitMappingId));
+                          chainPairs.push_back(std::make_pair(awed, it2->splitMappingId));
                       }
                   }
               }
-              if (distances.size()) {
-                  std::sort(distances.begin(), distances.end());
-                  disjoint_sets.unite(it->splitMappingId, std::get<2>(distances.front()));
+              if (!chainPairs.empty()) {
+                  std::sort(chainPairs.begin(), chainPairs.end());
+                  it->chainPairs = std::move(chainPairs);
               }
           }
 
-          //Assign the merged mapping ids
-          for (auto it =  readMappings.begin(); it != readMappings.end(); it++) {
+          // Second pass: Apply disjoint sets union operation
+          for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
+              if (!it->chainPairs.empty()) {
+                  disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second);
+              }
+          }
+
+          // Assign the merged mapping ids
+          for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
               it->splitMappingId = disjoint_sets.find(it->splitMappingId);
           }
 

From 09242d801e93b993f3a70398d7448739072a3ee8 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Fri, 13 Sep 2024 22:50:20 -0500
Subject: [PATCH 04/29] feat: Apply chaining logic to second mapping and sort
 chain pairs in ascending order

---
 src/map/include/computeMap.hpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 7b13eca8..d95f517d 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1651,19 +1651,16 @@ namespace skch
                           awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
                       }
                       if (dist < max_dist) {
-                          chainPairs.push_back(std::make_pair(awed, it2->splitMappingId));
+                          it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId));
                       }
                   }
               }
-              if (!chainPairs.empty()) {
-                  std::sort(chainPairs.begin(), chainPairs.end());
-                  it->chainPairs = std::move(chainPairs);
-              }
           }
 
-          // Second pass: Apply disjoint sets union operation
+          // Second pass: Sort chain pairs and apply disjoint sets union operation
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
               if (!it->chainPairs.empty()) {
+                  std::sort(it->chainPairs.begin(), it->chainPairs.end());
                   disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second);
               }
           }

From 869da9944525bb3c5bd7b3104361bb618f6d405e Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Fri, 13 Sep 2024 22:53:27 -0500
Subject: [PATCH 05/29] feat: Modify mergeMappingsInRange function to improve
 merging logic

---
 src/map/include/computeMap.hpp | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index d95f517d..d5a0469e 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1604,12 +1604,12 @@ namespace skch
 
           if(readMappings.size() < 2) return;
 
-          //Sort the mappings by reference (then query) position
+          //Sort the mappings by query position, then reference sequence id, then reference position
           std::sort(
               readMappings.begin(), readMappings.end(),
               [](const MappingResult &a, const MappingResult &b) {
-                  return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos)
-                      < std::tie(b.refSeqId, b.refStartPos, b.queryStartPos);
+                  return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos)
+                      < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos);
               });
 
           //First assign a unique id to each split mapping in the sorted order
@@ -1631,25 +1631,16 @@ namespace skch
                   if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) {
                       continue;
                   }
-                  //If this mapping is too far from current mapping being evaluated, stop finding a merge
-                  if (it2->refSeqId != it->refSeqId || it2->refStartPos > it->refEndPos + max_dist) {
+                  //If this mapping is too far from current mapping being evaluated in the query, stop finding a merge
+                  if (it2->queryStartPos > it->queryEndPos + max_dist) {
                       break;
                   }
                   //If the next mapping is within range, check if it's in range and
                   if (it2->strand == it->strand) {
                       int64_t ref_dist = it2->refStartPos - it->refEndPos;
-                      int64_t query_dist = std::numeric_limits<int64_t>::max();
-                      auto dist = std::numeric_limits<double>::max();
-                      auto awed = std::numeric_limits<double>::max();
-                      if (it->strand == strnd::FWD && it->queryEndPos <= it2->queryStartPos) {
-                          query_dist = it2->queryStartPos - it->queryEndPos;
-                          dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                          awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
-                      } else if (it->strand != strnd::FWD && it->queryStartPos >= it2->queryEndPos) {
-                          query_dist = it->queryStartPos - it2->queryEndPos;
-                          dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                          awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
-                      }
+                      int64_t query_dist = it2->queryStartPos - it->queryEndPos;
+                      auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
+                      auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
                       if (dist < max_dist) {
                           it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId));
                       }

From 68c9df1a8ba8f249232eb3ccc29fac1103cf481d Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Fri, 13 Sep 2024 23:45:13 -0500
Subject: [PATCH 06/29] simplify sorting and just keep best chain pair option
 around

---
 src/map/include/base_types.hpp |  2 ++
 src/map/include/computeMap.hpp | 38 +++++++++++++++-------------------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index 9505f3b9..f0b65d87 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -178,6 +178,8 @@ namespace skch
     uint8_t discard;                                    // set to 1 for deletion
     bool overlapped;                                    // set to true if this mapping is overlapped with another mapping
     bool selfMapFilter;                                 // set to true if a long-to-short mapping in all-vs-all mode (we report short as the query)
+    double chainPairScore;                              // best score for potential chain pair
+    int64_t chainPairId;                                // best partner mapping for potential chain pair
 
     offset_t qlen() {                                   //length of this mapping on query axis
       return queryEndPos - queryStartPos + 1;
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index d5a0469e..48e87607 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1616,6 +1616,8 @@ namespace skch
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
               it->splitMappingId = std::distance(readMappings.begin(), it);
               it->discard = 0;
+              it->chainPairScore = std::numeric_limits<double>::max();
+              it->chainPairId = std::numeric_limits<int64_t>::max();
           }
 
           // set up our union find data structure to track merges
@@ -1625,7 +1627,10 @@ namespace skch
 
           //Start the procedure to identify the chains
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
-              std::vector<std::pair<double, int64_t>> chainPairs;
+              // we we merge only with the best-scored previous mapping in query space
+              if (it->chainPairScore != std::numeric_limits<double>::max()) {
+                  disjoint_sets.unite(it->splitMappingId, it->chainPairId);
+              }
               for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) {
                   //If this mapping is for the same segment, ignore
                   if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) {
@@ -1638,24 +1643,23 @@ namespace skch
                   //If the next mapping is within range, check if it's in range and
                   if (it2->strand == it->strand) {
                       int64_t ref_dist = it2->refStartPos - it->refEndPos;
+                      if (ref_dist < 0) {
+                          continue;
+                      }
                       int64_t query_dist = it2->queryStartPos - it->queryEndPos;
+                      if (query_dist < 0) {
+                          continue;
+                      }
                       auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
                       auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
-                      if (dist < max_dist) {
-                          it2->chainPairs.push_back(std::make_pair(awed, it->splitMappingId));
+                      if (dist < max_dist && it2->chainPairScore > awed) {
+                          it2->chainPairId = it->splitMappingId;
+                          it2->chainPairScore = awed;
                       }
                   }
               }
           }
 
-          // Second pass: Sort chain pairs and apply disjoint sets union operation
-          for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
-              if (!it->chainPairs.empty()) {
-                  std::sort(it->chainPairs.begin(), it->chainPairs.end());
-                  disjoint_sets.unite(it->splitMappingId, it->chainPairs.front().second);
-              }
-          }
-
           // Assign the merged mapping ids
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
               it->splitMappingId = disjoint_sets.find(it->splitMappingId);
@@ -1666,22 +1670,14 @@ namespace skch
               readMappings.begin(),
               readMappings.end(),
               [](const MappingResult &a, const MappingResult &b) {
-                  return a.splitMappingId < b.splitMappingId
-                      || (a.splitMappingId == b.splitMappingId
-                          && (a.queryStartPos < b.queryStartPos
-                              || (a.queryStartPos == b.queryStartPos
-                                  && a.refStartPos < b.refStartPos)));
+                  return std::tie(a.splitMappingId, a.queryStartPos, a.refSeqId, a.refStartPos)
+                      < std::tie(b.splitMappingId, b.queryStartPos, b.refSeqId, b.refStartPos);
               });
 
           for(auto it = readMappings.begin(); it != readMappings.end();) {
               //Bucket by each chain
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
 
-              // Sort the chain by query, then reference, start position
-              std::sort(it, it_end, [](const MappingResult &a, const MappingResult &b) {
-                  return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos);
-              });
-
               // if we have an infinite max mappinng length, we should just emit the chain here
               if (param.max_mapping_length == std::numeric_limits<int64_t>::max()) {
                   // Process any remaining fragment

From 0e8bcc93fcb529d66d46331c7bdfe18e09b5475a Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Fri, 13 Sep 2024 23:53:48 -0500
Subject: [PATCH 07/29] get rid of axis weighted distance

---
 src/map/include/computeMap.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 48e87607..bb2ed304 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1651,10 +1651,10 @@ namespace skch
                           continue;
                       }
                       auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                      auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
-                      if (dist < max_dist && it2->chainPairScore > awed) {
+                      //auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
+                      if (dist < max_dist && it2->chainPairScore > dist) {
                           it2->chainPairId = it->splitMappingId;
-                          it2->chainPairScore = awed;
+                          it2->chainPairScore = dist;
                       }
                   }
               }

From fefb4ee0891406aa8ebff9ace81a595648f7d33d Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 14 Sep 2024 00:01:30 -0500
Subject: [PATCH 08/29] comment cleanup

---
 src/map/include/computeMap.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index bb2ed304..9a6d213a 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1640,7 +1640,7 @@ namespace skch
                   if (it2->queryStartPos > it->queryEndPos + max_dist) {
                       break;
                   }
-                  //If the next mapping is within range, check if it's in range and
+                  //If the next mapping is within range, we can potentially merge
                   if (it2->strand == it->strand) {
                       int64_t ref_dist = it2->refStartPos - it->refEndPos;
                       if (ref_dist < 0) {
@@ -1651,7 +1651,6 @@ namespace skch
                           continue;
                       }
                       auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                      //auto awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
                       if (dist < max_dist && it2->chainPairScore > dist) {
                           it2->chainPairId = it->splitMappingId;
                           it2->chainPairScore = dist;

From 377f0c5ea4fdfdd487050ad9226e0cd375c73fd6 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sat, 14 Sep 2024 08:50:26 -0500
Subject: [PATCH 09/29] refactor: processChainWithSplits

---
 src/map/include/computeMap.hpp | 175 ++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 89 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 9a6d213a..5dcdbd2e 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1677,109 +1677,106 @@ namespace skch
               //Bucket by each chain
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
 
-              // if we have an infinite max mappinng length, we should just emit the chain here
               if (param.max_mapping_length == std::numeric_limits<int64_t>::max()) {
-                  // Process any remaining fragment
+                  // Process the entire chain as one fragment
                   processMappingFragment(it, it_end);
-                  it = it_end;
-                  continue;
+              } else {
+                  // Process the chain with potential splits
+                  processChainWithSplits(it, it_end);
               }
+              
+              // Move the iterator to the end of the processed chain
+              it = it_end;
+          }
 
-              // tweak start and end positions of consecutive mappings
-              // TODO: XXX double check that the consecutive mappings are not overlapping!!!
-              // extra: force global alignment by patching head and tail to mapping start and end coordinates
-              //adjustConsecutiveMappings(it, it_end, param.segLength);
-
-              // First pass: Mark cuttable positions
-              const int consecutive_mappings_window = 4; // Configurable parameter
-              std::vector<bool> is_cuttable(std::distance(it, it_end), false);
-    
-              auto window_start = it;
-              int consecutive_count = 0;
+          // After processing all chains, remove discarded mappings
+          readMappings.erase(
+              std::remove_if(readMappings.begin(), readMappings.end(), 
+                             [](const MappingResult& e) { return e.discard == 1; }),
+              readMappings.end()
+              );
+      }
 
-              for (auto current = it; current != it_end; ++current) {
-                  is_cuttable[std::distance(it, current)] = true;
-              }
+      /**
+       * @brief Process a chain of mappings, potentially splitting it into smaller fragments
+       * @param begin Iterator to the start of the chain
+       * @param end Iterator to the end of the chain
+       */
+      void processChainWithSplits(std::vector<MappingResult>::iterator begin, std::vector<MappingResult>::iterator end) {
+          if (begin == end) return;
 
-              // mark anything positions on either side of a discontinuity as non-cuttable
-              for (auto current = it; current != it_end; ++current) {
-                  if (current == it || current == std::prev(it_end)) {
-                      continue;
-                  }
-                  if (current->queryStartPos - std::prev(current)->queryEndPos > param.segLength / 5
-                      || current->refStartPos - std::prev(current)->refEndPos > param.segLength / 5) {
-                      is_cuttable[std::distance(it, current) - 1] = false;
-                      is_cuttable[std::distance(it, current)] = false;
-                  }
+          std::vector<bool> is_cuttable(std::distance(begin, end), true);
+          
+          // Mark positions that are not cuttable (near discontinuities)
+          for (auto it = std::next(begin); it != end; ++it) {
+              auto prev = std::prev(it);
+              if (it->queryStartPos - prev->queryEndPos > param.segLength / 5 ||
+                  it->refStartPos - prev->refEndPos > param.segLength / 5) {
+                  is_cuttable[std::distance(begin, prev)] = false;
+                  is_cuttable[std::distance(begin, it)] = false;
               }
+          }
 
-              adjustConsecutiveMappings(it, it_end, param.segLength);
-
-              auto fragment_start = it;
-              auto current = it;
-              offset_t accumulate_length = 0;
-              double accumulate_nuc_identity = 0.0;
-
-              while (current != it_end) {
-                  // chunks are made across the query
-                  accumulate_length += current->queryEndPos - current->queryStartPos;
-                  if (accumulate_length >= param.max_mapping_length
-                      && is_cuttable[std::distance(it, current)]) {
-                      if (current != fragment_start) {
-                          adjustConsecutiveMappings(std::prev(current), current, param.chain_gap);
-                      }
-                      processMappingFragment(fragment_start, current);
-                      fragment_start = current;
-                      accumulate_length = 0;
-                  }
-                  ++current;
+          adjustConsecutiveMappings(begin, end, param.segLength);
+
+          auto fragment_start = begin;
+          offset_t accumulate_length = 0;
+
+          for (auto it = begin; it != end; ++it) {
+              accumulate_length += it->queryEndPos - it->queryStartPos;
+              
+              if (accumulate_length >= param.max_mapping_length && is_cuttable[std::distance(begin, it)]) {
+                  // Process the fragment up to this point
+                  processMappingFragment(fragment_start, std::next(it));
+                  
+                  // Start a new fragment
+                  fragment_start = std::next(it);
+                  accumulate_length = 0;
               }
+          }
 
-              // Process any remaining fragment
-              processMappingFragment(fragment_start, current);
-
-              // Compute chain statistics and coordinates
-              offset_t target_len_in_full_chain = 0;
-              offset_t query_len_in_full_chain = 0;
-              int n_in_full_chain = std::distance(it, it_end);
-              offset_t chain_start_query = std::numeric_limits<offset_t>::max();
-              offset_t chain_end_query = std::numeric_limits<offset_t>::min();
-              offset_t chain_start_ref = std::numeric_limits<offset_t>::max();
-              offset_t chain_end_ref = std::numeric_limits<offset_t>::min();
-              for (auto x = it; x != it_end; ++x) {
-                  chain_start_query = std::min(chain_start_query, x->queryStartPos);
-                  chain_end_query = std::max(chain_end_query, x->queryEndPos);
-                  chain_start_ref = std::min(chain_start_ref, x->refStartPos);
-                  chain_end_ref = std::max(chain_end_ref, x->refEndPos);
-                  target_len_in_full_chain += x->refEndPos - x->refStartPos;
-                  query_len_in_full_chain += x->queryEndPos - x->queryStartPos;
-                  accumulate_nuc_identity += current->nucIdentity;
-              }
+          // Process any remaining fragment
+          if (fragment_start != end) {
+              processMappingFragment(fragment_start, end);
+          }
 
-              // Assign chain statistics to all mappings in the chain
-              auto chain_nuc_identity = accumulate_nuc_identity / n_in_full_chain;
-              auto block_length = std::max(chain_end_query - chain_start_query, chain_end_ref - chain_start_ref);
-              for (auto x = it; x != it_end; ++x) {
-                  auto& m = *x;
-                  m.n_merged = n_in_full_chain;
-                  m.blockLength = block_length;
-                  m.blockQueryStartPos = chain_start_query;
-                  m.blockQueryEndPos = chain_end_query;
-                  m.blockRefStartPos = chain_start_ref;
-                  m.blockRefEndPos = chain_end_ref;
-                  m.blockNucIdentity = chain_nuc_identity;
-              }
+          // Compute and assign chain statistics
+          computeChainStatistics(begin, end);
+      }
 
-              // Move the iterator to the end of the processed chain
-              it = it_end;
+      /**
+       * @brief Compute and assign chain statistics to all mappings in the chain
+       * @param begin Iterator to the start of the chain
+       * @param end Iterator to the end of the chain
+       */
+      void computeChainStatistics(std::vector<MappingResult>::iterator begin, std::vector<MappingResult>::iterator end) {
+          offset_t chain_start_query = std::numeric_limits<offset_t>::max();
+          offset_t chain_end_query = std::numeric_limits<offset_t>::min();
+          offset_t chain_start_ref = std::numeric_limits<offset_t>::max();
+          offset_t chain_end_ref = std::numeric_limits<offset_t>::min();
+          double accumulate_nuc_identity = 0.0;
+          int n_in_full_chain = std::distance(begin, end);
+
+          for (auto it = begin; it != end; ++it) {
+              chain_start_query = std::min(chain_start_query, it->queryStartPos);
+              chain_end_query = std::max(chain_end_query, it->queryEndPos);
+              chain_start_ref = std::min(chain_start_ref, it->refStartPos);
+              chain_end_ref = std::max(chain_end_ref, it->refEndPos);
+              accumulate_nuc_identity += it->nucIdentity;
           }
 
-          // After processing all chains, remove discarded mappings
-          readMappings.erase(
-              std::remove_if(readMappings.begin(), readMappings.end(), 
-                             [](const MappingResult& e) { return e.discard == 1; }),
-              readMappings.end()
-              );
+          auto chain_nuc_identity = accumulate_nuc_identity / n_in_full_chain;
+          auto block_length = std::max(chain_end_query - chain_start_query, chain_end_ref - chain_start_ref);
+
+          for (auto it = begin; it != end; ++it) {
+              it->n_merged = n_in_full_chain;
+              it->blockLength = block_length;
+              it->blockQueryStartPos = chain_start_query;
+              it->blockQueryEndPos = chain_end_query;
+              it->blockRefStartPos = chain_start_ref;
+              it->blockRefEndPos = chain_end_ref;
+              it->blockNucIdentity = chain_nuc_identity;
+          }
       }
 
      /**

From 810ce0aafa4cda11254471cf65f437a6e4d43b4b Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 14 Sep 2024 10:01:19 -0500
Subject: [PATCH 10/29] cleanup decision to chain

---
 src/map/include/computeMap.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 5dcdbd2e..a4b4222a 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1617,7 +1617,7 @@ namespace skch
               it->splitMappingId = std::distance(readMappings.begin(), it);
               it->discard = 0;
               it->chainPairScore = std::numeric_limits<double>::max();
-              it->chainPairId = std::numeric_limits<int64_t>::max();
+              it->chainPairId = std::numeric_limits<int64_t>::min();
           }
 
           // set up our union find data structure to track merges
@@ -1632,8 +1632,12 @@ namespace skch
                   disjoint_sets.unite(it->splitMappingId, it->chainPairId);
               }
               for (auto it2 = std::next(it); it2 != readMappings.end(); it2++) {
-                  //If this mapping is for the same segment, ignore
-                  if (it2->refSeqId == it->refSeqId && it2->queryStartPos == it->queryStartPos) {
+                  // If this mapping is for a different reference sequence, ignore
+                  if (it2->refSeqId != it->refSeqId) {
+                      continue;
+                  }
+                  //If this mapping is for exactly the same segment, ignore
+                  if (it2->queryStartPos == it->queryStartPos) {
                       continue;
                   }
                   //If this mapping is too far from current mapping being evaluated in the query, stop finding a merge

From cb9472665cba4c6e46dff666979f96a63bfdd88f Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 14 Sep 2024 10:44:15 -0500
Subject: [PATCH 11/29] best-buddy chain merging

---
 src/map/include/computeMap.hpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index a4b4222a..ba4f3c7a 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1627,6 +1627,8 @@ namespace skch
 
           //Start the procedure to identify the chains
           for (auto it = readMappings.begin(); it != readMappings.end(); it++) {
+              double best_score = std::numeric_limits<double>::max();
+              auto best_it2 = readMappings.end();
               // we we merge only with the best-scored previous mapping in query space
               if (it->chainPairScore != std::numeric_limits<double>::max()) {
                   disjoint_sets.unite(it->splitMappingId, it->chainPairId);
@@ -1647,7 +1649,8 @@ namespace skch
                   //If the next mapping is within range, we can potentially merge
                   if (it2->strand == it->strand) {
                       int64_t ref_dist = it2->refStartPos - it->refEndPos;
-                      if (ref_dist < 0) {
+                      // If we jitter backwards in reference position too far, don't merge
+                      if (ref_dist < -param.segLength/5) {
                           continue;
                       }
                       int64_t query_dist = it2->queryStartPos - it->queryEndPos;
@@ -1655,12 +1658,16 @@ namespace skch
                           continue;
                       }
                       auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                      if (dist < max_dist && it2->chainPairScore > dist) {
-                          it2->chainPairId = it->splitMappingId;
-                          it2->chainPairScore = dist;
+                      if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) {
+                          best_it2 = it2;
+                          best_score = dist;
                       }
                   }
               }
+              if (best_it2 != readMappings.end()) {
+                  best_it2->chainPairScore = best_score;
+                  best_it2->chainPairId = it->splitMappingId;
+              }
           }
 
           // Assign the merged mapping ids

From d33a392d4808bf47601830abd7d00d4472bcca42 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 14:48:30 -0500
Subject: [PATCH 12/29] feat: Implement filtering based on maximally merged
 chains

---
 src/map/include/computeMap.hpp | 39 +++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index ba4f3c7a..171cc54d 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1684,6 +1684,43 @@ namespace skch
                       < std::tie(b.splitMappingId, b.queryStartPos, b.refSeqId, b.refStartPos);
               });
 
+          // Create maximallyMergedMappings
+          MappingResultsVector_t maximallyMergedMappings;
+          for(auto it = readMappings.begin(); it != readMappings.end();) {
+              auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
+              MappingResult mergedMapping = *it;
+              mergedMapping.queryStartPos = it->queryStartPos;
+              mergedMapping.queryEndPos = std::prev(it_end)->queryEndPos;
+              mergedMapping.refStartPos = it->refStartPos;
+              mergedMapping.refEndPos = std::prev(it_end)->refEndPos;
+              mergedMapping.blockLength = std::max(mergedMapping.refEndPos - mergedMapping.refStartPos,
+                                                   mergedMapping.queryEndPos - mergedMapping.queryStartPos);
+              mergedMapping.n_merged = std::distance(it, it_end);
+              maximallyMergedMappings.push_back(mergedMapping);
+              it = it_end;
+          }
+
+          // Apply filtering to maximallyMergedMappings
+          filterWeakMappings(maximallyMergedMappings, std::floor(param.block_length / param.segLength));
+          if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
+              MappingResultsVector_t filteredMappings;
+              filterByGroup(maximallyMergedMappings, filteredMappings, param.numMappingsForSegment - 1, false);
+              maximallyMergedMappings = std::move(filteredMappings);
+          }
+
+          // Create a set of splitMappingIds that survived filtering
+          std::unordered_set<offset_t> keptChains;
+          for (const auto& mapping : maximallyMergedMappings) {
+              keptChains.insert(mapping.splitMappingId);
+          }
+
+          // Final filtering of original mappings
+          readMappings.erase(
+              std::remove_if(readMappings.begin(), readMappings.end(),
+                             [&keptChains](const MappingResult& e) { return keptChains.count(e.splitMappingId) == 0; }),
+              readMappings.end()
+          );
+
           for(auto it = readMappings.begin(); it != readMappings.end();) {
               //Bucket by each chain
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
@@ -1705,7 +1742,7 @@ namespace skch
               std::remove_if(readMappings.begin(), readMappings.end(), 
                              [](const MappingResult& e) { return e.discard == 1; }),
               readMappings.end()
-              );
+          );
       }
 
       /**

From 6401eba0a51dd2ef61cbc658bd9883f416f423d6 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 15:05:41 -0500
Subject: [PATCH 13/29] feat: Restructure mapping and filtering logic

---
 src/map/include/computeMap.hpp | 236 ++++++++++++++-------------------
 1 file changed, 103 insertions(+), 133 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 171cc54d..13c38610 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -20,6 +20,8 @@
 #include <filesystem>
 namespace fs = std::filesystem;
 #include <queue>
+#include <algorithm>
+#include <unordered_set>
 
 //Own includes
 #include "map/include/base_types.hpp"
@@ -625,139 +627,105 @@ namespace skch
         MappingResultsVector_t unfilteredMappings;
         int refGroup = this->getRefGroup(input->seqName);
 
-        if(! param.split || input->len <= param.segLength)
+        if (!param.split || input->len <= param.segLength)
         {
-          QueryMetaData <MinVec_Type> Q;
-          Q.seq = &(input->seq)[0u];
-          Q.len = input->len;
-          Q.fullLen = input->len;
-          Q.seqCounter = input->seqCounter;
-          Q.seqName = input->seqName;
-          Q.refGroup = refGroup;
-
-          //Map this sequence
-          mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
-
-          // save the output
-          unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
-
-          // indicate that we mapped full length
-          split_mapping = false;
-
-          input->progress.increment(input->len);
-        }
-        else  //Split read mapping
-        {
-          int noOverlapFragmentCount = input->len / param.segLength;
-
-          //Map individual non-overlapping fragments in the read
-          for (int i = 0; i < noOverlapFragmentCount; i++)
-          {
-            //Prepare fragment sequence object
             QueryMetaData <MinVec_Type> Q;
-            Q.seq = &(input->seq)[0u] + i * param.segLength;
-            Q.len = param.segLength;
+            Q.seq = &(input->seq)[0u];
+            Q.len = input->len;
             Q.fullLen = input->len;
             Q.seqCounter = input->seqCounter;
             Q.seqName = input->seqName;
             Q.refGroup = refGroup;
 
-            intervalPoints.clear();
-            l1Mappings.clear();
-            l2Mappings.clear();
-
-            //Map this fragment
+            // Map this sequence
             mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
-
-            //Adjust query coordinates and length in the reported mapping
-            std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){
-                e.queryLen = input->len;
-                e.queryStartPos = i * param.segLength;
-                e.queryEndPos = i * param.segLength + Q.len;
-                });
-
-            // save the output
-            unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
-            input->progress.increment(param.segLength);
-          }
-
-          //Map last overlapping fragment to cover the whole read
-          if (noOverlapFragmentCount >= 1 && input->len % param.segLength != 0)
-          {
-            //Prepare fragment sequence object
-            QueryMetaData <MinVec_Type> Q;
-            Q.seq = &(input->seq)[0u] + input->len - param.segLength;
-            Q.len = param.segLength;
-            Q.seqCounter = input->seqCounter;
-            Q.seqName = input->seqName;
-            Q.refGroup = refGroup;
-
-            intervalPoints.clear();
-            l1Mappings.clear();
-            l2Mappings.clear();
-
-            //Map this fragment
-            mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
-
-            //Adjust query coordinates and length in the reported mapping
-            std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){
-                e.queryLen = input->len;
-                e.queryStartPos = input->len - param.segLength;
-                e.queryEndPos = input->len;
-                });
-
             unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
+        
+            // Apply non-merged filtering
+            filterNonMergedMappings(unfilteredMappings, param);
 
-            input->progress.increment(input->len % param.segLength);
-          }
+            split_mapping = false;
+            input->progress.increment(input->len);
         }
-
-        // how many mappings to keep
-        int n_mappings = (input->len < param.segLength ?
-                          param.numMappingsForShortSequence
-                          : param.numMappingsForSegment) - 1;
-
-        if (split_mapping) 
+        else // Split read mapping
         {
-          if (param.mergeMappings) 
-          {
-            // hardcore merge using the chain gap
-            mergeMappingsInRange(unfilteredMappings, param.chain_gap);
+            int noOverlapFragmentCount = input->len / param.segLength;
 
-            // remove short chains that didn't exceed block length
-            filterWeakMappings(unfilteredMappings, std::floor(param.block_length / param.segLength));
+            // Map individual non-overlapping fragments in the read
+            for (int i = 0; i < noOverlapFragmentCount; i++)
+            {
+                QueryMetaData <MinVec_Type> Q;
+                Q.seq = &(input->seq)[0u] + i * param.segLength;
+                Q.len = param.segLength;
+                Q.fullLen = input->len;
+                Q.seqCounter = input->seqCounter;
+                Q.seqName = input->seqName;
+                Q.refGroup = refGroup;
+
+                intervalPoints.clear();
+                l1Mappings.clear();
+                l2Mappings.clear();
+
+                mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
+
+                std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){
+                    e.queryLen = input->len;
+                    e.queryStartPos = i * param.segLength;
+                    e.queryEndPos = i * param.segLength + Q.len;
+                });
 
-            // now that filtering has happened, set back the individual mapping coordinates and block length
-            setBlockCoordsToMappingCoords(unfilteredMappings);
-          } else {
-              // set block coordinates
-              setBlockCoordsToMappingCoords(unfilteredMappings);
-          }
-        } else {
-          // set block coordinates
-          setBlockCoordsToMappingCoords(unfilteredMappings);
-        }
+                unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
+                input->progress.increment(param.segLength);
+            }
 
-        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
-          MappingResultsVector_t tmpMappings;
-          tmpMappings.reserve(output->readMappings.size());
-          filterByGroup(unfilteredMappings, tmpMappings, n_mappings, false);
-          unfilteredMappings = std::move(tmpMappings);
-        }
+            // Map last overlapping fragment to cover the whole read
+            if (noOverlapFragmentCount >= 1 && input->len % param.segLength != 0)
+            {
+                QueryMetaData <MinVec_Type> Q;
+                Q.seq = &(input->seq)[0u] + input->len - param.segLength;
+                Q.len = param.segLength;
+                Q.seqCounter = input->seqCounter;
+                Q.seqName = input->seqName;
+                Q.refGroup = refGroup;
+
+                intervalPoints.clear();
+                l1Mappings.clear();
+                l2Mappings.clear();
+
+                mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
+
+                std::for_each(l2Mappings.begin(), l2Mappings.end(), [&](MappingResult &e){
+                    e.queryLen = input->len;
+                    e.queryStartPos = input->len - param.segLength;
+                    e.queryEndPos = input->len;
+                });
 
-        output->readMappings = std::move(unfilteredMappings);
+                unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
+                input->progress.increment(input->len % param.segLength);
+            }
 
-        //Make sure mapping boundary don't exceed sequence lengths
-        this->mappingBoundarySanityCheck(input, output->readMappings);
+            if (param.mergeMappings) 
+            {
+                mergeMappingsInRange(unfilteredMappings, param.chain_gap);
+                unfilteredMappings = filterMaximallyMerged(unfilteredMappings, param);
+            } 
+            else 
+            {
+                filterNonMergedMappings(unfilteredMappings, param);
+            }
+        }
 
-        // remove alignments where the ratio between query and target length is < our identity threshold
+        // Common post-processing for both merged and non-merged mappings
+        mappingBoundarySanityCheck(input, unfilteredMappings);
+    
         if (param.filterLengthMismatches)
         {
-          this->filterFalseHighIdentity(output->readMappings);
+            filterFalseHighIdentity(unfilteredMappings);
         }
+    
+        sparsifyMappings(unfilteredMappings);
 
-        // sparsify the mappings, if requested
-        this->sparsifyMappings(output->readMappings);
+        output->readMappings = std::move(unfilteredMappings);
 
         return output;
       }
@@ -1591,7 +1559,30 @@ namespace skch
           double axis_factor = 1.0 - (2.0 * std::min(std::abs(dx), std::abs(dy))) / (std::abs(dx) + std::abs(dy));
           return euclidean * (1.0 + w * axis_factor);
       }
-      
+
+      /**
+       * @brief                       Filter maximally merged mappings
+       * @param[in]   readMappings    Merged mappings computed by Mashmap
+       * @param[in]   param           Algorithm parameters
+       * @return                      Filtered mappings
+       */
+      MappingResultsVector_t filterMaximallyMerged(const MappingResultsVector_t& readMappings, const Parameters& param)
+      {
+          MappingResultsVector_t filteredMappings = readMappings;
+
+          // Filter weak mappings
+          filterWeakMappings(filteredMappings, std::floor(param.block_length / param.segLength));
+
+          // Apply group filtering if necessary
+          if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
+              MappingResultsVector_t groupFilteredMappings;
+              filterByGroup(filteredMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false);
+              filteredMappings = std::move(groupFilteredMappings);
+          }
+
+          return filteredMappings;
+      }
+
       /**
        * @brief                       Merge fragment mappings by convolution of a 2D range over the alignment matrix
        * @param[in/out] readMappings  Mappings computed by Mashmap (L2 stage) for a read
@@ -1700,27 +1691,6 @@ namespace skch
               it = it_end;
           }
 
-          // Apply filtering to maximallyMergedMappings
-          filterWeakMappings(maximallyMergedMappings, std::floor(param.block_length / param.segLength));
-          if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
-              MappingResultsVector_t filteredMappings;
-              filterByGroup(maximallyMergedMappings, filteredMappings, param.numMappingsForSegment - 1, false);
-              maximallyMergedMappings = std::move(filteredMappings);
-          }
-
-          // Create a set of splitMappingIds that survived filtering
-          std::unordered_set<offset_t> keptChains;
-          for (const auto& mapping : maximallyMergedMappings) {
-              keptChains.insert(mapping.splitMappingId);
-          }
-
-          // Final filtering of original mappings
-          readMappings.erase(
-              std::remove_if(readMappings.begin(), readMappings.end(),
-                             [&keptChains](const MappingResult& e) { return keptChains.count(e.splitMappingId) == 0; }),
-              readMappings.end()
-          );
-
           for(auto it = readMappings.begin(); it != readMappings.end();) {
               //Bucket by each chain
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );

From 29ceaa2d1bad39fb7c1194765c9c3cd95e9da1aa Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 15:06:20 -0500
Subject: [PATCH 14/29] feat: add filterNonMergedMappings function

---
 src/map/include/computeMap.hpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 13c38610..fbf285dd 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -763,6 +763,22 @@ namespace skch
           delete output;
         }
 
+      /**
+       * @brief                       Filter non-merged mappings
+       * @param[in/out] readMappings  Mappings computed by Mashmap
+       * @param[in]     param         Algorithm parameters
+       */
+      void filterNonMergedMappings(MappingResultsVector_t &readMappings, const Parameters& param)
+      {
+          setBlockCoordsToMappingCoords(readMappings);
+    
+          if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
+              MappingResultsVector_t filteredMappings;
+              filterByGroup(readMappings, filteredMappings, param.numMappingsForSegment - 1, false);
+              readMappings = std::move(filteredMappings);
+          }
+      }
+
       /**
        * @brief                   map the parsed query sequence (L1 and L2 mapping)
        * @param[in]   Q           metadata about query sequence

From 90fe5b628e48fcc89a725b65c28942014642029a Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 15:22:41 -0500
Subject: [PATCH 15/29] one codepath for mapping chain processing

---
 src/map/include/computeMap.hpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index fbf285dd..27c526c3 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1711,14 +1711,9 @@ namespace skch
               //Bucket by each chain
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
 
-              if (param.max_mapping_length == std::numeric_limits<int64_t>::max()) {
-                  // Process the entire chain as one fragment
-                  processMappingFragment(it, it_end);
-              } else {
-                  // Process the chain with potential splits
-                  processChainWithSplits(it, it_end);
-              }
-              
+              // Process the chain into chunks defined by max_mapping_length
+              processChainWithSplits(it, it_end);
+
               // Move the iterator to the end of the processed chain
               it = it_end;
           }

From c7c171e9d196af92d9c161bf9e1cc2bd19cfb1d6 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 15:48:50 -0500
Subject: [PATCH 16/29] remove unused block coordinate info

---
 src/map/include/base_types.hpp |  4 ----
 src/map/include/computeMap.hpp | 21 ++-------------------
 src/map/include/filter.hpp     | 16 ++++++++--------
 3 files changed, 10 insertions(+), 31 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index f0b65d87..605b7560 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -157,10 +157,6 @@ namespace skch
     seqno_t refSeqId;                                   //internal sequence id of the reference contig
     seqno_t querySeqId;                                 //internal sequence id of the query sequence
     offset_t blockLength;                                    //the block length of the mapping
-    offset_t blockRefStartPos;
-    offset_t blockRefEndPos;
-    offset_t blockQueryStartPos;
-    offset_t blockQueryEndPos;
     float blockNucIdentity;
           
     float nucIdentity;                                  //calculated identity
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 27c526c3..669422f2 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -462,17 +462,6 @@ namespace skch
               readMappings.end());
       }
 
-      void setBlockCoordsToMappingCoords(MappingResultsVector_t &readMappings) {
-          for (auto& m : readMappings) {
-              m.blockRefStartPos = m.refStartPos;
-              m.blockRefEndPos = m.refEndPos;
-              m.blockQueryStartPos = m.queryStartPos;
-              m.blockQueryEndPos = m.queryEndPos;
-              m.blockLength = std::max(m.blockRefEndPos - m.blockRefStartPos, m.blockQueryEndPos - m.blockQueryStartPos);
-              m.blockNucIdentity = m.nucIdentity;
-          }
-      }
-
       /**
        * @brief               helper to main mapping function
        * @details             filters mappings whose identity and query/ref length don't agree
@@ -485,8 +474,8 @@ namespace skch
               std::remove_if(readMappings.begin(),
                              readMappings.end(),
                              [&](MappingResult &e){
-                                 int64_t q_l = (int64_t)e.blockQueryEndPos - (int64_t)e.blockQueryStartPos;
-                                 int64_t r_l = (int64_t)e.blockRefEndPos - (int64_t)e.blockRefStartPos;
+                                 int64_t q_l = (int64_t)e.queryEndPos - (int64_t)e.queryStartPos;
+                                 int64_t r_l = (int64_t)e.refEndPos - (int64_t)e.refStartPos;
                                  uint64_t delta = std::abs(r_l - q_l);
                                  double len_id_bound = (1.0 - (double)delta/(((double)q_l+r_l)/2));
                                  return len_id_bound < std::min(0.7, std::pow(param.percentageIdentity,3));
@@ -770,8 +759,6 @@ namespace skch
        */
       void filterNonMergedMappings(MappingResultsVector_t &readMappings, const Parameters& param)
       {
-          setBlockCoordsToMappingCoords(readMappings);
-    
           if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
               MappingResultsVector_t filteredMappings;
               filterByGroup(readMappings, filteredMappings, param.numMappingsForSegment - 1, false);
@@ -1800,10 +1787,6 @@ namespace skch
           for (auto it = begin; it != end; ++it) {
               it->n_merged = n_in_full_chain;
               it->blockLength = block_length;
-              it->blockQueryStartPos = chain_start_query;
-              it->blockQueryEndPos = chain_end_query;
-              it->blockRefStartPos = chain_start_ref;
-              it->blockRefEndPos = chain_end_ref;
               it->blockNucIdentity = chain_nuc_identity;
           }
       }
diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index fa18b132..eeaa7470 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -70,11 +70,11 @@ namespace skch
 
         // compute the overlap of the two mappings
         double get_overlap(const int x, const int y) const {
-            offset_t overlap_start = std::max(vec[x].blockQueryStartPos, vec[y].blockQueryStartPos);
-            offset_t overlap_end = std::min(vec[x].blockQueryEndPos, vec[y].blockQueryEndPos);
+            offset_t overlap_start = std::max(vec[x].queryStartPos, vec[y].queryStartPos);
+            offset_t overlap_end = std::min(vec[x].queryEndPos, vec[y].queryEndPos);
             offset_t overlap_length = std::max(0, static_cast<int>(overlap_end - overlap_start));
-            offset_t x_length = vec[x].blockQueryEndPos - vec[x].blockQueryStartPos;
-            offset_t y_length = vec[y].blockQueryEndPos - vec[y].blockQueryStartPos;
+            offset_t x_length = vec[x].queryEndPos - vec[x].queryStartPos;
+            offset_t y_length = vec[y].queryEndPos - vec[y].queryStartPos;
             return static_cast<double>(overlap_length) / std::min(x_length, y_length);
         }
 
@@ -343,11 +343,11 @@ namespace skch
 
         // compute the overlap of the two mappings
         double get_overlap(const int x, const int y) const {
-            offset_t overlap_start = std::max(vec[x].blockRefStartPos, vec[y].blockRefStartPos);
-            offset_t overlap_end = std::min(vec[x].blockRefEndPos, vec[y].blockRefEndPos);
+            offset_t overlap_start = std::max(vec[x].refStartPos, vec[y].refStartPos);
+            offset_t overlap_end = std::min(vec[x].refEndPos, vec[y].refEndPos);
             offset_t overlap_length = std::max(0, static_cast<int>(overlap_end - overlap_start));
-            offset_t x_length = vec[x].blockRefEndPos - vec[x].blockRefStartPos;
-            offset_t y_length = vec[y].blockRefEndPos - vec[y].blockRefStartPos;
+            offset_t x_length = vec[x].refEndPos - vec[x].refStartPos;
+            offset_t y_length = vec[y].refEndPos - vec[y].refStartPos;
             return static_cast<double>(overlap_length) / std::min(x_length, y_length);
         }
 

From 972222d8b265c959fcb5074f93ce6fb178b08714 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 15:52:31 -0500
Subject: [PATCH 17/29] simplify maximally merged filter

---
 src/map/include/computeMap.hpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 669422f2..9ec402b7 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -696,7 +696,7 @@ namespace skch
             if (param.mergeMappings) 
             {
                 mergeMappingsInRange(unfilteredMappings, param.chain_gap);
-                unfilteredMappings = filterMaximallyMerged(unfilteredMappings, param);
+                filterMaximallyMerged(unfilteredMappings, param);
             } 
             else 
             {
@@ -1569,21 +1569,18 @@ namespace skch
        * @param[in]   param           Algorithm parameters
        * @return                      Filtered mappings
        */
-      MappingResultsVector_t filterMaximallyMerged(const MappingResultsVector_t& readMappings, const Parameters& param)
+      void filterMaximallyMerged(MappingResultsVector_t& readMappings, const Parameters& param)
       {
-          MappingResultsVector_t filteredMappings = readMappings;
-
           // Filter weak mappings
-          filterWeakMappings(filteredMappings, std::floor(param.block_length / param.segLength));
+          filterWeakMappings(readMappings, std::floor(param.block_length / param.segLength));
 
           // Apply group filtering if necessary
           if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
               MappingResultsVector_t groupFilteredMappings;
-              filterByGroup(filteredMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false);
-              filteredMappings = std::move(groupFilteredMappings);
+              filterByGroup(readMappings, groupFilteredMappings, param.numMappingsForSegment - 1, false);
+              readMappings = std::move(groupFilteredMappings);
           }
 
-          return filteredMappings;
       }
 
       /**

From 99b66818dfd50ea3cc6d47d6a9c85fded543ebd4 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 16:03:27 -0500
Subject: [PATCH 18/29] fix: Modify FASTA file mapping to allow self-mapping

---
 src/interface/parse_args.hpp   |  3 +--
 src/map/include/computeMap.hpp | 10 ++++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
index 5d25b07f..52b896d2 100644
--- a/src/interface/parse_args.hpp
+++ b/src/interface/parse_args.hpp
@@ -237,8 +237,7 @@ void parse_args(int argc,
 
     // If there are no queries, go in all-vs-all mode with the sequences specified in `target_sequence_file`
     if (target_sequence_file && map_parameters.querySequences.empty()) {
-        map_parameters.skip_self = true;
-        std::cerr << "[mashmap] Skipping self mappings for single file all-vs-all mapping." << std::endl;
+        std::cerr << "[mashmap] Performing all-vs-all mapping including self mappings." << std::endl;
         map_parameters.querySequences.push_back(map_parameters.refSequences.back());
         align_parameters.querySequences.push_back(align_parameters.refSequences.back());
     }
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 9ec402b7..c2acc7fb 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -908,10 +908,12 @@ namespace skch
           {
             const IP_const_iterator ip_it = pq.front().it;
             const auto& ref = this->refSketch.metadata[ip_it->seqId];
-            if ((!param.skip_self || Q.seqName != ref.name)
-                && (!param.skip_prefix || this->refIdGroup[ip_it->seqId] != Q.refGroup)
-                && (!param.lower_triangular || Q.seqCounter > ip_it->seqId)
-            ) {
+            bool skip_mapping = false;
+            if (param.skip_self && Q.seqName == ref.name) skip_mapping = true;
+            if (param.skip_prefix && this->refIdGroup[ip_it->seqId] == Q.refGroup) skip_mapping = true;
+            if (param.lower_triangular && Q.seqCounter <= ip_it->seqId) skip_mapping = true;
+    
+            if (!skip_mapping) {
               intervalPoints.push_back(*ip_it);
             }
             std::pop_heap(pq.begin(), pq.end(), heap_cmp);

From 022249670c1af76cc5747d5b8529e33242695783 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 16:54:55 -0500
Subject: [PATCH 19/29] feat: Implement improved mapping merging and filtering

---
 src/map/include/computeMap.hpp | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index c2acc7fb..1a3cf6d5 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -695,9 +695,26 @@ namespace skch
 
             if (param.mergeMappings) 
             {
-                mergeMappingsInRange(unfilteredMappings, param.chain_gap);
-                filterMaximallyMerged(unfilteredMappings, param);
-            } 
+                // the maximally merged mappings are top-level chains
+                // while the unfiltered mappings now contain splits at max_mapping_length
+                auto maximallyMergedMappings =
+                    mergeMappingsInRange(unfilteredMappings, param.chain_gap);
+                // we filter on the top level chains
+                filterMaximallyMerged(maximallyMergedMappings, param);
+                // collect splitMappingIds in the maximally merged mappings
+                robin_hood::unordered_set<offset_t> kept_chains;
+                for (auto &mapping : maximallyMergedMappings) {
+                    kept_chains.insert(mapping.splitMappingId);
+                }
+                // and use them to filter mappings to discard
+                unfilteredMappings.erase(
+                    std::remove_if(unfilteredMappings.begin(), unfilteredMappings.end(),
+                                   [&kept_chains](const MappingResult &mapping) {
+                                       return !kept_chains.count(mapping.splitMappingId);
+                                   }),
+                    unfilteredMappings.end()
+                    );
+            }
             else 
             {
                 filterNonMergedMappings(unfilteredMappings, param);
@@ -1591,11 +1608,11 @@ namespace skch
        * @param[in]     max_dist      Distance to look in target and query
        */
       template <typename VecIn>
-      void mergeMappingsInRange(VecIn &readMappings,
+      VecIn mergeMappingsInRange(VecIn &readMappings,
                                 int max_dist) {
           assert(param.split == true);
 
-          if(readMappings.size() < 2) return;
+          if(readMappings.size() < 2) return readMappings;
 
           //Sort the mappings by query position, then reference sequence id, then reference position
           std::sort(
@@ -1710,6 +1727,8 @@ namespace skch
                              [](const MappingResult& e) { return e.discard == 1; }),
               readMappings.end()
           );
+
+          return maximallyMergedMappings;
       }
 
       /**

From c4408b7ed13abeb674e40cf30454577577abc2f5 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 21:56:50 -0500
Subject: [PATCH 20/29] fix: Update distance calculation in range merging to
 account for strand direction

---
 src/map/include/computeMap.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 1a3cf6d5..3501d1df 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1658,7 +1658,12 @@ namespace skch
                   }
                   //If the next mapping is within range, we can potentially merge
                   if (it2->strand == it->strand) {
-                      int64_t ref_dist = it2->refStartPos - it->refEndPos;
+                      int64_t ref_dist;
+                      if (it->strand == strnd::FWD) {
+                          ref_dist = it2->refStartPos - it->refEndPos;
+                      } else {
+                          ref_dist = it->refStartPos - it2->refEndPos;
+                      }
                       // If we jitter backwards in reference position too far, don't merge
                       if (ref_dist < -param.segLength/5) {
                           continue;

From 5fb526edca866744d4d990b05a6b2ae622172fd5 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 22:26:43 -0500
Subject: [PATCH 21/29] refactor: Improve distance calculation logic in
 computeMap.hpp

---
 src/map/include/computeMap.hpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 3501d1df..017b889d 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1658,24 +1658,25 @@ namespace skch
                   }
                   //If the next mapping is within range, we can potentially merge
                   if (it2->strand == it->strand) {
+                      // Always calculate query distance the same way, as query always moves forward
+                      int64_t query_dist = it2->queryStartPos - it->queryEndPos;
+
+                      // Reference distance calculation depends on strand
                       int64_t ref_dist;
                       if (it->strand == strnd::FWD) {
                           ref_dist = it2->refStartPos - it->refEndPos;
                       } else {
+                          // For reverse complement, we need to invert the order
                           ref_dist = it->refStartPos - it2->refEndPos;
                       }
-                      // If we jitter backwards in reference position too far, don't merge
-                      if (ref_dist < -param.segLength/5) {
-                          continue;
-                      }
-                      int64_t query_dist = it2->queryStartPos - it->queryEndPos;
-                      if (query_dist < 0) {
-                          continue;
-                      }
-                      auto dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
-                      if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) {
-                          best_it2 = it2;
-                          best_score = dist;
+
+                      // Check if the distance is within acceptable range
+                      if (query_dist >= 0 && ref_dist >= -param.segLength/5 && ref_dist <= max_dist) {
+                          double dist = std::sqrt(std::pow(query_dist, 2) + std::pow(ref_dist, 2));
+                          if (dist < max_dist && best_score > dist && it2->chainPairScore > dist) {
+                              best_it2 = it2;
+                              best_score = dist;
+                          }
                       }
                   }
               }

From c3e2b02780a803ab74185c359065aff0f4a5ff4e Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Sun, 15 Sep 2024 22:30:55 -0500
Subject: [PATCH 22/29] refactor: Ensure all fields in merged mappings are
 properly initialized

---
 src/map/include/computeMap.hpp | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 017b889d..fdb152be 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1704,7 +1704,7 @@ namespace skch
           MappingResultsVector_t maximallyMergedMappings;
           for(auto it = readMappings.begin(); it != readMappings.end();) {
               auto it_end = std::find_if(it, readMappings.end(), [&](const MappingResult &e){return e.splitMappingId != it->splitMappingId;} );
-              MappingResult mergedMapping = *it;
+              MappingResult mergedMapping = *it;  // Copy all fields from the first mapping in the chain
               mergedMapping.queryStartPos = it->queryStartPos;
               mergedMapping.queryEndPos = std::prev(it_end)->queryEndPos;
               mergedMapping.refStartPos = it->refStartPos;
@@ -1712,6 +1712,30 @@ namespace skch
               mergedMapping.blockLength = std::max(mergedMapping.refEndPos - mergedMapping.refStartPos,
                                                    mergedMapping.queryEndPos - mergedMapping.queryStartPos);
               mergedMapping.n_merged = std::distance(it, it_end);
+              
+              // Recalculate average values for the merged mapping
+              double totalNucIdentity = 0.0;
+              double totalKmerComplexity = 0.0;
+              int totalConservedSketches = 0;
+              int totalSketchSize = 0;
+              for (auto subIt = it; subIt != it_end; ++subIt) {
+                  totalNucIdentity += subIt->nucIdentity;
+                  totalKmerComplexity += subIt->kmerComplexity;
+                  totalConservedSketches += subIt->conservedSketches;
+                  totalSketchSize += subIt->sketchSize;
+              }
+              mergedMapping.nucIdentity = totalNucIdentity / mergedMapping.n_merged;
+              mergedMapping.kmerComplexity = totalKmerComplexity / mergedMapping.n_merged;
+              mergedMapping.conservedSketches = totalConservedSketches;
+              mergedMapping.sketchSize = totalSketchSize;
+              
+              // Ensure other fields are properly set
+              mergedMapping.approxMatches = std::round(mergedMapping.nucIdentity * mergedMapping.blockLength / 100.0);
+              mergedMapping.discard = 0;
+              mergedMapping.overlapped = false;
+              mergedMapping.chainPairScore = std::numeric_limits<double>::max();
+              mergedMapping.chainPairId = std::numeric_limits<int64_t>::min();
+
               maximallyMergedMappings.push_back(mergedMapping);
               it = it_end;
           }

From c89f1b069a857eac7c44cbeac0cc3b887fb83e37 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Mon, 16 Sep 2024 20:42:35 -0500
Subject: [PATCH 23/29] feat: Instrument the Lee plane sweep filter algorithm

---
 src/map/include/filter.hpp | 58 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index eeaa7470..543b433c 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -12,6 +12,8 @@
 #include <set>
 #include <fstream>
 #include <zlib.h>
+#include <iostream>
+#include <sstream>
 
 //Own includes
 #include "map/include/base_types.hpp"
@@ -86,6 +88,9 @@ namespace skch
         template <typename Type>
         inline void markGood(Type &L, int secondaryToKeep, bool dropRand, double overlapThreshold)
           {
+            std::cerr << "DEBUG: Entering markGood" << std::endl;
+            std::cerr << "DEBUG: L size: " << L.size() << ", secondaryToKeep: " << secondaryToKeep << ", dropRand: " << dropRand << ", overlapThreshold: " << overlapThreshold << std::endl;
+
             //first segment in the set order
             auto beg = L.begin();
 
@@ -95,12 +100,15 @@ namespace skch
             auto it = L.begin();
             for( ; it != L.end(); it++)
             {
+                std::cerr << "DEBUG: Checking mapping " << *it << ", score: " << get_score(*it) << std::endl;
                 if ((this->greater_score(*beg, *it) || vec[*it].discard == 0) && kept > secondaryToKeep) {
+                    std::cerr << "DEBUG: Breaking loop, kept: " << kept << std::endl;
                     break;
                 }
 
                 vec[*it].discard = 0;
                 ++kept;
+                std::cerr << "DEBUG: Marked mapping " << *it << " as good, kept: " << kept << std::endl;
             }
             auto kit = it;
 
@@ -109,9 +117,12 @@ namespace skch
                 if (it == L.begin()) continue;
                 int idx = *it;
                 for (auto it2 = L.begin(); it2 != kit; it2++) {
-                    if (get_overlap(idx, *it2) > overlapThreshold) {
+                    double overlap = get_overlap(idx, *it2);
+                    std::cerr << "DEBUG: Checking overlap between " << idx << " and " << *it2 << ": " << overlap << std::endl;
+                    if (overlap > overlapThreshold) {
                         vec[idx].overlapped = 1;  // Mark as bad if it overlaps >50% with the best mapping
                         vec[idx].discard = 1;
+                        std::cerr << "DEBUG: Marked mapping " << idx << " as overlapped and discarded" << std::endl;
                         break;
                     }
                 }
@@ -122,6 +133,7 @@ namespace skch
             // we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value
             if (kept > secondaryToKeep && dropRand) 
             {
+              std::cerr << "DEBUG: Entering dropRand section, kept: " << kept << std::endl;
               // we will use hashes of the mapping structs to break ties
               // first we'll make a vector of the mappings including the hashes
               std::vector<std::tuple<double, size_t, MappingResult*>> score_and_hash; // The tuple is (score, hash, pointer to the mapping)
@@ -130,24 +142,30 @@ namespace skch
                   if(vec[*it].discard == 0)
                   {
                       score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]);
+                      std::cerr << "DEBUG: Added mapping " << *it << " to score_and_hash, score: " << get_score(*it) << ", hash: " << vec[*it].hash() << std::endl;
                   }
               }
               // now we'll sort the vector by score and hash
               std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{});
+              std::cerr << "DEBUG: Sorted score_and_hash, size: " << score_and_hash.size() << std::endl;
               // reset kept counter
               kept = 0;
               for (auto& x : score_and_hash) {
                   std::get<2>(x)->discard = 1;
+                  std::cerr << "DEBUG: Initially marked mapping as discarded, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << std::endl;
               }
               // now we mark the best to keep
               for (auto& x : score_and_hash) {
                   if (kept > secondaryToKeep) {
+                      std::cerr << "DEBUG: Reached secondaryToKeep limit, breaking" << std::endl;
                       break;
                   }
                   std::get<2>(x)->discard = 0;
                   ++kept;
+                  std::cerr << "DEBUG: Marked mapping as kept, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << ", kept: " << kept << std::endl;
               }
             }
+            std::cerr << "DEBUG: Exiting markGood, final kept: " << kept << std::endl;
           }
       };
 
@@ -159,12 +177,22 @@ namespace skch
       template <typename VecIn>
       void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand, double overlapThreshold)
         {
+          std::cerr << "DEBUG: Entering liFilterAlgorithm" << std::endl;
+          std::cerr << "DEBUG: Initial readMappings size: " << readMappings.size() << std::endl;
+
           if(readMappings.size() <= 1)
+          {
+            std::cerr << "DEBUG: readMappings size <= 1, returning" << std::endl;
             return;
+          }
 
           //Initially mark all mappings as bad
           //Maintain the order of this vector till end of this function
-          std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ e.discard = 1; e.overlapped = 0; });
+          std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
+            e.discard = 1; 
+            e.overlapped = 0; 
+            std::cerr << "DEBUG: Marking mapping as bad: " << e.queryStartPos << "-" << e.queryEndPos << std::endl;
+          });
 
           //Initialize object of Helper struct
           Helper obj (readMappings);
@@ -182,13 +210,19 @@ namespace skch
           {
             eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i);
             eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i);
+            std::cerr << "DEBUG: Added events for mapping " << i << ": " 
+                      << readMappings[i].queryStartPos << " (BEGIN), " 
+                      << readMappings[i].queryEndPos << " (END)" << std::endl;
           }
 
           std::sort(eventSchedule.begin(), eventSchedule.end());
+          std::cerr << "DEBUG: Sorted event schedule" << std::endl;
 
           //Execute the plane sweep algorithm
           for(auto it = eventSchedule.begin(); it!= eventSchedule.end();)
           {
+            std::cerr << "DEBUG: Processing event at position " << std::get<0>(*it) << std::endl;
+
             //Find events that correspond to current position
             auto it2 = std::find_if(it, eventSchedule.end(), [&](const eventRecord_t &e)
                                     {
@@ -199,21 +233,39 @@ namespace skch
             std::for_each(it, it2, [&](const eventRecord_t &e)
                                     {
                                       if (std::get<1>(e) == event::BEGIN)
+                                      {
                                         bst.insert (std::get<2>(e));
+                                        std::cerr << "DEBUG: Inserted segment " << std::get<2>(e) << " into BST" << std::endl;
+                                      }
                                       else
+                                      {
                                         bst.erase (std::get<2>(e));
+                                        std::cerr << "DEBUG: Erased segment " << std::get<2>(e) << " from BST" << std::endl;
+                                      }
                                     });
 
+            std::cerr << "DEBUG: Calling markGood" << std::endl;
             //mark mappings as good
             obj.markGood(bst, secondaryToKeep, dropRand, overlapThreshold);
 
             it = it2;
           }
 
+          std::cerr << "DEBUG: Removing bad mappings" << std::endl;
           //Remove bad mappings
+          auto initialSize = readMappings.size();
           readMappings.erase(
-              std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ return e.discard == 1 || e.overlapped == 1; }),
+              std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
+                if (e.discard == 1 || e.overlapped == 1) {
+                  std::cerr << "DEBUG: Removing mapping: " << e.queryStartPos << "-" << e.queryEndPos << std::endl;
+                  return true;
+                }
+                return false;
+              }),
               readMappings.end());
+
+          std::cerr << "DEBUG: Removed " << (initialSize - readMappings.size()) << " mappings" << std::endl;
+          std::cerr << "DEBUG: Final readMappings size: " << readMappings.size() << std::endl;
         }
 
       /**

From c67c73bebf02a98990188427aceccdfb65743967 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Mon, 16 Sep 2024 20:58:24 -0500
Subject: [PATCH 24/29] feat: Add detailed mapping information to debug
 statements

---
 src/map/include/filter.hpp | 48 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index 543b433c..875a6b91 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -191,7 +191,13 @@ namespace skch
           std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
             e.discard = 1; 
             e.overlapped = 0; 
-            std::cerr << "DEBUG: Marking mapping as bad: " << e.queryStartPos << "-" << e.queryEndPos << std::endl;
+            std::cerr << "DEBUG: Marking mapping as bad: QueryID=" << e.querySeqId 
+                      << ", QueryStart=" << e.queryStartPos 
+                      << ", QueryEnd=" << e.queryEndPos 
+                      << ", RefID=" << e.refSeqId 
+                      << ", RefStart=" << e.refStartPos 
+                      << ", RefEnd=" << e.refEndPos 
+                      << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl;
           });
 
           //Initialize object of Helper struct
@@ -211,8 +217,13 @@ namespace skch
             eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i);
             eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i);
             std::cerr << "DEBUG: Added events for mapping " << i << ": " 
-                      << readMappings[i].queryStartPos << " (BEGIN), " 
-                      << readMappings[i].queryEndPos << " (END)" << std::endl;
+                      << "QueryID=" << readMappings[i].querySeqId 
+                      << ", QueryStart=" << readMappings[i].queryStartPos 
+                      << " (BEGIN), QueryEnd=" << readMappings[i].queryEndPos 
+                      << " (END), RefID=" << readMappings[i].refSeqId 
+                      << ", RefStart=" << readMappings[i].refStartPos 
+                      << ", RefEnd=" << readMappings[i].refEndPos 
+                      << ", Strand=" << (readMappings[i].strand == strnd::FWD ? "+" : "-") << std::endl;
           }
 
           std::sort(eventSchedule.begin(), eventSchedule.end());
@@ -232,15 +243,30 @@ namespace skch
             //update sweep line status by adding/removing segments
             std::for_each(it, it2, [&](const eventRecord_t &e)
                                     {
+                                      int idx = std::get<2>(e);
                                       if (std::get<1>(e) == event::BEGIN)
                                       {
-                                        bst.insert (std::get<2>(e));
-                                        std::cerr << "DEBUG: Inserted segment " << std::get<2>(e) << " into BST" << std::endl;
+                                        bst.insert (idx);
+                                        std::cerr << "DEBUG: Inserted segment " << idx << " into BST: "
+                                                  << "QueryID=" << readMappings[idx].querySeqId 
+                                                  << ", QueryStart=" << readMappings[idx].queryStartPos 
+                                                  << ", QueryEnd=" << readMappings[idx].queryEndPos 
+                                                  << ", RefID=" << readMappings[idx].refSeqId 
+                                                  << ", RefStart=" << readMappings[idx].refStartPos 
+                                                  << ", RefEnd=" << readMappings[idx].refEndPos 
+                                                  << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl;
                                       }
                                       else
                                       {
-                                        bst.erase (std::get<2>(e));
-                                        std::cerr << "DEBUG: Erased segment " << std::get<2>(e) << " from BST" << std::endl;
+                                        bst.erase (idx);
+                                        std::cerr << "DEBUG: Erased segment " << idx << " from BST: "
+                                                  << "QueryID=" << readMappings[idx].querySeqId 
+                                                  << ", QueryStart=" << readMappings[idx].queryStartPos 
+                                                  << ", QueryEnd=" << readMappings[idx].queryEndPos 
+                                                  << ", RefID=" << readMappings[idx].refSeqId 
+                                                  << ", RefStart=" << readMappings[idx].refStartPos 
+                                                  << ", RefEnd=" << readMappings[idx].refEndPos 
+                                                  << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl;
                                       }
                                     });
 
@@ -257,7 +283,13 @@ namespace skch
           readMappings.erase(
               std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
                 if (e.discard == 1 || e.overlapped == 1) {
-                  std::cerr << "DEBUG: Removing mapping: " << e.queryStartPos << "-" << e.queryEndPos << std::endl;
+                  std::cerr << "DEBUG: Removing mapping: QueryID=" << e.querySeqId 
+                            << ", QueryStart=" << e.queryStartPos 
+                            << ", QueryEnd=" << e.queryEndPos 
+                            << ", RefID=" << e.refSeqId 
+                            << ", RefStart=" << e.refStartPos 
+                            << ", RefEnd=" << e.refEndPos 
+                            << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl;
                   return true;
                 }
                 return false;

From 7c143410928603adc7cc9e16df9e8d6b0643fb36 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Mon, 16 Sep 2024 21:05:38 -0500
Subject: [PATCH 25/29] fix: Recalculate blockNucIdentity and handle edge cases
 in score calculation

---
 src/map/include/computeMap.hpp | 3 +++
 src/map/include/filter.hpp     | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index fdb152be..88a5ed1c 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1729,6 +1729,9 @@ namespace skch
               mergedMapping.conservedSketches = totalConservedSketches;
               mergedMapping.sketchSize = totalSketchSize;
               
+              // Calculate blockNucIdentity
+              mergedMapping.blockNucIdentity = mergedMapping.nucIdentity;
+              
               // Ensure other fields are properly set
               mergedMapping.approxMatches = std::round(mergedMapping.nucIdentity * mergedMapping.blockLength / 100.0);
               mergedMapping.discard = 0;
diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index 875a6b91..b9c9d193 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -43,7 +43,12 @@ namespace skch
 
         Helper(MappingResultsVector_t &v) : vec(v) {}
 
-        double get_score(const int x) const {return vec[x].blockNucIdentity * log(vec[x].blockLength); }
+        double get_score(const int x) const {
+            if (vec[x].blockLength <= 0 || vec[x].blockNucIdentity <= 0) {
+                return std::numeric_limits<double>::lowest();
+            }
+            return vec[x].blockNucIdentity * std::log(static_cast<double>(vec[x].blockLength));
+        }
 
         //Greater than comparison by score and begin position
         //used to define order in BST

From 60ba144ae676602a3948c0a0485223f5df53d799 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Mon, 16 Sep 2024 21:07:11 -0500
Subject: [PATCH 26/29] fix: Remove debug statements from liFilterAlgorithm

---
 src/map/include/filter.hpp | 62 +-------------------------------------
 1 file changed, 1 insertion(+), 61 deletions(-)

diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index b9c9d193..71042b44 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -182,27 +182,14 @@ namespace skch
       template <typename VecIn>
       void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand, double overlapThreshold)
         {
-          std::cerr << "DEBUG: Entering liFilterAlgorithm" << std::endl;
-          std::cerr << "DEBUG: Initial readMappings size: " << readMappings.size() << std::endl;
-
           if(readMappings.size() <= 1)
-          {
-            std::cerr << "DEBUG: readMappings size <= 1, returning" << std::endl;
             return;
-          }
 
           //Initially mark all mappings as bad
           //Maintain the order of this vector till end of this function
           std::for_each(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
             e.discard = 1; 
             e.overlapped = 0; 
-            std::cerr << "DEBUG: Marking mapping as bad: QueryID=" << e.querySeqId 
-                      << ", QueryStart=" << e.queryStartPos 
-                      << ", QueryEnd=" << e.queryEndPos 
-                      << ", RefID=" << e.refSeqId 
-                      << ", RefStart=" << e.refStartPos 
-                      << ", RefEnd=" << e.refEndPos 
-                      << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl;
           });
 
           //Initialize object of Helper struct
@@ -221,24 +208,13 @@ namespace skch
           {
             eventSchedule.emplace_back (readMappings[i].queryStartPos, event::BEGIN, i);
             eventSchedule.emplace_back (readMappings[i].queryEndPos, event::END, i);
-            std::cerr << "DEBUG: Added events for mapping " << i << ": " 
-                      << "QueryID=" << readMappings[i].querySeqId 
-                      << ", QueryStart=" << readMappings[i].queryStartPos 
-                      << " (BEGIN), QueryEnd=" << readMappings[i].queryEndPos 
-                      << " (END), RefID=" << readMappings[i].refSeqId 
-                      << ", RefStart=" << readMappings[i].refStartPos 
-                      << ", RefEnd=" << readMappings[i].refEndPos 
-                      << ", Strand=" << (readMappings[i].strand == strnd::FWD ? "+" : "-") << std::endl;
           }
 
           std::sort(eventSchedule.begin(), eventSchedule.end());
-          std::cerr << "DEBUG: Sorted event schedule" << std::endl;
 
           //Execute the plane sweep algorithm
           for(auto it = eventSchedule.begin(); it!= eventSchedule.end();)
           {
-            std::cerr << "DEBUG: Processing event at position " << std::get<0>(*it) << std::endl;
-
             //Find events that correspond to current position
             auto it2 = std::find_if(it, eventSchedule.end(), [&](const eventRecord_t &e)
                                     {
@@ -250,59 +226,23 @@ namespace skch
                                     {
                                       int idx = std::get<2>(e);
                                       if (std::get<1>(e) == event::BEGIN)
-                                      {
                                         bst.insert (idx);
-                                        std::cerr << "DEBUG: Inserted segment " << idx << " into BST: "
-                                                  << "QueryID=" << readMappings[idx].querySeqId 
-                                                  << ", QueryStart=" << readMappings[idx].queryStartPos 
-                                                  << ", QueryEnd=" << readMappings[idx].queryEndPos 
-                                                  << ", RefID=" << readMappings[idx].refSeqId 
-                                                  << ", RefStart=" << readMappings[idx].refStartPos 
-                                                  << ", RefEnd=" << readMappings[idx].refEndPos 
-                                                  << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl;
-                                      }
                                       else
-                                      {
                                         bst.erase (idx);
-                                        std::cerr << "DEBUG: Erased segment " << idx << " from BST: "
-                                                  << "QueryID=" << readMappings[idx].querySeqId 
-                                                  << ", QueryStart=" << readMappings[idx].queryStartPos 
-                                                  << ", QueryEnd=" << readMappings[idx].queryEndPos 
-                                                  << ", RefID=" << readMappings[idx].refSeqId 
-                                                  << ", RefStart=" << readMappings[idx].refStartPos 
-                                                  << ", RefEnd=" << readMappings[idx].refEndPos 
-                                                  << ", Strand=" << (readMappings[idx].strand == strnd::FWD ? "+" : "-") << std::endl;
-                                      }
                                     });
 
-            std::cerr << "DEBUG: Calling markGood" << std::endl;
             //mark mappings as good
             obj.markGood(bst, secondaryToKeep, dropRand, overlapThreshold);
 
             it = it2;
           }
 
-          std::cerr << "DEBUG: Removing bad mappings" << std::endl;
           //Remove bad mappings
-          auto initialSize = readMappings.size();
           readMappings.erase(
               std::remove_if(readMappings.begin(), readMappings.end(), [&](MappingResult &e){ 
-                if (e.discard == 1 || e.overlapped == 1) {
-                  std::cerr << "DEBUG: Removing mapping: QueryID=" << e.querySeqId 
-                            << ", QueryStart=" << e.queryStartPos 
-                            << ", QueryEnd=" << e.queryEndPos 
-                            << ", RefID=" << e.refSeqId 
-                            << ", RefStart=" << e.refStartPos 
-                            << ", RefEnd=" << e.refEndPos 
-                            << ", Strand=" << (e.strand == strnd::FWD ? "+" : "-") << std::endl;
-                  return true;
-                }
-                return false;
+                return e.discard == 1 || e.overlapped == 1;
               }),
               readMappings.end());
-
-          std::cerr << "DEBUG: Removed " << (initialSize - readMappings.size()) << " mappings" << std::endl;
-          std::cerr << "DEBUG: Final readMappings size: " << readMappings.size() << std::endl;
         }
 
       /**

From 9bdef533383a225b6ac8f9d5031d584da4067b08 Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Mon, 16 Sep 2024 21:11:31 -0500
Subject: [PATCH 27/29] fix: Remove debug statements from markGood function

---
 src/map/include/filter.hpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/map/include/filter.hpp b/src/map/include/filter.hpp
index 71042b44..a6fe6ad3 100644
--- a/src/map/include/filter.hpp
+++ b/src/map/include/filter.hpp
@@ -93,9 +93,6 @@ namespace skch
         template <typename Type>
         inline void markGood(Type &L, int secondaryToKeep, bool dropRand, double overlapThreshold)
           {
-            std::cerr << "DEBUG: Entering markGood" << std::endl;
-            std::cerr << "DEBUG: L size: " << L.size() << ", secondaryToKeep: " << secondaryToKeep << ", dropRand: " << dropRand << ", overlapThreshold: " << overlapThreshold << std::endl;
-
             //first segment in the set order
             auto beg = L.begin();
 
@@ -105,15 +102,12 @@ namespace skch
             auto it = L.begin();
             for( ; it != L.end(); it++)
             {
-                std::cerr << "DEBUG: Checking mapping " << *it << ", score: " << get_score(*it) << std::endl;
                 if ((this->greater_score(*beg, *it) || vec[*it].discard == 0) && kept > secondaryToKeep) {
-                    std::cerr << "DEBUG: Breaking loop, kept: " << kept << std::endl;
                     break;
                 }
 
                 vec[*it].discard = 0;
                 ++kept;
-                std::cerr << "DEBUG: Marked mapping " << *it << " as good, kept: " << kept << std::endl;
             }
             auto kit = it;
 
@@ -123,11 +117,9 @@ namespace skch
                 int idx = *it;
                 for (auto it2 = L.begin(); it2 != kit; it2++) {
                     double overlap = get_overlap(idx, *it2);
-                    std::cerr << "DEBUG: Checking overlap between " << idx << " and " << *it2 << ": " << overlap << std::endl;
                     if (overlap > overlapThreshold) {
                         vec[idx].overlapped = 1;  // Mark as bad if it overlaps >50% with the best mapping
                         vec[idx].discard = 1;
-                        std::cerr << "DEBUG: Marked mapping " << idx << " as overlapped and discarded" << std::endl;
                         break;
                     }
                 }
@@ -138,7 +130,6 @@ namespace skch
             // we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value
             if (kept > secondaryToKeep && dropRand) 
             {
-              std::cerr << "DEBUG: Entering dropRand section, kept: " << kept << std::endl;
               // we will use hashes of the mapping structs to break ties
               // first we'll make a vector of the mappings including the hashes
               std::vector<std::tuple<double, size_t, MappingResult*>> score_and_hash; // The tuple is (score, hash, pointer to the mapping)
@@ -147,30 +138,24 @@ namespace skch
                   if(vec[*it].discard == 0)
                   {
                       score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]);
-                      std::cerr << "DEBUG: Added mapping " << *it << " to score_and_hash, score: " << get_score(*it) << ", hash: " << vec[*it].hash() << std::endl;
                   }
               }
               // now we'll sort the vector by score and hash
               std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{});
-              std::cerr << "DEBUG: Sorted score_and_hash, size: " << score_and_hash.size() << std::endl;
               // reset kept counter
               kept = 0;
               for (auto& x : score_and_hash) {
                   std::get<2>(x)->discard = 1;
-                  std::cerr << "DEBUG: Initially marked mapping as discarded, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << std::endl;
               }
               // now we mark the best to keep
               for (auto& x : score_and_hash) {
                   if (kept > secondaryToKeep) {
-                      std::cerr << "DEBUG: Reached secondaryToKeep limit, breaking" << std::endl;
                       break;
                   }
                   std::get<2>(x)->discard = 0;
                   ++kept;
-                  std::cerr << "DEBUG: Marked mapping as kept, score: " << std::get<0>(x) << ", hash: " << std::get<1>(x) << ", kept: " << kept << std::endl;
               }
             }
-            std::cerr << "DEBUG: Exiting markGood, final kept: " << kept << std::endl;
           }
       };
 

From cd0542ea677c86e8c08cfc1e761f564b890986ec Mon Sep 17 00:00:00 2001
From: "Erik Garrison (aider)" <erik.garrison@gmail.com>
Date: Wed, 18 Sep 2024 16:18:07 -0500
Subject: [PATCH 28/29] fix: Correct calculation of query_end and target_end in
 tail patching

---
 src/common/wflign/src/wflign_patch.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp
index 65d467e7..6eae79e6 100644
--- a/src/common/wflign/src/wflign_patch.cpp
+++ b/src/common/wflign/src/wflign_patch.cpp
@@ -1397,17 +1397,17 @@ void write_merged_alignment(
                     query_pos = query_length;
                     target_pos = target_length;
 
-                    // Add safety checks first
-                    uint64_t new_query_end = query_offset + query_length;
-                    uint64_t new_target_end = target_offset + target_length + actual_extension;
+                    // Calculate new ends relative to the segment being aligned
+                    uint64_t new_query_end = query_length;
+                    uint64_t new_target_end = target_length + actual_extension;
 
-                    if (new_query_end > query_total_length || new_target_end > target_total_length) {
+                    if (query_offset + new_query_end > query_total_length || target_offset + new_target_end > target_total_length) {
                         std::cerr << "[wfmash::patch] Warning: Alignment extends beyond sequence bounds. Truncating." << std::endl;
                     }
 
-                    // Adjust query_end and target_end, ensuring we don't exceed the total lengths
-                    query_end = std::min(new_query_end, query_total_length);
-                    target_end = std::min(new_target_end, target_total_length);
+                    // Adjust query_end and target_end, ensuring we don't exceed the segment lengths
+                    query_end = std::min(new_query_end, query_length);
+                    target_end = std::min(new_target_end, target_length + actual_extension);
                 }
             }
 

From 240a0d36389e9c81795d51fdea989dbea1b5e954 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Wed, 18 Sep 2024 18:20:20 -0500
Subject: [PATCH 29/29] add pafcheck to validation

---
 .github/workflows/test_on_push.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
index 7d34b429..4f813f3b 100644
--- a/.github/workflows/test_on_push.yml
+++ b/.github/workflows/test_on_push.yml
@@ -53,8 +53,12 @@ jobs:
           override: true
       - name: Install wgatools
         run: cargo install --git https://github.com/wjwei-handsome/wgatools.git
+      - name: Install wgatools
+        run: cargo install --git https://github.com/ekg/pafcheck.git
       - name: Run wfmash and generate PAF
-        run: build/bin/wfmash -t 8 -n 1 -k 19 -s 5000 -p 90 -c 30k -P 50k -T SGDref -Q S288C -Y '#' data/scerevisiae8.fa.gz data/scerevisiae8.fa.gz > test.paf
+        run: build/bin/wfmash -t 8 -T SGDref -Q S288C -Y '#' data/scerevisiae8.fa.gz > test.paf
+      - name: check PAF coordinates and extended CIGAR validity
+        run: pafcheck --query-fasta data/scerevisiae8.fa.gz --paf test.paf
       - name: Convert PAF to MAF using wgatools
         run: wgatools paf2maf --target data/scerevisiae8.fa.gz --query data/scerevisiae8.fa.gz test.paf > test.maf
       - name: Check if MAF file is not empty