From 187baabf96dfe329a840621e4ce0a15bf87a2f89 Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Mon, 15 Jul 2024 15:31:15 +0200
Subject: [PATCH 1/7] Fixed a segfault when Correct couldn't create an index
 for the input FASTQ file. Now checking early if input files exist.

---
 dorado/cli/correct.cpp                  | 11 +++++++++++
 dorado/read_pipeline/CorrectionNode.cpp |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/dorado/cli/correct.cpp b/dorado/cli/correct.cpp
index af5c3c6bc..00f5a6933 100644
--- a/dorado/cli/correct.cpp
+++ b/dorado/cli/correct.cpp
@@ -106,10 +106,21 @@ int correct(int argc, char* argv[]) {
         std::exit(EXIT_FAILURE);
     }
 
+    if (!std::filesystem::exists(reads.front())) {
+        spdlog::error("Input reads file {} does not exist!", reads.front());
+        std::exit(EXIT_FAILURE);
+    }
+
     std::filesystem::path model_dir;
     bool remove_tmp_dir = false;
     if (parser.is_used("--model-path")) {
         model_dir = std::filesystem::path(parser.get<std::string>("model-path"));
+
+        if (!std::filesystem::exists(model_dir)) {
+            spdlog::error("Input model path {} does not exist!", model_dir.string());
+            std::exit(EXIT_FAILURE);
+        }
+
     } else {
         // Download model
         auto tmp_dir = utils::get_downloads_path(std::nullopt);
diff --git a/dorado/read_pipeline/CorrectionNode.cpp b/dorado/read_pipeline/CorrectionNode.cpp
index fd5d79669..231ba1572 100644
--- a/dorado/read_pipeline/CorrectionNode.cpp
+++ b/dorado/read_pipeline/CorrectionNode.cpp
@@ -472,10 +472,10 @@ CorrectionNode::CorrectionNode(const std::string& fastq,
     // Create index for fastq file.
     char* idx_name = fai_path(fastq.c_str());
     spdlog::debug("Looking for idx {}", idx_name);
-    if (!std::filesystem::exists(idx_name)) {
+    if (idx_name && !std::filesystem::exists(idx_name)) {
         if (fai_build(fastq.c_str()) != 0) {
             spdlog::error("Failed to build index for file {}", fastq);
-            throw std::runtime_error("");
+            throw std::runtime_error{"Failed to build index for file " + fastq + "."};
         }
         spdlog::debug("Created fastq index.");
     }

From 9b5ba4525b2b23b487911518c78a822aa9e6b17a Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Tue, 16 Jul 2024 12:15:05 +0200
Subject: [PATCH 2/7] Bugfix for the CIGAR out of bounds issue in
 correct/features.cpp.

---
 dorado/correct/features.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dorado/correct/features.cpp b/dorado/correct/features.cpp
index 47afb7a65..dddbe9d82 100644
--- a/dorado/correct/features.cpp
+++ b/dorado/correct/features.cpp
@@ -248,8 +248,10 @@ std::tuple<at::Tensor, at::Tensor> get_features_for_window(
             qseq = utils::reverse_complement(qseq);
             std::reverse(qqual.begin(), qqual.end());
         }
-        int cigar_len = overlap.cigar_end_idx - overlap.cigar_start_idx + 1;
-        int cigar_end = std::min((int)cigar.size(), cigar_len);
+
+        const int cigar_len_total = static_cast<int>(std::size(cigar));
+        const int cigar_len = overlap.cigar_end_idx - overlap.cigar_start_idx + 1;
+        const int cigar_end = std::min(cigar_len_total - overlap.cigar_start_idx, cigar_len);
 
         uint8_t gap = fwd ? '*' : '#';
 

From bc15a264fdda254ea0d691800dcf891ed7888a00 Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Thu, 25 Jul 2024 10:32:43 +0100
Subject: [PATCH 3/7] Dorado Correct: Log the batch size to info.

---
 dorado/read_pipeline/CorrectionNode.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dorado/read_pipeline/CorrectionNode.cpp b/dorado/read_pipeline/CorrectionNode.cpp
index 231ba1572..3890dc6bd 100644
--- a/dorado/read_pipeline/CorrectionNode.cpp
+++ b/dorado/read_pipeline/CorrectionNode.cpp
@@ -461,7 +461,8 @@ CorrectionNode::CorrectionNode(const std::string& fastq,
                     throw std::runtime_error("Insufficient memory to run inference on " + dev);
                 }
             }
-            spdlog::debug("Using batch size {} on device {}", device_batch_size, dev);
+            spdlog::info("> Using batch size {} on device {} in inference thread {}.",
+                         device_batch_size, dev, i);
             m_infer_threads.push_back(
                     std::thread(&CorrectionNode::infer_fn, this, dev, (int)d, device_batch_size));
         }

From 2a61e349193ed9ffe863f1efd0ea93dbf2929238 Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Thu, 25 Jul 2024 12:29:51 +0100
Subject: [PATCH 4/7] Updated the .gitignore.

---
 .gitignore | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 84e92149d..aa19d6cc1 100755
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,13 @@
 *.qdrep
 dist
 build
-build-Release
-build-Debug
+build-*
 archive
 cmake-build*
 cmake.lock
 CMakeSettings.json
 _CPack_Packages
 tests/test_output
+.temp_dorado_model-*
+temp
+venv

From 01c39db614c173208e592dbb15f87be213719ea3 Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Tue, 30 Jul 2024 10:27:09 +0200
Subject: [PATCH 5/7] Dorado Correct now uses a smaller auto-calculated batch
 size for GPU inference to avoid Torch issues related to not-enough-memory.

---
 dorado/read_pipeline/CorrectionNode.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dorado/read_pipeline/CorrectionNode.cpp b/dorado/read_pipeline/CorrectionNode.cpp
index 3890dc6bd..74031ccb4 100644
--- a/dorado/read_pipeline/CorrectionNode.cpp
+++ b/dorado/read_pipeline/CorrectionNode.cpp
@@ -453,10 +453,11 @@ CorrectionNode::CorrectionNode(const std::string& fastq,
 #endif
     for (size_t d = 0; d < devices.size(); d++) {
         const auto& dev = devices[d];
+        const float batch_factor = (utils::starts_with(device, "cuda")) ? 0.4f : 0.8f;
         for (int i = 0; i < infer_threads; i++) {
             int device_batch_size = batch_size;
             if (batch_size == 0) {
-                device_batch_size = calculate_batch_size(dev, 0.8f);
+                device_batch_size = calculate_batch_size(dev, batch_factor);
                 if (device_batch_size == 0) {
                     throw std::runtime_error("Insufficient memory to run inference on " + dev);
                 }

From 6178d43d0d96cf2189c3bbf5814adceaafc1bf12 Mon Sep 17 00:00:00 2001
From: Richard Harris <richard.harris@nanoporetech.com>
Date: Tue, 30 Jul 2024 11:07:58 +0000
Subject: [PATCH 6/7] DOR-740 DOR-774 Dorado Correct Performance

---
 dorado/correct/features.cpp                   | 142 ++++++++----------
 dorado/correct/features.h                     |   6 +
 dorado/correct/types.h                        |   1 +
 dorado/correct/windows.cpp                    |  29 ++--
 dorado/read_pipeline/CorrectionNode.cpp       |  62 ++++----
 .../ErrorCorrectionMapperNode.cpp             |  84 +++++++----
 dorado/read_pipeline/messages.h               |  13 +-
 dorado/utils/types.h                          |   2 +-
 8 files changed, 169 insertions(+), 170 deletions(-)

diff --git a/dorado/correct/features.cpp b/dorado/correct/features.cpp
index dddbe9d82..ae183f9b1 100644
--- a/dorado/correct/features.cpp
+++ b/dorado/correct/features.cpp
@@ -1,13 +1,17 @@
 #include "features.h"
 
 #include "conversions.h"
+#include "correct/types.h"
 #include "read_pipeline/messages.h"
 #include "utils/sequence_utils.h"
+#include "utils/types.h"
 
 #include <ATen/Tensor.h>
 #include <spdlog/spdlog.h>
 #include <torch/types.h>
 
+#include <cstdint>
+
 #ifdef NDEBUG
 #define LOG_TRACE(...)
 #else
@@ -19,61 +23,27 @@ const int TOP_K = 30;
 namespace dorado::correction {
 
 bool overlap_has_long_indel(const OverlapWindow& overlap, const CorrectionAlignments& alignments) {
-    bool long_indel = false;
     const auto& cigar = alignments.cigars[overlap.overlap_idx];
     size_t max_cigar_idx = std::min(size_t(overlap.cigar_end_idx + 1), cigar.size());
     for (size_t i = overlap.cigar_start_idx; i < max_cigar_idx; i++) {
-        if (cigar[i].op == CigarOpType::INS || cigar[i].op == CigarOpType::DEL) {
-            long_indel |= cigar[i].len >= 30;
+        if (cigar[i].len >= 30 &&
+            (cigar[i].op == CigarOpType::INS || cigar[i].op == CigarOpType::DEL)) {
+            LOG_TRACE("filter ? tstart {} qstart {} qend {} long indel: {}", overlap.tstart,
+                      overlap.qstart, overlap.qend, cigar[i].len);
+            return true;
         }
     }
-    LOG_TRACE("filter ? tstart {} qstart {} qend {} res {}", overlap.tstart, overlap.qstart,
-              overlap.qend, long_indel);
-    return long_indel;
+    return false;
 }
 
 // Measure the accuracy of an alignment segment within a window
 // determined from the cigar string.
-void calculate_accuracy(OverlapWindow& overlap,
-                        const CorrectionAlignments& alignments,
-                        size_t win_idx,
-                        int win_len,
-                        int window_size) {
-    int tstart = overlap.tstart;
-    int tend = (int)win_idx * window_size + win_len;
-
-    // get query region
-    const auto overlap_idx = overlap.overlap_idx;
-    int oqstart = alignments.overlaps[overlap_idx].qstart;
-    int oqend = alignments.overlaps[overlap_idx].qend;
-    int qstart, qend;
-    if (alignments.overlaps[overlap_idx].fwd) {
-        qstart = oqstart + overlap.qstart;
-        qend = oqstart + overlap.qend;
-    } else {
-        qstart = oqend - overlap.qend;
-        qend = oqend - overlap.qstart;
-    }
-
-    int qlen = qend - qstart;
-
-    // Fetch subsequences
-    std::string tseq = alignments.read_seq.substr(tstart, tend - tstart);
-    std::string qseq;
-    if (alignments.overlaps[overlap_idx].fwd) {
-        qseq = alignments.seqs[overlap_idx].substr(qstart, qlen);
-    } else {
-        qseq = utils::reverse_complement(alignments.seqs[overlap_idx].substr(qstart, qlen));
-    }
-
-    LOG_TRACE("tstart {} tend {} qstart {} qend {} cig st {} cig end {}", tstart, tend, qstart,
-              qend, overlap.cigar_start_idx, overlap.cigar_end_idx);
-
+void calculate_accuracy(OverlapWindow& overlap, const CorrectionAlignments& alignments) {
     const auto& cigar = alignments.cigars[overlap.overlap_idx];
+    bool has_warned_bad_cigar_op = false;
 
-    // Calculate accuracy
-    int tpos = 0, qpos = 0;
-    int m = 0, s = 0, i = 0, d = 0;
+    // counts of match, mismatch, insert, deletion
+    int n_match = 0, n_miss = 0, n_ins = 0, n_del = 0;
 
     for (int idx = overlap.cigar_start_idx; idx <= overlap.cigar_end_idx; idx++) {
         int len = -1;
@@ -91,40 +61,30 @@ void calculate_accuracy(OverlapWindow& overlap,
             break;
         }
 
-        LOG_TRACE("len {} tpos {} qpos {}", len, tpos, qpos);
-
         switch (cigar[idx].op) {
-        case CigarOpType::MATCH:
-            for (int j = 0; j < len; j++) {
-                auto tbase = tseq[tpos + j];
-                auto qbase = qseq[qpos + j];
-                LOG_TRACE("{} tbase {}, {} qbase {}", tpos + j, tbase, qpos + j, qbase);
-
-                if (tbase == qbase) {
-                    m += 1;
-                } else {
-                    s += 1;
-                }
-            }
-
-            tpos += len;
-            qpos += len;
+        case CigarOpType::EQ_MATCH:
+            n_match += len;
+            break;
+        case CigarOpType::X_MISMATCH:
+            n_miss += len;
             break;
         case CigarOpType::INS:
-            i += len;
-            qpos += len;
+            n_ins += len;
             break;
         case CigarOpType::DEL:
-            d += len;
-            tpos += len;
+            n_del += len;
             break;
         default:
+            if (!has_warned_bad_cigar_op) {
+                has_warned_bad_cigar_op = true;
+                LOG_TRACE("unexpected CigarOpType: {}", uint8_t(cigar[idx].op));
+            }
             break;
         }
     }
 
-    overlap.accuracy = (static_cast<float>(m) / (m + s + i + d));
-    LOG_TRACE("m {} s {} i {} d {}", m, s, i, d);
+    overlap.accuracy = (static_cast<float>(n_match) / (n_match + n_miss + n_ins + n_del));
+    LOG_TRACE("m {} s {} i {} d {}", n_match, n_miss, n_ins, n_del);
     LOG_TRACE("accuracy qstart {} qend {} {}", overlap.qstart, overlap.qend, overlap.accuracy);
 }
 
@@ -148,7 +108,8 @@ std::vector<int> get_max_ins_for_window(const std::vector<OverlapWindow>& overla
             int len = cigar[i].len;
 
             int l = -1;
-            if (op == CigarOpType::MATCH || op == CigarOpType::DEL) {
+            if (op == CigarOpType::EQ_MATCH || op == CigarOpType::X_MISMATCH ||
+                op == CigarOpType::DEL) {
                 l = len;
             } else if (op == CigarOpType::INS) {
                 max_ins[tpos - 1] = std::max(len, max_ins[tpos - 1]);
@@ -283,8 +244,8 @@ std::tuple<at::Tensor, at::Tensor> get_features_for_window(
             LOG_TRACE("cigar_idx {} l {}", cigar_idx, l);
 
             switch (op) {
-            case CigarOpType::MATCH:
-            case CigarOpType::MISMATCH:
+            case CigarOpType::EQ_MATCH:
+            case CigarOpType::X_MISMATCH:
                 for (uint32_t i = 0; i < l; i++) {
                     auto base = base_encoding[uint8_t(qseq[query_iter]) + (fwd ? 0 : 32)];
                     auto qual = qqual[query_iter];
@@ -414,18 +375,10 @@ at::Tensor get_indices(const at::Tensor& bases, const std::vector<std::pair<int,
             .clone();
 }
 
-// Main interface function for generating features for each window
-// given the overlaps for a target read.
-std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWindow>>& windows,
-                                             const CorrectionAlignments& alignments,
-                                             int window_size) {
-    const std::string& tseq = alignments.read_seq;
-    int tlen = (int)tseq.length();
-
-    std::vector<WindowFeatures> wfs;
+std::unordered_set<int> filter_features(std::vector<std::vector<OverlapWindow>>& windows,
+                                        const CorrectionAlignments& alignments) {
+    std::unordered_set<int> overlap_idxs;
     for (int w = 0; w < (int)windows.size(); w++) {
-        int win_len = (w == (int)windows.size() - 1) ? tlen - window_size * w : window_size;
-        LOG_TRACE("win idx {}: win len {}", w, win_len);
         auto& overlap_windows = windows[w];
 
         // Filter overlaps with very large indels
@@ -442,7 +395,7 @@ std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWind
         // Sort overlaps by score
         if (overlap_windows.size() > 1) {
             for (auto& ovlp : overlap_windows) {
-                calculate_accuracy(ovlp, alignments, w, win_len, window_size);
+                calculate_accuracy(ovlp, alignments);
             }
             // Sort the filtered overlaps by accuracy score
             std::sort(overlap_windows.begin(), overlap_windows.end(),
@@ -456,14 +409,39 @@ std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWind
                           return a.accuracy > b.accuracy;
                       });
         }
+        // Take the TOP_K best overlaps
         overlap_windows.resize(std::min(TOP_K, (int)overlap_windows.size()));
 
+#ifndef NDEBUG
         if (overlap_windows.size() == 1) {
             LOG_TRACE("window {} 1st {}-{}", w, overlap_windows[0].qstart, overlap_windows[0].qend);
         } else if (overlap_windows.size() > 1) {
             LOG_TRACE("window {} 1st {}-{} 2nd {}-{}", w, overlap_windows[0].qstart,
                       overlap_windows[0].qend, overlap_windows[1].qstart, overlap_windows[1].qend);
         }
+#endif
+
+        for (const auto& ov : overlap_windows) {
+            assert(ov.overlap_idx >= 0);
+            overlap_idxs.insert(ov.overlap_idx);
+        }
+    }
+    return overlap_idxs;
+}
+
+// Main interface function for generating features for the top_k overlaps for each window
+// given the overlaps for a target read.
+std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWindow>>& windows,
+                                             const CorrectionAlignments& alignments,
+                                             int window_size) {
+    const std::string& tseq = alignments.read_seq;
+    int tlen = (int)tseq.length();
+
+    std::vector<WindowFeatures> wfs;
+    for (int w = 0; w < (int)windows.size(); w++) {
+        int win_len = (w == (int)windows.size() - 1) ? tlen - window_size * w : window_size;
+        LOG_TRACE("win idx {}: win len {}", w, win_len);
+        auto& overlap_windows = windows[w];
 
         WindowFeatures wf;
         wf.window_idx = w;
diff --git a/dorado/correct/features.h b/dorado/correct/features.h
index 24dbd9c9b..25b3a61c6 100644
--- a/dorado/correct/features.h
+++ b/dorado/correct/features.h
@@ -2,12 +2,18 @@
 
 #include "types.h"
 
+#include <unordered_set>
+
 namespace dorado {
 struct CorrectionAlignments;
 }
 
 namespace dorado::correction {
 
+// Filter window features to TOP_K best. Returns collection of useful overlap indices
+std::unordered_set<int> filter_features(std::vector<std::vector<OverlapWindow>>& windows,
+                                        const CorrectionAlignments& alignments);
+
 std::vector<WindowFeatures> extract_features(std::vector<std::vector<OverlapWindow>>& windows,
                                              const CorrectionAlignments& alignments,
                                              int window_size);
diff --git a/dorado/correct/types.h b/dorado/correct/types.h
index 4bdf8ab20..c89e8f62c 100644
--- a/dorado/correct/types.h
+++ b/dorado/correct/types.h
@@ -8,6 +8,7 @@
 namespace dorado::correction {
 
 struct OverlapWindow {
+    // CorrectionAlignments overlap vector index
     int overlap_idx = -1;
     int tstart = -1;
     int qstart = -1;
diff --git a/dorado/correct/windows.cpp b/dorado/correct/windows.cpp
index 1a3266ebd..0103f4db7 100644
--- a/dorado/correct/windows.cpp
+++ b/dorado/correct/windows.cpp
@@ -6,6 +6,8 @@
 
 #include <spdlog/spdlog.h>
 
+#include <stdexcept>
+
 #ifdef NDEBUG
 #define LOG_TRACE(...)
 #else
@@ -30,10 +32,6 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
     for (int aln_idx = 0; aln_idx < num_alignments; aln_idx++) {
         const auto& overlap = alignments.overlaps[aln_idx];
         const auto& cigar = alignments.cigars[aln_idx];
-        //if (alignments.qnames[a] != "e3066d3e-2bdf-4803-89b9-0f077ac7ff7f") {
-        //    continue;
-        //}
-        LOG_TRACE("window for {}", alignments.qnames[aln_idx]);
 
         // Following the is_target == False logic form the rust code.
         if ((overlap.tend - overlap.tstart < window_size) ||
@@ -62,10 +60,10 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
             spdlog::error(
                     "{} zeroth thres {} nth thres {} first win {} last win {} windows size {} "
                     "overlap "
-                    "tlen {} overlsp tstart {} overlap tend {} qname {} qlen {} qstart {} qend {}",
+                    "tlen {} overlsp tstart {} overlap tend {} qlen {} qstart {} qend {}",
                     alignments.read_name, zeroth_window_thresh, nth_window_thresh, first_window,
                     last_window, windows.size(), overlap.tlen, overlap.tstart, overlap.tend,
-                    alignments.qnames[aln_idx], overlap.qlen, overlap.qstart, overlap.qend);
+                    overlap.qlen, overlap.qstart, overlap.qend);
             return false;
         }
 
@@ -102,8 +100,8 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
             int tnew = tpos;
             int qnew = qpos;
             switch (op.op) {
-            case CigarOpType::MATCH:
-            case CigarOpType::MISMATCH:
+            case CigarOpType::EQ_MATCH:
+            case CigarOpType::X_MISMATCH:
                 tnew = tpos + op.len;
                 qnew = qpos + op.len;
                 LOG_TRACE("{} {}", op.len, "M");
@@ -117,7 +115,7 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
                 LOG_TRACE("{} {}", op.len, "I");
                 continue;
             default:
-                continue;
+                throw std::runtime_error("unexpected CigarOpType");
             }
 
             LOG_TRACE("tpos {} qpos {} tnew {} qnew {}", tpos, qpos, tnew, qnew);
@@ -137,9 +135,10 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
             for (int i = 1; i < diff_w; i++) {
                 int offset = (current_w + i) * window_size - tpos;
 
-                int q_start_new = (op.op == CigarOpType::MATCH || op.op == CigarOpType::MISMATCH)
-                                          ? qpos + offset
-                                          : qpos;
+                int q_start_new =
+                        (op.op == CigarOpType::EQ_MATCH || op.op == CigarOpType::X_MISMATCH)
+                                ? qpos + offset
+                                : qpos;
 
                 if (cigar_start_idx >= 0) {
                     windows[(current_w + i) - 1].push_back(
@@ -154,7 +153,7 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
 
                     t_window_start = tpos + offset;
 
-                    if (op.op == CigarOpType::MATCH || op.op == CigarOpType::MISMATCH) {
+                    if (op.op == CigarOpType::EQ_MATCH || op.op == CigarOpType::X_MISMATCH) {
                         q_window_start = qpos + offset;
                     } else {
                         q_window_start = qpos;
@@ -165,7 +164,7 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
                 } else {
                     t_window_start = tpos + offset;
 
-                    if (op.op == CigarOpType::MATCH || op.op == CigarOpType::MISMATCH) {
+                    if (op.op == CigarOpType::EQ_MATCH || op.op == CigarOpType::X_MISMATCH) {
                         q_window_start = qpos + offset;
                     } else {
                         q_window_start = qpos;
@@ -179,7 +178,7 @@ bool extract_windows(std::vector<std::vector<OverlapWindow>>& windows,
             LOG_TRACE("new_w {} window size {} tpos {}", new_w, window_size, tpos);
             int offset = new_w * window_size - tpos;
 
-            int qend = (op.op == CigarOpType::MATCH || op.op == CigarOpType::MISMATCH)
+            int qend = (op.op == CigarOpType::EQ_MATCH || op.op == CigarOpType::X_MISMATCH)
                                ? qpos + offset
                                : qpos;
 
diff --git a/dorado/read_pipeline/CorrectionNode.cpp b/dorado/read_pipeline/CorrectionNode.cpp
index 74031ccb4..e7b1d36f4 100644
--- a/dorado/read_pipeline/CorrectionNode.cpp
+++ b/dorado/read_pipeline/CorrectionNode.cpp
@@ -11,6 +11,9 @@
 #include "utils/string_utils.h"
 #include "utils/thread_utils.h"
 #include "utils/types.h"
+
+#include <stdexcept>
+#include <unordered_set>
 #if DORADO_CUDA_BUILD
 #include "utils/cuda_utils.h"
 #endif
@@ -46,44 +49,23 @@ dorado::BamPtr create_bam_record(const std::string& read_id, const std::string&
     return dorado::BamPtr(rec);
 }
 
-std::vector<dorado::CigarOp> parse_cigar(const uint32_t* cigar, uint32_t n_cigar) {
-    std::vector<dorado::CigarOp> cigar_ops;
-    cigar_ops.resize(n_cigar);
-    for (uint32_t i = 0; i < n_cigar; i++) {
-        uint32_t op = cigar[i] & 0xf;
-        uint32_t len = cigar[i] >> 4;
-        if (op == MM_CIGAR_MATCH) {
-            cigar_ops[i] = {dorado::CigarOpType::MATCH, len};
-        } else if (op == MM_CIGAR_INS) {
-            cigar_ops[i] = {dorado::CigarOpType::INS, len};
-        } else if (op == MM_CIGAR_DEL) {
-            cigar_ops[i] = {dorado::CigarOpType::DEL, len};
-        } else {
-            throw std::runtime_error("Unknown cigar op: " + std::to_string(op));
-        }
-    }
-    return cigar_ops;
-}
-
 bool populate_alignments(dorado::CorrectionAlignments& alignments,
-                         dorado::hts_io::FastxRandomReader* reader) {
+                         dorado::hts_io::FastxRandomReader* reader,
+                         const std::unordered_set<int>& useful_overlap_idxs) {
     const auto& tname = alignments.read_name;
+
     alignments.read_seq = reader->fetch_seq(tname);
     alignments.read_qual = reader->fetch_qual(tname);
     int tlen = (int)alignments.read_seq.length();
+
+    // Might be worthwhile generating dense vectors with some index mapping to save memory
+    // as using filtering of useful overlaps makes these vectors sparse.
     auto num_qnames = alignments.qnames.size();
     alignments.seqs.resize(num_qnames);
     alignments.quals.resize(num_qnames);
     alignments.cigars.resize(num_qnames);
 
-    // In some cases the target read length reported by mm2 has differed from the
-    // read length when loaded from the fastq. So we check that here and skip
-    // any alignments where information is inconsisteny.
-    // TODO: This was mainly observed before a bug fix for proper loading
-    // of split mm2 indices was added. However the check is being kept around
-    // for now, and can be removed later.
-    std::vector<size_t> pos_to_remove;
-    for (size_t i = 0; i < num_qnames; i++) {
+    for (const size_t i : useful_overlap_idxs) {
         const std::string& qname = alignments.qnames[i];
         alignments.seqs[i] = reader->fetch_seq(qname);
         if ((int)alignments.seqs[i].length() != alignments.overlaps[i].qlen) {
@@ -97,9 +79,6 @@ bool populate_alignments(dorado::CorrectionAlignments& alignments,
                           alignments.overlaps[i].tlen, tlen, tname);
             return false;
         }
-        alignments.cigars[i] = parse_cigar(alignments.mm2_cigars[i].data(),
-                                           (uint32_t)alignments.mm2_cigars[i].size());
-        alignments.mm2_cigars[i] = {};
     }
 
     return alignments.check_consistent_overlaps();
@@ -354,19 +333,32 @@ void CorrectionNode::input_thread_fn() {
 
             auto alignments = std::get<CorrectionAlignments>(std::move(message));
             auto tname = alignments.read_name;
-            if (!populate_alignments(alignments, fastx_reader.get())) {
+
+            if (alignments.overlaps.empty()) {
                 continue;
             }
 
-            size_t n_windows = (alignments.read_seq.length() + m_window_size - 1) / m_window_size;
-            LOG_TRACE("num windows {} for read {}", n_windows, alignments.read_name);
             // Get the windows
+            size_t n_windows = (alignments.overlaps[0].tlen + m_window_size - 1) / m_window_size;
+            LOG_TRACE("num windows {} for read {}", n_windows, alignments.read_name);
             std::vector<std::vector<OverlapWindow>> windows;
             windows.resize(n_windows);
             if (!extract_windows(windows, alignments, m_window_size)) {
                 continue;
             }
-            // Get the features
+
+            // Filter the window features and get the set of unique overlaps
+            const std::unordered_set<int> overlap_idxs = filter_features(windows, alignments);
+            if (overlap_idxs.empty()) {
+                continue;
+            }
+
+            // Populate the alignment data with only the records that are useful after TOP_K filter
+            if (!populate_alignments(alignments, fastx_reader.get(), overlap_idxs)) {
+                continue;
+            }
+
+            // Get the filtered features
             auto wfs = extract_features(windows, alignments, m_window_size);
 
             std::vector<std::string> corrected_seqs;
diff --git a/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp b/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
index 97d594aa9..3017ba579 100644
--- a/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
+++ b/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
@@ -6,9 +6,11 @@
 #include "alignment/Minimap2Index.h"
 #include "alignment/Minimap2IndexSupportTypes.h"
 #include "alignment/Minimap2Options.h"
+#include "alignment/minimap2_args.h"
+#include "alignment/minimap2_wrappers.h"
 #include "utils/PostCondition.h"
 #include "utils/bam_utils.h"
-#include "utils/thread_utils.h"
+#include "utils/thread_naming.h"
 
 #include <htslib/faidx.h>
 #include <htslib/sam.h>
@@ -20,12 +22,32 @@
 #include <string>
 #include <vector>
 
-const size_t MAX_OVERLAPS_PER_READ = 500;
-
 namespace {
 
-std::vector<uint32_t> copy_mm2_cigar(const uint32_t* cigar, uint32_t n_cigar) {
-    std::vector<uint32_t> cigar_ops(cigar, cigar + n_cigar);
+std::vector<dorado::CigarOp> parse_cigar(const uint32_t* cigar, uint32_t n_cigar) {
+    std::vector<dorado::CigarOp> cigar_ops;
+    cigar_ops.resize(n_cigar);
+    for (uint32_t i = 0; i < n_cigar; i++) {
+        const uint32_t op = cigar[i] & 0xf;
+        const uint32_t len = cigar[i] >> 4;
+
+        // minimap2 --eqx must be set
+        if (op == MM_CIGAR_EQ_MATCH) {
+            cigar_ops[i] = {dorado::CigarOpType::EQ_MATCH, len};
+        } else if (op == MM_CIGAR_X_MISMATCH) {
+            cigar_ops[i] = {dorado::CigarOpType::X_MISMATCH, len};
+        } else if (op == MM_CIGAR_INS) {
+            cigar_ops[i] = {dorado::CigarOpType::INS, len};
+        } else if (op == MM_CIGAR_DEL) {
+            cigar_ops[i] = {dorado::CigarOpType::DEL, len};
+        } else if (op == MM_CIGAR_MATCH) {
+            throw std::runtime_error(
+                    "cigar op MATCH is not supported must set minimap2 --eqx flag" +
+                    std::to_string(op));
+        } else {
+            throw std::runtime_error("Unknown cigar op: " + std::to_string(op));
+        }
+    }
     return cigar_ops;
 }
 
@@ -104,22 +126,13 @@ void ErrorCorrectionMapperNode::extract_alignments(const mm_reg1_t* reg,
             continue;
         }
 
-        uint32_t n_cigar = aln->p->n_cigar;
-        auto cigar = copy_mm2_cigar(aln->p->cigar, n_cigar);
-
+        auto cigar = parse_cigar(aln->p->cigar, aln->p->n_cigar);
         {
             std::lock_guard<std::mutex> aln_lock(mtx);
 
             auto& alignments = m_correction_records[tname];
-
-            // Cap total overlaps per read.
-            if (alignments.qnames.size() >= MAX_OVERLAPS_PER_READ) {
-                continue;
-            }
-
             alignments.qnames.push_back(qname);
-
-            alignments.mm2_cigars.push_back(std::move(cigar));
+            alignments.cigars.push_back(std::move(cigar));
             alignments.overlaps.push_back(std::move(ovlp));
         }
     }
@@ -245,20 +258,31 @@ ErrorCorrectionMapperNode::ErrorCorrectionMapperNode(const std::string& index_fi
           m_index_file(index_file),
           m_num_threads(threads),
           m_reads_queue(5000) {
-    alignment::Minimap2Options options = alignment::dflt_options;
-    options.kmer_size = 25;
-    options.window_size = 17;
-    options.index_batch_size = index_size;
-    options.mm2_preset = "ava-ont";
-    options.bandwidth = 150;
-    options.bandwidth_long = 2000;
-    options.min_chain_score = 4000;
-    options.zdrop = options.zdrop_inv = 200;
-    options.occ_dist = 200;
-    options.cs = "short";
-    options.dual = "yes";
-    options.cap_kalloc = std::nullopt;
-    options.max_sw_mat = std::nullopt;
+    auto options = alignment::create_preset_options("ava-ont");
+    auto& index_options = options.index_options->get();
+    index_options.k = 25;
+    index_options.w = 17;
+    index_options.batch_size = index_size;
+    auto& mapping_options = options.mapping_options->get();
+    mapping_options.bw = 150;
+    mapping_options.bw_long = 2000;
+    mapping_options.min_chain_score = 4000;
+    mapping_options.zdrop = 200;
+    mapping_options.zdrop_inv = 200;
+    mapping_options.occ_dist = 200;
+    mapping_options.flag |= MM_F_EQX;
+
+    // --cs short
+    alignment::mm2::apply_cs_option(options, "short");
+
+    // --dual yes
+    alignment::mm2::apply_dual_option(options, "yes");
+
+    // reset to larger minimap2 defaults
+    mm_mapopt_t minimap_default_mapopt;
+    mm_mapopt_init(&minimap_default_mapopt);
+    mapping_options.cap_kalloc = minimap_default_mapopt.cap_kalloc;
+    mapping_options.max_sw_mat = minimap_default_mapopt.max_sw_mat;
 
     m_index = std::make_shared<alignment::Minimap2Index>();
     if (!m_index->initialise(options)) {
diff --git a/dorado/read_pipeline/messages.h b/dorado/read_pipeline/messages.h
index 7a0bd54f4..91bbd5a96 100644
--- a/dorado/read_pipeline/messages.h
+++ b/dorado/read_pipeline/messages.h
@@ -222,15 +222,17 @@ struct Overlap {
 
 // Overlaps for error correction
 struct CorrectionAlignments {
+    // Populated in ErrorCorrectionMapperNode::extract_alignments
     std::string read_name = "";
+    std::vector<std::string> qnames = {};
+    std::vector<std::vector<CigarOp>> cigars = {};
+    std::vector<Overlap> overlaps = {};
+
+    // Populated in CorrectionNode::populate_alignments if the alignment is useful
     std::string read_seq = "";
     std::vector<uint8_t> read_qual = {};
-    std::vector<Overlap> overlaps = {};
-    std::vector<std::vector<CigarOp>> cigars = {};
-    std::vector<std::vector<uint32_t>> mm2_cigars = {};
     std::vector<std::string> seqs = {};
     std::vector<std::vector<uint8_t>> quals = {};
-    std::vector<std::string> qnames = {};
 
     // This is mostly to workaround an issue where sometimes
     // the tend of an overlap is much bigger than the
@@ -257,9 +259,6 @@ struct CorrectionAlignments {
         for (auto& v : cigars) {
             si += v.size() * sizeof(CigarOp);
         }
-        for (auto& v : mm2_cigars) {
-            si += v.size() * sizeof(uint32_t);
-        }
         for (auto& s : seqs) {
             si += s.length();
         }
diff --git a/dorado/utils/types.h b/dorado/utils/types.h
index 1d3b9ce3d..63532bff4 100644
--- a/dorado/utils/types.h
+++ b/dorado/utils/types.h
@@ -237,7 +237,7 @@ struct ModBaseInfo {
 };
 
 // Enum for handling CIGAR ops
-enum class CigarOpType : uint8_t { INS = 0, DEL, MATCH, MISMATCH };
+enum class CigarOpType : uint8_t { INS = 0, DEL, EQ_MATCH, X_MISMATCH };
 
 struct CigarOp {
     CigarOpType op;

From 550385c9f89c2be5442643511beed7c0ad33a84b Mon Sep 17 00:00:00 2001
From: John Stone <john.stone@nanoporetech.com>
Date: Wed, 31 Jul 2024 10:46:50 +0200
Subject: [PATCH 7/7] Resolved cherry-picking conflicts.

---
 dorado/alignment/Minimap2Index.cpp            |  4 ++
 dorado/alignment/Minimap2Index.h              |  1 +
 .../ErrorCorrectionMapperNode.cpp             | 50 +++++++++----------
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/dorado/alignment/Minimap2Index.cpp b/dorado/alignment/Minimap2Index.cpp
index 27c41a836..092795dbd 100644
--- a/dorado/alignment/Minimap2Index.cpp
+++ b/dorado/alignment/Minimap2Index.cpp
@@ -253,6 +253,10 @@ const mm_mapopt_t& Minimap2Index::mapping_options() const {
     return *m_mapping_options;
 }
 
+mm_mapopt_t& Minimap2Index::mapping_options() {
+    return const_cast<mm_mapopt_t&>(const_cast<const Minimap2Index&>(*this).mapping_options());
+}
+
 const Minimap2Options& Minimap2Index::get_options() const { return m_options; }
 
 }  // namespace dorado::alignment
diff --git a/dorado/alignment/Minimap2Index.h b/dorado/alignment/Minimap2Index.h
index c3728517f..7e06b7261 100644
--- a/dorado/alignment/Minimap2Index.h
+++ b/dorado/alignment/Minimap2Index.h
@@ -46,6 +46,7 @@ class Minimap2Index {
     const mm_idx_t* index() const { return m_index.get(); }
     const mm_idxopt_t& index_options() const;
     const mm_mapopt_t& mapping_options() const;
+    mm_mapopt_t& mapping_options();
 
     HeaderSequenceRecords get_sequence_records_for_header() const;
 
diff --git a/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp b/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
index 3017ba579..568380bd9 100644
--- a/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
+++ b/dorado/read_pipeline/ErrorCorrectionMapperNode.cpp
@@ -6,11 +6,9 @@
 #include "alignment/Minimap2Index.h"
 #include "alignment/Minimap2IndexSupportTypes.h"
 #include "alignment/Minimap2Options.h"
-#include "alignment/minimap2_args.h"
-#include "alignment/minimap2_wrappers.h"
 #include "utils/PostCondition.h"
 #include "utils/bam_utils.h"
-#include "utils/thread_naming.h"
+#include "utils/thread_utils.h"
 
 #include <htslib/faidx.h>
 #include <htslib/sam.h>
@@ -258,38 +256,36 @@ ErrorCorrectionMapperNode::ErrorCorrectionMapperNode(const std::string& index_fi
           m_index_file(index_file),
           m_num_threads(threads),
           m_reads_queue(5000) {
-    auto options = alignment::create_preset_options("ava-ont");
-    auto& index_options = options.index_options->get();
-    index_options.k = 25;
-    index_options.w = 17;
-    index_options.batch_size = index_size;
-    auto& mapping_options = options.mapping_options->get();
-    mapping_options.bw = 150;
-    mapping_options.bw_long = 2000;
-    mapping_options.min_chain_score = 4000;
-    mapping_options.zdrop = 200;
-    mapping_options.zdrop_inv = 200;
-    mapping_options.occ_dist = 200;
-    mapping_options.flag |= MM_F_EQX;
-
-    // --cs short
-    alignment::mm2::apply_cs_option(options, "short");
+    alignment::Minimap2Options options = alignment::dflt_options;
+    options.kmer_size = 25;
+    options.window_size = 17;
+    options.index_batch_size = index_size;
+    options.mm2_preset = "ava-ont";
+    options.bandwidth = 150;
+    options.bandwidth_long = 2000;
+    options.min_chain_score = 4000;
+    options.zdrop = options.zdrop_inv = 200;
+    options.occ_dist = 200;
+    options.cs = "short";
+    options.dual = "yes";
+    options.cap_kalloc = std::nullopt;
+    options.max_sw_mat = std::nullopt;
 
-    // --dual yes
-    alignment::mm2::apply_dual_option(options, "yes");
+    m_index = std::make_shared<alignment::Minimap2Index>();
+    if (!m_index->initialise(options)) {
+        throw std::runtime_error("Failed to initialize with options.");
+    } else {
+        spdlog::debug("Initialized index options.");
+    }
 
     // reset to larger minimap2 defaults
+    mm_mapopt_t& mapping_options = m_index->mapping_options();
     mm_mapopt_t minimap_default_mapopt;
     mm_mapopt_init(&minimap_default_mapopt);
     mapping_options.cap_kalloc = minimap_default_mapopt.cap_kalloc;
     mapping_options.max_sw_mat = minimap_default_mapopt.max_sw_mat;
+    mapping_options.flag |= MM_F_EQX;
 
-    m_index = std::make_shared<alignment::Minimap2Index>();
-    if (!m_index->initialise(options)) {
-        throw std::runtime_error("Failed to initialize with options.");
-    } else {
-        spdlog::debug("Initialized index options.");
-    }
     spdlog::debug("Loading index...");
     if (m_index->load(index_file, threads, true) != alignment::IndexLoadResult::success) {
         throw std::runtime_error("Failed to load index file " + index_file);