From 1f709666d06ed9eb2cbf3f5ada347b81b01da89c Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 13 Nov 2024 17:46:08 -0600 Subject: [PATCH 1/9] refactor: Improve k-mer frequency filtering logic in winSketch.hpp --- src/map/include/winSketch.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index ee02632a..f04dd37f 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -256,15 +256,16 @@ namespace skch continue; // Should never happen } - uint64_t freq_cutoff; + uint64_t freq = freq_it->second; + bool should_filter; if (param.max_kmer_freq <= 1.0) { - // Calculate cutoff based on fraction of total windows - freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + // Filter if frequency exceeds fraction of total windows + should_filter = freq > std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); } else { - // Use direct count cutoff - freq_cutoff = (uint64_t)param.max_kmer_freq; + // Filter if frequency exceeds absolute count + should_filter = freq > (uint64_t)param.max_kmer_freq; } - if (freq_it->second > freq_cutoff) { + if (should_filter) { filtered_kmers++; continue; } From d259f86b1933fe3fe30e4b0ec6792cf8ae4e3531 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 13 Nov 2024 17:58:51 -0600 Subject: [PATCH 2/9] refactor: Align minimizer frequency filtering with minimap2 implementation --- src/map/include/winSketch.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index f04dd37f..2db555c0 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -257,15 +257,26 @@ namespace skch } uint64_t freq = freq_it->second; - bool should_filter; + uint64_t min_occ = 10; // minimap2's minimum occurrence threshold + uint64_t max_occ = 1000000; // minimap2's maximum occurrence threshold + uint64_t count_threshold; + if (param.max_kmer_freq <= 1.0) { - // Filter if frequency exceeds fraction of total windows - should_filter = freq > std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + // Calculate threshold based on fraction, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)(total_windows * param.max_kmer_freq))); } else { - // Filter if frequency exceeds absolute count - should_filter = freq > (uint64_t)param.max_kmer_freq; + // Use direct count threshold, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)param.max_kmer_freq)); } - if (should_filter) { + + // Filter only if BOTH conditions are met: + // 1. Frequency exceeds the calculated threshold + // 2. Count exceeds minimum occurrence threshold + if (freq > count_threshold && freq > min_occ) { filtered_kmers++; continue; } From 088e75b064ef39e3c20b0deab64b36dc253895d4 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 13 Nov 2024 18:02:29 -0600 Subject: [PATCH 3/9] docs: Update help text for minimizer frequency filter option --- src/interface/parse_args.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 5a4c4208..a470cd1f 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -98,7 +98,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter minimizers occurring > FLOAT fraction of total [0.0002]", {'F', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); From 5e531758724962768ce84c89d47991ab14ad6dad Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 13 Nov 2024 18:05:15 -0600 Subject: [PATCH 4/9] refactor: Remove 'fraction' from max_kmer_freq help text for clarity --- src/interface/parse_args.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index a470cd1f..44599562 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -98,7 +98,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter minimizers occurring > FLOAT fraction of total [0.0002]", {'F', "filter-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter minimizers occurring > FLOAT of total [0.0002]", {'F', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); From 0aa05cf3d35003686673bc8f66ef90929e2f9d59 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Wed, 13 Nov 2024 18:15:30 -0600 Subject: [PATCH 5/9] refactor: Simplify k-mer frequency filtering by removing arbitrary thresholds --- src/map/include/winSketch.hpp | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 2db555c0..2549a6a0 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -257,26 +257,18 @@ namespace skch } uint64_t freq = freq_it->second; - uint64_t min_occ = 10; // minimap2's minimum occurrence threshold - uint64_t max_occ = 1000000; // minimap2's maximum occurrence threshold uint64_t count_threshold; if (param.max_kmer_freq <= 1.0) { - // Calculate threshold based on fraction, but respect min/max bounds - count_threshold = std::min(max_occ, - std::max(min_occ, - (uint64_t)(total_windows * param.max_kmer_freq))); + // Calculate threshold based on fraction of total windows + count_threshold = (uint64_t)(total_windows * param.max_kmer_freq); } else { - // Use direct count threshold, but respect min/max bounds - count_threshold = std::min(max_occ, - std::max(min_occ, - (uint64_t)param.max_kmer_freq)); + // Use direct count threshold + count_threshold = (uint64_t)param.max_kmer_freq; } - // Filter only if BOTH conditions are met: - // 1. Frequency exceeds the calculated threshold - // 2. Count exceeds minimum occurrence threshold - if (freq > count_threshold && freq > min_occ) { + // Filter if frequency exceeds the threshold + if (freq > count_threshold) { filtered_kmers++; continue; } From 15883082f2081134d3f9bddac6a0fc67c464ec77 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 14 Nov 2024 10:42:32 -0600 Subject: [PATCH 6/9] Revert "refactor: Simplify k-mer frequency filtering by removing arbitrary thresholds" This reverts commit 0aa05cf3d35003686673bc8f66ef90929e2f9d59. --- src/map/include/winSketch.hpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 2549a6a0..2db555c0 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -257,18 +257,26 @@ namespace skch } uint64_t freq = freq_it->second; + uint64_t min_occ = 10; // minimap2's minimum occurrence threshold + uint64_t max_occ = 1000000; // minimap2's maximum occurrence threshold uint64_t count_threshold; if (param.max_kmer_freq <= 1.0) { - // Calculate threshold based on fraction of total windows - count_threshold = (uint64_t)(total_windows * param.max_kmer_freq); + // Calculate threshold based on fraction, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)(total_windows * param.max_kmer_freq))); } else { - // Use direct count threshold - count_threshold = (uint64_t)param.max_kmer_freq; + // Use direct count threshold, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)param.max_kmer_freq)); } - // Filter if frequency exceeds the threshold - if (freq > count_threshold) { + // Filter only if BOTH conditions are met: + // 1. Frequency exceeds the calculated threshold + // 2. Count exceeds minimum occurrence threshold + if (freq > count_threshold && freq > min_occ) { filtered_kmers++; continue; } From 3a7c4da92ede1f90f3aad033050f85fe940bebca Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Thu, 14 Nov 2024 10:43:22 -0600 Subject: [PATCH 7/9] refactor: Change max_occ to numeric limits max while keeping min_occ at 10 --- src/map/include/winSketch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 2db555c0..136b961a 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -258,7 +258,7 @@ namespace skch uint64_t freq = freq_it->second; uint64_t min_occ = 10; // minimap2's minimum occurrence threshold - uint64_t max_occ = 1000000; // minimap2's maximum occurrence threshold + uint64_t max_occ = std::numeric_limits::max(); // no upper limit on occurrences uint64_t count_threshold; if (param.max_kmer_freq <= 1.0) { From b0544290d7856179a07642f5d55ffc76cdea2653 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Thu, 14 Nov 2024 10:43:53 -0600 Subject: [PATCH 8/9] docs: Update comment to clarify minimum occurrence threshold purpose --- src/map/include/winSketch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 136b961a..d196b8f0 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -257,7 +257,7 @@ namespace skch } uint64_t freq = freq_it->second; - uint64_t min_occ = 10; // minimap2's minimum occurrence threshold + uint64_t min_occ = 10; // minimum occurrence threshold to prevent over-filtering in small datasets uint64_t max_occ = std::numeric_limits::max(); // no upper limit on occurrences uint64_t count_threshold; From f659baac658e51e5f4ccd5f30cb2ee33ef88b394 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Thu, 14 Nov 2024 10:54:46 -0600 Subject: [PATCH 9/9] feat: Add empty optionsString to parser help parameters --- src/interface/parse_args.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 44599562..ec2cb2ab 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -65,6 +65,7 @@ void parse_args(int argc, parser.helpParams.flagindent = 2; parser.helpParams.helpindent = 35; parser.helpParams.eachgroupindent = 2; + parser.helpParams.optionsString = ""; args::Group options_group(parser, ""); args::Positional target_sequence_file(options_group, "target.fa", "target sequences (required, default: self-map)");