diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 5a4c4208..ec2cb2ab 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -65,6 +65,7 @@ void parse_args(int argc, parser.helpParams.flagindent = 2; parser.helpParams.helpindent = 35; parser.helpParams.eachgroupindent = 2; + parser.helpParams.optionsString = ""; args::Group options_group(parser, ""); args::Positional target_sequence_file(options_group, "target.fa", "target sequences (required, default: self-map)"); @@ -98,7 +99,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter minimizers occurring > FLOAT of total [0.0002]", {'F', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index ee02632a..d196b8f0 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -256,15 +256,27 @@ namespace skch continue; // Should never happen } - uint64_t freq_cutoff; + uint64_t freq = freq_it->second; + uint64_t min_occ = 10; // minimum occurrence threshold to prevent over-filtering in small datasets + uint64_t max_occ = std::numeric_limits::max(); // no upper limit on occurrences + uint64_t count_threshold; + if (param.max_kmer_freq <= 1.0) { - // Calculate cutoff based on fraction of total windows - freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + // Calculate threshold based on fraction, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)(total_windows * param.max_kmer_freq))); } else { - // Use direct count cutoff - freq_cutoff = (uint64_t)param.max_kmer_freq; + // Use direct count threshold, but respect min/max bounds + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)param.max_kmer_freq)); } - if (freq_it->second > freq_cutoff) { + + // Filter only if BOTH conditions are met: + // 1. Frequency exceeds the calculated threshold + // 2. Count exceeds minimum occurrence threshold + if (freq > count_threshold && freq > min_occ) { filtered_kmers++; continue; }