diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index cf61348a..5a4c4208 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -98,7 +98,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "INT", "maximum allowed k-mer frequency [unlimited]", {'F', "max-kmer-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); @@ -557,7 +557,7 @@ void parse_args(int argc, if (max_kmer_freq) { map_parameters.max_kmer_freq = args::get(max_kmer_freq); } else { - map_parameters.max_kmer_freq = std::numeric_limits::max(); // unlimited + map_parameters.max_kmer_freq = 0.0002; // default filter fraction } //if (window_minimizers) { diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index e022878e..b0f72286 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -87,7 +87,7 @@ struct Parameters //std::unordered_set high_freq_kmers; // int64_t index_by_size = std::numeric_limits::max(); // Target total size of sequences for each index subset int minimum_hits = -1; // Minimum number of hits required for L1 filtering (-1 means auto) - uint64_t max_kmer_freq = std::numeric_limits::max(); // Maximum allowed k-mer frequency + double max_kmer_freq = 0.0002; // Maximum allowed k-mer frequency fraction (0-1) or count (>1) }; diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 60099e6e..ee02632a 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -256,7 +256,15 @@ namespace skch continue; // Should never happen } - if (freq_it->second > param.max_kmer_freq) { + uint64_t freq_cutoff; + if (param.max_kmer_freq <= 1.0) { + // Calculate cutoff based on fraction of total windows + freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + } else { + // Use direct count cutoff + freq_cutoff = (uint64_t)param.max_kmer_freq; + } + if (freq_it->second > freq_cutoff) { filtered_kmers++; continue; } @@ -282,12 +290,24 @@ namespace skch // Finish second progress meter index_progress.finish(); - double filtered_pct = (filtered_kmers * 100.0) / total_kmers; + uint64_t freq_cutoff; + if (param.max_kmer_freq <= 1.0) { + freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + } else { + freq_cutoff = (uint64_t)param.max_kmer_freq; + } std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), " << minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers - << " k-mers (" << std::fixed << std::setprecision(2) << filtered_pct << "%) exceeding frequency threshold of " - << param.max_kmer_freq << std::endl; + << " k-mers occurring > " << freq_cutoff << " times" + << " (target: " << (param.max_kmer_freq <= 1.0 ? + ([&]() { + std::stringstream ss; + ss << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100); + return ss.str(); + })() + "%" : + ">" + std::to_string((int)param.max_kmer_freq) + " occurrences") + << ")" << std::endl; } std::chrono::duration timeRefSketch = skch::Time::now() - t0;