Skip to content

Commit

Permalink
Merge pull request #294 from waveygang/fix-minmer-filt
Browse files Browse the repository at this point in the history
Fix minmer filt
  • Loading branch information
ekg authored Nov 14, 2024
2 parents b054426 + f659baa commit f8d4446
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
3 changes: 2 additions & 1 deletion src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ void parse_args(int argc,
parser.helpParams.flagindent = 2;
parser.helpParams.helpindent = 35;
parser.helpParams.eachgroupindent = 2;
parser.helpParams.optionsString = "";

args::Group options_group(parser, "");
args::Positional<std::string> target_sequence_file(options_group, "target.fa", "target sequences (required, default: self-map)");
Expand Down Expand Up @@ -98,7 +99,7 @@ void parse_args(int argc,
args::ValueFlag<double> kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"});
args::ValueFlag<std::string> hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"});
args::ValueFlag<int> min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"});
args::ValueFlag<double> max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"});
args::ValueFlag<double> max_kmer_freq(mapping_opts, "FLOAT", "filter minimizers occurring > FLOAT of total [0.0002]", {'F', "filter-freq"});

args::Group alignment_opts(options_group, "Alignment:");
args::ValueFlag<std::string> input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"});
Expand Down
24 changes: 18 additions & 6 deletions src/map/include/winSketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,15 +256,27 @@ namespace skch
continue; // Should never happen
}

uint64_t freq_cutoff;
uint64_t freq = freq_it->second;
uint64_t min_occ = 10; // minimum occurrence threshold to prevent over-filtering in small datasets
uint64_t max_occ = std::numeric_limits<uint64_t>::max(); // no upper limit on occurrences
uint64_t count_threshold;

if (param.max_kmer_freq <= 1.0) {
// Calculate cutoff based on fraction of total windows
freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq));
// Calculate threshold based on fraction, but respect min/max bounds
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)(total_windows * param.max_kmer_freq)));
} else {
// Use direct count cutoff
freq_cutoff = (uint64_t)param.max_kmer_freq;
// Use direct count threshold, but respect min/max bounds
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)param.max_kmer_freq));
}
if (freq_it->second > freq_cutoff) {

// Filter only if BOTH conditions are met:
// 1. Frequency exceeds the calculated threshold
// 2. Count exceeds minimum occurrence threshold
if (freq > count_threshold && freq > min_occ) {
filtered_kmers++;
continue;
}
Expand Down

0 comments on commit f8d4446

Please sign in to comment.