Skip to content

Commit

Permalink
Merge pull request #293 from waveygang/freq-filter-yes
Browse files Browse the repository at this point in the history
Freq filter yes
  • Loading branch information
ekg authored Nov 12, 2024
2 parents fb42892 + 2427c75 commit b054426
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 7 deletions.
4 changes: 2 additions & 2 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void parse_args(int argc,
args::ValueFlag<double> kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"});
args::ValueFlag<std::string> hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"});
args::ValueFlag<int> min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"});
args::ValueFlag<uint64_t> max_kmer_freq(mapping_opts, "INT", "maximum allowed k-mer frequency [unlimited]", {'F', "max-kmer-freq"});
args::ValueFlag<double> max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"});

args::Group alignment_opts(options_group, "Alignment:");
args::ValueFlag<std::string> input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"});
Expand Down Expand Up @@ -557,7 +557,7 @@ void parse_args(int argc,
if (max_kmer_freq) {
map_parameters.max_kmer_freq = args::get(max_kmer_freq);
} else {
map_parameters.max_kmer_freq = std::numeric_limits<uint64_t>::max(); // unlimited
map_parameters.max_kmer_freq = 0.0002; // default filter fraction
}

//if (window_minimizers) {
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct Parameters
//std::unordered_set<std::string> high_freq_kmers; //
int64_t index_by_size = std::numeric_limits<int64_t>::max(); // Target total size of sequences for each index subset
int minimum_hits = -1; // Minimum number of hits required for L1 filtering (-1 means auto)
uint64_t max_kmer_freq = std::numeric_limits<uint64_t>::max(); // Maximum allowed k-mer frequency
double max_kmer_freq = 0.0002; // Maximum allowed k-mer frequency fraction (0-1) or count (>1)
};


Expand Down
28 changes: 24 additions & 4 deletions src/map/include/winSketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,15 @@ namespace skch
continue; // Should never happen
}

if (freq_it->second > param.max_kmer_freq) {
uint64_t freq_cutoff;
if (param.max_kmer_freq <= 1.0) {
// Calculate cutoff based on fraction of total windows
freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq));
} else {
// Use direct count cutoff
freq_cutoff = (uint64_t)param.max_kmer_freq;
}
if (freq_it->second > freq_cutoff) {
filtered_kmers++;
continue;
}
Expand All @@ -282,12 +290,24 @@ namespace skch
// Finish second progress meter
index_progress.finish();

double filtered_pct = (filtered_kmers * 100.0) / total_kmers;
uint64_t freq_cutoff;
if (param.max_kmer_freq <= 1.0) {
freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq));
} else {
freq_cutoff = (uint64_t)param.max_kmer_freq;
}
std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), "
<< minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl
<< "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers
<< " k-mers (" << std::fixed << std::setprecision(2) << filtered_pct << "%) exceeding frequency threshold of "
<< param.max_kmer_freq << std::endl;
<< " k-mers occurring > " << freq_cutoff << " times"
<< " (target: " << (param.max_kmer_freq <= 1.0 ?
([&]() {
std::stringstream ss;
ss << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100);
return ss.str();
})() + "%" :
">" + std::to_string((int)param.max_kmer_freq) + " occurrences")
<< ")" << std::endl;
}

std::chrono::duration<double> timeRefSketch = skch::Time::now() - t0;
Expand Down

0 comments on commit b054426

Please sign in to comment.