waveygang · ekg · Oct 21, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
@@ -74,7 +74,6 @@ void parse_args(int argc,
     args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
     args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
     args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
-    args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
     args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
     args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
     args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
@@ -93,10 +92,14 @@ void parse_args(int argc,
     args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
     //ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
     args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
+    args::ValueFlag<double> hg_numerator(mapping_opts, "N", 
+        "Set the numerator for the hypergeometric filter's Jaccard similarity calculation. "
+        "Higher values increase speed at the cost of sensitivity. [default: 1.0]", 
+        {"hg-numerator"});
     args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
-    args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
-    args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {'2', "hg-filter-ani-diff"});
-    args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'3', "hg-filter-conf"});
+    args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {"no-hg-filter"});
+    args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {"hg-filter-ani-diff"});
+    args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {"hg-filter-conf"});
     //args::Flag window_minimizers(mapping_opts, "", "Use window minimizers rather than world minimizers", {'U', "window-minimizers"});
     //args::ValueFlag<std::string> path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"});
     //args::ValueFlag<std::string> spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds <weight_of_seed> <number_of_seeds> <similarity> <region_length> e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"});
@@ -471,12 +474,6 @@ void parse_args(int argc,
         map_parameters.kmerSize = 15;
     }
 
-    if (kmer_pct_threshold) {
-        map_parameters.kmer_pct_threshold = args::get(kmer_pct_threshold);
-    } else {
-        map_parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
-    }
-
     //if (spaced_seed_params) {
         //const std::string foobar = args::get(spaced_seed_params);
 
@@ -613,9 +610,34 @@ void parse_args(int argc,
         map_parameters.kmerComplexityThreshold = 0;
     }
 
+    if (hg_numerator) {
+        double value = args::get(hg_numerator);
+        if (value < 1.0) {
+            std::cerr << "[wfmash] ERROR: hg-numerator must be >= 1.0." << std::endl;
+            exit(1);
+        }
+        map_parameters.hgNumerator = value;
+    } else {
+        map_parameters.hgNumerator = 1.0;  // Default value
+    }
+
+    // Set the total reference size
+    map_parameters.totalReferenceSize = skch::CommonFunc::getReferenceSize(map_parameters.refSequences);
+
+    // Estimate total unique k-mers using information theoretic approach
+    map_parameters.estimatedUniqueKmers = skch::CommonFunc::estimateUniqueKmers(
+        map_parameters.totalReferenceSize, 
+        map_parameters.kmerSize
+    );
+
+    std::cerr << "[wfmash] Estimated unique " << map_parameters.kmerSize << "-mers: " 
+              << map_parameters.estimatedUniqueKmers 
+              << " (based on total reference size: " << map_parameters.totalReferenceSize << " bp)" 
+              << std::endl;
+
     map_parameters.filterLengthMismatches = true;
 
-    map_parameters.stage1_topANI_filter = !bool(no_hg_filter); 
+    map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
     map_parameters.stage2_full_scan = true;
 
     if (hg_filter_ani_diff)

diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp
@@ -617,6 +617,20 @@ namespace skch {
             split(s, delim, std::back_inserter(elems));
             return elems;
         }
+
+        uint64_t estimateUniqueKmers(uint64_t sequenceLength, int kmerSize) {
+            // Calculate the theoretical maximum number of k-mers
+            uint64_t maxKmers = sequenceLength - kmerSize + 1;
+
+            // Calculate the probability of a k-mer being unique
+            // This uses the "Birthday Problem" approach
+            double probabilityUnique = std::exp(-static_cast<double>(maxKmers) / std::pow(4, kmerSize));
+
+            // Estimate the number of unique k-mers
+            uint64_t estimatedUnique = maxKmers * (1 - probabilityUnique);
+
+            return estimatedUnique;
+        }
     }
 }
 

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
@@ -200,17 +200,6 @@ namespace skch
             p.query_list,
             p.target_list))
           {
-              std::cerr << "Initializing Map with parameters:" << std::endl;
-              std::cerr << "Query sequences: " << p.querySequences.size() << std::endl;
-              std::cerr << "Reference sequences: " << p.refSequences.size() << std::endl;
-              std::cerr << "Query prefix: " << (p.query_prefix.empty() ? "None" : p.query_prefix[0]) << std::endl;
-              std::cerr << "Target prefix: " << (p.target_prefix.empty() ? "None" : p.target_prefix) << std::endl;
-              std::cerr << "Prefix delimiter: '" << p.prefix_delim << "'" << std::endl;
-              std::cerr << "Query list: " << (p.query_list.empty() ? "None" : p.query_list) << std::endl;
-              std::cerr << "Target list: " << (p.target_list.empty() ? "None" : p.target_list) << std::endl;
-
-              idManager->dumpState();
-
               if (p.stage1_topANI_filter) {
                   this->setProbs();
               }
@@ -319,6 +308,7 @@ namespace skch
           );
           sketchCutoffs[cmax] = ci;
 
+
           // For really high min_p values and some values of cmax, there are no values of
           // ci that satisfy the cutoff, so we just set to the max
           if (sketchCutoffs[cmax] == 0) {
@@ -478,35 +468,43 @@ namespace skch
 
         std::unordered_map<seqno_t, MappingResultsVector_t> combinedMappings;
 
+        // Build index for the current subset
+        // Open the index file once
+        std::ifstream indexStream;
+        if (!param.indexFilename.empty() && !param.create_index_only) {
+            indexStream.open(param.indexFilename.string(), std::ios::binary);
+            if (!indexStream) {
+                std::cerr << "Error: Unable to open index file: " << param.indexFilename << std::endl;
+                exit(1);
+            }
+        }
+
         // For each subset of target sequences
         uint64_t subset_count = 0;
+        std::cerr << "[mashmap::skch::Map::mapQuery] Number of target subsets: " << target_subsets.size() << std::endl;
         for (const auto& target_subset : target_subsets) {
-            std::cerr << "processing subset " << subset_count << " of " << target_subsets.size() << std::endl;
-            std::cerr << "entries: ";
-            for (const auto& seqName : target_subset) {
-                std::cerr << seqName << " ";
-            }
-            std::cerr << std::endl;
             if (target_subset.empty()) {
                 continue;  // Skip empty subsets
             }
 
-            // Build index for the current subset
-            if (!param.indexFilename.empty() && !param.create_index_only) {
-                // load index from file
-                std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
-                refSketch = new skch::Sketch(param, *idManager, target_subset, indexFilename);
-            } else {
-                refSketch = new skch::Sketch(param, *idManager, target_subset);
-            }
-
             if (param.create_index_only) {
                 // Save the index to a file
-                std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
-                refSketch->writeIndex(indexFilename);
+                std::cerr << "[mashmap::skch::Map::mapQuery] Building and saving index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
+                refSketch = new skch::Sketch(param, *idManager, target_subset);
+                std::string indexFilename = param.indexFilename.string();
+                bool append = (subset_count != 0); // Append if not the first subset
+                refSketch->writeIndex(target_subset, indexFilename, append);
                 std::cerr << "[mashmap::skch::Map::mapQuery] Index created for subset " << subset_count 
                           << " and saved to " << indexFilename << std::endl;
             } else {
+                if (!param.indexFilename.empty()) {
+                    // Load index from file
+                    std::cerr << "[mashmap::skch::Map::mapQuery] Loading index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
+                    refSketch = new skch::Sketch(param, *idManager, target_subset, &indexStream);
+                } else {
+                    std::cerr << "[mashmap::skch::Map::mapQuery] Building index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
+                    refSketch = new skch::Sketch(param, *idManager, target_subset);
+                }
                 processSubset(subset_count, target_subsets.size(), total_seq_length, input_queue, merged_queue, 
                               fragment_queue, reader_done, workers_done, fragments_done, combinedMappings);
             }
@@ -517,6 +515,10 @@ namespace skch
             ++subset_count;
         }
 
+        if (indexStream.is_open()) {
+            indexStream.close();
+        }
+
         if (param.create_index_only) {
             std::cerr << "[mashmap::skch::Map::mapQuery] All indices created successfully. Exiting." << std::endl;
             exit(0);
@@ -1021,11 +1023,7 @@ namespace skch
           const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
           Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
 
-          // TODO remove them from the original sketch instead of removing for each read
-          auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
-            return refSketch->isFreqSeed(mi.hash);
-          });
-          Q.minmerTableQuery.erase(new_end, Q.minmerTableQuery.end());
+          // Removed frequent kmer filtering
 
           Q.sketchSize = Q.minmerTableQuery.size();
 #ifdef DEBUG
@@ -1144,6 +1142,12 @@ namespace skch
           std::unordered_map<hash_t, int> hash_to_freq;
 
           if (param.stage1_topANI_filter) {
+            double maxJaccard = refSketch->hgNumerator / static_cast<double>(Q.sketchSize);
+            double cutoff_j = Stat::md2j(1 - param.percentageIdentity + param.ANIDiff, param.kmerSize);
+            int minIntersectionSize = std::max(
+                static_cast<int>(cutoff_j * Q.sketchSize),
+                minimumHits);
+
             while (leadingIt != ip_end)
             {
               // Catch the trailing iterator up to the leading iterator - windowLen
@@ -1182,7 +1186,7 @@ namespace skch
 
             // Only go back through to find local opts if we know that there are some that are 
             // large enough
-            if (bestIntersectionSize < minimumHits) 
+            if (bestIntersectionSize < minIntersectionSize) 
             {
               return;
             } else 
@@ -1192,7 +1196,7 @@ namespace skch
                     int(std::min(bestIntersectionSize, Q.sketchSize) 
                       / std::max<double>(1, param.sketchSize / skch::fixed::ss_table_max))
                   ],
-                  minimumHits);
+                  minIntersectionSize);
             }
           } 
 
@@ -1438,17 +1442,20 @@ namespace skch
 
             if (param.stage1_topANI_filter)
             {
-              // If using HG filter, don't consider any mappings which have no chance of being 
-              // within param.ANIDiff of the best mapping seen so far
-              double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff));
+              // Use the global Jaccard numerator here
+              double jaccardSimilarity = refSketch->hgNumerator / Q.sketchSize;
+              double mash_dist = Stat::j2md(jaccardSimilarity, param.kmerSize);
+              double cutoff_ani = std::max(0.0, (1 - mash_dist) - param.ANIDiff);
               double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize);
-              if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) 
+
+              double candidateJaccard = static_cast<double>(candidateLocus.intersectionSize) / Q.sketchSize;
+
+              if (candidateJaccard < cutoff_j) 
               {
                 break;
               }
             }
 
-
             l2_vec.clear();
             computeL2MappedRegions(Q, candidateLocus, l2_vec);
 

diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
@@ -32,7 +32,6 @@ struct ales_params {
 struct Parameters
 {
     int kmerSize;                                     //kmer size for sketching
-    float kmer_pct_threshold;                         //use only kmers not in the top kmer_pct_threshold %-ile
     offset_t segLength;                                //For split mapping case, this represents the fragment length
                                                       //for noSplit, it represents minimum read length to multimap
     offset_t block_length;                             // minimum (potentially merged) block to keep if we aren't split
@@ -73,6 +72,9 @@ struct Parameters
     std::vector<std::string> query_prefix;            // prefix for query sequences to use
 
     int sketchSize;
+    double hgNumerator = 1.0;                         // Numerator for the hypergeometric filter's Jaccard similarity calculation
+    uint64_t totalReferenceSize = 0;                  // Total size of all reference sequences
+    uint64_t estimatedUniqueKmers = 0;                // Estimate of total unique k-mers
     bool use_spaced_seeds;                            //
     ales_params spaced_seed_params;                   //
     double spaced_seed_sensitivity;                   //

diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp
@@ -94,7 +94,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     cmd.defineOption("kmer", "kmer size [default : 19]", ArgvParser::OptionRequiresValue);
     cmd.defineOptionAlternative("kmer","k");
 
-    cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
     cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);
 
 
@@ -491,14 +490,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
         parameters.keep_low_pct_id = true;
     }
 
-    if (cmd.foundOption("kmerThreshold")) {
-        str << cmd.optionValue("kmerThreshold");
-        str >> parameters.kmer_pct_threshold;
-    } else {
-        parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
-    }
-    str.clear();
-
     if (cmd.foundOption("numMappingsForSegment")) {
       uint32_t n;
       str << cmd.optionValue("numMappingsForSegment");

diff --git a/src/map/include/sequenceIds.hpp b/src/map/include/sequenceIds.hpp
@@ -35,35 +35,6 @@ class SequenceIdManager {
         allPrefixes.insert(allPrefixes.end(), targetPrefixes.begin(), targetPrefixes.end());
         populateFromFiles(queryFiles, targetFiles, queryPrefixes, targetPrefixes, prefixDelim, queryList, targetList);
         buildRefGroups();
-        dumpState(); // Add this line to dump the state after initialization
-    }
-
-    // Add this method to dump the state of SequenceIdManager
-    void dumpState() const {
-        std::cerr << "SequenceIdManager State:" << std::endl;
-        std::cerr << "Total sequences: " << metadata.size() << std::endl;
-        std::cerr << "Query sequences: " << querySequenceNames.size() << std::endl;
-        std::cerr << "Target sequences: " << targetSequenceNames.size() << std::endl;
-        std::cerr << "\nSequence details:" << std::endl;
-        for (size_t i = 0; i < metadata.size(); ++i) {
-            std::cerr << "ID: " << i 
-                      << ", Name: " << metadata[i].name 
-                      << ", Length: " << metadata[i].len 
-                      << ", Group: " << metadata[i].groupId 
-                      << ", Type: " << (std::find(querySequenceNames.begin(), querySequenceNames.end(), metadata[i].name) != querySequenceNames.end() ? "Query" : "Target")
-                      << std::endl;
-        }
-        std::cerr << "\nGroup details:" << std::endl;
-        std::unordered_map<int, std::vector<std::string>> groupToSequences;
-        for (const auto& info : metadata) {
-            groupToSequences[info.groupId].push_back(info.name);
-        }
-        for (const auto& [groupId, sequences] : groupToSequences) {
-            std::cerr << "Group " << groupId << ": " << sequences.size() << " sequences" << std::endl;
-            for (const auto& seq : sequences) {
-                std::cerr << "  " << seq << std::endl;
-            }
-        }
     }
 
     seqno_t getSequenceId(const std::string& sequenceName) const {