diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 6cbadd7e..d6ac4b27 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -64,7 +64,7 @@ void parse_args(int argc, args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]"); args::Positional target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file"); - args::Group io_opts(parser, "[ Files IO Options ]"); + args::Group io_opts(parser, "[ Files IO Options ]"); args::Positional query_sequence_file(io_opts, "query", "query sequence file (optional)"); args::Group mapping_opts(parser, "[ Mapping Options ]"); @@ -75,14 +75,14 @@ void parse_args(int argc, args::ValueFlag num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"}); args::ValueFlag kmer_size(mapping_opts, "N", "kmer size [default: 19]", {'k', "kmer"}); args::ValueFlag kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"}); - args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"}); + args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"}); args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"}); args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"}); args::ValueFlag skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"}); - args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); - args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); - args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); - args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); + args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); + args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); + args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); + args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"}); args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"}); args::ValueFlag chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"}); @@ -99,6 +99,8 @@ void parse_args(int argc, //args::ValueFlag path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"}); //args::ValueFlag spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"}); args::Flag no_merge(mapping_opts, "no-merge", "don't merge consecutive segment-level mappings", {'M', "no-merge"}); + args::ValueFlag mashmap_index(mapping_opts, "FILE", "Use MashMap index if FILE exists, else create one and save as FILE", {'4', "mm-index"}); + args::Flag overwrite_mashmap_index(mapping_opts, "", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'5', "overwrite-mm-index"}); args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); @@ -604,6 +606,15 @@ void parse_args(int argc, //map_parameters.world_minimizers = true; //} + if (mashmap_index) + { + map_parameters.indexFilename = args::get(mashmap_index); + } else { + map_parameters.indexFilename = ""; + } + + map_parameters.overwrite_index = overwrite_mashmap_index; + if (approx_mapping) { map_parameters.outFileName = "/dev/stdout"; yeet_parameters.approx_mapping = true; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 8bd5f8ec..77343c4f 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -52,8 +52,8 @@ struct Parameters std::vector refSequences; //reference sequence(s) std::vector querySequences; //query sequence(s) std::string outFileName; //output file name - stdfs::path saveIndexFilename; //output file name of index - stdfs::path loadIndexFilename; //input file name of index + stdfs::path indexFilename; //output file name of index + bool overwrite_index; //overwrite index if it exists bool split; //Split read mapping (done if this is true) bool lower_triangular; // set to true if we should filter out half of the mappings bool skip_self; //skip self mappings diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 328dbac3..4b3c5db3 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -73,8 +73,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("numMappingsForShortSeq", "number of mappings to retain for each sequence shorter than segment length [default: 1]", ArgvParser::OptionRequiresValue); - cmd.defineOption("saveIndex", "Prefix of index files to save. PREFIX.map and PREFIX.index files will be created", ArgvParser::OptionRequiresValue); - cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue); + cmd.defineOption("index", "Writes index to provided filename if it doesn't exist, otherwise reads the index", ArgvParser::OptionRequiresValue); + cmd.defineOption("overwriteIndex", "Overwrites provided index filename"); cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]"); @@ -370,19 +370,14 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir } - if (cmd.foundOption("saveIndex")) { - str << cmd.optionValue("saveIndex"); - str >> parameters.saveIndexFilename; + if (cmd.foundOption("index")) { + str << cmd.optionValue("index"); + str >> parameters.indexFilename; } else { - parameters.saveIndexFilename = ""; + parameters.indexFilename = ""; } - if (cmd.foundOption("loadIndex")) { - str << cmd.optionValue("loadIndex"); - str >> parameters.loadIndexFilename; - } else { - parameters.loadIndexFilename = ""; - } - str.clear(); + + parameters.overwrite_index = cmd.foundOption("overwriteIndex"); parameters.alphabetSize = 4; //Do not expose the option to set protein alphabet in mashmap diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 1bd960cf..3a07f337 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -62,7 +62,7 @@ namespace skch const skch::Parameters ¶m; //Minmers that occur this or more times will be ignored (computed based on percentageThreshold) - int freqThreshold = std::numeric_limits::max(); + uint64_t freqThreshold = std::numeric_limits::max(); //Set of frequent seeds to be ignored ankerl::unordered_dense::set frequentSeeds; @@ -72,8 +72,12 @@ namespace skch public: - typedef std::vector< MinmerInfo > MI_Type; + using MI_Type = std::vector< MinmerInfo >; using MIIter_t = MI_Type::const_iterator; + using HF_Map_t = ankerl::unordered_dense::map; + + // Frequency of each hash + HF_Map_t hashFreq; //Keep sequence length, name that appear in the sequence (for printing the mappings later) std::vector< ContigInfo > metadata; @@ -111,7 +115,7 @@ namespace skch //Frequency histogram of minmers //[... ,x -> y, ...] implies y number of minmers occur x times - std::map minmerFreqHistogram; + std::map minmerFreqHistogram; public: @@ -122,29 +126,37 @@ namespace skch Sketch(const skch::Parameters &p) : param(p) { - this->build(); - this->index(); - if (!param.saveIndexFilename.empty()) { - if (param.saveIndexFilename.extension() == ".tsv") { - this->saveIndexTSV(); - } else { - this->saveIndexBinary(); + if (param.indexFilename.empty() + || !stdfs::exists(param.indexFilename) + || param.overwrite_index) + { + this->build(true); + this->computeFreqHist(); + this->computeFreqSeedSet(); + this->dropFreqSeedSet(); + this->hashFreq.clear(); + if (!param.indexFilename.empty()) + { + this->writeIndex(); } - this->savePosListBinary(); + } else { + this->build(false); + this->readIndex(); } - this->computeFreqHist(); - this->computeFreqSeedSet(); - this->dropFreqSeedSet(); + std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << (minmerPosLookupIndex.size() - this->frequentSeeds.size()) << std::endl; + std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl; } private: /** - * @brief build the sketch table - * @details compute and save minmers from the reference sequence(s) + * @brief Get sequence metadata and optionally build the sketch table + * + * @details Iterate through ref sequences to get metadata and + * optionally compute and save minmers from the reference sequence(s) * assuming a fixed window size */ - void build() + void build(bool compute_seeds) { // allowed set of targets @@ -163,13 +175,6 @@ namespace skch //Create the thread pool ThreadPool threadPool( [this](InputSeqContainer* e) {return buildHelper(e);}, param.threads); - if (!param.loadIndexFilename.empty()) { - if (param.loadIndexFilename.extension() == ".tsv") { - this->loadIndexTSV(); - } else { - this->loadIndexBinary(); - } - } for(const auto &fileName : param.refSequences) { @@ -199,7 +204,7 @@ namespace skch } else { - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter)); //Collect output if available @@ -219,14 +224,13 @@ namespace skch exit(1); } - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { //Collect remaining output objects while ( threadPool.running() ) this->buildHandleThreadOutput(threadPool.popOutputWhenAvailable()); + std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } - - std::cerr << "[mashmap::skch::Sketch::build] minmer windows picked from reference = " << minmerIndex.size() << std::endl; - } /** @@ -257,20 +261,38 @@ namespace skch * @brief routine to handle thread's local minmer index * @param[in] output thread local minmer output */ - void buildHandleThreadOutput(MI_Type* output) + void buildHandleThreadOutput(MI_Type* contigMinmerIndex) { - this->minmerIndex.insert(this->minmerIndex.end(), output->begin(), output->end()); - delete output; + for (MinmerInfo& mi : *contigMinmerIndex) + { + this->hashFreq[mi.hash]++; + if (minmerPosLookupIndex[mi.hash].size() == 0 + || minmerPosLookupIndex[mi.hash].back().hash != mi.hash + || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) + { + minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos, mi.hash, mi.seqId, side::OPEN}); + minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos_end, mi.hash, mi.seqId, side::CLOSE}); + } else { + minmerPosLookupIndex[mi.hash].back().pos = mi.wpos_end; + } + } + + this->minmerIndex.insert( + this->minmerIndex.end(), + std::make_move_iterator(contigMinmerIndex->begin()), + std::make_move_iterator(contigMinmerIndex->end())); + + delete contigMinmerIndex; } /** - * @brief Save index. TSV indexing is slower but can be debugged easier + * @brief Write sketch as tsv. TSV indexing is slower but can be debugged easier */ - void saveIndexTSV() + void writeSketchTSV() { std::ofstream outStream; - outStream.open(param.saveIndexFilename); + outStream.open(std::string(param.indexFilename) + ".tsv"); outStream << "seqId" << "\t" << "strand" << "\t" << "start" << "\t" << "end" << "\t" << "hash\n"; for (auto& mi : this->minmerIndex) { outStream << mi.seqId << "\t" << std::to_string(mi.strand) << "\t" << mi.wpos << "\t" << mi.wpos_end << "\t" << mi.hash << "\n"; @@ -278,29 +300,22 @@ namespace skch outStream.close(); } + /** - * @brief Save index for quick loading + * @brief Write sketch for quick loading */ - void saveIndexBinary() + void writeSketchBinary(std::ofstream& outStream) { - fs::path indexFilename = fs::path(param.saveIndexFilename); - indexFilename += ".index"; - std::ofstream outStream; - outStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = minmerIndex.size(); outStream.write((char*)&size, sizeof(size)); outStream.write((char*)&minmerIndex[0], minmerIndex.size() * sizeof(MinmerInfo)); } /** - * @brief Save posList for quick loading + * @brief Write posList for quick loading */ - void savePosListBinary() + void writePosListBinary(std::ofstream& outStream) { - fs::path posListFilename = fs::path(param.saveIndexFilename); - posListFilename += ".map"; - std::ofstream outStream; - outStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type size = minmerPosLookupIndex.size(); outStream.write((char*)&size, sizeof(size)); @@ -316,11 +331,54 @@ namespace skch /** - * @brief Load index from TSV file + * @brief Write posList for quick loading + */ + void writeFreqKmersBinary(std::ofstream& outStream) + { + typename MI_Map_t::size_type size = frequentSeeds.size(); + outStream.write((char*)&size, sizeof(size)); + + for (hash_t kmerHash : frequentSeeds) + { + MinmerMapKeyType key = kmerHash; + outStream.write((char*)&key, sizeof(key)); + } + } + + + /** + * @brief Write parameters + */ + void writeParameters(std::ofstream& outStream) + { + // Write segment length, sketch size, and kmer size + outStream.write((char*) ¶m.segLength, sizeof(param.segLength)); + outStream.write((char*) ¶m.sketchSize, sizeof(param.sketchSize)); + outStream.write((char*) ¶m.kmerSize, sizeof(param.kmerSize)); + } + + + /** + * @brief Write all index data structures to disk + */ + void writeIndex() + { + fs::path freqListFilename = fs::path(param.indexFilename); + std::ofstream outStream; + outStream.open(freqListFilename, std::ios::binary); + + writeParameters(outStream); + writeSketchBinary(outStream); + writePosListBinary(outStream); + writeFreqKmersBinary(outStream); + } + + /** + * @brief Read sketch from TSV file */ - void loadIndexTSV() + void readSketchTSV() { - io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(param.loadIndexFilename); + io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(std::string(param.indexFilename) + ".tsv"); inReader.read_header(io::ignore_missing_column, "seqId", "strand", "start", "end", "hash"); hash_t hash; offset_t start, end; @@ -333,14 +391,10 @@ namespace skch } /** - * @brief Load index from binary file + * @brief Read sketch from binary file */ - void loadIndexBinary() + void readSketchBinary(std::ifstream& inStream) { - fs::path indexFilename = fs::path(param.loadIndexFilename); - indexFilename += ".index"; - std::ifstream inStream; - inStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = 0; inStream.read((char*)&size, sizeof(size)); minmerIndex.resize(size); @@ -348,14 +402,10 @@ namespace skch } /** - * @brief Save posList for quick loading + * @brief Save posList for quick reading */ - void loadPosListBinary() + void readPosListBinary(std::ifstream& inStream) { - fs::path posListFilename = fs::path(param.loadIndexFilename); - posListFilename += ".map"; - std::ifstream inStream; - inStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type numKeys = 0; inStream.read((char*)&numKeys, sizeof(numKeys)); minmerPosLookupIndex.reserve(numKeys); @@ -369,51 +419,82 @@ namespace skch minmerPosLookupIndex[key].resize(size); inStream.read((char*)&minmerPosLookupIndex[key][0], size * sizeof(MinmerMapValueType::value_type)); - } } + /** - * @brief build the index for fast lookups using minmer table + * @brief read frequent kmers from file */ - void index() + void readFreqKmersBinary(std::ifstream& inStream) { - //Parse all the minmers and push into the map - //minmerPosLookupIndex.set_empty_key(0); - if (param.loadIndexFilename.empty()) + typename MI_Map_t::size_type numKeys = 0; + inStream.read((char*)&numKeys, sizeof(numKeys)); + frequentSeeds.reserve(numKeys); + + for (auto idx = 0; idx < numKeys; idx++) { - for(auto &mi : minmerIndex) - { - // [hash value -> info about minmer] - if (minmerPosLookupIndex[mi.hash].size() == 0 - || minmerPosLookupIndex[mi.hash].back().hash != mi.hash - || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) - { - minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos, mi.hash, mi.seqId, side::OPEN}); - minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos_end, mi.hash, mi.seqId, side::CLOSE}); - } else { - minmerPosLookupIndex[mi.hash].back().pos = mi.wpos_end; - } - } + MinmerMapKeyType key = 0; + inStream.read((char*)&key, sizeof(key)); + frequentSeeds.insert(key); } - else + } + + + /** + * @brief Read parameters and compare to CLI params + */ + void readParameters(std::ifstream& inStream) + { + // Read segment length, sketch size, and kmer size + decltype(param.segLength) index_segLength; + decltype(param.sketchSize) index_sketchSize; + decltype(param.kmerSize) index_kmerSize; + + inStream.read((char*) &index_segLength, sizeof(index_segLength)); + inStream.read((char*) &index_sketchSize, sizeof(index_sketchSize)); + inStream.read((char*) &index_kmerSize, sizeof(index_kmerSize)); + + if (param.segLength != index_segLength + || param.sketchSize != index_sketchSize + || param.kmerSize != index_kmerSize) { - this->loadPosListBinary(); + std::cerr << "[mashmap::skch::Sketch::build] ERROR: Parameters of indexed sketch differ from CLI parameters" << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] ERROR: Index --> segLength=" << index_segLength + << " sketchSize=" << index_sketchSize << " kmerSize=" << index_kmerSize << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] ERROR: CLI --> segLength=" << param.segLength + << " sketchSize=" << param.sketchSize << " kmerSize=" << param.kmerSize << std::endl; + exit(1); } - std::cerr << "[mashmap::skch::Sketch::index] unique minmers = " << minmerPosLookupIndex.size() << std::endl; } + + /** + * @brief Read all index data structures from file + */ + void readIndex() + { + fs::path indexFilename = fs::path(param.indexFilename); + std::ifstream inStream; + inStream.open(indexFilename, std::ios::binary); + readParameters(inStream); + readSketchBinary(inStream); + readPosListBinary(inStream); + readFreqKmersBinary(inStream); + } + + /** * @brief report the frequency histogram of minmers using position lookup index * and compute which high frequency minmers to ignore */ void computeFreqHist() { - if (!minmerPosLookupIndex.empty()) { + if (!this->minmerPosLookupIndex.empty()) { //1. Compute histogram - for (auto &e : this->minmerPosLookupIndex) - this->minmerFreqHistogram[e.second.size()] += 1; + for (auto& e : this->minmerPosLookupIndex) + this->minmerFreqHistogram[e.second.size()]++; std::cerr << "[mashmap::skch::Sketch::computeFreqHist] Frequency histogram of minmer interval points = " << *this->minmerFreqHistogram.begin() << " ... " << *this->minmerFreqHistogram.rbegin() @@ -440,13 +521,13 @@ namespace skch } } - if (this->freqThreshold != std::numeric_limits::max()) + if (this->freqThreshold != std::numeric_limits::max()) std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, ignore minmers occurring >= " << this->freqThreshold << " times during lookup." + << "\%, ignore minmers with more than >= " << this->freqThreshold << " interval points during mapping." << std::endl; else std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, consider all minmers during lookup." << std::endl; + << "\%, consider all minmers during mapping." << std::endl; } else { std::cerr << "[mashmap::skch::Sketch::computeFreqHist] No minmers." << std::endl; }