From c54bd944938c2eb30a8919670a3fdadc089d3102 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 24 Apr 2024 19:45:23 -0500 Subject: [PATCH 1/5] Handle thread output incrementally --- src/map/include/winSketch.hpp | 142 +++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 47 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 44a8fe44..51a8ae8e 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -62,7 +62,7 @@ namespace skch const skch::Parameters ¶m; //Minmers that occur this or more times will be ignored (computed based on percentageThreshold) - int freqThreshold = std::numeric_limits::max(); + uint64_t freqThreshold = std::numeric_limits::max(); //Set of frequent seeds to be ignored ankerl::unordered_dense::set frequentSeeds; @@ -72,8 +72,12 @@ namespace skch public: - typedef std::vector< MinmerInfo > MI_Type; + using MI_Type = std::vector< MinmerInfo >; using MIIter_t = MI_Type::const_iterator; + using HF_Map_t = ankerl::unordered_dense::map; + + // Frequency of each hash + HF_Map_t hashFreq; //Keep sequence length, name that appear in the sequence (for printing the mappings later) std::vector< ContigInfo > metadata; @@ -111,7 +115,7 @@ namespace skch //Frequency histogram of minmers //[... ,x -> y, ...] implies y number of minmers occur x times - std::map minmerFreqHistogram; + std::map minmerFreqHistogram; public: @@ -123,7 +127,19 @@ namespace skch : param(p) { this->build(); - this->index(); + if (param.loadIndexFilename.empty()) + { + this->computeFreqHist(); + this->computeFreqSeedSet(); + this->dropFreqSeedSet(); + this->hashFreq.clear(); + } else { + this->loadPosListBinary(); + this->loadFreqKmersBinary(); + } + std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << minmerPosLookupIndex.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl; + if (!param.saveIndexFilename.empty()) { if (param.saveIndexFilename.extension() == ".tsv") { this->saveIndexTSV(); @@ -131,10 +147,8 @@ namespace skch this->saveIndexBinary(); } this->savePosListBinary(); + this->saveFreqKmersBinary(); } - this->computeFreqHist(); - this->computeFreqSeedSet(); - this->dropFreqSeedSet(); } private: @@ -224,9 +238,8 @@ namespace skch while ( threadPool.running() ) this->buildHandleThreadOutput(threadPool.popOutputWhenAvailable()); } - - std::cerr << "[mashmap::skch::Sketch::build] minmer windows picked from reference = " << minmerIndex.size() << std::endl; - + std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } /** @@ -257,10 +270,28 @@ namespace skch * @brief routine to handle thread's local minmer index * @param[in] output thread local minmer output */ - void buildHandleThreadOutput(MI_Type* output) + void buildHandleThreadOutput(MI_Type* contigMinmerIndex) { - this->minmerIndex.insert(this->minmerIndex.end(), output->begin(), output->end()); - delete output; + for (MinmerInfo& mi : *contigMinmerIndex) + { + if (minmerPosLookupIndex[mi.hash].size() == 0 + || minmerPosLookupIndex[mi.hash].back().hash != mi.hash + || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) + { + this->hashFreq[mi.hash]++; + minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos, mi.hash, mi.seqId, side::OPEN}); + minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos_end, mi.hash, mi.seqId, side::CLOSE}); + } else { + minmerPosLookupIndex[mi.hash].back().pos = mi.wpos_end; + } + } + + this->minmerIndex.insert( + this->minmerIndex.end(), + std::make_move_iterator(contigMinmerIndex->begin()), + std::make_move_iterator(contigMinmerIndex->end())); + + delete contigMinmerIndex; } @@ -315,6 +346,26 @@ namespace skch } + /** + * @brief Save posList for quick loading + */ + void saveFreqKmersBinary() + { + fs::path freqListFilename = fs::path(param.saveIndexFilename); + freqListFilename += ".freq"; + std::ofstream outStream; + outStream.open(freqListFilename, std::ios::binary); + typename MI_Map_t::size_type size = frequentSeeds.size(); + outStream.write((char*)&size, sizeof(size)); + + for (hash_t kmerHash : frequentSeeds) + { + MinmerMapKeyType key = kmerHash; + outStream.write((char*)&key, sizeof(key)); + } + } + + /** * @brief Load index from TSV file */ @@ -373,55 +424,48 @@ namespace skch } } + /** - * @brief build the index for fast lookups using minmer table + * @brief Load frequent kmers from file */ - void index() + void loadFreqKmersBinary() { - //Parse all the minmers and push into the map - //minmerPosLookupIndex.set_empty_key(0); - if (param.loadIndexFilename.empty()) - { - for(auto &mi : minmerIndex) - { - // [hash value -> info about minmer] - if (minmerPosLookupIndex[mi.hash].size() == 0 - || minmerPosLookupIndex[mi.hash].back().hash != mi.hash - || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) - { - minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos, mi.hash, mi.seqId, side::OPEN}); - minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos_end, mi.hash, mi.seqId, side::CLOSE}); - } else { - minmerPosLookupIndex[mi.hash].back().pos = mi.wpos_end; - } - } - } - else + fs::path freqListFilename = fs::path(param.loadIndexFilename); + freqListFilename += ".freq"; + std::ifstream inStream; + inStream.open(freqListFilename, std::ios::binary); + typename MI_Map_t::size_type numKeys = 0; + inStream.read((char*)&numKeys, sizeof(numKeys)); + frequentSeeds.reserve(numKeys); + + for (auto idx = 0; idx < numKeys; idx++) { - this->loadPosListBinary(); + MinmerMapKeyType key = 0; + inStream.read((char*)&key, sizeof(key)); + frequentSeeds.insert(key); } - std::cerr << "[mashmap::skch::Sketch::index] unique minmers = " << minmerPosLookupIndex.size() << std::endl; } + /** * @brief report the frequency histogram of minmers using position lookup index * and compute which high frequency minmers to ignore */ void computeFreqHist() { - if (!minmerPosLookupIndex.empty()) { + if (!this->minmerPosLookupIndex.empty()) { //1. Compute histogram - for (auto &e : this->minmerPosLookupIndex) - this->minmerFreqHistogram[e.second.size()] += 1; + for (auto& [kmerHash, freq] : this->hashFreq) + this->minmerFreqHistogram[freq]++; - std::cerr << "[mashmap::skch::Sketch::computeFreqHist] Frequency histogram of minmer interval points = " + std::cerr << "[mashmap::skch::Sketch::computeFreqHist] Frequency histogram of minmer intervals = " << *this->minmerFreqHistogram.begin() << " ... " << *this->minmerFreqHistogram.rbegin() << std::endl; //2. Compute frequency threshold to ignore most frequent minmers - int64_t totalUniqueMinmers = this->minmerPosLookupIndex.size(); + int64_t totalUniqueMinmers = this->hashFreq.size(); int64_t minmerToIgnore = totalUniqueMinmers * param.kmer_pct_threshold / 100; int64_t sum = 0; @@ -442,11 +486,11 @@ namespace skch if (this->freqThreshold != std::numeric_limits::max()) std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, ignore minmers occurring >= " << this->freqThreshold << " times during lookup." + << "\%, ignore minmers with more than >= " << this->freqThreshold << " intervals during mapping." << std::endl; else std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, consider all minmers during lookup." << std::endl; + << "\%, consider all minmers during mapping." << std::endl; } else { std::cerr << "[mashmap::skch::Sketch::computeFreqHist] No minmers." << std::endl; } @@ -487,9 +531,9 @@ namespace skch void computeFreqSeedSet() { - for(auto &e : this->minmerPosLookupIndex) { - if (e.second.size() >= this->freqThreshold) { - this->frequentSeeds.insert(e.first); + for(auto& [kmerHash, posList] : this->minmerPosLookupIndex) { + if (posList.size() / 2 >= this->freqThreshold) { + this->frequentSeeds.insert(kmerHash); } } } @@ -498,9 +542,13 @@ namespace skch { this->minmerIndex.erase( std::remove_if(minmerIndex.begin(), minmerIndex.end(), [&] - (auto& mi) {return this->frequentSeeds.find(mi.hash) != this->frequentSeeds.end();} + (auto& mi) {return isFreqSeed(mi.hash);} ), minmerIndex.end() ); + for (hash_t kmer : this->frequentSeeds) + { + this->minmerPosLookupIndex.erase(kmer); + } } bool isFreqSeed(hash_t h) const From 6bbb4bff02f7fd04a263459e1ab38c479923b617 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 24 Apr 2024 19:53:03 -0500 Subject: [PATCH 2/5] Use hashFreq to check if seed is frequent --- src/map/include/winSketch.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 51a8ae8e..bbb9cb10 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -531,8 +531,8 @@ namespace skch void computeFreqSeedSet() { - for(auto& [kmerHash, posList] : this->minmerPosLookupIndex) { - if (posList.size() / 2 >= this->freqThreshold) { + for(auto& [kmerHash, frequency] : this->hashFreq) { + if (frequency >= this->freqThreshold) { this->frequentSeeds.insert(kmerHash); } } From 1b770a20be5084639038d4be21e4713b5ee220d8 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 24 Apr 2024 20:19:55 -0500 Subject: [PATCH 3/5] Split windows for frequency counting --- src/map/include/winSketch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index bbb9cb10..50f88125 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -274,11 +274,11 @@ namespace skch { for (MinmerInfo& mi : *contigMinmerIndex) { + this->hashFreq[mi.hash]++; if (minmerPosLookupIndex[mi.hash].size() == 0 || minmerPosLookupIndex[mi.hash].back().hash != mi.hash || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) { - this->hashFreq[mi.hash]++; minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos, mi.hash, mi.seqId, side::OPEN}); minmerPosLookupIndex[mi.hash].push_back(IntervalPoint {mi.wpos_end, mi.hash, mi.seqId, side::CLOSE}); } else { From 51c05d322d5ca0e78745894d22cd57287c403bc3 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 19 Jun 2024 15:35:47 -0500 Subject: [PATCH 4/5] Fix spacing --- src/interface/parse_args.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 6cbadd7e..31bcb69e 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -64,7 +64,7 @@ void parse_args(int argc, args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]"); args::Positional target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file"); - args::Group io_opts(parser, "[ Files IO Options ]"); + args::Group io_opts(parser, "[ Files IO Options ]"); args::Positional query_sequence_file(io_opts, "query", "query sequence file (optional)"); args::Group mapping_opts(parser, "[ Mapping Options ]"); @@ -75,14 +75,14 @@ void parse_args(int argc, args::ValueFlag num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"}); args::ValueFlag kmer_size(mapping_opts, "N", "kmer size [default: 19]", {'k', "kmer"}); args::ValueFlag kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"}); - args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"}); + args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"}); args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"}); args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"}); args::ValueFlag skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"}); - args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); - args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); - args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); - args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); + args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); + args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); + args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); + args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"}); args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"}); args::ValueFlag chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"}); From eb9eb510615d4e77e8215a13a3331763caf27d26 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 19 Jun 2024 16:24:55 -0500 Subject: [PATCH 5/5] Overhaul index reading/writing --- src/interface/parse_args.hpp | 11 +++ src/map/include/map_parameters.hpp | 4 +- src/map/include/parseCmdArgs.hpp | 21 ++--- src/map/include/winSketch.hpp | 139 ++++++++++++++--------------- 4 files changed, 88 insertions(+), 87 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 31bcb69e..d6ac4b27 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -99,6 +99,8 @@ void parse_args(int argc, //args::ValueFlag path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"}); //args::ValueFlag spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"}); args::Flag no_merge(mapping_opts, "no-merge", "don't merge consecutive segment-level mappings", {'M', "no-merge"}); + args::ValueFlag mashmap_index(mapping_opts, "FILE", "Use MashMap index if FILE exists, else create one and save as FILE", {'4', "mm-index"}); + args::Flag overwrite_mashmap_index(mapping_opts, "", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'5', "overwrite-mm-index"}); args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); @@ -604,6 +606,15 @@ void parse_args(int argc, //map_parameters.world_minimizers = true; //} + if (mashmap_index) + { + map_parameters.indexFilename = args::get(mashmap_index); + } else { + map_parameters.indexFilename = ""; + } + + map_parameters.overwrite_index = overwrite_mashmap_index; + if (approx_mapping) { map_parameters.outFileName = "/dev/stdout"; yeet_parameters.approx_mapping = true; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 8bd5f8ec..77343c4f 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -52,8 +52,8 @@ struct Parameters std::vector refSequences; //reference sequence(s) std::vector querySequences; //query sequence(s) std::string outFileName; //output file name - stdfs::path saveIndexFilename; //output file name of index - stdfs::path loadIndexFilename; //input file name of index + stdfs::path indexFilename; //output file name of index + bool overwrite_index; //overwrite index if it exists bool split; //Split read mapping (done if this is true) bool lower_triangular; // set to true if we should filter out half of the mappings bool skip_self; //skip self mappings diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 328dbac3..4b3c5db3 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -73,8 +73,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("numMappingsForShortSeq", "number of mappings to retain for each sequence shorter than segment length [default: 1]", ArgvParser::OptionRequiresValue); - cmd.defineOption("saveIndex", "Prefix of index files to save. PREFIX.map and PREFIX.index files will be created", ArgvParser::OptionRequiresValue); - cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue); + cmd.defineOption("index", "Writes index to provided filename if it doesn't exist, otherwise reads the index", ArgvParser::OptionRequiresValue); + cmd.defineOption("overwriteIndex", "Overwrites provided index filename"); cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]"); @@ -370,19 +370,14 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir } - if (cmd.foundOption("saveIndex")) { - str << cmd.optionValue("saveIndex"); - str >> parameters.saveIndexFilename; + if (cmd.foundOption("index")) { + str << cmd.optionValue("index"); + str >> parameters.indexFilename; } else { - parameters.saveIndexFilename = ""; + parameters.indexFilename = ""; } - if (cmd.foundOption("loadIndex")) { - str << cmd.optionValue("loadIndex"); - str >> parameters.loadIndexFilename; - } else { - parameters.loadIndexFilename = ""; - } - str.clear(); + + parameters.overwrite_index = cmd.foundOption("overwriteIndex"); parameters.alphabetSize = 4; //Do not expose the option to set protein alphabet in mashmap diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index c2d186f3..98ddda03 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -126,39 +126,37 @@ namespace skch Sketch(const skch::Parameters &p) : param(p) { - this->build(); - if (param.loadIndexFilename.empty()) + if (param.indexFilename.empty() + || !stdfs::exists(param.indexFilename) + || param.overwrite_index) { + this->build(true); this->computeFreqHist(); this->computeFreqSeedSet(); this->dropFreqSeedSet(); this->hashFreq.clear(); + if (!param.indexFilename.empty()) + { + this->writeIndex(); + } } else { - this->loadPosListBinary(); - this->loadFreqKmersBinary(); + this->build(false); + this->readIndex(); } std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << minmerPosLookupIndex.size() << std::endl; std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl; - - if (!param.saveIndexFilename.empty()) { - if (param.saveIndexFilename.extension() == ".tsv") { - this->saveIndexTSV(); - } else { - this->saveIndexBinary(); - } - this->savePosListBinary(); - this->saveFreqKmersBinary(); - } } private: /** - * @brief build the sketch table - * @details compute and save minmers from the reference sequence(s) + * @brief Get sequence metadata and optionally build the sketch table + * + * @details Iterate through ref sequences to get metadata and + * optionally compute and save minmers from the reference sequence(s) * assuming a fixed window size */ - void build() + void build(bool compute_seeds) { // allowed set of targets @@ -177,13 +175,6 @@ namespace skch //Create the thread pool ThreadPool threadPool( [this](InputSeqContainer* e) {return buildHelper(e);}, param.threads); - if (!param.loadIndexFilename.empty()) { - if (param.loadIndexFilename.extension() == ".tsv") { - this->loadIndexTSV(); - } else { - this->loadIndexBinary(); - } - } for(const auto &fileName : param.refSequences) { @@ -213,7 +204,7 @@ namespace skch } else { - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter)); //Collect output if available @@ -233,13 +224,13 @@ namespace skch exit(1); } - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { //Collect remaining output objects while ( threadPool.running() ) this->buildHandleThreadOutput(threadPool.popOutputWhenAvailable()); + std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } - std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; - std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } /** @@ -296,12 +287,12 @@ namespace skch /** - * @brief Save index. TSV indexing is slower but can be debugged easier + * @brief Write sketch as tsv. TSV indexing is slower but can be debugged easier */ - void saveIndexTSV() + void writeSketchTSV() { std::ofstream outStream; - outStream.open(param.saveIndexFilename); + outStream.open(std::string(param.indexFilename) + ".tsv"); outStream << "seqId" << "\t" << "strand" << "\t" << "start" << "\t" << "end" << "\t" << "hash\n"; for (auto& mi : this->minmerIndex) { outStream << mi.seqId << "\t" << std::to_string(mi.strand) << "\t" << mi.wpos << "\t" << mi.wpos_end << "\t" << mi.hash << "\n"; @@ -309,29 +300,22 @@ namespace skch outStream.close(); } + /** - * @brief Save index for quick loading + * @brief Write sketch for quick loading */ - void saveIndexBinary() + void writeSketchBinary(std::ofstream& outStream) { - fs::path indexFilename = fs::path(param.saveIndexFilename); - indexFilename += ".index"; - std::ofstream outStream; - outStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = minmerIndex.size(); outStream.write((char*)&size, sizeof(size)); outStream.write((char*)&minmerIndex[0], minmerIndex.size() * sizeof(MinmerInfo)); } /** - * @brief Save posList for quick loading + * @brief Write posList for quick loading */ - void savePosListBinary() + void writePosListBinary(std::ofstream& outStream) { - fs::path posListFilename = fs::path(param.saveIndexFilename); - posListFilename += ".map"; - std::ofstream outStream; - outStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type size = minmerPosLookupIndex.size(); outStream.write((char*)&size, sizeof(size)); @@ -347,14 +331,10 @@ namespace skch /** - * @brief Save posList for quick loading + * @brief Write posList for quick loading */ - void saveFreqKmersBinary() + void writeFreqKmersBinary(std::ofstream& outStream) { - fs::path freqListFilename = fs::path(param.saveIndexFilename); - freqListFilename += ".freq"; - std::ofstream outStream; - outStream.open(freqListFilename, std::ios::binary); typename MI_Map_t::size_type size = frequentSeeds.size(); outStream.write((char*)&size, sizeof(size)); @@ -367,11 +347,25 @@ namespace skch /** - * @brief Load index from TSV file + * @brief Write all index data structures to disk + */ + void writeIndex() + { + fs::path freqListFilename = fs::path(param.indexFilename); + std::ofstream outStream; + outStream.open(freqListFilename, std::ios::binary); + + writeSketchBinary(outStream); + writePosListBinary(outStream); + writeFreqKmersBinary(outStream); + } + + /** + * @brief Read sketch from TSV file */ - void loadIndexTSV() + void readSketchTSV() { - io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(param.loadIndexFilename); + io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(std::string(param.indexFilename) + ".tsv"); inReader.read_header(io::ignore_missing_column, "seqId", "strand", "start", "end", "hash"); hash_t hash; offset_t start, end; @@ -384,14 +378,10 @@ namespace skch } /** - * @brief Load index from binary file + * @brief Read sketch from binary file */ - void loadIndexBinary() + void readSketchBinary(std::ifstream& inStream) { - fs::path indexFilename = fs::path(param.loadIndexFilename); - indexFilename += ".index"; - std::ifstream inStream; - inStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = 0; inStream.read((char*)&size, sizeof(size)); minmerIndex.resize(size); @@ -399,14 +389,10 @@ namespace skch } /** - * @brief Save posList for quick loading + * @brief Save posList for quick reading */ - void loadPosListBinary() + void readPosListBinary(std::ifstream& inStream) { - fs::path posListFilename = fs::path(param.loadIndexFilename); - posListFilename += ".map"; - std::ifstream inStream; - inStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type numKeys = 0; inStream.read((char*)&numKeys, sizeof(numKeys)); minmerPosLookupIndex.reserve(numKeys); @@ -420,20 +406,15 @@ namespace skch minmerPosLookupIndex[key].resize(size); inStream.read((char*)&minmerPosLookupIndex[key][0], size * sizeof(MinmerMapValueType::value_type)); - } } /** - * @brief Load frequent kmers from file + * @brief read frequent kmers from file */ - void loadFreqKmersBinary() + void readFreqKmersBinary(std::ifstream& inStream) { - fs::path freqListFilename = fs::path(param.loadIndexFilename); - freqListFilename += ".freq"; - std::ifstream inStream; - inStream.open(freqListFilename, std::ios::binary); typename MI_Map_t::size_type numKeys = 0; inStream.read((char*)&numKeys, sizeof(numKeys)); frequentSeeds.reserve(numKeys); @@ -447,6 +428,20 @@ namespace skch } + /** + * @brief Read all index data structures from file + */ + void readIndex() + { + fs::path indexFilename = fs::path(param.indexFilename); + std::ifstream inStream; + inStream.open(indexFilename, std::ios::binary); + readSketchBinary(inStream); + readPosListBinary(inStream); + readFreqKmersBinary(inStream); + } + + /** * @brief report the frequency histogram of minmers using position lookup index * and compute which high frequency minmers to ignore @@ -484,7 +479,7 @@ namespace skch } } - if (this->freqThreshold != std::numeric_limits::max()) + if (this->freqThreshold != std::numeric_limits::max()) std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold << "\%, ignore minmers with more than >= " << this->freqThreshold << " intervals during mapping." << std::endl;