diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index 01362c67..2a9c8273 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -1,10 +1,10 @@ on: push: - branches: [ master ] + branches: [ main ] paths-ignore: - '**/*.md' pull_request: - branches: [ master ] + branches: [ main ] paths-ignore: - '**/*.md' @@ -38,9 +38,9 @@ jobs: - name: Test mapping coverage with 8 yeast genomes (PAF output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92 - name: Test mapping+alignment with a subset of the LPA dataset (PAF output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf - name: Test mapping+alignment with a subset of the LPA dataset (SAM output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head - name: Test mapping+alignment with short reads (255bps) (PAF output) diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py new file mode 100644 index 00000000..37a8b901 --- /dev/null +++ b/scripts/all2all_jobs.py @@ -0,0 +1,90 @@ +import argparse +import gzip +import itertools + +def parse_fasta_index(fasta_file): + fai_file = fasta_file + '.fai' + sequences = [] + with open(fai_file, 'r') as file: + for line in file: + sequence_name = line.strip().split('\t')[0] + sequences.append(sequence_name) + return sequences + +def group_sequences(sequences, grouping): + grouped_sequences = {} + for sequence in sequences: + if '#' in sequence: + fields = sequence.split('#') + if grouping in ['g', 'genome']: + group_key = fields[0] + elif grouping in ['h', 'haplotype']: + group_key = '#'.join(fields[:2]) + elif grouping in ['c', 'contig']: + group_key = sequence + else: + raise ValueError(f"Invalid grouping: {grouping}") + else: + group_key = sequence + + if group_key not in grouped_sequences: + grouped_sequences[group_key] = [] + grouped_sequences[group_key].append(sequence) + + return grouped_sequences + +def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries): + pairings = [] + target_groups = list(target_grouped_sequences.keys()) + query_groups = list(query_grouped_sequences.keys()) + + for target_group in target_groups: + query_pool = [group for group in query_groups if group != target_group] + + for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries): + query_chunk = [q for q in query_chunk if q is not None] + pairings.append((target_group, query_chunk)) + + return pairings + +def main(): + parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.') + parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)') + parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)') + parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)') + parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)') + parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines') + + args, wfmash_args = parser.parse_known_args() + + # Parse the FASTA index file + sequences = parse_fasta_index(args.fasta_file) + + # Group sequences based on the specified grouping levels + target_grouped_sequences = group_sequences(sequences, args.target_grouping) + query_grouped_sequences = group_sequences(sequences, args.query_grouping) + + # Generate pairings + pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries) + + # Save or print the pairings or command lines + if wfmash_args: + wfmash_options = ' '.join(wfmash_args) + if args.output: + with open(args.output, 'w') as file: + for target_group, query_groups in pairings: + file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n") + else: + for target_group, query_groups in pairings: + print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}") + else: + if args.output: + with open(args.output, 'w') as file: + for target_group, query_groups in pairings: + file.write(f"{target_group}\t{','.join(query_groups)}\n") + else: + for target_group, query_groups in pairings: + print(f"{target_group}\t{','.join(query_groups)}") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/make_source_targball.sh b/scripts/make_source_targball.sh new file mode 100644 index 00000000..313d2f92 --- /dev/null +++ b/scripts/make_source_targball.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# run from root of the repository + +mkdir source-tarball +cd source-tarball +git clone --recursive https://github.com/waveygang/wfmash +cd wfmash +git fetch --tags origin +LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)" +git checkout "${LATEST_TAG}" +git submodule update --init --recursive +bash scripts/generate_git_version.sh $PWD/src +sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i +rm -Rf .git +find src/common -name ".git" -exec rm -Rf "{}" \; +cd .. +mv wfmash "wfmash-${LATEST_TAG}" +tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}" +rm -Rf "wfmash-${LATEST_TAG}" diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp index 1c0c62fd..9aa38c09 100644 --- a/src/common/seqiter.hpp +++ b/src/common/seqiter.hpp @@ -106,4 +106,57 @@ void for_each_seq_in_file( } } +void for_each_seq_in_file( + faidx_t* fai, + const std::vector& seq_names, + const std::function& func) { + for (const auto& seq_name : seq_names) { + int len; + char* seq = fai_fetch(fai, seq_name.c_str(), &len); + if (seq != nullptr) { + func(seq_name, std::string(seq)); + free(seq); + } + } +} + +void for_each_seq_in_file_filtered( + const std::string& filename, + const std::vector& query_prefix, + const std::unordered_set& query_list, + const std::function& func) { + faidx_t* fai = fai_load(filename.c_str()); + if (fai == nullptr) { + std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl; + return; + } + + std::vector query_seq_names; + int num_seqs = faidx_nseq(fai); + for (int i = 0; i < num_seqs; i++) { + const char* seq_name = faidx_iseq(fai, i); + bool prefix_skip = true; + for (const auto& prefix : query_prefix) { + if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) { + prefix_skip = false; + break; + } + } + if (!query_prefix.empty() && prefix_skip) { + continue; + } + if (!query_list.empty() && query_list.count(seq_name) == 0) { + continue; + } + query_seq_names.push_back(seq_name); + } + + for_each_seq_in_file( + fai, + query_seq_names, + func); + + fai_destroy(fai); +} + } // namespace seqiter diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 67cd7bf9..84144896 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront( // Free delete wflambda_aligner; + delete wf_aligner; #ifdef WFA_PNG_TSV_TIMING if (extend_data.emit_png) { @@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront( } if (merge_alignments) { - // Free old aligner - delete wf_aligner; - // use biWFA for all patching wfa::WFAlignerGapAffine2Pieces* wf_aligner = new wfa::WFAlignerGapAffine2Pieces( @@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront( emit_patching_tsv, out_patching_tsv #endif - ); + ); + + delete wf_aligner; } else { // todo old implementation (and SAM format is not supported) for (auto x = trace.rbegin(); x != trace.rend(); ++x) { @@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront( } } } - - // Free - delete wf_aligner; } } diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index d866cd81..49090124 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -64,9 +64,9 @@ void parse_args(int argc, args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]"); args::Positional target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file"); - args::Group io_opts(parser, "[ Files IO Options ]"); - args::PositionalList query_sequence_files(io_opts, "queries", "query sequences file"); - args::ValueFlag query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"}); + args::Group io_opts(parser, "[ Files IO Options ]"); + args::PositionalList query_sequence_files(io_opts, "queries", "query sequence file(s)"); + //args::ValueFlag query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"}); args::Group mapping_opts(parser, "[ Mapping Options ]"); args::ValueFlag map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"}); @@ -80,8 +80,10 @@ void parse_args(int argc, args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"}); args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"}); args::ValueFlag skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"}); - args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"}); - args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"}); + args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); + args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); + args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); + args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"}); args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"}); args::ValueFlag chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"}); @@ -102,19 +104,19 @@ void parse_args(int argc, args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments " - "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]", + "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]", {'O', "invert-filtering"}); args::ValueFlag wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"}); args::ValueFlag wfa_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", - {"wfa-params"}); + "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", + {"wfa-params"}); args::ValueFlag wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2", - "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]", - {"wfa-patching-params"}); + "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]", + {"wfa-patching-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]", - {"wflign-params"}); + "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]", + {"wflign-params"}); args::ValueFlag wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"}); args::ValueFlag wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"}); args::ValueFlag wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"}); @@ -139,7 +141,7 @@ void parse_args(int argc, #ifdef WFA_PNG_TSV_TIMING args::Group debugging_opts(parser, "[ Debugging Options ]"); - args::ValueFlag prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"}); + args::ValueFlag prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"}); args::ValueFlag prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"}); args::ValueFlag wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"}); args::ValueFlag path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"}); @@ -202,6 +204,14 @@ void parse_args(int argc, if (target_prefix) { map_parameters.target_prefix = args::get(target_prefix); } + + if (query_list) { + map_parameters.query_list = args::get(query_list); + } + + if (query_prefix) { + map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ','); + } if (target_sequence_file) { map_parameters.refSequences.push_back(args::get(target_sequence_file)); @@ -215,10 +225,6 @@ void parse_args(int argc, align_parameters.querySequences.push_back(q); } } - if (query_sequence_file_list) { - skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences); - skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences); - } if (target_sequence_file && map_parameters.querySequences.empty() && map_parameters.refSequences.size() == 1 diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 715ad59d..1b7d42aa 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1,3 +1,4 @@ + /** * @file computeMap.hpp * @brief implements the sequence mapping logic @@ -274,48 +275,56 @@ namespace skch //Create the thread pool ThreadPool threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads); - // kind've expensive if the fasta index is not available for the query sequences, - // but it can help people know how long we're going to take - uint64_t total_seqs = 0; - uint64_t total_seq_length = 0; - for(const auto &fileName : param.querySequences) { - // check if there is a .fai - std::string fai_name = fileName + ".fai"; - if (fs::exists(fai_name)) { - // if so, process the .fai to determine our sequence length - std::string line; - std::ifstream in(fai_name.c_str()); + // allowed set of queries + std::unordered_set allowed_query_names; + if (!param.query_list.empty()) { + std::ifstream filter_list(param.query_list); + std::string name; + while (getline(filter_list, name)) { + allowed_query_names.insert(name); + } + } + + // Count the total number of sequences and sequence length + uint64_t total_seqs = 0; + uint64_t total_seq_length = 0; + for (const auto& fileName : param.querySequences) { + // Check if there is a .fai file + std::string fai_name = fileName + ".fai"; + if (fs::exists(fai_name)) { + std::string line; + std::ifstream in(fai_name.c_str()); while (std::getline(in, line)) { auto line_split = CommonFunc::split(line, '\t'); // if we have a param.target_prefix and the sequence name matches, skip it - if (param.skip_self - && param.target_prefix != "" - && line_split[0].substr(0, param.target_prefix.size()) == param.target_prefix) { - continue; - } - ++total_seqs; - total_seq_length += std::stoul(line_split[1]); - } - } else { - // if not, warn that this is expensive - std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to sum sequence length (slow)" << std::endl; - seqiter::for_each_seq_in_file( - fileName, {}, "", - [&](const std::string& seq_name, - const std::string& seq) { - // if we have a param.target_prefix and the sequence name matches, skip it - if (param.skip_self - && param.target_prefix != "" - && seq_name.substr(0, param.target_prefix.size()) == param.target_prefix) { - // skip - } else { - ++total_seqs; - total_seq_length += seq.size(); + auto seq_name = line_split[0]; + bool prefix_skip = true; + for (const auto& prefix : param.query_prefix) { + if (seq_name.substr(0, prefix.size()) == prefix) { + prefix_skip = false; + break; } - }); - } - } - + } + if (!allowed_query_names.empty() && allowed_query_names.find(seq_name) != allowed_query_names.end() + || !param.query_prefix.empty() && !prefix_skip) { + total_seqs++; + total_seq_length += std::stoul(line_split[1]); + } + } + } else { + // If .fai file doesn't exist, warn and use the for_each_seq_in_file_filtered function + std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to filter query sequences (slow)" << std::endl; + seqiter::for_each_seq_in_file_filtered( + fileName, + param.query_prefix, + allowed_query_names, + [&](const std::string& seq_name, const std::string& seq) { + ++total_seqs; + total_seq_length += seq.size(); + }); + } + } + progress_meter::ProgressMeter progress(total_seq_length, "[mashmap::skch::Map::mapQuery] mapped"); for(const auto &fileName : param.querySequences) @@ -325,8 +334,10 @@ namespace skch std::cerr << "[mashmap::skch::Map::mapQuery] mapping reads in " << fileName << std::endl; #endif - seqiter::for_each_seq_in_file( - fileName, {}, "", + seqiter::for_each_seq_in_file_filtered( + fileName, + param.query_prefix, + allowed_query_names, [&](const std::string& seq_name, const std::string& seq) { // todo: offset_t is an 32-bit integer, which could cause problems diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index a8da7f6a..8bd5f8ec 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -67,6 +67,8 @@ struct Parameters bool filterLengthMismatches; //true if filtering out length mismatches float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) + std::string query_list; // file containing list of query sequence names + std::vector query_prefix; // prefix for query sequences to use int sketchSize; bool use_spaced_seeds; // diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 44a8fe44..1bd960cf 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -156,7 +156,7 @@ namespace skch allowed_target_names.insert(name); } } - + //sequence counter while parsing file seqno_t seqCounter = 0;