From 5a045710b3faec83bfc62f776657d7b6b958871d Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 23 May 2024 11:46:02 -0500 Subject: [PATCH 01/11] rework query and target specification --- src/interface/parse_args.hpp | 24 +++++++++++++++--------- src/map/include/computeMap.hpp | 12 ++++++++++++ src/map/include/map_parameters.hpp | 2 ++ src/map/include/winSketch.hpp | 2 +- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index d866cd81..754b985e 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -64,9 +64,9 @@ void parse_args(int argc, args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]"); args::Positional target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file"); - args::Group io_opts(parser, "[ Files IO Options ]"); - args::PositionalList query_sequence_files(io_opts, "queries", "query sequences file"); - args::ValueFlag query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"}); + args::Group io_opts(parser, "[ Files IO Options ]"); + args::PositionalList query_sequence_files(io_opts, "queries", "query sequence file(s)"); + //args::ValueFlag query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"}); args::Group mapping_opts(parser, "[ Mapping Options ]"); args::ValueFlag map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"}); @@ -80,8 +80,10 @@ void parse_args(int argc, args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"}); args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"}); args::ValueFlag skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"}); - args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"}); - args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"}); + args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); + args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); + args::ValueFlag query_prefix(mapping_opts, "pfx", "use only queries whose names start with this prefix", {'Q', "query-prefix"}); + args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"}); args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"}); args::ValueFlag chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"}); @@ -202,6 +204,14 @@ void parse_args(int argc, if (target_prefix) { map_parameters.target_prefix = args::get(target_prefix); } + + if (query_list) { + map_parameters.query_list = args::get(query_list); + } + + if (query_prefix) { + map_parameters.query_prefix = args::get(query_prefix); + } if (target_sequence_file) { map_parameters.refSequences.push_back(args::get(target_sequence_file)); @@ -215,10 +225,6 @@ void parse_args(int argc, align_parameters.querySequences.push_back(q); } } - if (query_sequence_file_list) { - skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences); - skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences); - } if (target_sequence_file && map_parameters.querySequences.empty() && map_parameters.refSequences.size() == 1 diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 715ad59d..955a9502 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -274,6 +274,16 @@ namespace skch //Create the thread pool ThreadPool threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads); + // allowed set of queries + std::unordered_set allowed_query_names; + if (!param.query_list.empty()) { + std::ifstream filter_list(param.query_list); + std::string name; + while (getline(filter_list, name)) { + allowed_query_names.insert(name); + } + } + // kind've expensive if the fasta index is not available for the query sequences, // but it can help people know how long we're going to take uint64_t total_seqs = 0; @@ -288,6 +298,7 @@ namespace skch while (std::getline(in, line)) { auto line_split = CommonFunc::split(line, '\t'); // if we have a param.target_prefix and the sequence name matches, skip it + // XXXXX TODO this must be updated to filter query sequences if (param.skip_self && param.target_prefix != "" && line_split[0].substr(0, param.target_prefix.size()) == param.target_prefix) { @@ -299,6 +310,7 @@ namespace skch } else { // if not, warn that this is expensive std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to sum sequence length (slow)" << std::endl; + // XXXXX TODO this must be updated to filter query sequences seqiter::for_each_seq_in_file( fileName, {}, "", [&](const std::string& seq_name, diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index a8da7f6a..3f6c57b8 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -67,6 +67,8 @@ struct Parameters bool filterLengthMismatches; //true if filtering out length mismatches float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) + std::string query_list; // file containing list of query sequence names + std::string query_prefix; // prefix for query sequences to use int sketchSize; bool use_spaced_seeds; // diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 44a8fe44..1bd960cf 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -156,7 +156,7 @@ namespace skch allowed_target_names.insert(name); } } - + //sequence counter while parsing file seqno_t seqCounter = 0; From 2fb5e1b3782740062ee4eb456deb21bb47f82f84 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 23 May 2024 15:28:31 -0500 Subject: [PATCH 02/11] query filtering --- src/common/seqiter.hpp | 46 ++++++++++++++++++++ src/map/include/computeMap.hpp | 76 +++++++++++++++------------------- 2 files changed, 80 insertions(+), 42 deletions(-) diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp index 1c0c62fd..b2fd558a 100644 --- a/src/common/seqiter.hpp +++ b/src/common/seqiter.hpp @@ -106,4 +106,50 @@ void for_each_seq_in_file( } } +void for_each_seq_in_file( + faidx_t* fai, + const std::vector& seq_names, + const std::function& func) { + for (const auto& seq_name : seq_names) { + int len; + char* seq = fai_fetch(fai, seq_name.c_str(), &len); + if (seq != nullptr) { + func(seq_name, std::string(seq)); + free(seq); + } + } +} + +void for_each_seq_in_file_filtered( + const std::string& filename, + const std::string& query_prefix, + const std::unordered_set& query_list, + const std::function& func) { + faidx_t* fai = fai_load(filename.c_str()); + if (fai == nullptr) { + std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl; + return; + } + + std::vector query_seq_names; + int num_seqs = faidx_nseq(fai); + for (int i = 0; i < num_seqs; i++) { + const char* seq_name = faidx_iseq(fai, i); + if (!query_prefix.empty() && strncmp(seq_name, query_prefix.c_str(), query_prefix.size()) != 0) { + continue; + } + if (!query_list.empty() && query_list.count(seq_name) == 0) { + continue; + } + query_seq_names.push_back(seq_name); + } + + for_each_seq_in_file( + fai, + query_seq_names, + func); + + fai_destroy(fai); +} + } // namespace seqiter diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 955a9502..6c3c8486 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1,3 +1,4 @@ + /** * @file computeMap.hpp * @brief implements the sequence mapping logic @@ -284,50 +285,39 @@ namespace skch } } - // kind've expensive if the fasta index is not available for the query sequences, - // but it can help people know how long we're going to take - uint64_t total_seqs = 0; - uint64_t total_seq_length = 0; - for(const auto &fileName : param.querySequences) { - // check if there is a .fai - std::string fai_name = fileName + ".fai"; - if (fs::exists(fai_name)) { - // if so, process the .fai to determine our sequence length - std::string line; - std::ifstream in(fai_name.c_str()); + // Count the total number of sequences and sequence length + uint64_t total_seqs = 0; + uint64_t total_seq_length = 0; + for (const auto& fileName : param.querySequences) { + // Check if there is a .fai file + std::string fai_name = fileName + ".fai"; + if (fs::exists(fai_name)) { + std::string line; + std::ifstream in(fai_name.c_str()); while (std::getline(in, line)) { auto line_split = CommonFunc::split(line, '\t'); // if we have a param.target_prefix and the sequence name matches, skip it - // XXXXX TODO this must be updated to filter query sequences - if (param.skip_self - && param.target_prefix != "" - && line_split[0].substr(0, param.target_prefix.size()) == param.target_prefix) { - continue; + auto seq_name = line_split[0]; + if (!allowed_query_names.empty() && allowed_query_names.find(seq_name) != allowed_query_names.end() + || !param.query_prefix.empty() && seq_name.substr(0, param.query_prefix.size()) == param.query_prefix) { + total_seqs++; + total_seq_length += std::stoul(line_split[1]); } - ++total_seqs; - total_seq_length += std::stoul(line_split[1]); - } - } else { - // if not, warn that this is expensive - std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to sum sequence length (slow)" << std::endl; - // XXXXX TODO this must be updated to filter query sequences - seqiter::for_each_seq_in_file( - fileName, {}, "", - [&](const std::string& seq_name, - const std::string& seq) { - // if we have a param.target_prefix and the sequence name matches, skip it - if (param.skip_self - && param.target_prefix != "" - && seq_name.substr(0, param.target_prefix.size()) == param.target_prefix) { - // skip - } else { - ++total_seqs; - total_seq_length += seq.size(); - } - }); - } - } - + } + } else { + // If .fai file doesn't exist, warn and use the for_each_seq_in_file_filtered function + std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to filter query sequences (slow)" << std::endl; + seqiter::for_each_seq_in_file_filtered( + fileName, + param.query_prefix, + allowed_query_names, + [&](const std::string& seq_name, const std::string& seq) { + ++total_seqs; + total_seq_length += seq.size(); + }); + } + } + progress_meter::ProgressMeter progress(total_seq_length, "[mashmap::skch::Map::mapQuery] mapped"); for(const auto &fileName : param.querySequences) @@ -337,8 +327,10 @@ namespace skch std::cerr << "[mashmap::skch::Map::mapQuery] mapping reads in " << fileName << std::endl; #endif - seqiter::for_each_seq_in_file( - fileName, {}, "", + seqiter::for_each_seq_in_file_filtered( + fileName, + param.query_prefix, + allowed_query_names, [&](const std::string& seq_name, const std::string& seq) { // todo: offset_t is an 32-bit integer, which could cause problems From a6f15f147fd731ff029cf48bf08f3f0420a0f839 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 23 May 2024 15:54:30 -0500 Subject: [PATCH 03/11] list of query prefixes now possible --- src/common/seqiter.hpp | 11 +++++++++-- src/interface/parse_args.hpp | 18 +++++++++--------- src/map/include/computeMap.hpp | 9 ++++++++- src/map/include/map_parameters.hpp | 2 +- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp index b2fd558a..9aa38c09 100644 --- a/src/common/seqiter.hpp +++ b/src/common/seqiter.hpp @@ -122,7 +122,7 @@ void for_each_seq_in_file( void for_each_seq_in_file_filtered( const std::string& filename, - const std::string& query_prefix, + const std::vector& query_prefix, const std::unordered_set& query_list, const std::function& func) { faidx_t* fai = fai_load(filename.c_str()); @@ -135,7 +135,14 @@ void for_each_seq_in_file_filtered( int num_seqs = faidx_nseq(fai); for (int i = 0; i < num_seqs; i++) { const char* seq_name = faidx_iseq(fai, i); - if (!query_prefix.empty() && strncmp(seq_name, query_prefix.c_str(), query_prefix.size()) != 0) { + bool prefix_skip = true; + for (const auto& prefix : query_prefix) { + if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) { + prefix_skip = false; + break; + } + } + if (!query_prefix.empty() && prefix_skip) { continue; } if (!query_list.empty() && query_list.count(seq_name) == 0) { diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 754b985e..f48c4d89 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -82,7 +82,7 @@ void parse_args(int argc, args::ValueFlag skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"}); args::ValueFlag target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"}); args::ValueFlag target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"}); - args::ValueFlag query_prefix(mapping_opts, "pfx", "use only queries whose names start with this prefix", {'Q', "query-prefix"}); + args::ValueFlag query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"}); args::ValueFlag query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"}); args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"}); args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"}); @@ -104,19 +104,19 @@ void parse_args(int argc, args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments " - "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]", + "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]", {'O', "invert-filtering"}); args::ValueFlag wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"}); args::ValueFlag wfa_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", - {"wfa-params"}); + "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]", + {"wfa-params"}); args::ValueFlag wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2", - "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]", - {"wfa-patching-params"}); + "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]", + {"wfa-patching-params"}); //wflign parameters args::ValueFlag wflign_score_params(alignment_opts, "mismatch,gap1,ext1", - "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]", - {"wflign-params"}); + "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]", + {"wflign-params"}); args::ValueFlag wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"}); args::ValueFlag wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"}); args::ValueFlag wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"}); @@ -210,7 +210,7 @@ void parse_args(int argc, } if (query_prefix) { - map_parameters.query_prefix = args::get(query_prefix); + map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ','); } if (target_sequence_file) { diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 6c3c8486..1b7d42aa 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -298,8 +298,15 @@ namespace skch auto line_split = CommonFunc::split(line, '\t'); // if we have a param.target_prefix and the sequence name matches, skip it auto seq_name = line_split[0]; + bool prefix_skip = true; + for (const auto& prefix : param.query_prefix) { + if (seq_name.substr(0, prefix.size()) == prefix) { + prefix_skip = false; + break; + } + } if (!allowed_query_names.empty() && allowed_query_names.find(seq_name) != allowed_query_names.end() - || !param.query_prefix.empty() && seq_name.substr(0, param.query_prefix.size()) == param.query_prefix) { + || !param.query_prefix.empty() && !prefix_skip) { total_seqs++; total_seq_length += std::stoul(line_split[1]); } diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 3f6c57b8..8bd5f8ec 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -68,7 +68,7 @@ struct Parameters float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) std::string query_list; // file containing list of query sequence names - std::string query_prefix; // prefix for query sequences to use + std::vector query_prefix; // prefix for query sequences to use int sketchSize; bool use_spaced_seeds; // From 20897ac43578a4ddff8d5dba297697f1cc62ffea Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 15:38:08 -0500 Subject: [PATCH 04/11] add basic script to write jobs for all to all alignment or mapping --- scripts/all2all_jobs.py | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 scripts/all2all_jobs.py diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py new file mode 100644 index 00000000..80d117a0 --- /dev/null +++ b/scripts/all2all_jobs.py @@ -0,0 +1,74 @@ +import argparse +import gzip +import itertools + +def parse_fasta_index(fasta_file): + fai_file = fasta_file + '.fai' + sequences = [] + with open(fai_file, 'r') as file: + for line in file: + sequence_name = line.strip().split('\t')[0] + sequences.append(sequence_name) + return sequences + +def group_sequences(sequences, grouping): + grouped_sequences = {} + for sequence in sequences: + if '#' in sequence: + fields = sequence.split('#') + if grouping == 'genome': + group_key = fields[0] + elif grouping == 'haplotype': + group_key = '#'.join(fields[:2]) + else: + raise ValueError(f"Invalid grouping: {grouping}") + else: + group_key = sequence + + if group_key not in grouped_sequences: + grouped_sequences[group_key] = [] + grouped_sequences[group_key].append(sequence) + + return list(grouped_sequences.values()) + +def generate_pairings(grouped_sequences, num_queries): + pairings = [] + for target_group in grouped_sequences: + for query_chunk in itertools.combinations(grouped_sequences, num_queries): + query_chunk = [seq for group in query_chunk for seq in group if seq not in target_group] + pairings.append((target_group, query_chunk)) + return pairings + +def main(): + parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.') + parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)') + parser.add_argument('--num-queries', type=int, default=5, help='Number of query groups per target group (default: 5)') + parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)') + parser.add_argument('--output', help='Output file to save the pairings') + + args = parser.parse_args() + + # Parse the FASTA index file + sequences = parse_fasta_index(args.fasta_file) + + # Group sequences based on the specified grouping level + grouped_sequences = group_sequences(sequences, args.grouping) + + # Generate pairings + pairings = generate_pairings(grouped_sequences, args.num_queries) + + # Save or print the pairings + if args.output: + with open(args.output, 'w') as file: + for target_group, query_groups in pairings: + file.write(f"Target Group: {', '.join(target_group)}\n") + file.write(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}\n") + file.write("\n") + else: + for target_group, query_groups in pairings: + print(f"Target Group: {', '.join(target_group)}") + print(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}") + print() + +if __name__ == '__main__': + main() From fe693e7876b0009edd21533b2dbe7b9d1f96ebc7 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 16:14:05 -0500 Subject: [PATCH 05/11] now sort of correct --- scripts/all2all_jobs.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py index 80d117a0..143c6f50 100644 --- a/scripts/all2all_jobs.py +++ b/scripts/all2all_jobs.py @@ -29,20 +29,27 @@ def group_sequences(sequences, grouping): grouped_sequences[group_key] = [] grouped_sequences[group_key].append(sequence) - return list(grouped_sequences.values()) + return grouped_sequences def generate_pairings(grouped_sequences, num_queries): pairings = [] - for target_group in grouped_sequences: - for query_chunk in itertools.combinations(grouped_sequences, num_queries): - query_chunk = [seq for group in query_chunk for seq in group if seq not in target_group] + groups = list(grouped_sequences.keys()) + num_groups = len(groups) + + for i in range(num_groups): + target_group = groups[i] + query_groups = [group for j, group in enumerate(groups) if j != i] + + for query_chunk in itertools.zip_longest(*[iter(query_groups)] * num_queries): + query_chunk = [q for q in query_chunk if q is not None] pairings.append((target_group, query_chunk)) + return pairings def main(): parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.') parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)') - parser.add_argument('--num-queries', type=int, default=5, help='Number of query groups per target group (default: 5)') + parser.add_argument('--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)') parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)') parser.add_argument('--output', help='Output file to save the pairings') @@ -61,14 +68,10 @@ def main(): if args.output: with open(args.output, 'w') as file: for target_group, query_groups in pairings: - file.write(f"Target Group: {', '.join(target_group)}\n") - file.write(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}\n") - file.write("\n") + file.write(f"{target_group},{','.join(query_groups)}\n") else: for target_group, query_groups in pairings: - print(f"Target Group: {', '.join(target_group)}") - print(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}") - print() + print(f"{target_group},{','.join(query_groups)}") if __name__ == '__main__': - main() + main() \ No newline at end of file From 5792920061869a54a1d7f2f15abb121432274f98 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 16:39:46 -0500 Subject: [PATCH 06/11] update all to all generation script --- scripts/all2all_jobs.py | 59 +++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py index 143c6f50..37a8b901 100644 --- a/scripts/all2all_jobs.py +++ b/scripts/all2all_jobs.py @@ -16,10 +16,12 @@ def group_sequences(sequences, grouping): for sequence in sequences: if '#' in sequence: fields = sequence.split('#') - if grouping == 'genome': + if grouping in ['g', 'genome']: group_key = fields[0] - elif grouping == 'haplotype': + elif grouping in ['h', 'haplotype']: group_key = '#'.join(fields[:2]) + elif grouping in ['c', 'contig']: + group_key = sequence else: raise ValueError(f"Invalid grouping: {grouping}") else: @@ -31,47 +33,58 @@ def group_sequences(sequences, grouping): return grouped_sequences -def generate_pairings(grouped_sequences, num_queries): +def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries): pairings = [] - groups = list(grouped_sequences.keys()) - num_groups = len(groups) + target_groups = list(target_grouped_sequences.keys()) + query_groups = list(query_grouped_sequences.keys()) - for i in range(num_groups): - target_group = groups[i] - query_groups = [group for j, group in enumerate(groups) if j != i] + for target_group in target_groups: + query_pool = [group for group in query_groups if group != target_group] - for query_chunk in itertools.zip_longest(*[iter(query_groups)] * num_queries): + for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries): query_chunk = [q for q in query_chunk if q is not None] pairings.append((target_group, query_chunk)) return pairings def main(): - parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.') + parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.') parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)') - parser.add_argument('--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)') - parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)') - parser.add_argument('--output', help='Output file to save the pairings') + parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)') + parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)') + parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)') + parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines') - args = parser.parse_args() + args, wfmash_args = parser.parse_known_args() # Parse the FASTA index file sequences = parse_fasta_index(args.fasta_file) - # Group sequences based on the specified grouping level - grouped_sequences = group_sequences(sequences, args.grouping) + # Group sequences based on the specified grouping levels + target_grouped_sequences = group_sequences(sequences, args.target_grouping) + query_grouped_sequences = group_sequences(sequences, args.query_grouping) # Generate pairings - pairings = generate_pairings(grouped_sequences, args.num_queries) + pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries) - # Save or print the pairings - if args.output: - with open(args.output, 'w') as file: + # Save or print the pairings or command lines + if wfmash_args: + wfmash_options = ' '.join(wfmash_args) + if args.output: + with open(args.output, 'w') as file: + for target_group, query_groups in pairings: + file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n") + else: for target_group, query_groups in pairings: - file.write(f"{target_group},{','.join(query_groups)}\n") + print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}") else: - for target_group, query_groups in pairings: - print(f"{target_group},{','.join(query_groups)}") + if args.output: + with open(args.output, 'w') as file: + for target_group, query_groups in pairings: + file.write(f"{target_group}\t{','.join(query_groups)}\n") + else: + for target_group, query_groups in pairings: + print(f"{target_group}\t{','.join(query_groups)}") if __name__ == '__main__': main() \ No newline at end of file From dc5e4db91c2ddf6b400dce9cb63b8839144448bd Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 16:54:22 -0500 Subject: [PATCH 07/11] run PR tests on main --- .github/workflows/test_on_push.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index 01362c67..c13a1907 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -1,10 +1,10 @@ on: push: - branches: [ master ] + branches: [ main ] paths-ignore: - '**/*.md' pull_request: - branches: [ master ] + branches: [ main ] paths-ignore: - '**/*.md' From 57769c4cd441bc9dc16b41d17e55f9565dc91776 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 17:00:18 -0500 Subject: [PATCH 08/11] add a way to make the source tarball --- scripts/make_source_targball.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 scripts/make_source_targball.sh diff --git a/scripts/make_source_targball.sh b/scripts/make_source_targball.sh new file mode 100644 index 00000000..313d2f92 --- /dev/null +++ b/scripts/make_source_targball.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# run from root of the repository + +mkdir source-tarball +cd source-tarball +git clone --recursive https://github.com/waveygang/wfmash +cd wfmash +git fetch --tags origin +LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)" +git checkout "${LATEST_TAG}" +git submodule update --init --recursive +bash scripts/generate_git_version.sh $PWD/src +sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i +rm -Rf .git +find src/common -name ".git" -exec rm -Rf "{}" \; +cd .. +mv wfmash "wfmash-${LATEST_TAG}" +tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}" +rm -Rf "wfmash-${LATEST_TAG}" From 964eeb2ad598cb25c558a9b22ba567a95b6ab7dd Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 17:02:32 -0500 Subject: [PATCH 09/11] use G rather than T for the tsv output from wflign --- .github/workflows/test_on_push.yml | 4 ++-- src/interface/parse_args.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index c13a1907..c117960e 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -38,9 +38,9 @@ jobs: - name: Test mapping coverage with 8 yeast genomes (PAF output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92 - name: Test mapping+alignment with a subset of the LPA dataset (PAF output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -G wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv - name: Test mapping+alignment with a subset of the LPA dataset (SAM output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -G wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head - name: Test mapping+alignment with short reads (255bps) (PAF output) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index f48c4d89..49090124 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -141,7 +141,7 @@ void parse_args(int argc, #ifdef WFA_PNG_TSV_TIMING args::Group debugging_opts(parser, "[ Debugging Options ]"); - args::ValueFlag prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"}); + args::ValueFlag prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"}); args::ValueFlag prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"}); args::ValueFlag wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"}); args::ValueFlag path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"}); From 13931c657255166277edd5cb9d16cd3020402f34 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 25 May 2024 17:33:37 -0500 Subject: [PATCH 10/11] try a workaround --- .github/workflows/test_on_push.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index c117960e..2a9c8273 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -38,9 +38,9 @@ jobs: - name: Test mapping coverage with 8 yeast genomes (PAF output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92 - name: Test mapping+alignment with a subset of the LPA dataset (PAF output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -G wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf - name: Test mapping+alignment with a subset of the LPA dataset (SAM output) - run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -G wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 + run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9 - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output) run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head - name: Test mapping+alignment with short reads (255bps) (PAF output) From 76d19caf9682bd4702c2b3e39ca8b95eb520dc3a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 27 May 2024 17:43:55 -0500 Subject: [PATCH 11/11] fix `heap-use-after-free` error --- src/common/wflign/src/wflign.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 67cd7bf9..84144896 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront( // Free delete wflambda_aligner; + delete wf_aligner; #ifdef WFA_PNG_TSV_TIMING if (extend_data.emit_png) { @@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront( } if (merge_alignments) { - // Free old aligner - delete wf_aligner; - // use biWFA for all patching wfa::WFAlignerGapAffine2Pieces* wf_aligner = new wfa::WFAlignerGapAffine2Pieces( @@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront( emit_patching_tsv, out_patching_tsv #endif - ); + ); + + delete wf_aligner; } else { // todo old implementation (and SAM format is not supported) for (auto x = trace.rbegin(); x != trace.rend(); ++x) { @@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront( } } } - - // Free - delete wf_aligner; } }