From 5a045710b3faec83bfc62f776657d7b6b958871d Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Thu, 23 May 2024 11:46:02 -0500
Subject: [PATCH 01/11] rework query and target specification

---
 src/interface/parse_args.hpp       | 24 +++++++++++++++---------
 src/map/include/computeMap.hpp     | 12 ++++++++++++
 src/map/include/map_parameters.hpp |  2 ++
 src/map/include/winSketch.hpp      |  2 +-
 4 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
index d866cd81..754b985e 100644
--- a/src/interface/parse_args.hpp
+++ b/src/interface/parse_args.hpp
@@ -64,9 +64,9 @@ void parse_args(int argc,
     args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]");
     args::Positional<std::string> target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file");
 
-    args::Group io_opts(parser, "[ Files IO Options ]");
-    args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequences file");
-    args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
+	args::Group io_opts(parser, "[ Files IO Options ]");
+    args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequence file(s)");
+    //args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
 
     args::Group mapping_opts(parser, "[ Mapping Options ]");
     args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
@@ -80,8 +80,10 @@ void parse_args(int argc,
     args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
     args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
     args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
-	args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
-	args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
+	args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
+	args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
+	args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx", "use only queries whose names start with this prefix", {'Q', "query-prefix"});
+	args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
     args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
     args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
     args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"});
@@ -202,6 +204,14 @@ void parse_args(int argc,
 	if (target_prefix) {
 		map_parameters.target_prefix = args::get(target_prefix);
 	}
+
+	if (query_list) {
+		map_parameters.query_list = args::get(query_list);
+	}
+	
+	if (query_prefix) {
+		map_parameters.query_prefix = args::get(query_prefix);
+	}
 	
     if (target_sequence_file) {
         map_parameters.refSequences.push_back(args::get(target_sequence_file));
@@ -215,10 +225,6 @@ void parse_args(int argc,
             align_parameters.querySequences.push_back(q);
         }
     }
-    if (query_sequence_file_list) {
-        skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences);
-        skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences);
-    }
 
 	if (target_sequence_file && map_parameters.querySequences.empty()
 		&& map_parameters.refSequences.size() == 1
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 715ad59d..955a9502 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -274,6 +274,16 @@ namespace skch
         //Create the thread pool
         ThreadPool<InputSeqProgContainer, MapModuleOutput> threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads);
 
+		// allowed set of queries
+		std::unordered_set<std::string> allowed_query_names;
+		if (!param.query_list.empty()) {
+			std::ifstream filter_list(param.query_list);
+			std::string name;
+			while (getline(filter_list, name)) {
+				allowed_query_names.insert(name); 
+			}
+		}
+
         // kind've expensive if the fasta index is not available for the query sequences,
         // but it can help people know how long we're going to take
         uint64_t total_seqs = 0;
@@ -288,6 +298,7 @@ namespace skch
                 while (std::getline(in, line)) {
                     auto line_split = CommonFunc::split(line, '\t');
 					// if we have a param.target_prefix and the sequence name matches, skip it
+					// XXXXX TODO this must be updated to filter query sequences
 					if (param.skip_self
 						&& param.target_prefix != ""
 						&& line_split[0].substr(0, param.target_prefix.size()) == param.target_prefix) {
@@ -299,6 +310,7 @@ namespace skch
             } else {
                 // if not, warn that this is expensive
                 std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to sum sequence length (slow)" << std::endl;
+				// XXXXX TODO this must be updated to filter query sequences
                 seqiter::for_each_seq_in_file(
                     fileName, {}, "",
                     [&](const std::string& seq_name,
diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index a8da7f6a..3f6c57b8 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -67,6 +67,8 @@ struct Parameters
     bool filterLengthMismatches;                      //true if filtering out length mismatches
     float kmerComplexityThreshold;                    //minimum kmer complexity to consider (default 0)
 
+	std::string query_list;                           // file containing list of query sequence names
+	std::string query_prefix;                         // prefix for query sequences to use
 
     int sketchSize;
     bool use_spaced_seeds;                            //
diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp
index 44a8fe44..1bd960cf 100644
--- a/src/map/include/winSketch.hpp
+++ b/src/map/include/winSketch.hpp
@@ -156,7 +156,7 @@ namespace skch
                         allowed_target_names.insert(name); 
                 }
         }
-
+		
 
         //sequence counter while parsing file
         seqno_t seqCounter = 0;

From 2fb5e1b3782740062ee4eb456deb21bb47f82f84 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Thu, 23 May 2024 15:28:31 -0500
Subject: [PATCH 02/11] query filtering

---
 src/common/seqiter.hpp         | 46 ++++++++++++++++++++
 src/map/include/computeMap.hpp | 76 +++++++++++++++-------------------
 2 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp
index 1c0c62fd..b2fd558a 100644
--- a/src/common/seqiter.hpp
+++ b/src/common/seqiter.hpp
@@ -106,4 +106,50 @@ void for_each_seq_in_file(
     }
 }
 
+void for_each_seq_in_file(
+    faidx_t* fai,
+    const std::vector<std::string>& seq_names,
+    const std::function<void(const std::string&, const std::string&)>& func) {
+    for (const auto& seq_name : seq_names) {
+        int len;
+        char* seq = fai_fetch(fai, seq_name.c_str(), &len);
+        if (seq != nullptr) {
+            func(seq_name, std::string(seq));
+            free(seq);
+        }
+    }
+}
+	
+void for_each_seq_in_file_filtered(
+    const std::string& filename,
+    const std::string& query_prefix,
+    const std::unordered_set<std::string>& query_list,
+    const std::function<void(const std::string&, const std::string&)>& func) {
+    faidx_t* fai = fai_load(filename.c_str());
+    if (fai == nullptr) {
+        std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl;
+        return;
+    }
+
+    std::vector<std::string> query_seq_names;
+    int num_seqs = faidx_nseq(fai);
+    for (int i = 0; i < num_seqs; i++) {
+        const char* seq_name = faidx_iseq(fai, i);
+        if (!query_prefix.empty() && strncmp(seq_name, query_prefix.c_str(), query_prefix.size()) != 0) {
+            continue;
+        }
+        if (!query_list.empty() && query_list.count(seq_name) == 0) {
+            continue;
+        }
+        query_seq_names.push_back(seq_name);
+    }
+
+    for_each_seq_in_file(
+        fai,
+        query_seq_names,
+        func);
+
+    fai_destroy(fai);
+}
+
 } // namespace seqiter
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 955a9502..6c3c8486 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1,3 +1,4 @@
+
 /**
  * @file    computeMap.hpp
  * @brief   implements the sequence mapping logic
@@ -284,50 +285,39 @@ namespace skch
 			}
 		}
 
-        // kind've expensive if the fasta index is not available for the query sequences,
-        // but it can help people know how long we're going to take
-        uint64_t total_seqs = 0;
-        uint64_t total_seq_length = 0;
-        for(const auto &fileName : param.querySequences) {
-            // check if there is a .fai
-            std::string fai_name = fileName + ".fai";
-            if (fs::exists(fai_name)) {
-                // if so, process the .fai to determine our sequence length
-                std::string line;
-                std::ifstream in(fai_name.c_str());
+		// Count the total number of sequences and sequence length
+		uint64_t total_seqs = 0;
+		uint64_t total_seq_length = 0;
+		for (const auto& fileName : param.querySequences) {
+			// Check if there is a .fai file
+			std::string fai_name = fileName + ".fai";
+			if (fs::exists(fai_name)) {
+				std::string line;
+				std::ifstream in(fai_name.c_str());
                 while (std::getline(in, line)) {
                     auto line_split = CommonFunc::split(line, '\t');
 					// if we have a param.target_prefix and the sequence name matches, skip it
-					// XXXXX TODO this must be updated to filter query sequences
-					if (param.skip_self
-						&& param.target_prefix != ""
-						&& line_split[0].substr(0, param.target_prefix.size()) == param.target_prefix) {
-						continue;
+					auto seq_name = line_split[0];
+					if (!allowed_query_names.empty() && allowed_query_names.find(seq_name) != allowed_query_names.end()
+						|| !param.query_prefix.empty() && seq_name.substr(0, param.query_prefix.size()) == param.query_prefix) {
+						total_seqs++;
+						total_seq_length += std::stoul(line_split[1]);
 					}
-					++total_seqs;
-                    total_seq_length += std::stoul(line_split[1]);
-                }
-            } else {
-                // if not, warn that this is expensive
-                std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to sum sequence length (slow)" << std::endl;
-				// XXXXX TODO this must be updated to filter query sequences
-                seqiter::for_each_seq_in_file(
-                    fileName, {}, "",
-                    [&](const std::string& seq_name,
-                        const std::string& seq) {
-						// if we have a param.target_prefix and the sequence name matches, skip it
-						if (param.skip_self
-							&& param.target_prefix != ""
-							&& seq_name.substr(0, param.target_prefix.size()) == param.target_prefix) {
-							// skip
-						} else {
-							++total_seqs;
-							total_seq_length += seq.size();
-						}
-                    });
-            }
-        }
-
+				}
+			} else {
+				// If .fai file doesn't exist, warn and use the for_each_seq_in_file_filtered function
+				std::cerr << "[mashmap::skch::Map::mapQuery] WARNING, no .fai index found for " << fileName << ", reading the file to filter query sequences (slow)" << std::endl;
+				seqiter::for_each_seq_in_file_filtered(
+					fileName,
+					param.query_prefix,
+					allowed_query_names,
+					[&](const std::string& seq_name, const std::string& seq) {
+						++total_seqs;
+						total_seq_length += seq.size();
+					});
+			}
+		}
+		
         progress_meter::ProgressMeter progress(total_seq_length, "[mashmap::skch::Map::mapQuery] mapped");
 
         for(const auto &fileName : param.querySequences)
@@ -337,8 +327,10 @@ namespace skch
             std::cerr << "[mashmap::skch::Map::mapQuery] mapping reads in " << fileName << std::endl;
 #endif
 
-            seqiter::for_each_seq_in_file(
-                fileName, {}, "",
+			seqiter::for_each_seq_in_file_filtered(
+				fileName,
+				param.query_prefix,
+				allowed_query_names,
                 [&](const std::string& seq_name,
                     const std::string& seq) {
                     // todo: offset_t is an 32-bit integer, which could cause problems

From a6f15f147fd731ff029cf48bf08f3f0420a0f839 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Thu, 23 May 2024 15:54:30 -0500
Subject: [PATCH 03/11] list of query prefixes now possible

---
 src/common/seqiter.hpp             | 11 +++++++++--
 src/interface/parse_args.hpp       | 18 +++++++++---------
 src/map/include/computeMap.hpp     |  9 ++++++++-
 src/map/include/map_parameters.hpp |  2 +-
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp
index b2fd558a..9aa38c09 100644
--- a/src/common/seqiter.hpp
+++ b/src/common/seqiter.hpp
@@ -122,7 +122,7 @@ void for_each_seq_in_file(
 	
 void for_each_seq_in_file_filtered(
     const std::string& filename,
-    const std::string& query_prefix,
+    const std::vector<std::string>& query_prefix,
     const std::unordered_set<std::string>& query_list,
     const std::function<void(const std::string&, const std::string&)>& func) {
     faidx_t* fai = fai_load(filename.c_str());
@@ -135,7 +135,14 @@ void for_each_seq_in_file_filtered(
     int num_seqs = faidx_nseq(fai);
     for (int i = 0; i < num_seqs; i++) {
         const char* seq_name = faidx_iseq(fai, i);
-        if (!query_prefix.empty() && strncmp(seq_name, query_prefix.c_str(), query_prefix.size()) != 0) {
+		bool prefix_skip = true;
+		for (const auto& prefix : query_prefix) {
+			if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) {
+				prefix_skip = false;
+				break;
+			}
+		}
+		if (!query_prefix.empty() && prefix_skip) {
             continue;
         }
         if (!query_list.empty() && query_list.count(seq_name) == 0) {
diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
index 754b985e..f48c4d89 100644
--- a/src/interface/parse_args.hpp
+++ b/src/interface/parse_args.hpp
@@ -82,7 +82,7 @@ void parse_args(int argc,
     args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
 	args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
 	args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
-	args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx", "use only queries whose names start with this prefix", {'Q', "query-prefix"});
+	args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"});
 	args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
     args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
     args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
@@ -104,19 +104,19 @@ void parse_args(int argc,
     args::Group alignment_opts(parser, "[ Alignment Options ]");
     args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
     args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments "
-                                                   "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
+								"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
                                 {'O', "invert-filtering"});
     args::ValueFlag<uint16_t> wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"});
     args::ValueFlag<std::string> wfa_score_params(alignment_opts, "mismatch,gap1,ext1",
-                                            "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
-                                            {"wfa-params"});
+												  "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
+												  {"wfa-params"});
     args::ValueFlag<std::string> wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2",
-                                            "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
-                                            {"wfa-patching-params"});
+														   "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
+														   {"wfa-patching-params"});
     //wflign parameters
     args::ValueFlag<std::string> wflign_score_params(alignment_opts, "mismatch,gap1,ext1",
-                                                       "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
-                                                       {"wflign-params"});
+													 "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
+													 {"wflign-params"});
     args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
     args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
     args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});
@@ -210,7 +210,7 @@ void parse_args(int argc,
 	}
 	
 	if (query_prefix) {
-		map_parameters.query_prefix = args::get(query_prefix);
+		map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ',');
 	}
 	
     if (target_sequence_file) {
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 6c3c8486..1b7d42aa 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -298,8 +298,15 @@ namespace skch
                     auto line_split = CommonFunc::split(line, '\t');
 					// if we have a param.target_prefix and the sequence name matches, skip it
 					auto seq_name = line_split[0];
+					bool prefix_skip = true;
+					for (const auto& prefix : param.query_prefix) {
+						if (seq_name.substr(0, prefix.size()) == prefix) {
+							prefix_skip = false;
+							break;
+						}
+					}
 					if (!allowed_query_names.empty() && allowed_query_names.find(seq_name) != allowed_query_names.end()
-						|| !param.query_prefix.empty() && seq_name.substr(0, param.query_prefix.size()) == param.query_prefix) {
+						|| !param.query_prefix.empty() && !prefix_skip) {
 						total_seqs++;
 						total_seq_length += std::stoul(line_split[1]);
 					}
diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index 3f6c57b8..8bd5f8ec 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -68,7 +68,7 @@ struct Parameters
     float kmerComplexityThreshold;                    //minimum kmer complexity to consider (default 0)
 
 	std::string query_list;                           // file containing list of query sequence names
-	std::string query_prefix;                         // prefix for query sequences to use
+	std::vector<std::string> query_prefix;            // prefix for query sequences to use
 
     int sketchSize;
     bool use_spaced_seeds;                            //

From 20897ac43578a4ddff8d5dba297697f1cc62ffea Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 15:38:08 -0500
Subject: [PATCH 04/11] add basic script to write jobs for all to all alignment
 or mapping

---
 scripts/all2all_jobs.py | 74 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 scripts/all2all_jobs.py

diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py
new file mode 100644
index 00000000..80d117a0
--- /dev/null
+++ b/scripts/all2all_jobs.py
@@ -0,0 +1,74 @@
+import argparse
+import gzip
+import itertools
+
+def parse_fasta_index(fasta_file):
+    fai_file = fasta_file + '.fai'
+    sequences = []
+    with open(fai_file, 'r') as file:
+        for line in file:
+            sequence_name = line.strip().split('\t')[0]
+            sequences.append(sequence_name)
+    return sequences
+
+def group_sequences(sequences, grouping):
+    grouped_sequences = {}
+    for sequence in sequences:
+        if '#' in sequence:
+            fields = sequence.split('#')
+            if grouping == 'genome':
+                group_key = fields[0]
+            elif grouping == 'haplotype':
+                group_key = '#'.join(fields[:2])
+            else:
+                raise ValueError(f"Invalid grouping: {grouping}")
+        else:
+            group_key = sequence
+
+        if group_key not in grouped_sequences:
+            grouped_sequences[group_key] = []
+        grouped_sequences[group_key].append(sequence)
+
+    return list(grouped_sequences.values())
+
+def generate_pairings(grouped_sequences, num_queries):
+    pairings = []
+    for target_group in grouped_sequences:
+        for query_chunk in itertools.combinations(grouped_sequences, num_queries):
+            query_chunk = [seq for group in query_chunk for seq in group if seq not in target_group]
+            pairings.append((target_group, query_chunk))
+    return pairings
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.')
+    parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
+    parser.add_argument('--num-queries', type=int, default=5, help='Number of query groups per target group (default: 5)')
+    parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)')
+    parser.add_argument('--output', help='Output file to save the pairings')
+
+    args = parser.parse_args()
+
+    # Parse the FASTA index file
+    sequences = parse_fasta_index(args.fasta_file)
+
+    # Group sequences based on the specified grouping level
+    grouped_sequences = group_sequences(sequences, args.grouping)
+
+    # Generate pairings
+    pairings = generate_pairings(grouped_sequences, args.num_queries)
+
+    # Save or print the pairings
+    if args.output:
+        with open(args.output, 'w') as file:
+            for target_group, query_groups in pairings:
+                file.write(f"Target Group: {', '.join(target_group)}\n")
+                file.write(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}\n")
+                file.write("\n")
+    else:
+        for target_group, query_groups in pairings:
+            print(f"Target Group: {', '.join(target_group)}")
+            print(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}")
+            print()
+
+if __name__ == '__main__':
+    main()

From fe693e7876b0009edd21533b2dbe7b9d1f96ebc7 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 16:14:05 -0500
Subject: [PATCH 05/11] now sort of correct

---
 scripts/all2all_jobs.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py
index 80d117a0..143c6f50 100644
--- a/scripts/all2all_jobs.py
+++ b/scripts/all2all_jobs.py
@@ -29,20 +29,27 @@ def group_sequences(sequences, grouping):
             grouped_sequences[group_key] = []
         grouped_sequences[group_key].append(sequence)
 
-    return list(grouped_sequences.values())
+    return grouped_sequences
 
 def generate_pairings(grouped_sequences, num_queries):
     pairings = []
-    for target_group in grouped_sequences:
-        for query_chunk in itertools.combinations(grouped_sequences, num_queries):
-            query_chunk = [seq for group in query_chunk for seq in group if seq not in target_group]
+    groups = list(grouped_sequences.keys())
+    num_groups = len(groups)
+    
+    for i in range(num_groups):
+        target_group = groups[i]
+        query_groups = [group for j, group in enumerate(groups) if j != i]
+        
+        for query_chunk in itertools.zip_longest(*[iter(query_groups)] * num_queries):
+            query_chunk = [q for q in query_chunk if q is not None]
             pairings.append((target_group, query_chunk))
+    
     return pairings
 
 def main():
     parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.')
     parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
-    parser.add_argument('--num-queries', type=int, default=5, help='Number of query groups per target group (default: 5)')
+    parser.add_argument('--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
     parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)')
     parser.add_argument('--output', help='Output file to save the pairings')
 
@@ -61,14 +68,10 @@ def main():
     if args.output:
         with open(args.output, 'w') as file:
             for target_group, query_groups in pairings:
-                file.write(f"Target Group: {', '.join(target_group)}\n")
-                file.write(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}\n")
-                file.write("\n")
+                file.write(f"{target_group},{','.join(query_groups)}\n")
     else:
         for target_group, query_groups in pairings:
-            print(f"Target Group: {', '.join(target_group)}")
-            print(f"Query Groups: {', '.join([seq for group in query_groups for seq in group])}")
-            print()
+            print(f"{target_group},{','.join(query_groups)}")
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file

From 5792920061869a54a1d7f2f15abb121432274f98 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 16:39:46 -0500
Subject: [PATCH 06/11] update all to all generation script

---
 scripts/all2all_jobs.py | 59 +++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py
index 143c6f50..37a8b901 100644
--- a/scripts/all2all_jobs.py
+++ b/scripts/all2all_jobs.py
@@ -16,10 +16,12 @@ def group_sequences(sequences, grouping):
     for sequence in sequences:
         if '#' in sequence:
             fields = sequence.split('#')
-            if grouping == 'genome':
+            if grouping in ['g', 'genome']:
                 group_key = fields[0]
-            elif grouping == 'haplotype':
+            elif grouping in ['h', 'haplotype']:
                 group_key = '#'.join(fields[:2])
+            elif grouping in ['c', 'contig']:
+                group_key = sequence
             else:
                 raise ValueError(f"Invalid grouping: {grouping}")
         else:
@@ -31,47 +33,58 @@ def group_sequences(sequences, grouping):
 
     return grouped_sequences
 
-def generate_pairings(grouped_sequences, num_queries):
+def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
     pairings = []
-    groups = list(grouped_sequences.keys())
-    num_groups = len(groups)
+    target_groups = list(target_grouped_sequences.keys())
+    query_groups = list(query_grouped_sequences.keys())
     
-    for i in range(num_groups):
-        target_group = groups[i]
-        query_groups = [group for j, group in enumerate(groups) if j != i]
+    for target_group in target_groups:
+        query_pool = [group for group in query_groups if group != target_group]
         
-        for query_chunk in itertools.zip_longest(*[iter(query_groups)] * num_queries):
+        for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
             query_chunk = [q for q in query_chunk if q is not None]
             pairings.append((target_group, query_chunk))
     
     return pairings
 
 def main():
-    parser = argparse.ArgumentParser(description='Generate pairings for all-to-all alignment using PanSN format.')
+    parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
     parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
-    parser.add_argument('--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
-    parser.add_argument('--grouping', choices=['genome', 'haplotype'], default='haplotype', help='Grouping level: genome or haplotype (default: haplotype)')
-    parser.add_argument('--output', help='Output file to save the pairings')
+    parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
+    parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
+    parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
+    parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')
 
-    args = parser.parse_args()
+    args, wfmash_args = parser.parse_known_args()
 
     # Parse the FASTA index file
     sequences = parse_fasta_index(args.fasta_file)
 
-    # Group sequences based on the specified grouping level
-    grouped_sequences = group_sequences(sequences, args.grouping)
+    # Group sequences based on the specified grouping levels
+    target_grouped_sequences = group_sequences(sequences, args.target_grouping)
+    query_grouped_sequences = group_sequences(sequences, args.query_grouping)
 
     # Generate pairings
-    pairings = generate_pairings(grouped_sequences, args.num_queries)
+    pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)
 
-    # Save or print the pairings
-    if args.output:
-        with open(args.output, 'w') as file:
+    # Save or print the pairings or command lines
+    if wfmash_args:
+        wfmash_options = ' '.join(wfmash_args)
+        if args.output:
+            with open(args.output, 'w') as file:
+                for target_group, query_groups in pairings:
+                    file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
+        else:
             for target_group, query_groups in pairings:
-                file.write(f"{target_group},{','.join(query_groups)}\n")
+                print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
     else:
-        for target_group, query_groups in pairings:
-            print(f"{target_group},{','.join(query_groups)}")
+        if args.output:
+            with open(args.output, 'w') as file:
+                for target_group, query_groups in pairings:
+                    file.write(f"{target_group}\t{','.join(query_groups)}\n")
+        else:
+            for target_group, query_groups in pairings:
+                print(f"{target_group}\t{','.join(query_groups)}")
 
 if __name__ == '__main__':
     main()
\ No newline at end of file

From dc5e4db91c2ddf6b400dce9cb63b8839144448bd Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 16:54:22 -0500
Subject: [PATCH 07/11] run PR tests on main

---
 .github/workflows/test_on_push.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
index 01362c67..c13a1907 100644
--- a/.github/workflows/test_on_push.yml
+++ b/.github/workflows/test_on_push.yml
@@ -1,10 +1,10 @@
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
     paths-ignore:
       - '**/*.md'
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
     paths-ignore:
       - '**/*.md'
 

From 57769c4cd441bc9dc16b41d17e55f9565dc91776 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 17:00:18 -0500
Subject: [PATCH 08/11] add a way to make the source tarball

---
 scripts/make_source_targball.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 scripts/make_source_targball.sh

diff --git a/scripts/make_source_targball.sh b/scripts/make_source_targball.sh
new file mode 100644
index 00000000..313d2f92
--- /dev/null
+++ b/scripts/make_source_targball.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# run from root of the repository
+
+mkdir source-tarball
+cd source-tarball
+git clone --recursive https://github.com/waveygang/wfmash
+cd wfmash
+git fetch --tags origin
+LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)"
+git checkout "${LATEST_TAG}"
+git submodule update --init --recursive
+bash scripts/generate_git_version.sh $PWD/src
+sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i
+rm -Rf .git
+find src/common -name ".git" -exec rm -Rf "{}" \;
+cd ..
+mv wfmash "wfmash-${LATEST_TAG}"
+tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}"
+rm -Rf "wfmash-${LATEST_TAG}"

From 964eeb2ad598cb25c558a9b22ba567a95b6ab7dd Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 17:02:32 -0500
Subject: [PATCH 09/11] use G rather than T for the tsv output from wflign

---
 .github/workflows/test_on_push.yml | 4 ++--
 src/interface/parse_args.hpp       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
index c13a1907..c117960e 100644
--- a/.github/workflows/test_on_push.yml
+++ b/.github/workflows/test_on_push.yml
@@ -38,9 +38,9 @@ jobs:
       - name: Test mapping coverage with 8 yeast genomes (PAF output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
       - name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -G wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
       - name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -G wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
       - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
       - name: Test mapping+alignment with short reads (255bps) (PAF output)
diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
index f48c4d89..49090124 100644
--- a/src/interface/parse_args.hpp
+++ b/src/interface/parse_args.hpp
@@ -141,7 +141,7 @@ void parse_args(int argc,
 
 #ifdef WFA_PNG_TSV_TIMING
     args::Group debugging_opts(parser, "[ Debugging Options ]");
-    args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"});
+    args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"});
     args::ValueFlag<std::string> prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"});
     args::ValueFlag<uint64_t> wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"});
     args::ValueFlag<std::string> path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"});

From 13931c657255166277edd5cb9d16cd3020402f34 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sat, 25 May 2024 17:33:37 -0500
Subject: [PATCH 10/11] try a workaround

---
 .github/workflows/test_on_push.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
index c117960e..2a9c8273 100644
--- a/.github/workflows/test_on_push.yml
+++ b/.github/workflows/test_on_push.yml
@@ -38,9 +38,9 @@ jobs:
       - name: Test mapping coverage with 8 yeast genomes (PAF output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
       - name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -G wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf
       - name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -G wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
       - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
       - name: Test mapping+alignment with short reads (255bps) (PAF output)

From 76d19caf9682bd4702c2b3e39ca8b95eb520dc3a Mon Sep 17 00:00:00 2001
From: AndreaGuarracino <andreaguarracino@outlook.com>
Date: Mon, 27 May 2024 17:43:55 -0500
Subject: [PATCH 11/11] fix `heap-use-after-free` error

---
 src/common/wflign/src/wflign.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp
index 67cd7bf9..84144896 100644
--- a/src/common/wflign/src/wflign.cpp
+++ b/src/common/wflign/src/wflign.cpp
@@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront(
 
         // Free
         delete wflambda_aligner;
+        delete wf_aligner;
 
 #ifdef WFA_PNG_TSV_TIMING
         if (extend_data.emit_png) {
@@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront(
             }
 
             if (merge_alignments) {
-                // Free old aligner
-                delete wf_aligner;
-
                 // use biWFA for all patching
                 wfa::WFAlignerGapAffine2Pieces* wf_aligner =
                         new wfa::WFAlignerGapAffine2Pieces(
@@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront(
                         emit_patching_tsv,
                         out_patching_tsv
 #endif
-                        );
+                );
+
+                delete wf_aligner;
             } else {
                 // todo old implementation (and SAM format is not supported)
                 for (auto x = trace.rbegin(); x != trace.rend(); ++x) {
@@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront(
                 }
             }
         }
-
-        // Free
-        delete wf_aligner;
     }
 }