Merge pull request #242 from waveygang/specify-queries

Specify queries
waveygang · May 28, 2024 · 517e1bc · 517e1bc
2 parents 251f4e1 + 76d19ca
commit 517e1bc
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 69 deletions.
diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
@@ -1,10 +1,10 @@
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
     paths-ignore:
       - '**/*.md'
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
     paths-ignore:
       - '**/*.md'
 
@@ -38,9 +38,9 @@ jobs:
       - name: Test mapping coverage with 8 yeast genomes (PAF output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
       - name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf
       - name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
-        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
+        run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
       - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
         run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
       - name: Test mapping+alignment with short reads (255bps) (PAF output)

diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py
@@ -0,0 +1,90 @@
+import argparse
+import gzip
+import itertools
+
+def parse_fasta_index(fasta_file):
+    fai_file = fasta_file + '.fai'
+    sequences = []
+    with open(fai_file, 'r') as file:
+        for line in file:
+            sequence_name = line.strip().split('\t')[0]
+            sequences.append(sequence_name)
+    return sequences
+
+def group_sequences(sequences, grouping):
+    grouped_sequences = {}
+    for sequence in sequences:
+        if '#' in sequence:
+            fields = sequence.split('#')
+            if grouping in ['g', 'genome']:
+                group_key = fields[0]
+            elif grouping in ['h', 'haplotype']:
+                group_key = '#'.join(fields[:2])
+            elif grouping in ['c', 'contig']:
+                group_key = sequence
+            else:
+                raise ValueError(f"Invalid grouping: {grouping}")
+        else:
+            group_key = sequence
+
+        if group_key not in grouped_sequences:
+            grouped_sequences[group_key] = []
+        grouped_sequences[group_key].append(sequence)
+
+    return grouped_sequences
+
+def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
+    pairings = []
+    target_groups = list(target_grouped_sequences.keys())
+    query_groups = list(query_grouped_sequences.keys())
+
+    for target_group in target_groups:
+        query_pool = [group for group in query_groups if group != target_group]
+
+        for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
+            query_chunk = [q for q in query_chunk if q is not None]
+            pairings.append((target_group, query_chunk))
+
+    return pairings
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
+    parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
+    parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
+    parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
+    parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
+    parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')
+
+    args, wfmash_args = parser.parse_known_args()
+
+    # Parse the FASTA index file
+    sequences = parse_fasta_index(args.fasta_file)
+
+    # Group sequences based on the specified grouping levels
+    target_grouped_sequences = group_sequences(sequences, args.target_grouping)
+    query_grouped_sequences = group_sequences(sequences, args.query_grouping)
+
+    # Generate pairings
+    pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)
+
+    # Save or print the pairings or command lines
+    if wfmash_args:
+        wfmash_options = ' '.join(wfmash_args)
+        if args.output:
+            with open(args.output, 'w') as file:
+                for target_group, query_groups in pairings:
+                    file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
+        else:
+            for target_group, query_groups in pairings:
+                print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
+    else:
+        if args.output:
+            with open(args.output, 'w') as file:
+                for target_group, query_groups in pairings:
+                    file.write(f"{target_group}\t{','.join(query_groups)}\n")
+        else:
+            for target_group, query_groups in pairings:
+                print(f"{target_group}\t{','.join(query_groups)}")
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/make_source_targball.sh b/scripts/make_source_targball.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# run from root of the repository
+
+mkdir source-tarball
+cd source-tarball
+git clone --recursive https://github.com/waveygang/wfmash
+cd wfmash
+git fetch --tags origin
+LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)"
+git checkout "${LATEST_TAG}"
+git submodule update --init --recursive
+bash scripts/generate_git_version.sh $PWD/src
+sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i
+rm -Rf .git
+find src/common -name ".git" -exec rm -Rf "{}" \;
+cd ..
+mv wfmash "wfmash-${LATEST_TAG}"
+tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}"
+rm -Rf "wfmash-${LATEST_TAG}"
diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp
@@ -106,4 +106,57 @@ void for_each_seq_in_file(
     }
 }
 
+void for_each_seq_in_file(
+    faidx_t* fai,
+    const std::vector<std::string>& seq_names,
+    const std::function<void(const std::string&, const std::string&)>& func) {
+    for (const auto& seq_name : seq_names) {
+        int len;
+        char* seq = fai_fetch(fai, seq_name.c_str(), &len);
+        if (seq != nullptr) {
+            func(seq_name, std::string(seq));
+            free(seq);
+        }
+    }
+}
+
+void for_each_seq_in_file_filtered(
+    const std::string& filename,
+    const std::vector<std::string>& query_prefix,
+    const std::unordered_set<std::string>& query_list,
+    const std::function<void(const std::string&, const std::string&)>& func) {
+    faidx_t* fai = fai_load(filename.c_str());
+    if (fai == nullptr) {
+        std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl;
+        return;
+    }
+
+    std::vector<std::string> query_seq_names;
+    int num_seqs = faidx_nseq(fai);
+    for (int i = 0; i < num_seqs; i++) {
+        const char* seq_name = faidx_iseq(fai, i);
+		bool prefix_skip = true;
+		for (const auto& prefix : query_prefix) {
+			if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) {
+				prefix_skip = false;
+				break;
+			}
+		}
+		if (!query_prefix.empty() && prefix_skip) {
+            continue;
+        }
+        if (!query_list.empty() && query_list.count(seq_name) == 0) {
+            continue;
+        }
+        query_seq_names.push_back(seq_name);
+    }
+
+    for_each_seq_in_file(
+        fai,
+        query_seq_names,
+        func);
+
+    fai_destroy(fai);
+}
+
 } // namespace seqiter
diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp
@@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront(
 
         // Free
         delete wflambda_aligner;
+        delete wf_aligner;
 
 #ifdef WFA_PNG_TSV_TIMING
         if (extend_data.emit_png) {
@@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront(
             }
 
             if (merge_alignments) {
-                // Free old aligner
-                delete wf_aligner;
-
                 // use biWFA for all patching
                 wfa::WFAlignerGapAffine2Pieces* wf_aligner =
                         new wfa::WFAlignerGapAffine2Pieces(
@@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront(
                         emit_patching_tsv,
                         out_patching_tsv
 #endif
-                        );
+                );
+
+                delete wf_aligner;
             } else {
                 // todo old implementation (and SAM format is not supported)
                 for (auto x = trace.rbegin(); x != trace.rend(); ++x) {
@@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront(
                 }
             }
         }
-
-        // Free
-        delete wf_aligner;
     }
 }
 

diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
@@ -64,9 +64,9 @@ void parse_args(int argc,
     args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]");
     args::Positional<std::string> target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file");
 
-    args::Group io_opts(parser, "[ Files IO Options ]");
-    args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequences file");
-    args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
+	args::Group io_opts(parser, "[ Files IO Options ]");
+    args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequence file(s)");
+    //args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
 
     args::Group mapping_opts(parser, "[ Mapping Options ]");
     args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
@@ -80,8 +80,10 @@ void parse_args(int argc,
     args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
     args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
     args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
-	args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
-	args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
+	args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
+	args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
+	args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"});
+	args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
     args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
     args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
     args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"});
@@ -102,19 +104,19 @@ void parse_args(int argc,
     args::Group alignment_opts(parser, "[ Alignment Options ]");
     args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
     args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments "
-                                                   "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
+								"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
                                 {'O', "invert-filtering"});
     args::ValueFlag<uint16_t> wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"});
     args::ValueFlag<std::string> wfa_score_params(alignment_opts, "mismatch,gap1,ext1",
-                                            "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
-                                            {"wfa-params"});
+												  "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
+												  {"wfa-params"});
     args::ValueFlag<std::string> wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2",
-                                            "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
-                                            {"wfa-patching-params"});
+														   "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
+														   {"wfa-patching-params"});
     //wflign parameters
     args::ValueFlag<std::string> wflign_score_params(alignment_opts, "mismatch,gap1,ext1",
-                                                       "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
-                                                       {"wflign-params"});
+													 "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
+													 {"wflign-params"});
     args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
     args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
     args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});
@@ -139,7 +141,7 @@ void parse_args(int argc,
 
 #ifdef WFA_PNG_TSV_TIMING
     args::Group debugging_opts(parser, "[ Debugging Options ]");
-    args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"});
+    args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"});
     args::ValueFlag<std::string> prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"});
     args::ValueFlag<uint64_t> wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"});
     args::ValueFlag<std::string> path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"});
@@ -202,6 +204,14 @@ void parse_args(int argc,
 	if (target_prefix) {
 		map_parameters.target_prefix = args::get(target_prefix);
 	}
+
+	if (query_list) {
+		map_parameters.query_list = args::get(query_list);
+	}
+
+	if (query_prefix) {
+		map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ',');
+	}
 
     if (target_sequence_file) {
         map_parameters.refSequences.push_back(args::get(target_sequence_file));
@@ -215,10 +225,6 @@ void parse_args(int argc,
             align_parameters.querySequences.push_back(q);
         }
     }
-    if (query_sequence_file_list) {
-        skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences);
-        skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences);
-    }
 
 	if (target_sequence_file && map_parameters.querySequences.empty()
 		&& map_parameters.refSequences.size() == 1