From cc89be65d65de13dc28274b5eb32078dd61198ed Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 1 Jun 2024 15:10:47 -0500 Subject: [PATCH 1/2] optionally force global biwfa alignment of mappings --- src/align/include/align_parameters.hpp | 2 ++ src/align/include/computeAlignments.hpp | 1 + src/common/wflign/src/wflign.cpp | 5 ++++- src/common/wflign/src/wflign.hpp | 2 ++ src/interface/parse_args.hpp | 2 ++ 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/align/include/align_parameters.hpp b/src/align/include/align_parameters.hpp index bda1c1eb..67548c08 100644 --- a/src/align/include/align_parameters.hpp +++ b/src/align/include/align_parameters.hpp @@ -25,6 +25,8 @@ struct Parameters { //wflambda uint16_t wflambda_segment_length; //segment length for wflambda + bool force_biwfa_alignment; //force biwfa alignment + int wfa_mismatch_score; int wfa_gap_opening_score; int wfa_gap_extension_score; diff --git a/src/align/include/computeAlignments.hpp b/src/align/include/computeAlignments.hpp index 1500644f..b2971653 100644 --- a/src/align/include/computeAlignments.hpp +++ b/src/align/include/computeAlignments.hpp @@ -496,6 +496,7 @@ namespace align wflign::wavefront::WFlign* wflign = new wflign::wavefront::WFlign( param.wflambda_segment_length, param.min_identity, + param.force_biwfa_alignment, param.wfa_mismatch_score, param.wfa_gap_opening_score, param.wfa_gap_extension_score, diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 84144896..20195364 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -42,6 +42,7 @@ void clean_up_sketches(std::vector*> &sketches) { WFlign::WFlign( const uint16_t segment_length, const float min_identity, + const bool force_biwfa_alignment, const int wfa_mismatch_score, const int wfa_gap_opening_score, const int wfa_gap_extension_score, @@ -66,6 +67,8 @@ WFlign::WFlign( this->segment_length = segment_length; this->min_identity = min_identity; + this->force_biwfa_alignment = force_biwfa_alignment; + this->wfa_mismatch_score = wfa_mismatch_score; this->wfa_gap_opening_score = wfa_gap_opening_score; this->wfa_gap_extension_score = wfa_gap_extension_score; @@ -436,7 +439,7 @@ void WFlign::wflign_affine_wavefront( #ifdef WFA_PNG_TSV_TIMING const auto start_time = std::chrono::steady_clock::now(); #endif - if ( + if (force_biwfa_alignment || (query_length <= segment_length * 8 || target_length <= segment_length * 8) || (mashmap_estimated_identity >= 0.99 && query_length <= MAX_LEN_FOR_STANDARD_WFA && target_length <= MAX_LEN_FOR_STANDARD_WFA) diff --git a/src/common/wflign/src/wflign.hpp b/src/common/wflign/src/wflign.hpp index 5cb4149c..daec5770 100644 --- a/src/common/wflign/src/wflign.hpp +++ b/src/common/wflign/src/wflign.hpp @@ -91,10 +91,12 @@ namespace wflign { bool emit_md_tag; bool paf_format_else_sam; bool no_seq_in_sam; + bool force_biwfa_alignment; // Setup WFlign( const uint16_t segment_length, const float min_identity, + const bool force_biwfa_alignment, const int wfa_mismatch_score, const int wfa_gap_opening_score, const int wfa_gap_extension_score, diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index f21745e9..37e50069 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -102,6 +102,7 @@ void parse_args(int argc, args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); + args::Flag force_biwfa_alignment(alignment_opts, "force-biwfa", "force alignment with biWFA for all sequence pairs", {'I', "force-biwfa"}); args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments " "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]", {'O', "invert-filtering"}); @@ -365,6 +366,7 @@ void parse_args(int argc, align_parameters.emit_md_tag = args::get(emit_md_tag); align_parameters.sam_format = args::get(sam_format); align_parameters.no_seq_in_sam = args::get(no_seq_in_sam); + align_parameters.force_biwfa_alignment = args::get(force_biwfa_alignment); map_parameters.split = !args::get(no_split); map_parameters.dropRand = false;//ToFix: !args::get(keep_ties); align_parameters.split = !args::get(no_split); From 3e434f8e3d851041c0eb518cc24be91e2e5ed3f1 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 1 Jun 2024 15:54:45 -0500 Subject: [PATCH 2/2] script to make test cases for wflign --- scripts/make_align_test_case.sh | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 scripts/make_align_test_case.sh diff --git a/scripts/make_align_test_case.sh b/scripts/make_align_test_case.sh new file mode 100755 index 00000000..6244595a --- /dev/null +++ b/scripts/make_align_test_case.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Function to display usage +usage() { + echo "Usage: $0 -p -f -o " + echo " -p, --paf PAF file" + echo " -f, --fasta FASTA file" + echo " -o, --output Output prefix for the new FASTA and adjusted PAF files" + exit 1 +} + +# Parse command-line arguments +PARSED_ARGUMENTS=$(getopt -a -n "$0" -o p:f:o: --long paf:,fasta:,output: -- "$@") +VALID_ARGUMENTS=$? +if [ "$VALID_ARGUMENTS" != "0" ]; then + usage +fi + +eval set -- "$PARSED_ARGUMENTS" +while : +do + case "$1" in + -p | --paf) PAF_FILE="$2" ; shift 2 ;; + -f | --fasta) FASTA_FILE="$2" ; shift 2 ;; + -o | --output) OUTPUT_PREFIX="$2" ; shift 2 ;; + --) shift ; break ;; + *) usage ;; + esac +done + +# Validate arguments +if [ -z "$PAF_FILE" ] || [ -z "$FASTA_FILE" ] || [ -z "$OUTPUT_PREFIX" ]; then + usage +fi + +OUTPUT_FASTA="${OUTPUT_PREFIX}.fa" +OUTPUT_PAF="${OUTPUT_PREFIX}.paf" + +# Extract the first alignment from the PAF file +read -r line < "$PAF_FILE" + +# Parse the alignment fields +IFS=$'\t' read -r -a fields <<< "$line" + +# Extract query and target information +query_name="${fields[0]}" +query_start="${fields[2]}" +query_end="${fields[3]}" +target_name="${fields[5]}" +target_start="${fields[7]}" +target_end="${fields[8]}" + +# Extract sequences using samtools faidx +samtools faidx "$FASTA_FILE" "$query_name:$query_start-$query_end" > "$OUTPUT_FASTA" +samtools faidx "$FASTA_FILE" "$target_name:$target_start-$target_end" >> "$OUTPUT_FASTA" + +qname=$(grep ^">" "$OUTPUT_FASTA" | head -n 1 | sed 's/^>//') +tname=$(grep ^">" "$OUTPUT_FASTA" | tail -n 1 | sed 's/^>//') + +# Adjust the PAF file +awk -v qname=$qname \ + -v tname=$tname \ + '{ + if (NF > 0) { + query_length = $4 - $3; + target_length = $9 - $8; + $1 = qname; + $6 = tname; + $2 = query_length; + $3 = 0; + $4 = query_length; + $7 = target_length; + $8 = 0; + $9 = target_length; + print $0; + } + }' "$PAF_FILE" | tr ' ' '\t' > "$OUTPUT_PAF" +