From be24047e2b64d02a187824909b91d04bda6074d8 Mon Sep 17 00:00:00 2001
From: Michal Babinski <michal.babinski@theiagen.com>
Date: Wed, 9 Oct 2024 14:25:41 -0400
Subject: [PATCH] [TheiaCoV] Reorder flu segments from largest to smallest in
 irma task (#635)

* concat consensus fasta by segment

* Updated irma assembly info for reorderd flu segments
---
 .../genomic_characterization/theiacov.md      |  4 ++--
 tasks/assembly/task_irma.wdl                  | 23 ++++++++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md
index f56dd285c..f877ba540 100644
--- a/docs/workflows/genomic_characterization/theiacov.md
+++ b/docs/workflows/genomic_characterization/theiacov.md
@@ -812,7 +812,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT
 
 ??? toggle "`irma`: Assembly and Characterization ==_for flu in TheiaCoV_Illumina_PE & TheiaCoV_ONT_=="
 
-    Cleaned reads are assembled using `irma` which does not use a reference due to the rapid evolution and high variability of influenza. `irma` also performs typing and subtyping as part of the assembly process.
+    Cleaned reads are assembled using `irma` which does not use a reference due to the rapid evolution and high variability of influenza. Assemblies produced by `irma` will be orderd from largest to smallest assembled flu segment. `irma` also performs typing and subtyping as part of the assembly process.
 
     General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)).
 
@@ -959,7 +959,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch)
 | aligned_bam | File | Primer-trimmed BAM file; generated during consensus assembly process | CL, ONT, PE, SE |
 | artic_docker | String | Docker image utilized for read trimming and consensus genome assembly | CL, ONT |
 | artic_version | String | Version of the Artic software utilized for read trimming and conesnsus genome assembly | CL, ONT |
-| assembly_fasta | File | Consensus genome assembly; for lower quality flu samples, the output may state "Assembly could not be generated" when there is too little and/or too low quality data for IRMA to produce an assembly | CL, ONT, PE, SE |
+| assembly_fasta | File | Consensus genome assembly; for lower quality flu samples, the output may state "Assembly could not be generated" when there is too little and/or too low quality data for IRMA to produce an assembly. Contigs will be ordered from smallest to largest when IRMA is used. | CL, ONT, PE, SE |
 | assembly_length_unambiguous | Int | Number of unambiguous basecalls within the consensus assembly | CL, FASTA, ONT, PE, SE |
 | assembly_mean_coverage | Float | Mean sequencing depth throughout the consensus assembly. Generated after performing primer trimming and calculated using the SAMtools coverage command | CL, ONT, PE, SE |
 | assembly_method | String | Method employed to generate consensus assembly | CL, FASTA, ONT, PE, SE |
diff --git a/tasks/assembly/task_irma.wdl b/tasks/assembly/task_irma.wdl
index f0dddc0fb..f9c5478ab 100644
--- a/tasks/assembly/task_irma.wdl
+++ b/tasks/assembly/task_irma.wdl
@@ -87,9 +87,26 @@ task irma {
       echo "Type_"$(basename "$(echo "$(find ~{samplename}/*.fasta | head -n1)")" | cut -d_ -f1) > IRMA_TYPE
       # set irma_type bash variable which is used later
       irma_type=$(cat IRMA_TYPE)
-      # concatenate consensus assemblies into single file with all genome segments
-      echo "DEBUG: creating IRMA FASTA file containing all segments...."
-      cat ~{samplename}/*.fasta > ~{samplename}.irma.consensus.fasta
+      
+      # flu segments from largest to smallest
+      segments=("PB2" "PB1" "PA" "HA" "NP" "NA" "MP" "NS")
+
+      echo "DEBUG: creating IRMA FASTA file containing all segments in order (largest to smallest)...."
+      
+      # initialize an empty file
+      touch ~{samplename}.irma.consensus.fasta
+
+      # concatenate files in the order of the segments array
+      for segment in "${segments[@]}"; do
+        segment_file=$(find "~{samplename}" -name "*${segment}*.fasta")
+        if [ -n "$segment_file" ]; then
+          echo "DEBUG: Adding $segment_file to consensus FASTA"
+          cat "$segment_file" >> ~{samplename}.irma.consensus.fasta
+        else
+          echo "WARNING: No file containing ${segment} found for ~{samplename}"
+        fi
+      done
+
       echo "DEBUG: editing IRMA FASTA file to include sample name in FASTA headers...."
       sed -i "s/>/>~{samplename}_/g" ~{samplename}.irma.consensus.fasta