sanger-tol
diff --git a/‎.github/workflows/branch.yml
+1-1 b/‎.github/workflows/branch.yml
+1-1
diff --git a/‎CHANGELOG.md
+6-6 b/‎CHANGELOG.md
+6-6
diff --git a/‎README.md
+5 b/‎README.md
+5
diff --git a/‎bin/ascc_shorten_fasta_headers.py
-95 b/‎bin/ascc_shorten_fasta_headers.py
-95
diff --git a/‎bin/create_btk_dataset.py
+19-6 b/‎bin/create_btk_dataset.py
+19-6
diff --git a/‎bin/generate_samplesheet.py
+5-1 b/‎bin/generate_samplesheet.py
+5-1
diff --git a/‎bin/sanitise_input_fasta_file.py
+164 b/‎bin/sanitise_input_fasta_file.py
+164
@@ -3,7 +3,7 @@ name: nf-core branch protection
 # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
 on:
   pull_request_target:
-    branches: [master]
+    branches: [main]
 
 jobs:
   test:
 
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.1.0 - [date]
+## v0.1.0 - Red Book [14/02/2025]
 
 Initial release of sanger-tol/ascc, created with the [nf-core](https://nf-co.re/) template.
 
@@ -82,8 +82,8 @@ The intention of this pipeline is to succeed the currently in production Cobiont
 | FCSGX_RUNGX                             | -           | 0.5.4--h4ac6f70_1                                                                                                                 |
 | GNU_SORT                                | -           | 9.3                                                                                                                               |
 | GUNZIP                                  | -           | ubuntu:22.04                                                                                                                      |
-| KRAKEN2_KRAKEN2                         | -           |                                                                                                                                   |
-| MINIMAP2_ALIGN                          | -           |                                                                                                                                   |
+| KRAKEN2_KRAKEN2                         | -           | kraken2:2.1.3,pigz:2.8                                                                                                            |
+| MINIMAP2_ALIGN                          | -           | minimap2:2.28--he4a0461_0,samtools=1.20                                                                                           |
 | MINIMAP2_INDEX                          | -           | 2.28--he4a0461_0                                                                                                                  |
 | NCBITOOLS_VECSCREEN                     | -           | ncbi-tools-bin:6.1.20170106-6-deb_cv2                                                                                             |
 | SAMTOOLS\_\*                            | -           | 1.21--h50ea8bc_0                                                                                                                  |
@@ -98,11 +98,11 @@ The intention of this pipeline is to succeed the currently in production Cobiont
 | CHECK_BARCODE                           | -           | python:3.9. pacbio_barcode_check.py:1.0.0                                                                                         |
 | CHUNK_ASSEMBLY_FOR_VECSCREEN            | -           | biopython:1.81, chunk_assembly_for_vecscreen.py:1.0.0                                                                             |
 | CONVERT_TO_HITS_FILE                    | -           | python:3.9, convert_to_hits.py:1.0.0                                                                                              |
-| CREATE_BTK_DATASET                      | -           | blobtoolkit:4.3.9, python:3.9, create_btk_dataset.py:1.0.0                                                                        |
+| CREATE_BTK_DATASET                      | -           | blobtoolkit:4.3.9, python:3.9, create_btk_dataset.py:2.0.0                                                                        |
 | EXTRACT_CONTAMINANTS                    | -           | python:3.9, biopython:1.78, pybedtools:0.9.0, extract_contaminants_by_type.py:1.0.0                                               |
 | FILTER_BARCODE                          | -           | biopython:1.78, python:3.9, filter_barcode_blast_results.py:1.0.0                                                                 |
 | FILTER_COMMENTS                         | -           | coreutils:9.1                                                                                                                     |
-| FILTER_FASTA                            | -           | python:3.9, ascc_shorten_fasta_headers.py:1.0.0, filter_fasta_by_length.py:1.0.0                                                  |
+| FILTER_FASTA                            | -           | python:3.9, sanitise_input_fasta_file.py:1.2.0, filter_fasta_by_length.py:1.0.0                                                   |
 | FILTER_VECSCREEN_RESULTS                | -           | python:3.9, VSlistTo1HitPerLine.py:1.0.0                                                                                          |
 | REFORMAT_DIAMOND_OUTFMT6                | -           | python:3.9, reformat_diamond_outfmt6.py:1.0.0                                                                                     |
 | GC_CONTENT                              | -           | python:3.9, gc_content.py:1.0.0                                                                                                   |
@@ -113,7 +113,7 @@ The intention of this pipeline is to succeed the currently in production Cobiont
 | GET_LINEAGE_FOR_TOP                     | -           | python:3.9, get_lineage_for_top.py:1.0.0                                                                                          |
 | KMER_COUNT_DIM_REDUCTION_COMBINE_CSV    | -           | pandas:1.5.2, python:3.9, kmer_count_dim_reduction_combine_csv.py:1.0.0                                                           |
 | KMER_COUNT_DIM_REDUCTION                | -           | python:3.9, pandas:2.2.1, tensorlflow:2.15.0, scikit-learn:1.4.1, umap:0.5.5, matplotlib:3.8.0, kmer_count_dim_reduction.py:1.0.0 |
-| MERGE_BTK_DATASETS                      | -           | blobtoolkit:4.3.9, merge_btk_datasets.py:1.0.0                                                                                    |
+| MERGE_BTK_DATASETS                      | -           | blobtoolkit:4.3.9, merge_btk_datasets.py:2.0.0                                                                                    |
 | ORGANELLE_CONTAMINATION_RECOMMENDATIONS | -           | python:3.9, organelle_contamination_recommendation.py:1.0.0                                                                       |
 | PARSE_FCSGX_RESULT                      | -           | python:3.9, parse_fcsgx_result.py:1.0.0                                                                                           |
 | REFORMAT_FULL_OUTFMT6                   | -           | python:3.9, reformat_blast_outfmt6.py:1.0.0                                                                                       |
 
@@ -79,6 +79,11 @@ This setup assumes that you have an assembly where the primary contigs or scaffo
 It is okay to leave out assembly components from the run. E.g. if your assembly does not have a mitochondrial sequence, you can leave the row with the `MITO` tag out. If your assembly does not have a plastid sequence, you can leave the row with the `PLASTID` tag out.
 The params-input yaml will need to contain the following data will be detailed [here](./docs/usage.md).
 
+The documentation of the kmers dimensionality reduction is covered in separate markdown files dedicated to this topic:
+
+- [Kmers Dimensionality Reduction](./docs/kmers_dim_reduction.md)
+- [Kmers Autoencoder](./docs/kmers_autoencoder.md)
+
 Now, you can run the pipeline using:
 
 <!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
 
@@ -111,19 +111,32 @@ def tiara_results_to_btk_format(tiara_results_path, outfile_path):
 def detect_dim_reduction_methods(kmers_dim_reduction_output_path):
     """
     Parses the header of the kmers dimensionality reduction report file to detect
-    which dimensionality reduction methods were used and how many dimensions each has
-    Returns a dictionary where keys are method names and values are number of dimensions
+    which dimensionality reduction methods were used and how many dimensions each has.
+    Returns a dictionary where keys are method names and values are number of dimensions.
+
+    The function extracts method names by removing the 'embedding_dim_X_' prefix from column names,
+    preserving the complete method name including any underscores it may contain.
     """
+    import re
+
     with open(kmers_dim_reduction_output_path) as f:
         header_string = f.readline().strip()
 
     split_header = header_string.split(",")
     dim_reduction_methods = {}
 
-    for method in set(col.split("_")[-1] for col in split_header if col.startswith("embedding_dim_")):
-        # Count how many dimensions exist for this method
-        dims = sum(1 for col in split_header if f"embedding_dim_" in col and col.endswith(f"_{method}"))
-        dim_reduction_methods[method] = dims
+    # Get columns that start with embedding_dim_
+    embedding_cols = [col for col in split_header if col.startswith("embedding_dim_")]
+
+    # Extract method names by removing the embedding_dim_X_ prefix
+    for col in embedding_cols:
+        # Use regex to remove the prefix pattern 'embedding_dim_digits_'
+        method = re.sub(r"^embedding_dim_\d+_", "", col)
+
+        # Count dimensions for this method if we haven't seen it yet
+        if method not in dim_reduction_methods:
+            dims = sum(1 for c in embedding_cols if re.sub(r"^embedding_dim_\d+_", "", c) == method)
+            dim_reduction_methods[method] = dims
 
     return dim_reduction_methods
 
 
@@ -33,7 +33,11 @@ def main():
 
     data_list.append("sample,datatype,datafile\n")
 
-    [data_list.append(f"{args.sample_name},pacbio,{args.path_to_reads}{file}\n") for file in os.listdir(args.path_to_reads) if file.endswith('.fasta.gz') or file.endswith('.fa.gz')]
+    [
+        data_list.append(f"{args.sample_name},pacbio,{args.path_to_reads}{file}\n")
+        for file in os.listdir(args.path_to_reads)
+        if file.endswith(".fasta.gz") or file.endswith(".fa.gz")
+    ]
 
     if len(data_list) <= 1:
         sys.exit("I was expecting at least one FASTA.GZ file")
 
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+VERSION = "1.2.0"
+DESCRIPTION = f"""
+---
+Script for sanitising FASTA headers and sequences:
+- Shortens headers by splitting by whitespace and keeping only the first element
+- Replaces problematic characters in headers (commas, spaces, etc.) with underscores
+- Converts sequences to uppercase and replaces non-ATGC bases with N
+Version: {VERSION}
+---
+
+Written by Eerik Aunin (ea10)
+Modified by Damon-Lee Pointon (@dp24/@DLBPointon)
+Further modified by Eerik Aunin (ea10)
+
+"""
+
+# MIT License
+#
+# Copyright (c) 2020-2022 Genome Research Ltd.
+#
+# Author: Eerik Aunin (eeaunin@gmail.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import general_purpose_functions as gpf
+import argparse
+import textwrap
+import sys
+import tempfile
+import re
+
+
+def is_all_n_sequence(seq):
+    """Return True if sequence consists entirely of N's."""
+    return all(base == "N" for base in seq.strip().upper())
+
+
+def sanitise_sequence(seq):
+    """Convert sequence to uppercase and replace any non-ATGC bases with N."""
+    seq = seq.upper()
+    return re.sub(r"[^ATGC]", "N", seq)
+
+
+def sanitise_header(header):
+    """Replace problematic characters in FASTA headers with underscores."""
+    # Remove the '>' character if present at the start
+    if header.startswith(">"):
+        header = header[1:]
+
+    # Replace problematic characters with underscores
+    sanitised = re.sub(r"[,;\s|:]", "_", header)
+
+    # Add back the '>' character
+    return ">" + sanitised
+
+
+def parse_args(argv=None):
+    parser = argparse.ArgumentParser(
+        prog="sanitise_input_fasta_file",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(DESCRIPTION),
+    )
+    parser.add_argument("fasta_path", type=str, help="Path to input FASTA file")
+    parser.add_argument(
+        "--delimiter",
+        type=str,
+        help="Delimiter string for splitting FASTA headers. Default: any whitespace character",
+        default="",
+    )
+    parser.add_argument("--allow_duplicate_headers", dest="allow_duplicate_headers", action="store_true")
+    parser.add_argument(
+        "--keep_n_sequences", action="store_true", help="Keep sequences that are all Ns (default: False)"
+    )
+    parser.add_argument("-v", "--version", action="version", version=VERSION)
+    return parser.parse_args(argv)
+
+
+def main(fasta_path, delimiter, allow_duplicate_headers, keep_n_sequences=False):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        input_file = fasta_path
+        if fasta_path.endswith(".gz") or fasta_path.endswith('.gz"'):
+            input_file = "{}/input_file.fa".format(tmp_dir)
+            gpf.run_system_command("gunzip -c {} > {}".format(fasta_path, input_file))
+
+        headers_list = list()
+        headers_with_commas = 0
+        in_data = gpf.ll(input_file)
+
+        current_header = None
+        current_sequence = []
+
+        def process_sequence():
+            if current_header and current_sequence:
+                sequence = "".join(current_sequence)
+                if keep_n_sequences or not is_all_n_sequence(sequence):
+                    print(current_header)
+                    print(sequence)
+                else:
+                    sys.stderr.write("Skipping all-N sequence: {}\n".format(current_header[1:].strip()))
+
+        for line in in_data:
+            if line.startswith(">"):
+                # Process previous sequence if it exists
+                process_sequence()
+
+                # Start new sequence
+                original_header = line.strip()
+                if delimiter == "":
+                    current_header = original_header.split()[0]
+                else:
+                    current_header = original_header.split(delimiter)[0]
+
+                # Check for commas in the original header
+                if "," in original_header:
+                    headers_with_commas += 1
+
+                # Sanitise the header
+                current_header = sanitise_header(current_header)
+
+                if current_header in headers_list and allow_duplicate_headers is False:
+                    sys.stderr.write(
+                        "Duplicate FASTA headers ({}) were found in the input file ({}) after truncating the headers with a delimiter\n".format(
+                            current_header[1:], fasta_path
+                        )
+                    )
+                    sys.exit(1)
+                headers_list.append(current_header)
+                current_sequence = []
+            else:
+                # Add sanitised sequence line
+                current_sequence.append(sanitise_sequence(line))
+
+        # Process the last sequence
+        process_sequence()
+
+        # Print warning about headers with commas
+        if headers_with_commas > 0:
+            sys.stderr.write(
+                "Warning: {} FASTA header(s) contained commas that were replaced with underscores\n".format(
+                    headers_with_commas
+                )
+            )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.fasta_path, args.delimiter, args.allow_duplicate_headers, args.keep_n_sequences)