From 39393ef4d61db58821dff09ec74549d1b0b2a7c5 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@github.com>
Date: Wed, 7 Aug 2024 13:01:44 +0100
Subject: [PATCH 1/3] ea10 edits to dp24_btk_datasets_branch

---
 bin/ascc_merge_tables.py           | 313 +++++++++++++++++++++++++++++
 bin/autofilter.py                  |  23 ++-
 modules/local/ascc_merge_tables.nf |   6 +-
 workflows/ascc.nf                  |  21 +-
 4 files changed, 342 insertions(+), 21 deletions(-)
 create mode 100755 bin/ascc_merge_tables.py

diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py
new file mode 100755
index 0000000..932f505
--- /dev/null
+++ b/bin/ascc_merge_tables.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+
+VERSION = "2.0.0"
+DESCRIPTION = """
+Script for merging contaminant check results into one table
+Version: {VERSION}
+---
+Written by Eerik Anuin
+
+Re-Written by Damon-Lee Pointon (dp24/DLBPointon)
+"""
+
+import argparse
+import pandas as pd
+import textwrap
+import os
+import sys
+import general_purpose_functions as gpf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="AsccMergeTables",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(DESCRIPTION),
+    )
+    parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file")
+    parser.add_argument("-c", "--coverage", type=str, help="Coverage file")
+    parser.add_argument("-t", "--tiara", type=str, help="Tiara file")
+    parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file")
+    parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file")
+    parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file")
+    parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file")
+    parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file")
+    parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file")
+    parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file")
+    parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file")
+    parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file")
+    parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file")
+    parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample")
+    parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file")
+    parser.add_argument("-v", "--version", action="version", version=VERSION)
+    return parser.parse_args()
+
+
+def check_paths(paths_dict, required_files):
+    """
+    Checks if a required file exists and exits with an error message if it doesn't
+    """
+    out_dict = dict()
+
+    for data_type, input_file in paths_dict.items():
+        if input == None:
+            pass
+        else:
+            out_dict[data_type] = input_file
+
+    return out_dict
+
+
+def load_and_merge_dataframes(paths_dict):
+    """
+    Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table
+    """
+    gc_path = paths_dict["gc_content"]
+    df = pd.read_csv(gc_path, sep="\t", header=None)
+    if df.shape[0] > 0:
+        df.columns = ["scaff", "gc"]
+        df["gc"] = df["gc"] * 100
+    else:
+        sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path))
+        sys.exit(1)
+
+    coverage_df = None
+    if paths_dict["coverage"] is not None:
+        coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None)
+        if coverage_df.shape[0] > 0:
+            coverage_df.columns = ["scaff", "coverage"]
+        else:
+            sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n")
+            coverage_df = None
+
+    tiara_df = None
+    if paths_dict["tiara"] is not None:
+        tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t")
+        if tiara_df.shape[0] > 0:
+            tiara_df["tiara_classif"] = tiara_df["class_fst_stage"]
+            tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()]
+            tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits]
+            tiara_df = tiara_df.iloc[:, [0, 3]]
+            tiara_df.columns = ["scaff", "tiara_classif"]
+        else:
+            sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"]))
+            tiara_df = None
+
+    bacterial_kraken_df = None
+    if paths_dict["bacterial_kraken"] is not None:
+        bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",")
+        if bacterial_kraken_df.shape[0] > 0:
+            bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True)
+            bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True)
+        else:
+            sys.stderr.write(
+                "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"])
+            )
+            bacterial_kraken_df = None
+
+    nt_kraken_df = None
+    if paths_dict["nt_kraken"] is not None:
+        nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",")
+        if nt_kraken_df.shape[0] > 0:
+            nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True)
+            nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True)
+        else:
+            sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"]))
+            nt_kraken_df = None
+
+    dim_reduction_df = None
+    if paths_dict["dim_reduction_embeddings"] is not None:
+        dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",")
+        if dim_reduction_df.shape[0] == 0:
+            sys.stderr.write(
+                "No rows were found in kmers dimensionality reduction output table ({})\n".format(
+                    paths_dict["dim_reduction_embeddings"]
+                )
+            )
+            dim_reduction_df = None
+
+    btk_df = None
+    if paths_dict["blobtoolkit"] is not None:
+        btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t")
+        if btk_df.shape[0] == 0:
+            sys.stderr.write(
+                "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"])
+            )
+            sys.exit(1)
+        btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"}
+        if "mapped_hifi_reads_sorted_cov" in btk_df.columns:
+            btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov"
+        if "bestsum_phylum" in btk_df.columns:
+            btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum"
+        # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"}
+
+        btk_df.rename(columns=btk_renaming_dict, inplace=True)
+
+        btk_selected_cols = [
+            col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"]
+        ]
+        if len(btk_selected_cols) > 0:
+            btk_df = btk_df[btk_selected_cols]
+        else:
+            btk_df = None
+
+    btk_busco_df = None
+    if paths_dict["btk_busco"] is not None:
+        btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t")
+        if btk_busco_df.shape[0] == 0:
+            sys.stderr.write(
+                "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"])
+            )
+            sys.exit(1)
+        btk_busco_renaming_dict = {"identifiers": "scaff"}
+
+        btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True)
+
+        btk_busco_selected_cols = [
+            col
+            for col in btk_busco_df.columns
+            if col
+            in [
+                "scaff",
+                "buscogenes_superkingdom",
+                "buscogenes_kingdom",
+                "buscogenes_phylum",
+                "buscogenes_class",
+                "buscogenes_order",
+                "buscogenes_family",
+                "buscogenes_genus",
+                "buscogenes_species",
+                "buscoregions_superkingdom",
+                "buscoregions_kingdom",
+                "buscoregions_phylum",
+                "buscoregions_class",
+                "buscoregions_order",
+                "buscoregions_family",
+                "buscoregions_genus",
+                "buscoregions_species",
+            ]
+        ]
+        if len(btk_busco_selected_cols) > 0:
+            btk_busco_df = btk_busco_df[btk_busco_selected_cols]
+        else:
+            btk_busco_df = None
+
+    fcs_gx_df = None
+    if paths_dict["fcs_gx"] is not None:
+        fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",")
+        if fcs_gx_df.shape[0] == 0:
+            sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"]))
+            fcs_gx_df = None
+
+    nt_blast_df = None
+    if paths_dict["nt_blast"] is not None:
+        nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",")
+        if nt_blast_df.shape[0] == 0:
+            sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"]))
+            nt_blast_df = None
+
+    nr_diamond_df = None
+    if paths_dict["nr_diamond"] is not None:
+        nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",")
+        if nr_diamond_df.shape[0] == 0:
+            sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"]))
+            nr_diamond_df = None
+
+    uniprot_diamond_df = None
+    if paths_dict["uniprot_diamond"] is not None:
+        uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",")
+        if uniprot_diamond_df.shape[0] == 0:
+            sys.stderr.write(
+                "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"])
+            )
+            uniprot_diamond_df = None
+
+    cobiontid_markerscan_df = None
+    if paths_dict["cobiontid_markerscan"] is not None:
+        cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",")
+        if cobiontid_markerscan_df.shape[0] == 0:
+            sys.stderr.write(
+                "No rows were found in CobiontID MarkerScan output table ({})\n".format(
+                    paths_dict["cobiontid_markerscan"]
+                )
+            )
+            uniprot_diamond_df = None
+
+    contigviz_df = None
+    if paths_dict["contigviz"] is not None:
+        contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",")
+        if contigviz_df.shape[0] == 0:
+            sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"]))
+            contigviz_df = None
+
+    if coverage_df is not None:
+        df = pd.merge(df, coverage_df, on="scaff", how="outer")
+    if tiara_df is not None:
+        df = pd.merge(df, tiara_df, on="scaff", how="outer")
+    if bacterial_kraken_df is not None:
+        df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer")
+    if nt_kraken_df is not None:
+        df = pd.merge(df, nt_kraken_df, on="scaff", how="outer")
+    if dim_reduction_df is not None:
+        df = pd.merge(df, dim_reduction_df, on="scaff", how="outer")
+    if nt_blast_df is not None:
+        df = pd.merge(df, nt_blast_df, on="scaff", how="outer")
+    if nr_diamond_df is not None:
+        df = pd.merge(df, nr_diamond_df, on="scaff", how="outer")
+    if uniprot_diamond_df is not None:
+        df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer")
+    if fcs_gx_df is not None:
+        df = pd.merge(df, fcs_gx_df, on="scaff", how="outer")
+    if cobiontid_markerscan_df is not None:
+        df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer")
+    if contigviz_df is not None:
+        df = pd.merge(df, contigviz_df, on="scaff", how="outer")
+    if btk_df is not None:
+        df = pd.merge(df, btk_df, on="scaff", how="outer")
+    if btk_busco_df is not None:
+        df = pd.merge(df, btk_busco_df, on="scaff", how="outer")
+
+    return df
+
+
+def main(args):
+    paths_dict = dict()
+    paths_dict["gc_content"] = args.gc_cov
+    paths_dict["coverage"] = args.coverage
+    paths_dict["tiara"] = args.tiara
+    paths_dict["bacterial_kraken"] = args.bacterial_kraken
+    paths_dict["nt_kraken"] = args.nt_kraken
+    paths_dict["nt_blast"] = args.nt_blast
+    paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings
+    paths_dict["nr_diamond"] = args.nr_diamond
+    paths_dict["uniprot_diamond"] = args.uniprot_diamond
+    paths_dict["cobiontid_markerscan"] = args.markerscan
+    paths_dict["contigviz"] = args.contigviz
+    paths_dict["blobtoolkit"] = args.blobtoolkit
+    paths_dict["btk_busco"] = args.busco_btk
+    paths_dict["fcs_gx"] = args.fcs_gx
+
+    required_files = ["gc_content"]
+
+    paths_dict = check_paths(paths_dict, required_files)
+    df = load_and_merge_dataframes(paths_dict)
+    df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False)
+
+    if (
+        paths_dict["nt_blast"]
+        and paths_dict["nr_diamond"]
+        and paths_dict["uniprot_diamond"]
+        and paths_dict["coverage"]
+        and paths_dict["tiara"]
+        and paths_dict["nt_kraken"]
+    ):
+        process_results_tables_command = f"process_result_tables.py . {args.sample_name}"
+        gpf.run_system_command(process_results_tables_command)
+    else:
+        sys.stderr.write(
+            f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n"
+        )
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/bin/autofilter.py b/bin/autofilter.py
index 93849f6..d843308 100755
--- a/bin/autofilter.py
+++ b/bin/autofilter.py
@@ -42,9 +42,9 @@ def parse_args():
         help="Path to the assembly_autofiltered.fasta file",
         default="autofiltered.fasta",
     )
-    parser.add_argument(
-        "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file"
-    )
+    #parser.add_argument(
+    #    "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file"
+    #)
     parser.add_argument(
         "-r",
         "--rejected_seq",
@@ -56,6 +56,9 @@ def parse_args():
     parser.add_argument(
         "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy"
     )
+    parser.add_argument(
+        "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn"
+    )
     parser.add_argument("-v", "--version", action="version", version=VERSION)
     return parser.parse_args()
 
@@ -179,7 +182,7 @@ def main():
     tiara_results_path = args.tiara
     fcs_gx_summary_path = args.fcsgx_summary
     filtered_assembly_path = args.output_auto_filtered
-    combined_summary = args.fcs_gx_and_tiara_summary
+    #combined_summary = args.fcs_gx_and_tiara_summary
     excluded_seq_list_path = args.rejected_seq
     ncbi_rankedlist = args.ncbi_rankedlineage_path
 
@@ -187,7 +190,7 @@ def main():
 
     for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]:
         if not os.path.isfile(i):
-            sys.stderr.write(f"{i} WAS NOT AT THE EXPECTED LOCATION\n")
+            sys.stderr.write(f"{i} was not at the expected location\n")
             sys.exit(1)
 
     target_domain = get_domain_from_taxid(args.taxid, ncbi_rankedlist)
@@ -207,8 +210,12 @@ def main():
             tiara_action = tiara_action_dict[scaff]
         combined_action = fcs_gx_action
         if fcs_gx_action == "NA" and tiara_action == "EXCLUDE":
-            combined_action = "EXCLUDE"
-            combined_action_source = "Tiara"
+            if args.tiara_action_mode == "remove":
+                combined_action = "EXCLUDE"
+                combined_action_source = "Tiara"
+            elif args.tiara_action_mode == "warn":
+                combined_action = "WARN"
+                combined_action_source = "Tiara"
         if fcs_gx_action == "EXCLUDE" and tiara_action == "EXCLUDE":
             combined_action_source = "FCS-GX_and_Tiara"
         if combined_action == "EXCLUDE":
@@ -231,4 +238,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf
index da7d59b..2dea7aa 100644
--- a/modules/local/ascc_merge_tables.nf
+++ b/modules/local/ascc_merge_tables.nf
@@ -2,10 +2,10 @@ process ASCC_MERGE_TABLES {
     tag "$meta.id"
     label 'process_low'
 
-    conda "conda-forge::python=3.9"
+    conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.9' :
-        'biocontainers/python:3.9' }"
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(gc_content,   stageAs: "GC.txt")
diff --git a/workflows/ascc.nf b/workflows/ascc.nf
index 79d490d..bca98c6 100644
--- a/workflows/ascc.nf
+++ b/workflows/ascc.nf
@@ -79,6 +79,8 @@ workflow ASCC {
     include_workflow_steps  = params.include ? params.include.split(",") : ""
     exclude_workflow_steps  = params.exclude ? params.exclude.split(",") : ""
 
+    btk_busco_run_mode  = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional"
+
     full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""]
 
     if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) {
@@ -290,7 +292,7 @@ workflow ASCC {
     //
     // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA
     //
-    if ( include_workflow_steps.contains('barcodes') || include_workflow_steps.contains('ALL') ) {
+    if ( include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL') ) {
         PACBIO_BARCODE_CHECK (
             YAML_INPUT.out.reference_tuple,
             YAML_INPUT.out.pacbio_tuple,
@@ -315,7 +317,7 @@ workflow ASCC {
     //
     // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE
     //
-    if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('busco_btk') || include_workflow_steps.contains('ALL') ) {
+    if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('btk_busco') || include_workflow_steps.contains('ALL') ) {
         RUN_READ_COVERAGE (
             YAML_INPUT.out.reference_tuple,
             YAML_INPUT.out.assembly_path,
@@ -372,12 +374,12 @@ workflow ASCC {
             modified_input,
             YAML_INPUT.out.diamond_nr_database_path
         )
-        nt_full         = NUCLEOT_DIAMOND.out.reformed.map{it[1]}
-        nt_hits         = NUCLEOT_DIAMOND.out.hits_file.map{it[1]}
+        nr_full         = NUCLEOT_DIAMOND.out.reformed.map{it[1]}
+        nr_hits         = NUCLEOT_DIAMOND.out.hits_file.map{it[1]}
         ch_versions     = ch_versions.mix(NUCLEOT_DIAMOND.out.versions)
     } else {
-        nt_hits         = []
-        nt_full         = []
+        nr_hits         = []
+        nr_full         = []
     }
 
     //
@@ -411,7 +413,7 @@ workflow ASCC {
         ch_kraken1,
         ch_kraken2,
         ch_kraken3,
-        nt_full,
+        nr_full,
         un_full,
         YAML_INPUT.out.ncbi_taxonomy_path.first()
     )
@@ -449,8 +451,7 @@ workflow ASCC {
     //              WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT
     //              TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING.
     //              This will also eventually check for the above run_btk boolean from autofilter
-    if ( !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) {
-
+    if ( !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('btk_busco') && btk_busco_run_mode == "conditional" && include_workflow_steps.contains("autofilter_assembly") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('ALL') || btk_busco_run_mode == "mandatory" && !exclude_workflow_steps.contains('btk_busco') && include_workflow_steps.contains('btk_busco') ) {
         YAML_INPUT.out.reference_tuple
             .combine(ch_bam)
             .map{ meta, ref, bam ->
@@ -505,7 +506,7 @@ workflow ASCC {
         ch_kraken3,                                         // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]}
         ch_nt_blast,                                        // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]}
         ch_kmers,                                           // FROM -- GET_KMERS_PROFILE.out.combined_csv
-        nt_hits,                                            // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]}
+        nr_hits,                                            // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]}
         un_hits,                                            // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]}
         [],                                                 // <-- MARKER SCAN -- NOT IN PIPELINE YET
         [],                                                 // <-- CONTIGVIZ -- NOT IN PIPELINE YET

From afc73ed90373687501c97a2eb919999ebdb6e662 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@github.com>
Date: Wed, 7 Aug 2024 13:59:00 +0100
Subject: [PATCH 2/3] 07.08.2024 edits

---
 .github/workflows/ci.yml            | 2 +-
 bin/ascc_merge_tables.py            | 4 ++--
 modules/local/merge_btk_datasets.nf | 4 ++--
 modules/local/sanger_tol_btk.nf     | 8 +++-----
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 24d41b5..76fa33c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -139,4 +139,4 @@ jobs:
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude busco_btk
+          nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco
diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py
index 932f505..6045600 100755
--- a/bin/ascc_merge_tables.py
+++ b/bin/ascc_merge_tables.py
@@ -35,7 +35,7 @@ def parse_args():
     parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file")
     parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file")
     parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file")
-    parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file")
+    parser.add_argument("-bb", "--btk_busco", type=str, help="Busco Blobtoolkit file")
     parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file")
     parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample")
     parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file")
@@ -284,7 +284,7 @@ def main(args):
     paths_dict["cobiontid_markerscan"] = args.markerscan
     paths_dict["contigviz"] = args.contigviz
     paths_dict["blobtoolkit"] = args.blobtoolkit
-    paths_dict["btk_busco"] = args.busco_btk
+    paths_dict["btk_busco"] = args.btk_busco
     paths_dict["fcs_gx"] = args.fcs_gx
 
     required_files = ["gc_content"]
diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf
index 707c33a..7a81801 100644
--- a/modules/local/merge_btk_datasets.nf
+++ b/modules/local/merge_btk_datasets.nf
@@ -9,7 +9,7 @@ process MERGE_BTK_DATASETS {
 
     input:
     tuple val(meta), path(create_btk_datasets)
-    tuple val(meta2), path(busco_btk_datasets)
+    tuple val(meta2), path(btk_busco_datasets)
 
     output:
     tuple val(meta), path("merged_datasets"),                                   emit: merged_datasets
@@ -29,7 +29,7 @@ process MERGE_BTK_DATASETS {
     merge_btk_datasets.py \\
         -m $create_btk_datasets \\
         -o ./merged_datasets \\
-        -b $busco_btk_datasets \\
+        -b $btk_busco_datasets \\
         $args
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf
index 009bb27..b73e326 100644
--- a/modules/local/sanger_tol_btk.nf
+++ b/modules/local/sanger_tol_btk.nf
@@ -32,7 +32,7 @@ process SANGER_TOL_BTK {
     def profiles            =   task.ext.profiles       ?:  ""
     def get_version         =   task.ext.version_data   ?:  "UNKNOWN - SETTING NOT SET"
     def btk_config          =   btk_config_file         ? "-c $btk_config_file"         : ""
-    def pipeline_version    =   task.ext.version        ?: "main"
+    def pipeline_version    =   task.ext.version        ?: "draft_assemblies"
     // YAML used to avoid the use of GCA accession number
     //    https://github.com/sanger-tol/blobtoolkit/issues/77
 
@@ -49,9 +49,7 @@ process SANGER_TOL_BTK {
         --input "\$(realpath $samplesheet_csv)" \\
         --outdir ${prefix}_btk_out \\
         --fasta "\$(realpath REFERENCE.fa)" \\
-        --yaml "\$(realpath BTK.yaml)" \\
-        --busco_lineages $busco_lineages \\
-        --accession draft \\
+        --busco_lineages eukaryota_odb10 \\
         --taxon $taxon \\
         --taxdump "\$(realpath $tax_dump)" \\
         --blastp "\$(realpath blastp.dmnd)" \\
@@ -72,7 +70,7 @@ process SANGER_TOL_BTK {
 
     stub:
     def prefix              =   task.ext.prefix         ?:  "${meta.id}"
-    def pipeline_version    =   task.ext.version        ?: "main"
+    def pipeline_version    =   task.ext.version        ?: "draft_assemblies"
 
     """
     mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession

From e3b8db9c75f553ab3aaab0848961f8f654e43b03 Mon Sep 17 00:00:00 2001
From: eeaunin <eeaunin@github.com>
Date: Wed, 7 Aug 2024 14:09:12 +0100
Subject: [PATCH 3/3] ran linting with black

---
 bin/autofilter.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/bin/autofilter.py b/bin/autofilter.py
index d843308..8c1dc4e 100755
--- a/bin/autofilter.py
+++ b/bin/autofilter.py
@@ -42,9 +42,9 @@ def parse_args():
         help="Path to the assembly_autofiltered.fasta file",
         default="autofiltered.fasta",
     )
-    #parser.add_argument(
+    # parser.add_argument(
     #    "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file"
-    #)
+    # )
     parser.add_argument(
         "-r",
         "--rejected_seq",
@@ -57,7 +57,11 @@ def parse_args():
         "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy"
     )
     parser.add_argument(
-        "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn"
+        "--tiara_action_mode",
+        type=str,
+        choices=["warn", "remove"],
+        default="warn",
+        help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn",
     )
     parser.add_argument("-v", "--version", action="version", version=VERSION)
     return parser.parse_args()
@@ -182,7 +186,7 @@ def main():
     tiara_results_path = args.tiara
     fcs_gx_summary_path = args.fcsgx_summary
     filtered_assembly_path = args.output_auto_filtered
-    #combined_summary = args.fcs_gx_and_tiara_summary
+    # combined_summary = args.fcs_gx_and_tiara_summary
     excluded_seq_list_path = args.rejected_seq
     ncbi_rankedlist = args.ncbi_rankedlineage_path
 
@@ -238,4 +242,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()