From 39393ef4d61db58821dff09ec74549d1b0b2a7c5 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 13:01:44 +0100 Subject: [PATCH 1/3] ea10 edits to dp24_btk_datasets_branch --- bin/ascc_merge_tables.py | 313 +++++++++++++++++++++++++++++ bin/autofilter.py | 23 ++- modules/local/ascc_merge_tables.nf | 6 +- workflows/ascc.nf | 21 +- 4 files changed, 342 insertions(+), 21 deletions(-) create mode 100755 bin/ascc_merge_tables.py diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py new file mode 100755 index 0000000..932f505 --- /dev/null +++ b/bin/ascc_merge_tables.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = """ +Script for merging contaminant check results into one table +Version: {VERSION} +--- +Written by Eerik Anuin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + +import argparse +import pandas as pd +import textwrap +import os +import sys +import general_purpose_functions as gpf + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="AsccMergeTables", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") + parser.add_argument("-c", "--coverage", type=str, help="Coverage file") + parser.add_argument("-t", "--tiara", type=str, help="Tiara file") + parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") + parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") + parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") + parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") + parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") + parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") + parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") + parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") + parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") + parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") + parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + + for data_type, input_file in paths_dict.items(): + if input == None: + pass + else: + out_dict[data_type] = input_file + + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + if paths_dict["coverage"] is not None: + coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") + coverage_df = None + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:, [0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write( + "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) + ) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in kmers dimensionality reduction output table ({})\n".format( + paths_dict["dim_reduction_embeddings"] + ) + ) + dim_reduction_df = None + + btk_df = None + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) + ) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns=btk_renaming_dict, inplace=True) + + btk_selected_cols = [ + col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] + ] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) + ) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + + btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [ + col + for col in btk_busco_df.columns + if col + in [ + "scaff", + "buscogenes_superkingdom", + "buscogenes_kingdom", + "buscogenes_phylum", + "buscogenes_class", + "buscogenes_order", + "buscogenes_family", + "buscogenes_genus", + "buscogenes_species", + "buscoregions_superkingdom", + "buscoregions_kingdom", + "buscoregions_phylum", + "buscoregions_class", + "buscoregions_order", + "buscoregions_family", + "buscoregions_genus", + "buscoregions_species", + ] + ] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) + ) + uniprot_diamond_df = None + + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in CobiontID MarkerScan output table ({})\n".format( + paths_dict["cobiontid_markerscan"] + ) + ) + uniprot_diamond_df = None + + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(args): + paths_dict = dict() + paths_dict["gc_content"] = args.gc_cov + paths_dict["coverage"] = args.coverage + paths_dict["tiara"] = args.tiara + paths_dict["bacterial_kraken"] = args.bacterial_kraken + paths_dict["nt_kraken"] = args.nt_kraken + paths_dict["nt_blast"] = args.nt_blast + paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings + paths_dict["nr_diamond"] = args.nr_diamond + paths_dict["uniprot_diamond"] = args.uniprot_diamond + paths_dict["cobiontid_markerscan"] = args.markerscan + paths_dict["contigviz"] = args.contigviz + paths_dict["blobtoolkit"] = args.blobtoolkit + paths_dict["btk_busco"] = args.busco_btk + paths_dict["fcs_gx"] = args.fcs_gx + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) + + if ( + paths_dict["nt_blast"] + and paths_dict["nr_diamond"] + and paths_dict["uniprot_diamond"] + and paths_dict["coverage"] + and paths_dict["tiara"] + and paths_dict["nt_kraken"] + ): + process_results_tables_command = f"process_result_tables.py . {args.sample_name}" + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write( + f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" + ) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/autofilter.py b/bin/autofilter.py index 93849f6..d843308 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -42,9 +42,9 @@ def parse_args(): help="Path to the assembly_autofiltered.fasta file", default="autofiltered.fasta", ) - parser.add_argument( - "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" - ) + #parser.add_argument( + # "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" + #) parser.add_argument( "-r", "--rejected_seq", @@ -56,6 +56,9 @@ def parse_args(): parser.add_argument( "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" ) + parser.add_argument( + "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn" + ) parser.add_argument("-v", "--version", action="version", version=VERSION) return parser.parse_args() @@ -179,7 +182,7 @@ def main(): tiara_results_path = args.tiara fcs_gx_summary_path = args.fcsgx_summary filtered_assembly_path = args.output_auto_filtered - combined_summary = args.fcs_gx_and_tiara_summary + #combined_summary = args.fcs_gx_and_tiara_summary excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path @@ -187,7 +190,7 @@ def main(): for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: if not os.path.isfile(i): - sys.stderr.write(f"{i} WAS NOT AT THE EXPECTED LOCATION\n") + sys.stderr.write(f"{i} was not at the expected location\n") sys.exit(1) target_domain = get_domain_from_taxid(args.taxid, ncbi_rankedlist) @@ -207,8 +210,12 @@ def main(): tiara_action = tiara_action_dict[scaff] combined_action = fcs_gx_action if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": - combined_action = "EXCLUDE" - combined_action_source = "Tiara" + if args.tiara_action_mode == "remove": + combined_action = "EXCLUDE" + combined_action_source = "Tiara" + elif args.tiara_action_mode == "warn": + combined_action = "WARN" + combined_action_source = "Tiara" if fcs_gx_action == "EXCLUDE" and tiara_action == "EXCLUDE": combined_action_source = "FCS-GX_and_Tiara" if combined_action == "EXCLUDE": @@ -231,4 +238,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf index da7d59b..2dea7aa 100644 --- a/modules/local/ascc_merge_tables.nf +++ b/modules/local/ascc_merge_tables.nf @@ -2,10 +2,10 @@ process ASCC_MERGE_TABLES { tag "$meta.id" label 'process_low' - conda "conda-forge::python=3.9" + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(gc_content, stageAs: "GC.txt") diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 79d490d..bca98c6 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -79,6 +79,8 @@ workflow ASCC { include_workflow_steps = params.include ? params.include.split(",") : "" exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" + btk_busco_run_mode = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional" + full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) { @@ -290,7 +292,7 @@ workflow ASCC { // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - if ( include_workflow_steps.contains('barcodes') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL') ) { PACBIO_BARCODE_CHECK ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, @@ -315,7 +317,7 @@ workflow ASCC { // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('busco_btk') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('btk_busco') || include_workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, @@ -372,12 +374,12 @@ workflow ASCC { modified_input, YAML_INPUT.out.diamond_nr_database_path ) - nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} - nt_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} + nr_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { - nt_hits = [] - nt_full = [] + nr_hits = [] + nr_full = [] } // @@ -411,7 +413,7 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, - nt_full, + nr_full, un_full, YAML_INPUT.out.ncbi_taxonomy_path.first() ) @@ -449,8 +451,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) { - + if ( !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('btk_busco') && btk_busco_run_mode == "conditional" && include_workflow_steps.contains("autofilter_assembly") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('ALL') || btk_busco_run_mode == "mandatory" && !exclude_workflow_steps.contains('btk_busco') && include_workflow_steps.contains('btk_busco') ) { YAML_INPUT.out.reference_tuple .combine(ch_bam) .map{ meta, ref, bam -> @@ -505,7 +506,7 @@ workflow ASCC { ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv - nt_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} [], // <-- MARKER SCAN -- NOT IN PIPELINE YET [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET From afc73ed90373687501c97a2eb919999ebdb6e662 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 13:59:00 +0100 Subject: [PATCH 2/3] 07.08.2024 edits --- .github/workflows/ci.yml | 2 +- bin/ascc_merge_tables.py | 4 ++-- modules/local/merge_btk_datasets.nf | 4 ++-- modules/local/sanger_tol_btk.nf | 8 +++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 24d41b5..76fa33c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -139,4 +139,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude busco_btk + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py index 932f505..6045600 100755 --- a/bin/ascc_merge_tables.py +++ b/bin/ascc_merge_tables.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") - parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-bb", "--btk_busco", type=str, help="Busco Blobtoolkit file") parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") @@ -284,7 +284,7 @@ def main(args): paths_dict["cobiontid_markerscan"] = args.markerscan paths_dict["contigviz"] = args.contigviz paths_dict["blobtoolkit"] = args.blobtoolkit - paths_dict["btk_busco"] = args.busco_btk + paths_dict["btk_busco"] = args.btk_busco paths_dict["fcs_gx"] = args.fcs_gx required_files = ["gc_content"] diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 707c33a..7a81801 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -9,7 +9,7 @@ process MERGE_BTK_DATASETS { input: tuple val(meta), path(create_btk_datasets) - tuple val(meta2), path(busco_btk_datasets) + tuple val(meta2), path(btk_busco_datasets) output: tuple val(meta), path("merged_datasets"), emit: merged_datasets @@ -29,7 +29,7 @@ process MERGE_BTK_DATASETS { merge_btk_datasets.py \\ -m $create_btk_datasets \\ -o ./merged_datasets \\ - -b $busco_btk_datasets \\ + -b $btk_busco_datasets \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 009bb27..b73e326 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -32,7 +32,7 @@ process SANGER_TOL_BTK { def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" def btk_config = btk_config_file ? "-c $btk_config_file" : "" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" // YAML used to avoid the use of GCA accession number // https://github.com/sanger-tol/blobtoolkit/issues/77 @@ -49,9 +49,7 @@ process SANGER_TOL_BTK { --input "\$(realpath $samplesheet_csv)" \\ --outdir ${prefix}_btk_out \\ --fasta "\$(realpath REFERENCE.fa)" \\ - --yaml "\$(realpath BTK.yaml)" \\ - --busco_lineages $busco_lineages \\ - --accession draft \\ + --busco_lineages eukaryota_odb10 \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ --blastp "\$(realpath blastp.dmnd)" \\ @@ -72,7 +70,7 @@ process SANGER_TOL_BTK { stub: def prefix = task.ext.prefix ?: "${meta.id}" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" """ mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession From e3b8db9c75f553ab3aaab0848961f8f654e43b03 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 14:09:12 +0100 Subject: [PATCH 3/3] ran linting with black --- bin/autofilter.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/autofilter.py b/bin/autofilter.py index d843308..8c1dc4e 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -42,9 +42,9 @@ def parse_args(): help="Path to the assembly_autofiltered.fasta file", default="autofiltered.fasta", ) - #parser.add_argument( + # parser.add_argument( # "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" - #) + # ) parser.add_argument( "-r", "--rejected_seq", @@ -57,7 +57,11 @@ def parse_args(): "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" ) parser.add_argument( - "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn" + "--tiara_action_mode", + type=str, + choices=["warn", "remove"], + default="warn", + help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn", ) parser.add_argument("-v", "--version", action="version", version=VERSION) return parser.parse_args() @@ -182,7 +186,7 @@ def main(): tiara_results_path = args.tiara fcs_gx_summary_path = args.fcsgx_summary filtered_assembly_path = args.output_auto_filtered - #combined_summary = args.fcs_gx_and_tiara_summary + # combined_summary = args.fcs_gx_and_tiara_summary excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path @@ -238,4 +242,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()