diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2bc5487b..76fa33c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,6 +72,18 @@ jobs: run: | curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf - + - name: Temporary ASCC Diamond Data + run: | + curl https://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd + + - name: Temporary BLASTN Data + run: | + curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - + + - name: Temporary Accession2TaxID Data + run: | + curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -xzf - + - name: Download the NCBI taxdump database run: | mkdir ncbi_taxdump @@ -120,10 +132,11 @@ jobs: run: | mkdir vecscreen curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf - + ls -lh - name: Singularity - Run FULL pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco diff --git a/.nf-core.yml b/.nf-core.yml index d1422793..5a60a58e 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -19,5 +19,4 @@ lint: nextflow_config: - manifest.name - manifest.homePage - multiqc_config: - - report_comment + multiqc_config: False diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml new file mode 100644 index 00000000..0e023513 --- /dev/null +++ b/assets/btk_draft.yaml @@ -0,0 +1,17 @@ +assembly: + level: bar +settings: + foo: 0 +similarity: + diamond_blastx: + foo: 0 +taxon: + class: class_name + family: family_name + genus: genus_name + kingdom: kingdom_name + name: species_name + order: order_name + phylum: phylum_name + superkingdom: superkingdom_name + taxid: 0 diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index fde2e2e8..d1929eb7 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -12,18 +12,20 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /home/runner/work/ascc/ascc/NT_database/ -nt_database_prefix: 18S_fungal_sequences +nt_database: /home/runner/work/ascc/ascc/blastdb/ +nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 -ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ +ncbi_accessionids_folder: /home/runner/work/ascc/ascc/20240709_tiny_accession2taxid/ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages +busco_lineages: "diptera_odb10,insecta_odb10" fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ -diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd +diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd +diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond.dmnd vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/ seqkit: - sliding: 6000 - window: 100000 + sliding: 100000 + window: 6000 n_neighbours: 13 +btk_yaml: /home/runner/work/ascc/ascc/assets/btk_draft.yaml diff --git a/assets/test.yaml b/assets/test.yaml index 122db327..3922933c 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -12,18 +12,20 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /data/blastdb/Supported/NT/202308/dbv4/ -nt_database_prefix: nt -nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt -ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ -ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/ +nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/ +nt_database_prefix: tiny_plasmodium_blastdb.fa +nt_kraken_db_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/kraken2/kraken2/ +ncbi_accessionids_folder: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/20240709_tiny_accession2taxid/ +ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages -fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb +busco_lineages: "diptera_odb10,insecta_odb10" +fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/ vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/ -diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd -diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd +diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd +diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd seqkit: sliding: 100000 window: 6000 n_neighbours: 13 +btk_yaml: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py new file mode 100755 index 00000000..7e5bbf74 --- /dev/null +++ b/bin/abnormal_contamination_check.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +VERSION = "V1.0.0" + +DESCRIPTION = """ +------------------------------------- + Abnormal Contamination Check + Version = {VERSION} +------------------------------------- +Written by James Torrance +Modified by Eerik Aunin +Modified by Damon-Lee Pointon +------------------------------------- + +Script for determining if there is +enough contamination found by FCS-GX +to warrant an abnormal contamination +report alarm. Partially based on code +written by James Torrance +------------------------------------- + +""" + +import general_purpose_functions as gpf +import sys +import os.path +import pathlib +import argparse +import textwrap + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="Abnormal Contamination Check", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("assembly", type=str, help="Path to the fasta assembly file") + parser.add_argument("summary_path", type=str, help="Path to the tiara summary file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def get_sequence_lengths(assembly_fasta_path): + """ + Gets sequence lengths of a FASTA file and returns them as a dictionary + """ + seq_lengths_dict = dict() + fasta_data = gpf.read_fasta_in_chunks(assembly_fasta_path) + for header, seq in fasta_data: + seq_len = len(seq) + seq_lengths_dict[header] = dict() + seq_lengths_dict[header]["seq_len"] = seq_len + return seq_lengths_dict + + +def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path): + """ + Loads FCS-GX actions from the FCS-GX and Tiara results summary file, adds them to the dictionary that contains sequence lengths + """ + fcs_gx_and_tiara_summary_data = gpf.l(fcs_gx_and_tiara_summary_path) + fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)] + for line in fcs_gx_and_tiara_summary_data: + split_line = line.split(",") + assert len(split_line) == 5 + seq_name = split_line[0] + fcs_gx_action = split_line[1] + seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action + return seq_dict + + +def main(): + args = parse_args() + if os.path.isfile(args.summary_path) is False: + sys.stderr.write( + f"The FCS-GX and Tiara results file was not found at the expected location ({args.summary_path})\n" + ) + sys.exit(1) + + if os.path.isfile(args.assembly) is False: + sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({args.assembly})\n") + sys.exit(1) + + seq_dict = get_sequence_lengths(args.assembly) + seq_dict = load_fcs_gx_results(seq_dict, args.summary_path) + + total_assembly_length = 0 + lengths_removed = list() + scaffolds_removed = 0 + scaffold_count = len(seq_dict) + + for seq_name in seq_dict: + seq_len = seq_dict[seq_name]["seq_len"] + if seq_dict[seq_name]["fcs_gx_action"] == "EXCLUDE": + lengths_removed.append(seq_len) + scaffolds_removed += 1 + total_assembly_length += seq_len + + alarm_threshold_for_parameter = { + "TOTAL_LENGTH_REMOVED": 1e7, + "PERCENTAGE_LENGTH_REMOVED": 3, + "LARGEST_SCAFFOLD_REMOVED": 1.8e6, + } + + report_dict = { + "TOTAL_LENGTH_REMOVED": sum(lengths_removed), + "PERCENTAGE_LENGTH_REMOVED": 100 * sum(lengths_removed) / total_assembly_length, + "LARGEST_SCAFFOLD_REMOVED": max(lengths_removed, default=0), + "SCAFFOLDS_REMOVED": scaffolds_removed, + "PERCENTAGE_SCAFFOLDS_REMOVED": 100 * scaffolds_removed / scaffold_count, + } + + for param in report_dict: + sys.stderr.write(f"{param}: {report_dict[param]}\n") + + fcs_gx_alarm_indicator_path = f"fcs-gx_alarm_indicator_file.txt" + pathlib.Path(fcs_gx_alarm_indicator_path).unlink(missing_ok=True) + + alarm_list = [] + stage1_decon_pass_flag = True + for param in alarm_threshold_for_parameter: + param_value = report_dict[param] + alarm_threshold = alarm_threshold_for_parameter[param] + + # IF CONTAMINATING SEQ FOUND FILL FILE WITH ABNORMAL CONTAM + if param_value > alarm_threshold_for_parameter[param]: + stage1_decon_pass_flag = False + alarm_list.append( + f"YES_ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n" + ) + + # Seperated out to ensure that the file is written in one go and doesn't confuse Nextflow + with open(fcs_gx_alarm_indicator_path, "a") as f: + f.write("".join(alarm_list)) + + # IF NO CONTAM FILL FILE WITH NO CONTAM + if stage1_decon_pass_flag is True: + alarm_message = f"NO_ABNORMAL_CONTAMINATION: No scaffolds were tagged for removal by FCS-GX\n" + with open(fcs_gx_alarm_indicator_path, "a") as f: + f.write(alarm_message) + + +if __name__ == "__main__": + main() diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py new file mode 100755 index 00000000..6045600e --- /dev/null +++ b/bin/ascc_merge_tables.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = """ +Script for merging contaminant check results into one table +Version: {VERSION} +--- +Written by Eerik Anuin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + +import argparse +import pandas as pd +import textwrap +import os +import sys +import general_purpose_functions as gpf + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="AsccMergeTables", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") + parser.add_argument("-c", "--coverage", type=str, help="Coverage file") + parser.add_argument("-t", "--tiara", type=str, help="Tiara file") + parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") + parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") + parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") + parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") + parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") + parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") + parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") + parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") + parser.add_argument("-bb", "--btk_busco", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") + parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") + parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + + for data_type, input_file in paths_dict.items(): + if input == None: + pass + else: + out_dict[data_type] = input_file + + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + if paths_dict["coverage"] is not None: + coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") + coverage_df = None + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:, [0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write( + "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) + ) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in kmers dimensionality reduction output table ({})\n".format( + paths_dict["dim_reduction_embeddings"] + ) + ) + dim_reduction_df = None + + btk_df = None + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) + ) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns=btk_renaming_dict, inplace=True) + + btk_selected_cols = [ + col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] + ] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) + ) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + + btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [ + col + for col in btk_busco_df.columns + if col + in [ + "scaff", + "buscogenes_superkingdom", + "buscogenes_kingdom", + "buscogenes_phylum", + "buscogenes_class", + "buscogenes_order", + "buscogenes_family", + "buscogenes_genus", + "buscogenes_species", + "buscoregions_superkingdom", + "buscoregions_kingdom", + "buscoregions_phylum", + "buscoregions_class", + "buscoregions_order", + "buscoregions_family", + "buscoregions_genus", + "buscoregions_species", + ] + ] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) + ) + uniprot_diamond_df = None + + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in CobiontID MarkerScan output table ({})\n".format( + paths_dict["cobiontid_markerscan"] + ) + ) + uniprot_diamond_df = None + + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(args): + paths_dict = dict() + paths_dict["gc_content"] = args.gc_cov + paths_dict["coverage"] = args.coverage + paths_dict["tiara"] = args.tiara + paths_dict["bacterial_kraken"] = args.bacterial_kraken + paths_dict["nt_kraken"] = args.nt_kraken + paths_dict["nt_blast"] = args.nt_blast + paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings + paths_dict["nr_diamond"] = args.nr_diamond + paths_dict["uniprot_diamond"] = args.uniprot_diamond + paths_dict["cobiontid_markerscan"] = args.markerscan + paths_dict["contigviz"] = args.contigviz + paths_dict["blobtoolkit"] = args.blobtoolkit + paths_dict["btk_busco"] = args.btk_busco + paths_dict["fcs_gx"] = args.fcs_gx + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) + + if ( + paths_dict["nt_blast"] + and paths_dict["nr_diamond"] + and paths_dict["uniprot_diamond"] + and paths_dict["coverage"] + and paths_dict["tiara"] + and paths_dict["nt_kraken"] + ): + process_results_tables_command = f"process_result_tables.py . {args.sample_name}" + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write( + f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" + ) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/ascc_shorten_fasta_headers.py b/bin/ascc_shorten_fasta_headers.py new file mode 100755 index 00000000..f12f6e05 --- /dev/null +++ b/bin/ascc_shorten_fasta_headers.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for shortening FASTA headers, by splitting the header and keeping only the first element +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +# MIT License +# +# Copyright (c) 2020-2022 Genome Research Ltd. +# +# Author: Eerik Aunin (ea10@sanger.ac.uk) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import general_purpose_functions as gpf +import argparse +import textwrap +import sys +import tempfile + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="ascc_shorten_fasta_headers", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") + parser.add_argument( + "--delimiter", + type=str, + help="Delimiter string for splitting FASTA headers. Default: any whitespace character", + default="", + ) + parser.add_argument("--allow_duplicate_headers", dest="allow_duplicate_headers", action="store_true") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args(argv) + + +def main(fasta_path, delimiter, allow_duplicate_headers): + + with tempfile.TemporaryDirectory() as tmp_dir: + input_file = fasta_path + if fasta_path.endswith(".gz") or fasta_path.endswith('.gz"'): + input_file = "{}/input_file.fa".format(tmp_dir) + gpf.run_system_command("gunzip -c {} > {}".format(fasta_path, input_file)) + + headers_list = list() + in_data = gpf.ll(input_file) + + for line in in_data: + out_line = line + if line.startswith(">"): + if delimiter == "": + out_line = line.split()[0] + else: + out_line = line.split(delimiter)[0] + if out_line in headers_list and allow_duplicate_headers is False: + sys.stderr.write( + "Duplicate FASTA headers ({}) were found in the input file ({}) after truncating the headers with a delimiter\n".format( + out_line[1 : len(out_line)], fasta_path + ) + ) + sys.exit(1) + headers_list.append(out_line) + print(out_line) + + +if __name__ == "__main__": + args = parse_args() + main(args.fasta_path, args.delimiter, args.allow_duplicate_headers) diff --git a/bin/autofilter.py b/bin/autofilter.py new file mode 100755 index 00000000..8c1dc4e4 --- /dev/null +++ b/bin/autofilter.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +VERSION = "V1.0.0" + +DESCRIPTION = """ +------------------------------------- + Autofilter + Version = {VERSION} +------------------------------------- +Written by Eerik Aunin +Modified by Damon-Lee Pointon +------------------------------------- + +Script for filtering the assembly to +remove putative contaminants based on +FCS-GX and Tiara results. +------------------------------------- + +""" + +from pathlib import Path +import general_purpose_functions as gpf +import os +import sys +import argparse +import textwrap + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="Autofilter", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("fasta", type=str, help="Path to the assembly FASTA file") + parser.add_argument("-t", "--tiara", type=str, help="Path to the Tiara summary file") + parser.add_argument("-s", "--fcsgx_summary", type=str, help="Path to the fcs-gx_summary.csv file") + parser.add_argument( + "-o", + "--output_auto_filtered", + type=str, + help="Path to the assembly_autofiltered.fasta file", + default="autofiltered.fasta", + ) + # parser.add_argument( + # "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" + # ) + parser.add_argument( + "-r", + "--rejected_seq", + type=str, + help="Path to the assembly_filtering_removed_sequences.txt file", + default="assembly_filtering_removed_sequences.txt", + ) + parser.add_argument("-i", "--taxid", type=int, help="NCBI taxonomy ID of the species") + parser.add_argument( + "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" + ) + parser.add_argument( + "--tiara_action_mode", + type=str, + choices=["warn", "remove"], + default="warn", + help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def get_domain_from_taxid(query_taxid, rankedlineage_path): + """ + Input: 1) a taxID, 2) path to the NCBI rankedlineage.dmp file + Output: domain classification corresponding to the taxID + """ + domain = None + query_taxid = str(query_taxid) + rankedlineage_data = gpf.ll(rankedlineage_path) + for line in rankedlineage_data: + split_line = line.split("|") + split_line = [n.strip() for n in split_line] + assert len(split_line) == 11 + taxid = split_line[0] + domain = split_line[9] + if taxid == query_taxid: + domain = split_line[9] + if domain not in ("", "Archaea", "Bacteria", "Eukaryota", "Viruses"): + sys.stderr.write(f"Unrecognised value for domain-level taxonomy: {domain}") + sys.exit(1) + break + if domain is None: + sys.stderr.write( + "The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format( + query_taxid, rankedlineage_path + ) + ) + sys.exit(1) + return domain + + +def process_tiara_results(tiara_results_path, target_domain): + """ + Input: 1) path to the main output file of Tiara, 2) the domain of the target species + Output: dictionary where the keys are scaffold names and the values are the decontamination action based on Tiara results + ('keep' or 'exclude') + """ + tiara_action_dict = dict() + + allowed_classif_dict = dict() + allowed_classif_dict[""] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_dict["Archaea"] = ["archaea", "prokarya", "unknown"] + allowed_classif_dict["Bacteria"] = ["bacteria", "prokarya", "unknown"] + allowed_classif_dict["Eukaryota"] = ["eukarya", "organelle", "unknown"] + allowed_classif_dict["Viruses"] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_list = allowed_classif_dict[target_domain] + + tiara_output = gpf.ll(tiara_results_path) + for counter, line in enumerate(tiara_output): + if counter == 0: + continue + split_line = line.split() + assert len(split_line) == 3 + tiara_class_fst_stage = split_line[1] + assert tiara_class_fst_stage in ("archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown") + tiara_action = "KEEP" + if tiara_class_fst_stage not in allowed_classif_list: + tiara_action = "EXCLUDE" + scaff = split_line[0] + tiara_action_dict[scaff] = tiara_action + return tiara_action_dict + + +def get_fcs_gx_action_dict(fcs_gx_summary_path): + """ + Input: path to FCS-GX summary CSV file (produced by ascc_parse_fcsgx_results.py) + Output: dictionary where the keys are scaffold names and the values are the FCS-GX action values + """ + fcs_gx_action_dict = dict() + fcs_gx_summary_data = gpf.ll(fcs_gx_summary_path) + for counter, line in enumerate(fcs_gx_summary_data): + if counter == 0: + continue + split_line = line.split(",") + scaff = split_line[0] + fcs_gx_action = split_line[8] + fcs_gx_action_dict[scaff] = fcs_gx_action + return fcs_gx_action_dict + + +def get_scaff_names(assembly_path): + """ + Reads FASTA headers from a FASTA file and returns them as a list + """ + scaffs = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for fasta_tuple in fasta_data: + scaffs.append(fasta_tuple[0]) + return scaffs + + +def filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path): + """ + Filters a genome assembly FASTA file to remove sequences that are listed in the scaffs_to_exclude list + """ + out_list = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for header, seq in fasta_data: + if header not in scaffs_to_exclude: + out_list.append(">" + header) + split_seq = gpf.split_with_fixed_row_length(seq, 80) + out_list.extend(split_seq) + else: + sys.stderr.write( + f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n" + ) + gpf.export_list_as_line_break_separated_file(out_list, filtered_assembly_path) + + +def main(): + args = parse_args() + if args.taxid == -1: + sys.stderr.write( + "The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n" + ) + + assembly_path = args.fasta + tiara_results_path = args.tiara + fcs_gx_summary_path = args.fcsgx_summary + filtered_assembly_path = args.output_auto_filtered + # combined_summary = args.fcs_gx_and_tiara_summary + excluded_seq_list_path = args.rejected_seq + ncbi_rankedlist = args.ncbi_rankedlineage_path + + Path(f"./fasta/filtered").mkdir(parents=True, exist_ok=True) + + for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: + if not os.path.isfile(i): + sys.stderr.write(f"{i} was not at the expected location\n") + sys.exit(1) + + target_domain = get_domain_from_taxid(args.taxid, ncbi_rankedlist) + tiara_action_dict = process_tiara_results(tiara_results_path, target_domain) + fcs_gx_action_dict = get_fcs_gx_action_dict(fcs_gx_summary_path) + + combined_action_dict = dict() + scaffs_to_exclude = list() + scaffs = get_scaff_names(assembly_path) + for scaff in scaffs: + fcs_gx_action = "NA" + tiara_action = "NA" + if scaff in fcs_gx_action_dict: + fcs_gx_action = fcs_gx_action_dict[scaff] + combined_action_source = "FCS-GX" + if scaff in tiara_action_dict: + tiara_action = tiara_action_dict[scaff] + combined_action = fcs_gx_action + if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": + if args.tiara_action_mode == "remove": + combined_action = "EXCLUDE" + combined_action_source = "Tiara" + elif args.tiara_action_mode == "warn": + combined_action = "WARN" + combined_action_source = "Tiara" + if fcs_gx_action == "EXCLUDE" and tiara_action == "EXCLUDE": + combined_action_source = "FCS-GX_and_Tiara" + if combined_action == "EXCLUDE": + scaffs_to_exclude.append(scaff) + combined_action_dict[scaff] = { + "fcs_gx_action": fcs_gx_action, + "tiara_action": tiara_action, + "combined_action": combined_action, + "combined_action_source": combined_action_source, + } + filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) + gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) + + out_csv_list = list() + out_csv_list.append("scaff,fcs_gx_action,tiara_action,combined_action") + for scaff, scaff_properties in combined_action_dict.items(): + out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']},{scaff_properties['combined_action_source']}" + out_csv_list.append(out_line) + gpf.export_list_as_line_break_separated_file(out_csv_list, "ABNORMAL_CHECK.csv") + + +if __name__ == "__main__": + main() diff --git a/bin/convert_to_hits.py b/bin/convert_to_hits.py new file mode 100755 index 00000000..37915f2f --- /dev/null +++ b/bin/convert_to_hits.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" + +DESCRIPTION = """ +Script for getting the top hits of Diamond BLASTX against the nr database. Top hits per each scaffold are determined from the BLASTX results of scaffold chunks. +The output is reformatted into a CSV table +Argument1: path to Diamond BLASTX results, with the following columns: +"qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "end", "evalue", "bitscore", "staxids", "sscinames", "sskingdoms", "sphylums", "salltitles" + +Version: {VERSION} +--- +Written by Eerik Aunin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" +import general_purpose_functions as gpf +from collections import OrderedDict +import textwrap +import argparse + + +def save_file(output_list, name): + with open(f"{name}_diamond_blastx_top_hits.csv", "w") as f: + for line in output_list: + f.write(f"{line}\n") + + +def main(in_path, diamond_database_title): + in_data = gpf.ll(in_path) + + top_hits_dict = OrderedDict() + output = [] + + colnames = ( + "scaff", + "diamond_{0}_bitscore", + "diamond_{0}_staxids", + "diamond_{0}_sscinames", + "diamond_{0}_sskingdoms", + "diamond_{0}_sphylums", + "diamond_{0}_salltitles", + ) + colnames = [n.format(diamond_database_title) for n in colnames] + + for line in in_data: + line = line.replace(",", " ") + line = line.replace("<>", " ") + split_line = line.split("\t") + seq_name = split_line[0] + seq_name = seq_name.split("_sliding")[0] + bitscore = float(split_line[11]) + if seq_name not in top_hits_dict: + top_hits_dict[seq_name] = dict() + top_hits_dict[seq_name]["bitscore"] = bitscore + top_hits_dict[seq_name]["line"] = line + else: + if bitscore > top_hits_dict[seq_name]["bitscore"]: + top_hits_dict[seq_name]["bitscore"] = bitscore + top_hits_dict[seq_name]["line"] = line + + output.append(",".join(colnames)) + for seq_name in top_hits_dict: + out_line = top_hits_dict[seq_name]["line"] + out_line = out_line.split("\t") + out_line = ",".join(out_line[11 : len(out_line)]) + out_line = seq_name + "," + out_line + output.append(out_line) + + save_file(output, diamond_database_title) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Convert File to hits", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("in_path", type=str, help="Path to Diamond BLASTX results") + parser.add_argument("diamond_database_title", type=str, help="Name of the Diamond database (e.g. nr or Uniprot)") + parser.add_argument("-v", "--version", action="version", version=VERSION) + args = parser.parse_args() + main(args.in_path, args.diamond_database_title) diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py new file mode 100755 index 00000000..69638fdf --- /dev/null +++ b/bin/create_btk_dataset.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for creating a BlobToolKit dataset from the ASCC output files +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import general_purpose_functions as gpf +import argparse +from pathlib import Path +import textwrap +import sys +import os.path + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="createBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-n", "--name", required=True, type=str, help="Assembly name (for the output files)") + parser.add_argument( + "-tn", + "--taxon_name", + required=True, + type=str, + help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)", + ) + parser.add_argument("-id", "--taxid", required=True, type=int, help="Taxon ID of the assembly") + parser.add_argument( + "-td", "--taxdump", required=True, type=str, help="Path to the directory containing the NCBI taxdump" + ) + parser.add_argument("-f", "--fasta", required=True, type=str, help="The path for the assembly fasta file") + parser.add_argument( + "-d", + "--dataset", + type=str, + required=True, + help="The folder containing the data generated throughout the pipeline", + ) + parser.add_argument("-bh", "--blastn_hits", default="N", type=str, help="Path to the BLASTN hits file") + parser.add_argument( + "-ud", "--uniprot_diamond_hits", default="N", type=str, help="Path to the UNIPROT diamond BlastX hits file" + ) + parser.add_argument("-nr", "--nr_diamond_hits", default="N", type=str, help="Path to the DIAMOND BlastX hits file") + parser.add_argument( + "-r", "--mapped_reads", default="N", type=str, help="Path to mapped reads BAM for coverage estimation" + ) + parser.add_argument("-t", "--tiara", default="N", type=str, help="Path to the tiara_out.txt file") + parser.add_argument( + "-p", "--pca", default="N", type=str, help="Path to the kmers_dim_reduction_embeddings.csv file" + ) + parser.add_argument("-fc", "--fcs_gx", default="N", type=str, help="Path to the fcs-gx_summary.csv.csv file") + parser.add_argument("-k", "--kraken", default="N", type=str, help="Path to the nt_kraken_lineage.txt file") + parser.add_argument("-ms", "--markerscan", default="N", type=str, help="Path to the cobiontid_markerscan.csv file") + parser.add_argument("-cv", "--contigviz", default="N", type=str, help="Path to the contigviz_results.csv file") + parser.add_argument("-o", "--output", default="btk_datasets", type=str, help="Output directory") + parser.add_argument("--threads", type=int, default=1, help="Number of threads to utilise") + parser.add_argument("--alias", type=str, default="", help="Assembly alias") + parser.add_argument( + "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): + """ + Creates the assembly YAML file for creating a BlobToolKit dataset + """ + if ".gz" in assembly_alias: + assembly_alias = assembly_alias.replace(".gz", "_gz") + out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format( + assembly_alias, taxon_name + ) + with open(assembly_yaml_path, "w") as f: + f.write(out_string) + + +def tiara_results_to_btk_format(tiara_results_path, outfile_path): + """ + Reformatting Tiara output file so that the summarised results of the first and second pass of Tiara can be + added to a BlobToolKit dataset + """ + tiara_data = gpf.l(tiara_results_path) + tiara_data = tiara_data[1 : len(tiara_data)] + with open(outfile_path, "w") as f: + f.write("identifier\ttiara\n") + for line in tiara_data: + split_line = line.split() + if len(split_line) != 3: + sys.stderr.write("Failed to parse the Tiara results file {}\n".format(tiara_results_path)) + sys.exit(1) + first_pass_result = split_line[1] + second_pass_result = split_line[2] + if second_pass_result != "n/a": + first_pass_result = second_pass_result + f.write(split_line[0] + "\t" + first_pass_result + "\n") + + +def detect_dim_reduction_methods(kmers_dim_reduction_output_path): + """ + Parses the header of the kmers dimensionality reduction report file to detect which dimensionality reduction methods were used + """ + header_string = None + with open(kmers_dim_reduction_output_path) as f: + header_string = f.readline() + header_string = header_string.strip() + split_header = header_string.split(",") + dim_reduction_methods = list() + for header_item in split_header: + if header_item.startswith("embedding_"): + if header_item.startswith("embedding_x_"): + header_item = header_item.split("embedding_x_")[1] + elif header_item.startswith("embedding_y_"): + header_item = header_item.split("embedding_y_")[1] + if header_item not in dim_reduction_methods: + dim_reduction_methods.append(header_item) + return dim_reduction_methods + + +def main(args): + command_list = [] + + assembly_alias = args.name if args.alias == "" else args.alias + + edited_assembly_title = args.name.replace(".", "_").replace(" ", "_") + + assembly_yaml_path = args.output + "/" + edited_assembly_title + "BTK_DS.yaml" + + if args.dry_run == False: + Path(args.dataset).mkdir(parents=True, exist_ok=True) + create_assembly_yaml(assembly_yaml_path, assembly_alias, args.taxon_name) + + # Base command for new BTK Dataset + blobtools_create_command = f"blobtools create --fasta {args.fasta} --meta {assembly_yaml_path} --taxid {args.taxid} --taxdump {args.taxdump} {args.output}" + gpf.run_system_command(blobtools_create_command, dry_run=args.dry_run) + + # ADDING BLAST HIT DATA TO BTK + hits_file_paths = [args.blastn_hits, args.uniprot_diamond_hits, args.nr_diamond_hits] + + hits_file = [n for n in hits_file_paths if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0] + + if len(hits_file) > 0: + add_hits_command = "blobtools add" + for file in hits_file_paths: + add_hits_command += f" --hits {file}" + add_hits_command += f" --taxrule bestsum --taxdump {args.taxdump} {args.output}" + command_list.append(add_hits_command) + + # ADDING MAPPED READS DATA TO BTK + if ( + args.mapped_reads != "N" + and os.path.isfile(args.mapped_reads) is True + and os.stat(args.mapped_reads).st_size > 0 + ): + add_cov_command = f"blobtools add --cov {args.mapped_reads} --threads {args.threads} {args.output}" + command_list.append(add_cov_command) + + # ADDING TIARA + if args.tiara != "N" and os.path.isfile(args.tiara) and os.stat(args.tiara).st_size > 0: + tiara_reformatted_output_path = args.dataset + "/tiara_out_btk_format.tsv" + tiara_results_to_btk_format(args.tiara, tiara_reformatted_output_path) + add_tiara_command = f"blobtools add --text {tiara_reformatted_output_path} --text-delimiter '\t' --text-cols 'identifier=identifiers,tiara=tiara' --text-header {args.output}" + command_list.append(add_tiara_command) + + # ADDING KMER DIM REDUCTION + if args.pca != "N" and os.path.isfile(args.pca) and os.stat(args.pca).st_size > 0: + used_dim_reduction_methods = detect_dim_reduction_methods(args.pca) + for dim_reduction_method in used_dim_reduction_methods: + add_embedding_command = f"blobtools add --text {args.pca} --text-delimiter ',' --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {args.output}" + command_list.append(add_embedding_command) + + # ADDIND KRAKEN DATA + if args.kraken != "N" and os.path.isfile(args.kraken) and os.stat(args.kraken).st_size > 0: + for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): + add_kraken_command = f"blobtools add --text {args.kraken} --text-delimiter ',' --text-cols scaff=identifiers,nt_kraken_{taxonomy_level}=nt_kraken_{taxonomy_level} --text-header {args.output}" + command_list.append(add_kraken_command) + + # ADDING FCS_GX DATA + if args.fcs_gx != "N" and os.path.isfile(args.fcs_gx) and os.stat(args.fcs_gx).st_size > 0: + add_fcs_gx_results_command = f"blobtools add --text {args.fcs_gx} --text-delimiter ',' --text-cols 'scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action' --text-header {args.output}" + command_list.append(add_fcs_gx_results_command) + + export_table_command = f"blobtools filter --table btk_summary_table_full.tsv {args.output}" + command_list.append(export_table_command) + + # EXECUTE ALL BTK COMMANDS + for i in command_list: + gpf.run_system_command(i, dry_run=args.dry_run) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/filter_fasta_by_length.py b/bin/filter_fasta_by_length.py new file mode 100755 index 00000000..a7b03e12 --- /dev/null +++ b/bin/filter_fasta_by_length.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for filtering a FASTA file by sequence length. By default, sequences shorter than a cutoff value will be removed. +Version = {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import general_purpose_functions as gpf +import textwrap +import argparse +import sys +import os + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="filter_fasta_by_length", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("in_path", type=str, help="Path to input FASTA file") + parser.add_argument("cutoff", type=int, help="Cutoff value for filtering") + parser.add_argument( + "-l", + "--low_pass", + dest="low_pass", + action="store_true", + help="Optional: low pass filtering mode (sequences longer than the cutoff value will be removed)", + ) + parser.add_argument( + "--remove_original_fasta", + action="store_true", + help="Optional: remove the input FASTA file after creating the filtered FASTA file", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def main(args): + fasta_path = os.path.abspath(args.in_path) + if ( + args.cutoff == -1 + ): # When this script is used as a part of a pipeline, -1 can be assigned as a value for the cutoff to indicate that no filtering should be done + sys.stderr.write(f"The input FASTA sequences ({fasta_path}) will not be filtered by length\n") + # sys.exit(0) + retained_seq_count = 0 + fasta_data = gpf.read_fasta_in_chunks(fasta_path) + for header, seq in fasta_data: + if args.cutoff == -1: + print(">" + header) + gpf.print_with_fixed_row_length(seq, 80) + else: + seq_len = len(seq) + seq_len_ok_flag = True + if args.low_pass == True: + if seq_len > args.cutoff: + seq_len_ok_flag = False + sys.stderr.write( + f"Low pass filtering of FASTA sequences by length: removing sequence {header} from the assembly because its length ({seq_len}) exceeds the length cutoff ({args.cutoff})\n" + ) + else: + if seq_len < args.cutoff: + seq_len_ok_flag = False + sys.stderr.write( + f"High pass filtering of FASTA sequences by length: removing sequence {header} from the assembly because its length ({seq_len}) is below the length cutoff ({args.cutoff})\n" + ) + if seq_len_ok_flag == True: + retained_seq_count += 1 + print(">" + header) + gpf.print_with_fixed_row_length(seq, 80) + # print(header, seq_len, seq_len_ok_flag) + if args.cutoff != -1 and retained_seq_count == 0: + sys.stderr.write( + f"No sequences remain in the FASTA file {fasta_path} after filtering the sequences by length (cutoff: {args.cutoff} bp)\n" + ) + sys.exit(1) + + if args.remove_original_fasta is True: + os.remove(fasta_path) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/find_taxid_in_taxdump.py b/bin/find_taxid_in_taxdump.py new file mode 100755 index 00000000..930ee12c --- /dev/null +++ b/bin/find_taxid_in_taxdump.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for checking if the TaxID given by the user exists in the NCBI taxdump +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import argparse +import textwrap +import general_purpose_functions as gpf +import sys +import os + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="find_taxid_in_taxdump", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("query_taxid", type=int, help="Query taxonomy ID") + parser.add_argument( + "taxdump_nodes_path", + type=str, + help="Path to the nodes.dmp file of NCBI taxdump (downloaded from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz)", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def main(query_taxid, taxdump_nodes_path): + if query_taxid == -1: + sys.exit(0) + query_taxid = str(query_taxid) + if os.path.isfile(taxdump_nodes_path) is False: + sys.stderr.write("The NCBI taxdump nodes file ({}) was not found\n".format(taxdump_nodes_path)) + sys.exit(1) + nodes_data = gpf.ll(taxdump_nodes_path) + taxid_found_flag = False + for counter, line in enumerate(nodes_data): + split_line = line.split("|") + if len(split_line) > 2: + taxid = split_line[0].strip() + if taxid == query_taxid: + taxid_found_flag = True + break + else: + sys.stderr.write( + "Failed to parse the NCBI taxdump nodes.dmp file ({}) at line {}:\n".format( + taxdump_nodes_path, counter + 1 + ) + ) + sys.stderr.write(line + "\n") + sys.exit(1) + + if taxid_found_flag is False: + sys.stderr.write( + "The TaxID given by the user ({}) was not found in the NCBI taxdump nodes.dmp file ({})\n".format( + query_taxid, taxdump_nodes_path + ) + ) + sys.exit(1) + + +if __name__ == "__main__": + args = parse_args() + main(args.query_taxid, args.taxdump_nodes_path) diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py new file mode 100755 index 00000000..12af7059 --- /dev/null +++ b/bin/generate_samplesheet.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse + +""" +A simple script to generate a csv file required for the sanger-tol/blobtoolkit pipeline-module. + +Required input include the sample ID and the mapped BAM file generated with PacBio data and input FASTA assembly + +Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Generate a csv file for BTK") + parser.add_argument("sample_name", type=str, help="Name of sample") + parser.add_argument( + "mapped_bam_file", + type=str, + help="Path containing the mapped BAM generated with PacBio data and the ASCC input assembly", + ) + parser.add_argument("-v", "--version", action="version", version="1.0.0") + return parser.parse_args() + + +def main(): + args = parse_args() + + data_list = [] + + data_list.append("sample,datatype,datafile\n") + if args.mapped_bam_file.endswith(".bam"): + data_list.append(f"{args.sample_name},pacbio,{args.mapped_bam_file}\n") + else: + sys.exit("I was expecting a mapped BAM file") + + with open("samplesheet.csv", "w") as file: + file.write("".join(data_list)) + + +if __name__ == "__main__": + main() diff --git a/bin/merge_btk_datasets.py b/bin/merge_btk_datasets.py new file mode 100755 index 00000000..fcc6c9cb --- /dev/null +++ b/bin/merge_btk_datasets.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for merging BlobToolKit datasets from the createBTKdatasets output directory. +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import json +from pathlib import Path +import shutil +import os +import sys +import argparse +import textwrap +import general_purpose_functions as gpf +import gzip + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="mergeBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument( + "-m", "--main_btk_datasets", required=True, type=str, help="The btk_datasets generated by createBTKdatasets" + ) + parser.add_argument( + "-b", + "--btk_busco_datasets", + type=str, + help="Path to the BTK dataset (blobdir) created by the BUSCO-based BTK pipeline", + ) + parser.add_argument( + "-s", + "--btk_busco_summary_full", + type=str, + help="The btk_datasets generated by createBTKdatasets", + ) + parser.add_argument( + "-o", + "--new_output_directory", + default="merged_datasets", + type=str, + help="The new output directory for the merged datasets", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def load_json(filename): + """ + Loads a JSON file and returns it as a dictionary + """ + json_contents = None + if filename.endswith(".gz"): + with gzip.open(filename, "rt", encoding="UTF-8") as zipfile: + json_contents = json.load(zipfile) + else: + with open(filename) as f: + json_contents = json.load(f) + return json_contents + + +def create_meta_json_contents(main_btk_dataset_folder, btk_busco_dataset_folder): + """ + Creates the contents for the meta.json file for the new BTK dataset by combining the two meta.json files from the input directories + """ + for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): + if os.path.isdir(folder) is False: + sys.stderr.write( + f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n" + ) + sys.exit(0) + + main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" + btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json.gz" + for json_path in (main_btk_json_path, btk_busco_json_path): + if os.path.isfile(json_path) is False: + sys.stderr.write(f"File {json_path} not found)\n") + sys.exit(1) + + main_meta_dict = load_json(main_btk_json_path) + btk_busco_meta_dict = load_json(btk_busco_json_path) + + merged_dict = btk_busco_meta_dict.copy() + + keys_to_skip = [] + fields = main_meta_dict["fields"] + for field in fields: + field_id = field["id"] + + if field_id == "taxonomy": + btk_main_taxonomy_field = field.copy() + btk_main_taxonomy_field["id"] = "btk_main_taxonomy" + btk_main_taxonomy_field["name"] = "btk_main_taxonomy" + merged_dict["fields"].append(btk_main_taxonomy_field) + else: + if field_id not in keys_to_skip: + merged_dict["fields"].append(field) + return merged_dict + + +def detect_buscogenes_variables(merged_jsons_dict): + """ + Goes through the content of merged meta.json file (derived from both BTK datasets) and detects if buscogenes + variables are present + """ + buscogenes_present_flag = False + fields = merged_jsons_dict["fields"] + for field in fields: + field_id = field["id"] + if field_id == "taxonomy": + for item in field["children"]: + if item["id"] == "buscogenes": + buscogenes_present_flag = True + break + return buscogenes_present_flag + + +def main(args): + if os.path.isdir(args.main_btk_datasets) is False: + sys.stderr.write(f"The BlobToolKit dataset ({args.main_btk_datasets}) was not found!\n") + sys.exit(1) + + if os.path.isdir(args.btk_busco_datasets) is False: + sys.stderr.write( + f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {args.btk_busco_datasets}, skipping the merging of BTK datasets\n" + ) + sys.exit(0) + + not_copying_list = [ + "identifiers.json.gz", + "gc_data.json.gz", + "length_data.json.gz", + "ncount_data.json.gz", + "meta.json.gz", + ] + + Path(args.new_output_directory).mkdir(parents=True, exist_ok=True) + + main_btk_dataset_files = [ + f for f in os.listdir(args.main_btk_datasets) if os.path.isfile(os.path.join(args.main_btk_datasets, f)) + ] + main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] + for main_btk_dataset_file in main_btk_dataset_files: + main_btk_dataset_file_full_path = f"{args.main_btk_datasets}/{main_btk_dataset_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{main_btk_dataset_file}") + shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) + + btk_busco_files = [ + f for f in os.listdir(args.btk_busco_datasets) if os.path.isfile(os.path.join(args.btk_busco_datasets, f)) + ] + for btk_busco_file in btk_busco_files: + btk_busco_file_full_path = f"{args.btk_busco_datasets}/{btk_busco_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{btk_busco_file}") + shutil.copy(btk_busco_file_full_path, copied_file_full_path) + + merged_jsons_dict = create_meta_json_contents(args.main_btk_datasets, args.btk_busco_datasets) + meta_json_outpath = f"{args.new_output_directory}/meta.json" + + with open(meta_json_outpath, "w") as json_outfile: + json.dump(merged_jsons_dict, json_outfile, indent=1, sort_keys=True) + + buscogenes_present_flag = detect_buscogenes_variables(merged_jsons_dict) + + btk_busco_table_outpath = f"{args.new_output_directory}/btk_busco_summary_table_full.tsv" + + btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species" + if buscogenes_present_flag == True: + btk_busco_table_exporting_command += ",buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species" + btk_busco_table_exporting_command += f" {args.new_output_directory}" + + gpf.run_system_command(btk_busco_table_exporting_command) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/process_result_tables.py b/bin/process_result_tables.py new file mode 100755 index 00000000..dc1ca398 --- /dev/null +++ b/bin/process_result_tables.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +""" +Script processing the cobiont check result tables to add a combined classification ('merged_classif') column that is based + on the output of multiple tools. Also generates a table for estimated coverages per 'merged_classif' column +""" + +import pandas as pd +import numpy as np +import argparse +import sys +import os + + +def generate_counts_df(df, counts_df_output_path): + """ + Creates a table that shows the number of sequences, mean coverage and sequence span per phylum + """ + merged_classif_counts = df["merged_classif"].value_counts(dropna=False) + cov_list = list() + span_list = list() + for classif_item, _ in merged_classif_counts.iteritems(): + ind = list(np.where(df["merged_classif"] == classif_item)[0]) + cov_values = df.iloc[ind, df.columns.get_loc("coverage")] + length_values = df.iloc[ind, df.columns.get_loc("length")] + cov_list.append(round(np.mean(cov_values), 2)) + span_sum = sum(length_values) / 1000000 + span_list.append(round(span_sum, 2)) + + counts_df = merged_classif_counts.to_frame() + counts_df["mean_coverage"] = cov_list + counts_df["span_mb"] = span_list + + counts_df.rename(columns={"merged_classif": "number_of_sequences"}, inplace=True) + counts_df.index.name = "phylum" + counts_df.to_csv(counts_df_output_path, index=True) + + +def main(output_folder, sample_id): + main_results_table_path = "{}/{}_contamination_check_merged_table.csv".format(output_folder, sample_id) + btk_results_table_path = "{}/btk_summary_table_full.tsv".format(output_folder) + output_df_path = "{}/{}_contamination_check_merged_table_extended.csv".format(output_folder, sample_id) + counts_df_output_path = "{}/{}_phylum_counts_and_coverage.csv".format(output_folder, sample_id) + + if os.path.isdir(output_folder) == False: + sys.stderr.write( + "The directory with the output tables of the pipeline ({}) was not found\n".format(output_folder) + ) + sys.exit(1) + + if os.path.isfile(main_results_table_path) == False: + sys.stderr.write("The main results table file ({}) was not found\n".format(main_results_table_path)) + sys.exit(1) + + if os.path.isfile(btk_results_table_path) == False: + sys.stderr.write( + "Skipping the exporting of extended results table because the BlobToolKit dataset ({}) was not found\n".format( + btk_results_table_path + ) + ) + sys.exit(0) + + main_df = None + main_df = pd.read_csv(main_results_table_path) + if main_df.shape[0] == 0: + sys.stderr.write("No rows were found in cobiont check results table ({})\n".format(main_results_table_path)) + sys.exit(1) + + if "btk_bestsum_phylum" not in main_df.columns: + sys.stderr.write( + "Column 'btk_bestsum_phylum' was not found in results table ({})\n".format(main_results_table_path) + ) + sys.exit(1) + + df = main_df + df["merged_classif"] = df["btk_bestsum_phylum"] + df["merged_classif_source"] = "btk_bestsum_phylum" + + if "nt_kraken_phylum" in df.columns: + ind = list(np.where(df["merged_classif"] == "no-hit")[0]) + ind2 = list(np.where(df["nt_kraken_phylum"].isna())[0]) + ind3 = [n for n in ind if n not in ind2] + + df.iloc[ind3, df.columns.get_loc("merged_classif")] = df.iloc[ind3, df.columns.get_loc("nt_kraken_phylum")] + df.iloc[ind3, df.columns.get_loc("merged_classif_source")] = "nt_kraken_phylum" + + if "tiara_classif" in df.columns: + tiara_ind = list(np.where(df["merged_classif"] == "no-hit")[0]) + tiara_ind2 = list(np.where(df["tiara_classif"].isna())[0]) + tiara_ind3 = list(np.where(df["tiara_classif"] == "unknown")[0]) + tiara_ind = [n for n in tiara_ind if n not in tiara_ind2 and n not in tiara_ind3] + df.iloc[tiara_ind, df.columns.get_loc("merged_classif")] = df.iloc[ + tiara_ind, df.columns.get_loc("tiara_classif") + ] + df.iloc[tiara_ind, df.columns.get_loc("merged_classif_source")] = "tiara" + + df["merged_classif"] = df["merged_classif"].replace("bacteria", "Bacteria-undef") + df["merged_classif"] = df["merged_classif"].replace("eukarya", "Eukaryota-undef") + df["merged_classif"] = df["merged_classif"].replace("prokarya", "Prokaryota-undef") + df["merged_classif"] = df["merged_classif"].replace("archaea", "Archaea-undef") + + df.to_csv(output_df_path, index=False) + os.remove(main_results_table_path) + # os.rename(output_df_path, main_results_table_path) To remove Nextflow confussion + generate_counts_df(df, counts_df_output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("output_folder", type=str, help="Path to the directory with the output tables of the pipeline") + parser.add_argument("sample_id", type=str, help="ToL ID of the sample") + args = parser.parse_args() + main(args.output_folder, args.sample_id) diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py old mode 100644 new mode 100755 diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py new file mode 100755 index 00000000..059cce1b --- /dev/null +++ b/bin/trim_Ns.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +""" +Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance (jt8@sanger.ac.uk). Edited by Eerik Aunin (ea10@sanger.ac.uk) +""" + +import re +from Bio import SeqIO +import argparse + + +def main(fasta_file, output_file): + minleftover = 200 # after trimming start/end, at least this many bp should be left + winsize = 5000 # for sliding window analysis + minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off + + output_handle = open(output_file, "w") + with open(fasta_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + # output_handle.write('Testing ' + record.id + "\n") + + # Do this twice: once with the regular string, once with the reversed string + # The output should be as one or more variables that can be reversed for the second iteration + seq_string = str(record.seq) + + n_count = seq_string.count("N") + seq_string.count("n") + n_perc = n_count / len(seq_string) + if n_perc > 0.8: + output_handle.write( + "# WARNING: " + + record.id + + "\t" + + str(int(n_perc * 10000) / 100) + + " % Ns of total " + + str(len(seq_string)) + + "\n" + ) + + # Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time + # terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string) + start_n_match = re.match("^([Nn]*)", seq_string) + end_n_match = re.search("^([Nn]*)", seq_string[::-1]) + + startseq = "" + if start_n_match: + startseq = start_n_match.group(1) + endseq = "" + if end_n_match: + endseq = end_n_match.group(1) + realseq_length = len(seq_string) - len(startseq) - len(endseq) + # Handle "all Ns exception" + if len(startseq) == len(seq_string): + realseq_length = 0 + endseq = "" + + if len(startseq) > 0 or len(endseq) > 0: + if len(startseq) > 0: + output_handle.write("\t".join(["TRIM:", record.id, "1", str(len(startseq))]) + "\t\n") + if len(endseq) > 0: + output_handle.write( + "\t".join(["TRIM:", record.id, str(len(startseq) + realseq_length + 1), str(len(seq_string))]) + + "\t\n" + ) + + if realseq_length <= minleftover: + output_handle.write( + "REMOVE: " + record.id + "\t" + str(realseq_length) + " bp leftover after trimming\n" + ) + + # Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition + for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]): + if ( + len(seq_string_for_window) > winsize + and (seq_string_for_window[:winsize].count("N") + seq_string_for_window[:winsize].count("n")) + > winsize * minslidingBase + ): + non_n_regions = [] + non_n_iterator = re.finditer("[^Nn]+", seq_string_for_window) + for non_n_instance in non_n_iterator: + # output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n') + + current_non_n_start = non_n_instance.start(0) + 1 + current_non_n_end = non_n_instance.end(0) + + non_n_regions.insert(0, [current_non_n_start, current_non_n_end]) + + # Does the *end* of this block satisfy the window condition? + bases_in_window = 0 + start_of_first_base_block_in_window = None + for non_n_region in non_n_regions: + if non_n_region[1] >= current_non_n_end - winsize: + start_of_first_base_block_in_window = non_n_region[0] + if non_n_region[0] >= current_non_n_end - winsize: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + else: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + break + else: + break + + if bases_in_window >= minslidingBase * winsize: + # Because start positions have BED-style + + # Remember that the blocks are in *reverse* order along the sequence + trackback_to_start_flag = False + + for i, non_n_region in enumerate(non_n_regions): + if i == 0: + continue + bases_in_window_2 = 0 + if non_n_region[1] < non_n_regions[0][0] - winsize: + break + else: + current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize) + + # Count the bases from this block + bases_in_window_2 += ( + min(non_n_region[1], (current_window_start + winsize - 1)) + - current_window_start + + 1 + ) + + # Add up all the sequence in blocks tested thus far, but not the final block + for j in range(1, i): + bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1 + + # Count the sequence in the final block that can contribute + # This is the region between the start of the final block and either the end of the block + # or the end of a window extending from the current test start point, whichever comes first + bases_in_window_2 += ( + min(non_n_regions[0][1], (current_window_start + winsize - 1)) + - non_n_regions[0][0] + + 1 + ) + + # output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n") + if bases_in_window_2 >= minslidingBase * winsize: + if current_window_start == non_n_region[0]: + start_of_first_base_block_in_window = current_window_start + else: + start_of_first_base_block_in_window = non_n_regions[i - 1][0] + else: + # We keep going back until the threshold condition is *not* met + break + + # If we find the break-point should be before the first block, then we don't want to trim at all! + if i == len(non_n_regions) - 1: + trackback_to_start_flag = True + + # Only trim if the breakpoint isn't before the first block + # and if the breakpoint isn't at the start of the sequence + if not (trackback_to_start_flag) and start_of_first_base_block_in_window != 1: + if seq_string_loop == 0: + output_handle.write( + "FWDCLIP:\t" + + record.id + + "\t1\t" + + str(start_of_first_base_block_in_window - 1) + + "\n" + ) + else: + output_handle.write( + "REVCLIP:\t" + + record.id + + "\t" + + str(len(seq_string) - start_of_first_base_block_in_window + 2) + + "\t" + + str(len(seq_string)) + + "\n" + ) + + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("fasta_file", type=str, help="Path to input FASTA file") + parser.add_argument("output_file", type=str, help="Path for output report file") + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args.fasta_file, args.output_file) diff --git a/conf/base.config b/conf/base.config index 4e764169..01fb4653 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,11 +15,17 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - withName: BLAST_BLASTN { + withName: BLAST_BLASTN_MOD { memory = { check_max( 50.GB * task.attempt, 'memory' ) } time = { check_max( 12.h * task.attempt, 'time' ) } } + withName: DIAMOND_BLASTX { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 50.GB * task.attempt, 'memory' ) } + time = { check_max( 20.h * task.attempt, 'time' ) } + } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 diff --git a/conf/modules.config b/conf/modules.config index 0820f4a9..530430c3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,18 +12,74 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: SANGER_TOL_BTK { + ext.args = "--blastx_outext 'txt'" + ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" + ext.profiles = "singularity,sanger" + ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" + ext.version = "draft_assemblies" + publishDir = [ + path: { "${params.outdir}/sanger-tol-btk" }, + mode: params.publish_dir_mode, + ] + } + + withName: "AUTOFILTER_AND_CHECK_ASSEMBLY|CREATE_BTK_DATASET|ORGANELLE_CONTAMINATION_RECOMMENDATIONS|FILTER_BARCODE|SUMMARISE_VECSCREEN_OUTPUT|MERGE_BTK_DATASETS" { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + + withName: AUTOFILTER_AND_CHECK_ASSEMBLY { + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode, + pattern: "autofiltering_done_indicator_file.txt" + ] + } + + withName: GenIndicator { + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode + ] + } + + withName: ASCC_MERGE_TABLES { + publishDir = [ + path: { "${params.outdir}/ASCC-main-output" }, + mode: params.publish_dir_mode + ] + } + + withName: FILTER_FASTA { + ext.args = "--low_pass --remove_original_fasta" + ext.cutoff = 1900000000 + } withName: SEQKIT_SLIDING { ext.args = {"-s ${meta.sliding} -W ${meta.window} "} } - withName: BLAST_CHUNK_TO_FULL { - ext.args = 'nucleotide' + withName: '.*:.*:EXTRACT_NT_BLAST:BLAST_CHUNK_TO_FULL' { + ext.args = "nucleotide" + } + + withName: '.*:.*:NUCLEOT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { + ext.args = "diamond" + } + + withName: '.*:.*:NUCLEOT_DIAMOND:CONVERT_TO_HITS_FILE' { + ext.args = "nr" + } + + withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { + ext.args = "diamond" + } + + withName: '.*:.*:UNIPROT_DIAMOND:CONVERT_TO_HITS_FILE' { + ext.args = "Uniprot" } withName: BLAST_MAKEBLASTDB { @@ -36,6 +92,10 @@ process { ext.dbprefix = '*' } + withName: DIAMOND_BLASTX { + ext.args = { "--sensitive --max-target-seqs 3 --evalue 1e-25 --no-unlink --tmpdir ./" } + } + withName: '.*:EXTRACT_NT_BLAST:BLAST_BLASTN_MOD' { ext.args = { '-outfmt "6 qseqid staxids bitscore std" -max_target_seqs 10 -max_hsps 1 -evalue 1e-25 -dust yes -lcase_masking' } ext.dbprefix = { "${meta2.id}" } @@ -75,6 +135,13 @@ process { ext.prefix = { "${meta.id}_euk" } } + withName: "FCS_FCSADAPTOR_EUK|FCS_FCSADAPTOR_PROK" { + publishDir = [ + path: { "${params.outdir}/FCS-adaptor" }, + mode: params.publish_dir_mode, + ] + } + withName: SED_SED { ext.prefix = { "${meta.id}_fixed" } ext.args = " -e '/>/s/ //g' " diff --git a/conf/test_full.config b/conf/test_full.config deleted file mode 100644 index 338f7c8b..00000000 --- a/conf/test_full.config +++ /dev/null @@ -1,31 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running full-size tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a full size pipeline test. - - Use as follows: - nextflow run sanger-tol/ascc -profile test_full, --outdir - ----------------------------------------------------------------------------------------- -*/ -process { - maxForks = 1 -} - -executor { - queueSize=1 -} - -params { - config_profile_name = 'Full test profile' - config_profile_description = 'Full test dataset to check pipeline function' - - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Fasta references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz' -} diff --git a/docs/images/ascc-1.0.0-diamond-blast.drawio.png b/docs/images/ascc-1.0.0-diamond-blast.drawio.png new file mode 100644 index 00000000..e4ba9f9d Binary files /dev/null and b/docs/images/ascc-1.0.0-diamond-blast.drawio.png differ diff --git a/docs/images/ascc-1.0.0-extract-blast.drawio.png b/docs/images/ascc-1.0.0-extract-blast.drawio.png new file mode 100644 index 00000000..85a5818b Binary files /dev/null and b/docs/images/ascc-1.0.0-extract-blast.drawio.png differ diff --git a/docs/images/ascc-1.0.0-extract-tiara.drawio.png b/docs/images/ascc-1.0.0-extract-tiara.drawio.png new file mode 100644 index 00000000..9a502521 Binary files /dev/null and b/docs/images/ascc-1.0.0-extract-tiara.drawio.png differ diff --git a/docs/images/ascc-1.0.0-fcsadaptor.drawio.png b/docs/images/ascc-1.0.0-fcsadaptor.drawio.png new file mode 100644 index 00000000..1b4baf58 Binary files /dev/null and b/docs/images/ascc-1.0.0-fcsadaptor.drawio.png differ diff --git a/docs/images/ascc-1.0.0-generate-genome.drawio.png b/docs/images/ascc-1.0.0-generate-genome.drawio.png new file mode 100644 index 00000000..2110ff48 Binary files /dev/null and b/docs/images/ascc-1.0.0-generate-genome.drawio.png differ diff --git a/docs/images/ascc-1.0.0-get-kmer-profile.drawio.png b/docs/images/ascc-1.0.0-get-kmer-profile.drawio.png new file mode 100644 index 00000000..dbdcdad9 Binary files /dev/null and b/docs/images/ascc-1.0.0-get-kmer-profile.drawio.png differ diff --git a/docs/images/ascc-1.0.0-kraken.drawio.png b/docs/images/ascc-1.0.0-kraken.drawio.png new file mode 100644 index 00000000..afadcdf7 Binary files /dev/null and b/docs/images/ascc-1.0.0-kraken.drawio.png differ diff --git a/docs/images/ascc-1.0.0-mapping.drawio.png b/docs/images/ascc-1.0.0-mapping.drawio.png new file mode 100644 index 00000000..efcc26ae Binary files /dev/null and b/docs/images/ascc-1.0.0-mapping.drawio.png differ diff --git a/docs/images/ascc-1.0.0-organellar.drawio.png b/docs/images/ascc-1.0.0-organellar.drawio.png new file mode 100644 index 00000000..a5a966f8 Binary files /dev/null and b/docs/images/ascc-1.0.0-organellar.drawio.png differ diff --git a/docs/images/ascc-1.0.0-pacbio-check.drawio.png b/docs/images/ascc-1.0.0-pacbio-check.drawio.png new file mode 100644 index 00000000..00f5e748 Binary files /dev/null and b/docs/images/ascc-1.0.0-pacbio-check.drawio.png differ diff --git a/docs/images/ascc-1.0.0-read-coverage.drawio.png b/docs/images/ascc-1.0.0-read-coverage.drawio.png new file mode 100644 index 00000000..1e86f664 Binary files /dev/null and b/docs/images/ascc-1.0.0-read-coverage.drawio.png differ diff --git a/docs/images/ascc-1.0.0-trailing-ns.drawio.png b/docs/images/ascc-1.0.0-trailing-ns.drawio.png new file mode 100644 index 00000000..84f7d421 Binary files /dev/null and b/docs/images/ascc-1.0.0-trailing-ns.drawio.png differ diff --git a/docs/images/ascc-1.0.0-vecscreen.drawio.png b/docs/images/ascc-1.0.0-vecscreen.drawio.png new file mode 100644 index 00000000..c4b1d181 Binary files /dev/null and b/docs/images/ascc-1.0.0-vecscreen.drawio.png differ diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e47..00000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb8..00000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf5..00000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index 5a19f952..99d0d6b2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,46 +12,350 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [YamlInput](#yamlinput) - +- [Validate TaxID](#validate-taxid) - +- [Filter Fasta](#filter-fasta) - +- [GC Content](#gc-content) - +- [Generate Genome](#generate-genome) - +- [Trailing Ns Check](#trailing-ns-check) - +- [Get KMERS profile](#get-kmers-profile) - +- [Extract Tiara Hits](#extract-tiara-hits) - +- [Mito organellar blast](#mito-organellar-blast) - +- [Plastid organellar blast](#plastid-organellar-blast) - +- [Run FCS Adaptor](#run-fcs-adaptor) - +- [Run FCS-GX](#run-fcs-gx) - +- [Pacbio Barcode Check](#pacbio-barcode-check) - +- [Run Read Coverage](#run-read-coverage) - +- [Run Vecscreen](#run-vecscreen) - +- [Run NT Kraken](#run-nt-kraken) - +- [Nucleotide Diamond Blast](#nucleotide-diamond-blast) - +- [Uniprot Diamond Blast](#uniprot-diamond-blast) - +- [Create BTK dataset](#create-btk-dataset) - +- [Autofilter and check assembly](#autofilter-and-check-assembly) - +- [Generate samplesheet](#generate-samplesheet) - +- [Sanger-TOL BTK](#sanger-tol-btk) - +- [Merge BTK datasets](#merge-btk-datasets) - +- [ASCC Merge Tables](#ascc-merge-tables) - - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### YamlInput
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `NA`
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +YamlInput parses the input yaml into channels for later use in the pipeline. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +### Validate TaxID -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files + +- `NA` + +
+ +Validate TaxID scans through the taxdump to ensure that the input taxid is present in the nxbi taxdump. + +### Filter Fasta + +
+Output files + +- `filter/` + `*filtered.fasta` - A fasta file that has been filtered for sequences below a given threshold. + +
+ +By default scaffolds above 1.9Gb are removed from the assembly, as scaffolds of this size are unlikely to truely have contamination. There is also the issue that scaffolds larger than this use a significant amount of resources which hinders production environments. + +### GC Content + +
+Output files + +- `gc/` + `*-GC_CONTENT.txt` - A text file describing the GC content of the input genome. + +
+ +Calculating the GC content of the input genome. + +### Generate Genome + +
+Output files + +- `generate/` + `*.genome` - An index-like file describing the input genome. + +
+ +An index-like file containing the scaffold and scaffold length of the input genome. + +### Trailing Ns Check + +
+Output files + +- `trailingns/` + `*_trim_Ns` - A text file containing a report of the Ns found in the genome. + +
+ +A text file containing a report of the Ns found in the genome. + +### Get KMERS profile + +
+Output files + +- `get/` + `*_KMER_COUNTS.csv` - A csv file containing kmers and their counts. -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +
+ +A csv file containing kmers and their counts. + +### Extract Tiara Hits + +
+Output files + +- `tiara/` + `*.{txt,txt.gz}` - A text file containing classifications of potential contaminants. + `log_*.{txt,txt.gz}` - A log of the tiara run. + `*.{fasta,fasta.gz}` - An output fasta file. + +
+ +Tiara ... + +### Mito Organellar Blast -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +
+Output files + +- `blast/` + `*.tsv` - A tsv file containing potential contaminants. + +
+ +A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. -### MultiQC +### Chloro Organellar Blast
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `blast/` + `*.tsv` - A tsv file containing potential contaminants.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. + +### Run FCS Adaptor + +
+Output files + +- `fcs/` + `*.fcs_adaptor_report.txt` - A text file containing potential adaptor sequences and locations. + `*.cleaned_sequences.fa.gz` - Cleaned fasta file. + `*.fcs_adaptor.log` - Log of the fcs run. + `*.pipeline_args.yaml` - Arguments to FCS Adaptor + `*.skipped_trims.jsonl` - Skipped sequences + +
+ +FCS Adaptor Identified potential locations of retained adaptor sequences from the sequencing run. + +### Run FCS-GX + +
+Output files + +- `fcs/` + `*out/*.fcs_gx_report.txt` - A text file containing potential contaminant locations. + `out/*.taxonomy.rpt` - Taxonomy report of the potential contaminants. + +
+ +FCS-GX Identified potential locations of contaminant sequences. + +### Pacbio Barcode Check + +
+Output files + +- `filter/` + `*_filtered.txt` - Text file of barcodes found in the genome. + +
+ +Uses BlastN to identify where given barcode sequences may be in the genome. + +### Run Read Coverage + +
+Output files + +- `samtools/` + `*.bam` - Aligned BAM file. + `*_average_coverage.txt` - Text file containing the coverage information for the genome + +
+ +Mapping the read data to the input genome and calculating the average coverage across it. + +### Run Vecscreen + +
+Output files + +- `summarise/` + `*.vecscreen_contamination` - A text file containing potential vector contaminant locations. + +
+ +Vecscreen identifies vector contamination in the input sequence. + +### Run NT Kraken + +
+Output files + +- `kraken2/` + `*.classified{.,_}*'` - Fastq file containing classified sequence. + `*.unclassified{.,_}*'` - Fastq file containing unclassified sequence. + `*classifiedreads.txt` - A text file containing a report on reads which have been classified. + `*report.txt` - Report of Kraken2 run. +- `get/` + `*txt` - Text file containing lineage information of the reported meta genomic data. + +
+ +Kraken assigns taxonomic labels to metagenomic DNA sequences and optionally outputs the fastq of these data. + +### Nucleotide Diamond Blast + +
+Output files + +- `diamond/` + `*.txt` - A text file containing the genomic locations of hits and scores. +- `reformat/` + `*text` - A Reformated text file continaing the full genomic location of hits and scores. +- `convert/` + `*.hits` - A file containing all hits above the cutoff. + +
+ +Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the NCBI db + +### Uniprot Diamond Blast + +
+Output files + +- `diamond/` + `*.txt` - A text file containing the genomic locations of hits and scores. +- `reformat/` + `*text` - A Reformated text file continaing the full genomic location of hits and scores. +- `convert/` + `*.hits` - A file containing all hits above the cutoff. + +
+ +Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the Uniprot db + +### Create BTK dataset + +
+Output files + +- `create/` + `btk_datasets/` - A btk dataset folder containing data compatible with BTK viewer. + `btk_summary_table_full.tsv` - A TSV file summarising the dataset. + +
+ +Create BTK, creates a BTK_dataset folder compatible with BTK viewer. + +### Autofilter and check assembly + +
+Output files + +- `autofilter/` + `autofiltered.fasta` - The decontaminated input genome. + `ABNORMAL_CHECK.csv` - Combined FCS and Tiara summary of contamination. + `assembly_filtering_removed_sequences.txt` - Sequences deemed contamination and removed from the above assembly. + `fcs-gx_alarm_indicator_file.txt` - Contains text to control the running of Blobtoolkit. + +
+ +Autofilter and check assembly returns a decontaminated genome file as well as summaries of the contamination found. + +### Generate samplesheet + +
+Output files + +- `generate/` + `*.csv` - A CSV file containing data locations, for use in Blobtoolkit. + +
+ +This produces a CSV containing information on the read data for use in BlobToolKit. + +### Sanger-TOL BTK + +
+Output files + +- `sanger/` + `*_btk_out/blobtoolkit/${meta.id}*/` - The BTK dataset folder generated by BTK. + `*_btk_out/blobtoolkit/plots/` - The plots for display in BTK Viewer. + `*_btk_out/blobtoolkit/${meta.id}*/summary.json.gz` - The Summary.json file... + `*_btk_out/busco/*` - The BUSCO results returned by BTK. + `*_btk_out/multiqc/*` - The MultiQC results returned by BTK. + `blobtoolkit_pipeline_info` - The pipeline_info folder. + +
+ +Sanger-Tol/BlobToolKit is a Nextflow re-implementation of the snakemake based BlobToolKit pipeline and produces interactive plots used to identify true contamination and seperate sequence from the main assembly. + +### Merge BTK datasets + +
+Output files + +- `merge/` + `merged_datasets` - A BTK dataset. + `merged_datasets/btk_busco_summary_table_full.tsv` - A TSV file containing a summary of the btk busco results. + +
+ +This module merged the Create_btk_dataset folder with the Sanger-tol BTK dataset to create one unified dataset for use with btk viewer. + +### ASCC Merge Tables + +
+Output files + +- `ascc/` + `*_contamination_check_merged_table.csv` - .... + `*_contamination_check_merged_table_extended.csv` - .... + `*_phylum_counts_and_coverage.csv` - A CSV report containing information on the hits per phylum and the coverage of the hits.. + +
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +Merge Tables merged the summary reports from a number of modules inorder to create a single set of reports. ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index 27730271..341980d8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,59 +6,51 @@ -## Samplesheet input +## Yaml input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +### Full yaml -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```yaml +assembly_path: PATH TO INPUT FASTA +assembly_title: NAME OF INPUT ORGANISM +sci_name: "{SCIENTIFIC NAME OF ORGANISM}" +taxid: 352914 +mito_fasta_path: PATH TO MITO FASTA +plastid_fasta_path: PATH TO PLASTID FASTA +reads_path: /path/to/pacbio/fasta/ +reads_type: "hifi" +pacbio_barcodes: FULL PATH TO /ascc/assets/pacbio_adaptors.fa +pacbio_multiplexing_barcode_names: "bc2008,bc2009" {BARCODES EXPECTED IN DATA} +kmer_len: 7 +dimensionality_reduction_methods: "pca,random_trees" A CSV OF THE BELOW METHODS +# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" +nt_database: PATH TO UPTO DATE BLASTDB NT DATABASE +nt_database_prefix: PREFIX FOR THE BLASTDB DATABASE +nt_kraken_db_path: PATH+PREFIX TO THE NT KRAKEN DATABASE +ncbi_accessionids_folder: PATH TO /accession2taxid/ +ncbi_taxonomy_path: PATH TO /taxdump/ +ncbi_rankedlineage_path: PATH TO /taxdump/rankedlineage.dmp +busco_lineages_folder: PATH TO THE BUSCO LINEAGES FOLDER +fcs_gx_database_path: PATH TO FOLDER CONTAINING THE FCS_GX DB +vecscreen_database_path: PATH TO VECSCREEN DB +diamond_uniprot_database_path: PATH TO uniprot_reference_proteomes_with_taxonnames.dmnd +diamond_nr_database_path: PATH TO /nr.dmnd +seqkit: + sliding: 100000 + window: 6000 +n_neighbours: 13 +btk_yaml: PATH TO /ascc/assets/btk_draft.yaml <- THIS IS DEFAULT AND ONLY SERVES TO BYPASS GCA REQUIREMENTS OF SANGER-TOL/BLOBTOOLKIT ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/ascc --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run sanger-tol/ascc --input {INPUT YAML} --outdir {OUTDIR} --steps {CSV LIST OF STEPS TO RUN} -profile singularity ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `singularity` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -69,29 +61,6 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. - -Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. - -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). - -The above pipeline run specified with a params file in yaml format: - -```bash -nextflow run sanger-tol/ascc -profile docker -params-file params.yaml -``` - -with `params.yaml` containing: - -```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -<...> -``` - -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). - ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules.json b/modules.json index f8c9ac15..0acbe5f1 100644 --- a/modules.json +++ b/modules.json @@ -7,9 +7,8 @@ "nf-core": { "blast/blastn": { "branch": "master", - "git_sha": "acacb4075ef46fa74630aa3f4b0684f1021d5930", - "installed_by": ["modules"], - "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" + "git_sha": "4262a04142431275e54e1f4b413628a2201ed6e6", + "installed_by": ["modules"] }, "blast/makeblastdb": { "branch": "master", diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf new file mode 100644 index 00000000..2dea7aa2 --- /dev/null +++ b/modules/local/ascc_merge_tables.nf @@ -0,0 +1,92 @@ +process ASCC_MERGE_TABLES { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(gc_content, stageAs: "GC.txt") + path coverage + path tiara, stageAs: "TIARA.txt" + path bacterial_kraken + path nt_kraken, stageAs: "LINEAGE.txt" + path nt_blast + path dim_reduction_embeddings + path nr_diamond + path uniprot_diamond, stageAs: "UP_DIAMOND.tsv" + path cobiontid_markerscan + path contigviz + path btk, stageAs: "BTK_summary_table_full.tsv" + path btk_busco + path fcs_gx, stageAs: "FCSGX_parsed.csv" + + output: + tuple val(meta), path("*_contamination_check_merged_table.csv") , emit: merged_table + tuple val(meta), path("*_contamination_check_merged_table_extended.csv"), optional: true, emit: extended_table + tuple val(meta), path("*_phylum_counts_and_coverage.csv") , optional: true, emit: phylum_counts + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def coverage = coverage ? "-c ${coverage}" : "" + def tiara = tiara ? "-t ${tiara}" : "" + def bacterial_kraken = bacterial_kraken ? "-bk ${bacterial_kraken}" : "" + def nt_kraken = nt_kraken ? "-nk ${nt_kraken}" : "" + def nt_blast = nt_blast ? "-nb ${nt_blast}" : "" + def dim_reduction_embeddings = dim_reduction_embeddings ? "-dr ${dim_reduction_embeddings}" : "" + def nr_diamond = nr_diamond ? "-nd ${nr_diamond}" : "" + def uniprot_diamond = uniprot_diamond ? "-ud ${uniprot_diamond}" : "" + def contigviz = contigviz ? "-cv ${contigviz}" : "" + def btk = btk ? "-btk ${btk}" : "" + def btk_busco = btk_busco ? "-bb ${btk_busco}" : "" + def fcs_gx = fcs_gx ? "-fg ${fcs_gx}" : "" + def cobiontid_markerscan = "" + + """ + ascc_merge_tables.py \\ + --gc_cov $gc_content \\ + --sample_name $meta.id \\ + $coverage \\ + $tiara \\ + $bacterial_kraken \\ + $nt_kraken \\ + $nt_blast \\ + $dim_reduction_embeddings \\ + $nr_diamond \\ + $uniprot_diamond \\ + $contigviz \\ + $btk \\ + $btk_busco \\ + $fcs_gx \\ + $cobiontid_markerscan \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_merge_tables: \$(ascc_merge_tables.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_contamination_check_merged_table.csv + touch ${prefix}_contamination_check_merged_table_extended.csv + touch ${prefix}_phylum_counts_and_coverage.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_merge_tables: \$(ascc_merge_tables.py --version | cut -d' ' -f2) + END_VERSIONS + """ + +} diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf new file mode 100644 index 00000000..e86f0e95 --- /dev/null +++ b/modules/local/autofiltering.nf @@ -0,0 +1,64 @@ +process AUTOFILTER_AND_CHECK_ASSEMBLY { + tag "$meta.id" + label "process_medium" + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(reference) + tuple val(tiara_meta), path(tiara_txt) + tuple val(fcs_meta), path(fcs_csv) + path ncbi_rankedlineage + + output: + tuple val(meta), path("autofiltered.fasta"), emit: decontaminated_assembly + tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: fcs_tiara_summary + tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs + path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file + path("autofiltering_done_indicator_file.txt"), emit: indicator_file + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + """ + autofilter.py \\ + $reference \\ + --taxid $meta.taxid \\ + --tiara $tiara_txt \\ + --fcsgx_sum $fcs_csv \\ + --ncbi_rankedlineage_path $ncbi_rankedlineage \\ + + abnormal_contamination_check.py \\ + $reference \\ + ABNORMAL_CHECK.csv + + # The below indicator file is used in Sanger-Tol to allow for other processes + # to begin once generated. This allows us to speed up the overall flow of the + # Tol-engine + touch autofiltering_done_indicator_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch autofiltered.fasta + touch ABNORMAL_CHECK.csv + touch assembly_filtering_removed_sequences.txt + touch fcs-gx_alarm_indicator_file.txt + touch autofiltering_done_indicator_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/blast_chunk_to_full.nf b/modules/local/blast_chunk_to_full.nf index 6fdb04f2..f51b8b97 100644 --- a/modules/local/blast_chunk_to_full.nf +++ b/modules/local/blast_chunk_to_full.nf @@ -12,11 +12,11 @@ process BLAST_CHUNK_TO_FULL { output: tuple val(meta), path( "*.tsv" ) , emit: full - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" """ blast_hit_chunk_coords_to_full_coords.py ${chunked} ${args} > full_coords.tsv @@ -28,8 +28,10 @@ process BLAST_CHUNK_TO_FULL { """ stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ - touch full_coords.tsv + touch ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/convert_to_hits_file.nf b/modules/local/convert_to_hits_file.nf new file mode 100644 index 00000000..0f04ad93 --- /dev/null +++ b/modules/local/convert_to_hits_file.nf @@ -0,0 +1,40 @@ +process CONVERT_TO_HITS_FILE { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(blast_full) + + output: + tuple val(meta), path("*csv"), emit: hits_file + path "versions.yml", emit: versions + + script: + def args = task.ext.args + """ + convert_to_hits.py $blast_full $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + convert_to_hits: \$(convert_to_hits.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + convert_to_hits: \$(convert_to_hits.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf new file mode 100644 index 00000000..bbad73a8 --- /dev/null +++ b/modules/local/create_btk_dataset.nf @@ -0,0 +1,89 @@ +process CREATE_BTK_DATASET { + tag "$meta.id" + label 'process_medium' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CREATE_BTK_DATASET module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.9" + + input: + tuple val(meta), path(reference) + path dot_genome, stageAs: "SORTED.genome" + path kmers, stageAs: "KMERS_dim_reduction_embeddings_combined.csv" + path tiara, stageAs: "TIARA.txt" + path nt_blast, stageAs: "BLAST_HITS.tsv" + path fcsgx, stageAs: "FCSGX_parsed.csv" + path mapped_bam, stageAs: "MAPPED.bam" + path coverage, stageAs: "COVERAGE_AVERAGE.txt" + path kraken_class, stageAs: "KRAKEN_CLASSIFIED.txt" + path kraken_report, stageAs: "KRAKEN_REPORT.txt" + path kraken_lineage, stageAs: "KRAKEN_LINEAGE.txt" + path nt_diamond, stageAs: "NUCLEOT_DIAMOND_FULL.tsv" + path un_diamond, stageAs: "UNIPROT_DIAMOND_FULL.tsv" + path ncbi_taxdump, stageAs: "TAXDUMP" + + output: + tuple val(meta), path("btk_datasets"), emit: btk_datasets + tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary + path "versions.yml", emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" + def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" + def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" + def kraken_arg = kraken_lineage ? "-k ${kraken_lineage}": "" + def mapped_arg = mapped_bam ? "-r ${mapped_bam}" : "" + def tiara_arg = tiara ? "-t ${tiara}" : "" + def pca_arg = kmers ? "-p ${kmers}" : "" + def fcs_arg = fcsgx ? "-fc ${fcsgx}" : "" + + """ + mkdir -p btk_datasets/ + + create_btk_dataset.py \\ + -f ${reference} \\ + -d ./1/ \\ + -n "${prefix}" \\ + -tn "${meta.sci_name}" \\ + -id ${meta.taxid} \\ + -td ${ncbi_taxdump}/ \\ + $blastn_arg \\ + $nt_diamond_arg \\ + $un_diamond_arg \\ + $kraken_arg \\ + $mapped_arg \\ + $tiara_arg \\ + $pca_arg \\ + $fcs_arg \\ + $args + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + create_btk_dataset: \$(create_btk_dataset.py -v) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir btk_datasets + touch btk_datasets/${prefix}.txt + touch btk_summary_table_full.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + create_btk_dataset: \$(create_btk_dataset.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/filter_fasta.nf b/modules/local/filter_fasta.nf new file mode 100644 index 00000000..2aec22ce --- /dev/null +++ b/modules/local/filter_fasta.nf @@ -0,0 +1,52 @@ +process FILTER_FASTA { + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(input_fasta) + + output: + tuple val(meta), path("*_filtered.fasta"), emit: fasta + path "*_filtering_stderr.txt", emit: error_log + path "versions.yml", emit: versions + + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def max_length = task.ext.cutoff ?: 1900000000 // This is the default value and maximum supported length of sequence per scaffold + """ + ascc_shorten_fasta_headers.py \\ + ${input_fasta} > ${prefix}_shortened.fasta + + filter_fasta_by_length.py \\ + ${args} \\ + ${prefix}_shortened.fasta \\ + ${max_length} > ${prefix}_filtered.fasta 2> ${prefix}_filtering_stderr.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_shorten_fasta_headers: \$(ascc_shorten_fasta_headers.py -v) + filter_fasta_by_length: \$(filter_fasta_by_length.py -v) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_filtered.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_shorten_fasta_headers: \$(ascc_shorten_fasta_headers.py -v) + filter_fasta_by_length: \$(filter_fasta_by_length.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf index cf1863e8..50331a1c 100644 --- a/modules/local/filter_vecscreen_results.nf +++ b/modules/local/filter_vecscreen_results.nf @@ -18,8 +18,8 @@ process FILTER_VECSCREEN_RESULTS { task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' """ VSlistTo1HitPerLine.py ${args} ${vecscreen_outfile} > ${prefix}_vecscreen.grepped.out diff --git a/modules/local/format_diamond_outfmt6.nf b/modules/local/format_diamond_outfmt6.nf index e2acf1ca..0d916f21 100644 --- a/modules/local/format_diamond_outfmt6.nf +++ b/modules/local/format_diamond_outfmt6.nf @@ -1,4 +1,4 @@ -process REFORMAT_FULL_OUTFMT6 { +process REFORMAT_DIAMOND_OUTFMT6 { tag "${meta.id}" label 'process_low' @@ -11,8 +11,8 @@ process REFORMAT_FULL_OUTFMT6 { tuple val(meta), path(diamond_blast) output: - tuple val(meta), path( "*_diamond_outfmt6.tsv" ) , emit: full - path "versions.yml" , emit: versions + tuple val(meta), path( "*_diamond_outfmt6.tsv" ) , emit: full + path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/gc_content.nf b/modules/local/gc_content.nf index 825b0ec8..d1fc5766 100644 --- a/modules/local/gc_content.nf +++ b/modules/local/gc_content.nf @@ -11,13 +11,13 @@ process GC_CONTENT { tuple val(meta), path(fasta) output: - tuple val(meta), path( "*-gc.txt" ) , emit: txt - path "versions.yml" , emit: versions + tuple val(meta), path( "*-GC_CONTENT.txt" ) , emit: txt + path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" """ - gc_content.py ${fasta} > ${prefix}-gc.txt + gc_content.py ${fasta} > ${prefix}-GC_CONTENT.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf new file mode 100644 index 00000000..018f7ec6 --- /dev/null +++ b/modules/local/generate_samplesheet.nf @@ -0,0 +1,43 @@ +process GENERATE_SAMPLESHEET { + tag "$meta.id" + label "process_low" + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(pacbio_path) + + output: + tuple val(meta), path("*csv"), emit: csv + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + """ + generate_samplesheet.py \\ + $prefix \\ + $pacbio_path + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ +} diff --git a/modules/local/get_kmer_counts.nf b/modules/local/get_kmer_counts.nf index b0742fe2..a3ff0857 100755 --- a/modules/local/get_kmer_counts.nf +++ b/modules/local/get_kmer_counts.nf @@ -12,7 +12,7 @@ process GET_KMER_COUNTS { val kmer_size output: - tuple val(meta), path( "*_kmer_counts.csv" ) , emit: csv + tuple val(meta), path( "*_KMER_COUNTS.csv" ) , emit: csv path "versions.yml" , emit: versions when: @@ -20,11 +20,11 @@ process GET_KMER_COUNTS { script: def KCOUNTER_VERSION = "0.1.1" - def prefix = args.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: "${meta.id}" """ get_kmers_counts.py \\ $input_fasta \\ - ${prefix}_kmer_counts.csv \\ + ${prefix}_KMER_COUNTS.csv \\ --kmer_size $kmer_size cat <<-END_VERSIONS > versions.yml @@ -40,7 +40,7 @@ process GET_KMER_COUNTS { def KCOUNTER_VERSION = "0.1.1" def prefix = args.ext.prefix ?: "${meta.id}" """ - touch ${prefix}_kmer_counts.csv + touch ${prefix}_KMER_COUNTS.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/kmer_count_dim_reduction.nf b/modules/local/kmer_count_dim_reduction.nf index f80b06bc..b900dd92 100755 --- a/modules/local/kmer_count_dim_reduction.nf +++ b/modules/local/kmer_count_dim_reduction.nf @@ -15,8 +15,8 @@ process KMER_COUNT_DIM_REDUCTION { val autoencoder_epochs_count output: - path '*_kmers_dim_reduction_embeddings.csv', emit: csv - path "versions.yml", emit: versions + path '*_kmers_dim_reduction_embeddings.csv', emit: csv + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/kmer_count_dim_reduction_combine_csv.nf b/modules/local/kmer_count_dim_reduction_combine_csv.nf index cdd2d002..01998a76 100755 --- a/modules/local/kmer_count_dim_reduction_combine_csv.nf +++ b/modules/local/kmer_count_dim_reduction_combine_csv.nf @@ -11,8 +11,8 @@ process KMER_COUNT_DIM_REDUCTION_COMBINE_CSV { tuple val(meta), path(input_files) output: - path '*_kmers_dim_reduction_embeddings_combined.csv', emit: csv - path "versions.yml", emit: versions + path '*_kmers_dim_reduction_embeddings_combined.csv', emit: csv + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf new file mode 100644 index 00000000..7a818013 --- /dev/null +++ b/modules/local/merge_btk_datasets.nf @@ -0,0 +1,53 @@ +process MERGE_BTK_DATASETS { + tag "$meta.id" + label 'process_low' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "ASCC_MERGE_TABLES module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.9" + + input: + tuple val(meta), path(create_btk_datasets) + tuple val(meta2), path(btk_busco_datasets) + + output: + tuple val(meta), path("merged_datasets"), emit: merged_datasets + tuple val(meta), path("merged_datasets/btk_busco_summary_table_full.tsv"), emit: busco_summary_tsv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + + """ + mkdir -p merged_datasets/ + + merge_btk_datasets.py \\ + -m $create_btk_datasets \\ + -o ./merged_datasets \\ + -b $btk_busco_datasets \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + merge_btk_datasets: \$(merge_btk_datasets.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p merged_datasets/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_btk_datasets: \$(merge_btk_datasets.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/samtools_depth_average_coverage.nf b/modules/local/samtools_depth_average_coverage.nf index 9af43516..b04b9ab2 100644 --- a/modules/local/samtools_depth_average_coverage.nf +++ b/modules/local/samtools_depth_average_coverage.nf @@ -11,8 +11,8 @@ process SAMTOOLS_DEPTH_AVERAGE_COVERAGE { tuple val(meta), path(depth) output: - tuple val(meta), path( "*.txt" ), emit: average_coverage - path "versions.yml", emit: versions + tuple val(meta), path( "*.txt" ), emit: average_coverage + path "versions.yml", emit: versions script: def args = task.ext.args ?: '' diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf new file mode 100644 index 00000000..98c7e780 --- /dev/null +++ b/modules/local/sanger_tol_btk.nf @@ -0,0 +1,105 @@ +process SANGER_TOL_BTK { + tag "$meta.id" + label 'process_low' + + input: + tuple val(meta), path(reference, stageAs: "REFERENCE.fa") + tuple val(meta1), path(bam) // Name needs to remain the same as previous process as they are referenced in the samplesheet + tuple val(meta2), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") + path blastp, stageAs: "blastp.dmnd" + path blastn + path blastx + path btk_config_file + path tax_dump + path btk_yaml, stageAs: "BTK.yaml" + val busco_lineages + val taxon + val gca_accession + + output: + tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/${meta.id}*"), emit: dataset + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots + path("${meta.id}_btk_out/blobtoolkit/${meta.id}*/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/busco"), emit: busco_data + path("${meta.id}_btk_out/multiqc"), emit: multiqc_report + path("blobtoolkit_pipeline_info"), emit: pipeline_info + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def executor = task.ext.executor ?: "" + def profiles = task.ext.profiles ?: "" + def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + def btk_config = btk_config_file ? "-c $btk_config_file" : "" + def pipeline_version = task.ext.version ?: "draft_assemblies" + // YAML used to avoid the use of GCA accession number + // https://github.com/sanger-tol/blobtoolkit/issues/77 + + // Seems to be an issue where a nested pipeline can't see the files in the same directory + // Running realpath gets around this but the files copied into the folder are + // now just wasted space. Should be fixed with using Mahesh's method of nesting but + // this is proving a bit complicated with BTK + + // outdir should be an arg + + // blastx and blastp use the same database hence the StageAs + + + """ + $executor 'nextflow run sanger-tol/blobtoolkit \\ + -r $pipeline_version \\ + -profile $profiles \\ + --input "\$(realpath $samplesheet_csv)" \\ + --outdir ${prefix}_btk_out \\ + --fasta "\$(realpath REFERENCE.fa)" \\ + --busco_lineages $busco_lineages \\ + --taxon $taxon \\ + --taxdump "\$(realpath $tax_dump)" \\ + --blastp "\$(realpath blastp.dmnd)" \\ + --blastn "\$(realpath $blastn)" \\ + --blastx "\$(realpath $blastx)" \\ + $btk_config \\ + $args' + + mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Blobtoolkit: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def pipeline_version = task.ext.version ?: "draft_assemblies" + + """ + mkdir -p ${meta.id}_btk_out/blobtoolkit/${meta.id}_out + touch ${meta.id}_btk_out/blobtoolkit/${meta.id}_out/test.json.gz + + mkdir ${meta.id}_btk_out/blobtoolkit/plots + touch ${meta.id}_btk_out/blobtoolkit/plots/test.png + + mkdir ${meta.id}_btk_out/busco + touch ${meta.id}_btk_out/busco/test.batch_summary.txt + touch ${meta.id}_btk_out/busco/test.fasta.txt + touch ${meta.id}_btk_out/busco/test.json + + mkdir ${meta.id}_btk_out/multiqc + mkdir ${meta.id}_btk_out/multiqc/multiqc_data + mkdir ${meta.id}_btk_out/multiqc/multiqc_plots + touch ${meta.id}_btk_out/multiqc/multiqc_report.html + + mv ${meta.id}_btk_out/pipeline_info blobtoolkit_pipeline_info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Blobtoolkit: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ +} diff --git a/modules/local/trailingns.nf b/modules/local/trailingns.nf new file mode 100644 index 00000000..eb97af22 --- /dev/null +++ b/modules/local/trailingns.nf @@ -0,0 +1,42 @@ +process TRAILINGNS { + tag "$meta.id" + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py27_1': + 'biocontainers/biopython:1.70--np112py27_1' }" + + input: + tuple val(meta), path(fasta_input_file) + + output: + tuple val(meta), path("*_trim_Ns"), emit: trailing_ns_report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + """ + trim_Ns.py $fasta_input_file ${prefix}_trim_Ns ${args} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + """ + touch ${prefix}_trim_Ns + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/validate_taxid.nf b/modules/local/validate_taxid.nf new file mode 100644 index 00000000..4cc1e2b2 --- /dev/null +++ b/modules/local/validate_taxid.nf @@ -0,0 +1,40 @@ +process VALIDATE_TAXID { + tag "$taxid" + label 'process_single' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + val(taxid) + path(ncbi_taxonomy_path) + + output: + path "versions.yml", emit: versions + + script: + """ + find_taxid_in_taxdump.py \\ + $taxid \\ + ${ncbi_taxonomy_path}/nodes.dmp + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + find_taxid_in_taxdump: \$(find_taxid_in_taxdump.py -v) + END_VERSIONS + """ + + stub: + """ + OUTPUT="TAXID FOUND + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + find_taxid_in_taxdump: \$(find_taxid_in_taxdump.py -v) + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff deleted file mode 100644 index 320fc3a1..00000000 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ /dev/null @@ -1,53 +0,0 @@ -Changes in module 'nf-core/blast/blastn' ---- modules/nf-core/blast/blastn/main.nf -+++ modules/nf-core/blast/blastn/main.nf -@@ -8,8 +8,8 @@ - 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" - - input: -- tuple val(meta), path(fasta) -- path db -+ tuple val(meta), path(fasta) -+ tuple val(meta2), path(db) - - output: - tuple val(meta), path('*.txt'), emit: txt -@@ -19,16 +19,17 @@ - task.ext.when == null || task.ext.when - - script: -- def args = task.ext.args ?: '' -- def prefix = task.ext.prefix ?: "${meta.id}" -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" - """ -- DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` -+ DB=`find -L ./ -name "${db_prefix}.nin" | sed 's/\\.nin\$//'` - blastn \\ - -num_threads $task.cpus \\ - -db \$DB \\ - -query $fasta \\ - $args \\ -- -out ${prefix}.txt -+ -out ${prefix}-${db_prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": -@@ -37,10 +38,11 @@ - """ - - stub: -- def args = task.ext.args ?: '' -- def prefix = task.ext.prefix ?: "${meta.id}" -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" - """ -- touch ${prefix}.txt -+ touch ${prefix}-${db_prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - -************************************************************ diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml index e2a15166..e4a72800 100644 --- a/modules/nf-core/blast/blastn/environment.yml +++ b/modules/nf-core/blast/blastn/environment.yml @@ -1,6 +1,7 @@ +name: blast_blastn channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::blast=2.14.1 + - bioconda::blast=2.15.0 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 9136b9fb..68b43ba4 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -4,11 +4,11 @@ process BLAST_BLASTN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': - 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + 'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1': + 'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }" input: - tuple val(meta), path(fasta) + tuple val(meta) , path(fasta) tuple val(meta2), path(db) output: @@ -19,16 +19,27 @@ process BLAST_BLASTN { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + """ - DB=`find -L ./ -name "${db_prefix}.nin" | sed 's/\\.nin\$//'` + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'` + if [ -z "\$DB" ]; then + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + fi + echo Using \$DB + blastn \\ - -num_threads $task.cpus \\ + -num_threads ${task.cpus} \\ -db \$DB \\ - -query $fasta \\ - $args \\ + -query ${fasta_name} \\ + ${args} \\ -out ${prefix}.txt cat <<-END_VERSIONS > versions.yml @@ -38,9 +49,8 @@ process BLAST_BLASTN { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.txt diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml index 5fff6a7b..a0d64dd6 100644 --- a/modules/nf-core/blast/blastn/meta.yml +++ b/modules/nf-core/blast/blastn/meta.yml @@ -22,12 +22,22 @@ input: - fasta: type: file description: Input fasta file containing queries sequences - pattern: "*.{fa,fasta}" + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] - db: type: directory - description: Directory containing blast database + description: Directory containing the blast database pattern: "*" output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - txt: type: file description: File containing blastn hits @@ -39,7 +49,6 @@ output: authors: - "@joseespinosa" - "@drpatelh" - - "@vagkaratzas" maintainers: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test index a6badbc4..02ecfab5 100644 --- a/modules/nf-core/blast/blastn/tests/main.nf.test +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -8,27 +8,52 @@ nextflow_process { tag "modules_nfcore" tag "blast" tag "blast/blastn" + tag "blast/makeblastdb" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + } test("Should search for nucleotide hits against a blast db") { - setup { - run("BLAST_MAKEBLASTDB") { - script "../../makeblastdb/main.nf" - process { - """ - input[0] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] - """ - } + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ } } + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("Should search for zipped nucleotide hits against a blast db") { + when { params { outdir = "$outputDir" } process { """ - input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] input[1] = BLAST_MAKEBLASTDB.out.db """ } @@ -37,8 +62,8 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, - { assert process.out.versions } + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions_zipped") } ) } diff --git a/nextflow.config b/nextflow.config index 422de162..01c68b33 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { // Input options input = null outdir = "results" - tracedir = "${outdir}/pipeline_info/" + tracedir = "${params.outdir}/pipeline_info/" // MultiQC options multiqc_config = null diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index 16f97efb..ccbf0be4 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -24,56 +24,12 @@ workflow EXTRACT_NT_BLAST { SEQKIT_SLIDING ( input_genome ) ch_versions = ch_versions.mix(SEQKIT_SLIDING.out.versions) - // - // LOGIC: GLOB ALL *NIN FILES IN DIRECTORY AND SPLIT INTO CHANNELS - // - - blastn_db_path - .map( - it -> file("${it}*") // glob all files in directory - ) - .flatten() // flatten to file per channel - .map( - it -> - tuple ( - [ id: it.toString().split('/')[-1].split("\\....\$")[0] ], // get basename and trim off the extension, returns database prefix - it // list of files - ) - ) - .groupTuple() // group files by id (which = db prefix) - .map { - meta, files -> - tuple ( - [ id: meta.id, - file_count: files.size() ], // get number of files - files - ) - } - .filter { it[0].file_count >= 8 } // a database is made of 8 files, less than this means it is an accessory to the db - .set { databases_by_prefix } - - databases_by_prefix - .combine( blastn_db_path ) - .map { meta, files, rootpath -> - tuple( rootpath, meta.id ) - } - .combine ( SEQKIT_SLIDING.out.fastx ) - .multiMap { root, db_prefix, meta, ref -> - reference: tuple( [ id: meta.id ], - ref - ) - nin_db: tuple( [ id: db_prefix ], - root - ) - } - .set { nin } - // // MODULE: BLASTS THE INPUT GENOME AGAINST A LOCAL NCBI DATABASE // BLAST_BLASTN_MOD ( - nin.reference, - nin.nin_db + SEQKIT_SLIDING.out.fastx, + blastn_db_path ) ch_versions = ch_versions.mix(BLAST_BLASTN_MOD.out.versions) @@ -143,6 +99,8 @@ workflow EXTRACT_NT_BLAST { ch_versions = ch_versions.mix(GET_LINEAGE_FOR_TOP.out.versions) emit: + ch_top_lineages = GET_LINEAGE_FOR_TOP.out.full + ch_blast_hits = BLAST_CHUNK_TO_FULL.out.full versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/get_kmers_profile.nf b/subworkflows/local/get_kmers_profile.nf index de117122..6775e613 100755 --- a/subworkflows/local/get_kmers_profile.nf +++ b/subworkflows/local/get_kmers_profile.nf @@ -80,8 +80,6 @@ workflow GET_KMERS_PROFILE { } .set { collected_files_for_combine } - collected_files_for_combine.view() - // // MODULE: COMBINE OUTPUTS OF MULTIPLE METHODS // diff --git a/subworkflows/local/organellar_blast.nf b/subworkflows/local/organellar_blast.nf index fe4342e7..41fa9e17 100644 --- a/subworkflows/local/organellar_blast.nf +++ b/subworkflows/local/organellar_blast.nf @@ -120,7 +120,7 @@ workflow ORGANELLAR_BLAST { ch_versions = ch_versions.mix(ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.versions) emit: - organelle_report = ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.recommendations + organelle_report= ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.recommendations versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/pe_mapping.nf b/subworkflows/local/pe_mapping.nf index e4e2963a..032657ee 100644 --- a/subworkflows/local/pe_mapping.nf +++ b/subworkflows/local/pe_mapping.nf @@ -1,5 +1,5 @@ -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ILLUMINA } from '../../modules/nf-core/minimap2/align/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ILLUMINA } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' workflow PE_MAPPING { diff --git a/subworkflows/local/run_diamond.nf b/subworkflows/local/run_diamond.nf new file mode 100644 index 00000000..3bb107a4 --- /dev/null +++ b/subworkflows/local/run_diamond.nf @@ -0,0 +1,65 @@ +include { SEQKIT_SLIDING } from '../../modules/nf-core/seqkit/sliding/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' +include { BLAST_CHUNK_TO_FULL as DIAMOND_BLAST_CHUNK_TO_FULL } from '../../modules/local/blast_chunk_to_full' +include { CONVERT_TO_HITS_FILE } from '../../modules/local/convert_to_hits_file' +include { REFORMAT_DIAMOND_OUTFMT6 } from '../../modules/local/format_diamond_outfmt6' + +workflow RUN_DIAMOND { + take: + reference_tuple // tuple [[meta.id, meta.sliding, meta.window], reference] + diamond_db // val (path) + + main: + ch_versions = Channel.empty() + ch_ext = Channel.of("txt") + ch_columns = Channel.of("qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles") + + // + // MODULE: CREATE SLIDING WINDOW OF THE INPUT ASSEMBLY + // + SEQKIT_SLIDING ( + reference_tuple + ) + ch_versions = ch_versions.mix(SEQKIT_SLIDING.out.versions) + + // + // MODULE: BLAST THE SLIDING WINDOW FASTA AGAINST THE DIAMOND DB + // + DIAMOND_BLASTX ( + SEQKIT_SLIDING.out.fastx, + diamond_db, + ch_ext, + ch_columns + ) + ch_versions = ch_versions.mix(DIAMOND_BLASTX.out.versions) + + // + // MODULE: COMBINE THE CHUNKS INTO THE FULL GENOME + // + DIAMOND_BLAST_CHUNK_TO_FULL ( + DIAMOND_BLASTX.out.txt + ) + ch_versions = ch_versions.mix(DIAMOND_BLAST_CHUNK_TO_FULL.out.versions) + + // + // MODULE: CONVERT THE FULL GENOME FILE INTO A HITS FILE + // + CONVERT_TO_HITS_FILE( + DIAMOND_BLAST_CHUNK_TO_FULL.out.full + ) + ch_versions = ch_versions.mix(CONVERT_TO_HITS_FILE.out.versions) + + // + // MODULE: REFORMAT THE DIAMOND OUTPUT + // + REFORMAT_DIAMOND_OUTFMT6 ( + DIAMOND_BLAST_CHUNK_TO_FULL.out.full + ) + ch_versions = ch_versions.mix(REFORMAT_DIAMOND_OUTFMT6.out.versions) + + emit: + full = DIAMOND_BLAST_CHUNK_TO_FULL.out.full + reformed = REFORMAT_DIAMOND_OUTFMT6.out.full + hits_file = CONVERT_TO_HITS_FILE.out.hits_file + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file diff --git a/subworkflows/local/run_fcsadaptor.nf b/subworkflows/local/run_fcsadaptor.nf index 16b52759..0818140c 100755 --- a/subworkflows/local/run_fcsadaptor.nf +++ b/subworkflows/local/run_fcsadaptor.nf @@ -30,8 +30,8 @@ workflow RUN_FCSADAPTOR { ch_versions = ch_versions.mix(FCS_FCSADAPTOR_EUK.out.versions) emit: - FCS_FCSADAPTOR_EUK.out.adaptor_report - FCS_FCSADAPTOR_PROK.out.adaptor_report + ch_euk = FCS_FCSADAPTOR_EUK.out.adaptor_report + ch_prok = FCS_FCSADAPTOR_PROK.out.adaptor_report versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_fcsgx.nf b/subworkflows/local/run_fcsgx.nf index 3e8c6d41..78f220ad 100644 --- a/subworkflows/local/run_fcsgx.nf +++ b/subworkflows/local/run_fcsgx.nf @@ -1,5 +1,5 @@ -include { FCS_FCSGX } from '../../modules/nf-core/fcs/fcsgx/main' -include { PARSE_FCSGX_RESULT } from '../../modules/local/parse_fcsgx_result' +include { FCS_FCSGX } from '../../modules/nf-core/fcs/fcsgx/main' +include { PARSE_FCSGX_RESULT } from '../../modules/local/parse_fcsgx_result' workflow RUN_FCSGX { @@ -13,11 +13,17 @@ workflow RUN_FCSGX { ch_versions = Channel.empty() Channel - .of('all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz') - .combine(fcsgxpath) - .map {suxfix, dbpath -> [file(dbpath + '/' + suxfix)]} + .of( + 'all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz' + ) + .combine( + fcsgxpath + ) + .map {suxfix, dbpath -> + [file(dbpath + '/' + suxfix)] + } .collect() - .set {fcsgxdb} + .set { fcsgxdb } // // Create input channel for FCS_FCSGX, taxid is required to be the meta id. diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf index db2f668b..6f5b5e6a 100755 --- a/subworkflows/local/run_nt_kraken.nf +++ b/subworkflows/local/run_nt_kraken.nf @@ -45,8 +45,8 @@ workflow RUN_NT_KRAKEN { ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions) emit: - KRAKEN2_KRAKEN2.out.classified_reads_assignment - KRAKEN2_KRAKEN2.out.report - GET_LINEAGE_FOR_KRAKEN.out.txt + classified = KRAKEN2_KRAKEN2.out.classified_reads_assignment + report = KRAKEN2_KRAKEN2.out.report + lineage = GET_LINEAGE_FOR_KRAKEN.out.txt versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_read_coverage.nf b/subworkflows/local/run_read_coverage.nf index 001d2f4f..49ada33f 100644 --- a/subworkflows/local/run_read_coverage.nf +++ b/subworkflows/local/run_read_coverage.nf @@ -4,7 +4,7 @@ include { SAMTOOLS_MERGE } from '../../modules/nf include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_DEPTH } from '../../modules/nf-core/samtools/depth/main' -include { SAMTOOLS_DEPTH_AVERAGE_COVERAGE } from '../../modules/local/samtools_depth_average_coverage' +include { SAMTOOLS_DEPTH_AVERAGE_COVERAGE } from '../../modules/local/samtools_depth_average_coverage' workflow RUN_READ_COVERAGE { @@ -30,6 +30,7 @@ workflow RUN_READ_COVERAGE { platform ) ch_versions = ch_versions.mix(SE_MAPPING.out.versions) + ch_align_bam .mix( SE_MAPPING.out.mapped_bam ) .set { merged_bam } @@ -43,6 +44,7 @@ workflow RUN_READ_COVERAGE { platform ) ch_versions = ch_versions.mix(PE_MAPPING.out.versions) + ch_align_bam .mix( PE_MAPPING.out.mapped_bam ) .set { merged_bam } @@ -83,6 +85,7 @@ workflow RUN_READ_COVERAGE { ch_versions = ch_versions.mix( SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.versions ) emit: - versions = ch_versions.ifEmpty(null) - tsv_ch = SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.average_coverage + tsv_ch = SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.average_coverage + bam_ch = SAMTOOLS_SORT.out.bam + versions = ch_versions.ifEmpty(null) } \ No newline at end of file diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf index fce4395f..db882680 100644 --- a/subworkflows/local/run_vecscreen.nf +++ b/subworkflows/local/run_vecscreen.nf @@ -24,8 +24,6 @@ workflow RUN_VECSCREEN { // // MODULE: RUNS NCBI VECSCREEN // - vecscreen_database_tuple.view() - NCBITOOLS_VECSCREEN( CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database_tuple @@ -45,7 +43,7 @@ workflow RUN_VECSCREEN { ch_versions = ch_versions.mix( SUMMARISE_VECSCREEN_OUTPUT.out.versions ) emit: - vecscreen_contamination = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination + vecscreen_contam = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination versions = ch_versions.ifEmpty( null ) // channel: [ versions.yml ] } diff --git a/subworkflows/local/trailingns_check.nf b/subworkflows/local/trailingns_check.nf new file mode 100644 index 00000000..a63a9605 --- /dev/null +++ b/subworkflows/local/trailingns_check.nf @@ -0,0 +1,21 @@ +include { TRAILINGNS } from '../../modules/local/trailingns' + +workflow TRAILINGNS_CHECK { + + take: + reference_tuple // tuple [ val(meta), path(fasta) ] + + main: + + ch_versions = Channel.empty() + + // + // MODULE: TRIM LENGTHS OF N'S FROM THE INPUT GENOME AND GENERATE A REPORT ON LENGTH AND LOCATION + // + TRAILINGNS( reference_tuple ) + ch_versions = ch_versions.mix( TRAILINGNS.out.versions ) + + emit: + trailing_ns_report = TRAILINGNS.out.trailing_ns_report + versions = ch_versions.ifEmpty( null ) +} diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 6ed9ea4d..935e5033 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -29,6 +29,7 @@ workflow YAML_INPUT { taxid: ( data.taxid ) mito_fasta_path: ( data.mito_fasta_path ) plastid_fasta_path: ( data.plastid_fasta_path ) + nt_db_prefix: ( data.nt_database_prefix ) nt_database: ( data.nt_database ) reference_proteomes: ( data.reference_proteomes ) nt_kraken_db_path: ( data.nt_kraken_db_path ) @@ -36,14 +37,16 @@ workflow YAML_INPUT { dimensionality_reduction_methods: ( data.dimensionality_reduction_methods ) fcs_gx_database_path: ( data.fcs_gx_database_path ) ncbi_taxonomy_path: ( data.ncbi_taxonomy_path ) - ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path ) + ncbi_rankedlineage_path: ( file(data.ncbi_rankedlineage_path) ) ncbi_accessionids: ( data.ncbi_accessionids_folder ) busco_lineages_folder: ( data.busco_lineages_folder ) + busco_lineages: ( data.busco_lineages ) seqkit_values: ( data.seqkit ) diamond_uniprot_database_path: ( data.diamond_uniprot_database_path ) diamond_nr_database_path: ( data.diamond_nr_database_path ) vecscreen_database_path: ( data.vecscreen_database_path ) neighbours: ( data.n_neighbours ) + btk_yaml: ( file(data.btk_yaml) ) } .set{ group } @@ -51,15 +54,26 @@ workflow YAML_INPUT { .seqkit_values .flatten() .multiMap { data -> - sliding_value : ( data.sliding ) - window_value : ( data.window ) + sliding_value : ( data.sliding ) + window_value : ( data.window ) } .set { seqkit } group.assembly_title - .combine( group.assembly_path ) - .map { id, file -> - tuple( [ id: id ], + .combine( + group.assembly_path, + ) + .combine( + group.taxid, + ) + .combine( + group.sci_name + ) + .map { id, file, tax, sci -> + tuple( [ id: id, + taxid: tax, + sci_name: sci + ], file ) } @@ -67,8 +81,10 @@ workflow YAML_INPUT { group.assembly_title .combine( group.reads_path ) - .map { id, file -> - tuple( [ id: id ], + .combine( group.reads_type ) + .map { id, file, type -> + tuple( [ id: id, + type: type ], file ) } @@ -107,6 +123,15 @@ workflow YAML_INPUT { } .set{ ch_vecscreen } + group.nt_database + .combine( group.assembly_title ) + .map{ db, meta -> + tuple( [ id: meta ], + db + ) + } + .set{ ch_nt_db } + emit: reference_tuple = ch_reference pacbio_tuple = ch_pacbio @@ -116,12 +141,14 @@ workflow YAML_INPUT { assembly_title = group.assembly_title assembly_path = group.assembly_path taxid = group.taxid - nt_database = group.nt_database + nt_database = ch_nt_db + nt_db_prefix = group.nt_db_prefix nt_kraken_db_path = group.nt_kraken_db_path ncbi_accessions = group.ncbi_accessionids ncbi_taxonomy_path = group.ncbi_taxonomy_path ncbi_rankedlineage_path = group.ncbi_rankedlineage_path busco_lineages_folder = group.busco_lineages_folder + busco_lineages = group.busco_lineages fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path @@ -135,6 +162,7 @@ workflow YAML_INPUT { plastid_var = "plastid_genome" kmer_len = group.kmer_len n_neighbours = group.neighbours + btk_yaml = group.btk_yaml versions = ch_versions.ifEmpty(null) } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index a80033d5..34d14855 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -37,11 +37,22 @@ include { RUN_READ_COVERAGE } from '../subworkflows/ include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' +include { RUN_DIAMOND as NUCLEOT_DIAMOND } from '../subworkflows/local/run_diamond.nf' +include { RUN_DIAMOND as UNIPROT_DIAMOND } from '../subworkflows/local/run_diamond.nf' +include { TRAILINGNS_CHECK } from '../subworkflows/local/trailingns_check' // // MODULE: Local modules // include { GC_CONTENT } from '../modules/local/gc_content' +include { VALIDATE_TAXID } from '../modules/local/validate_taxid' +include { FILTER_FASTA } from '../modules/local/filter_fasta' +include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' +include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' +include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' +include { AUTOFILTER_AND_CHECK_ASSEMBLY } from '../modules/local/autofiltering' +include { SANGER_TOL_BTK } from '../modules/local/sanger_tol_btk' +include { GENERATE_SAMPLESHEET } from '../modules/local/generate_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -63,12 +74,21 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-co workflow ASCC { main: - ch_versions = Channel.empty() - ch_out_merge = Channel.empty() + ch_versions = Channel.empty() + ch_out_merge = Channel.empty() - workflow_steps = params.steps.split(",") + include_workflow_steps = params.include ? params.include.split(",") : "" + exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" - input_ch = Channel.fromPath(params.input, checkIfExists: true) + btk_busco_run_mode = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional" + + full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] + + if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) { + exit 1, "There is an extra argument given on Command Line: \n Check contents of: $include_workflow_steps\nAnd $exclude_workflow_steps\nMaster list is: $full_list" + } + + input_ch = Channel.fromPath(params.input, checkIfExists: true) // // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE @@ -76,31 +96,72 @@ workflow ASCC { YAML_INPUT ( input_ch ) - ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + + // + // MODULE: ENSURE THAT THE TAXID FOR THE INPUT GENOME IS INDEED IN THE TAXDUMP + // + VALIDATE_TAXID( + YAML_INPUT.out.taxid, + YAML_INPUT.out.ncbi_taxonomy_path + ) + ch_versions = ch_versions.mix(VALIDATE_TAXID.out.versions) + + + // + // MODULE: FILTER THE INPUT FASTA FOR LENGTHS OF SEQUENCE BELOW A CONFIG VARIABLE + // + FILTER_FASTA( + YAML_INPUT.out.reference_tuple, + ) + ch_versions = ch_versions.mix(FILTER_FASTA.out.versions) + + + // + // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE + // + FILTER_FASTA.out.fasta + .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) + .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) + .map { meta, ref, sliding, window -> + tuple([ id : meta.id, + sliding : sliding, + window : window + ], + file(ref) + )} + .set { modified_input } // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // GC_CONTENT ( - YAML_INPUT.out.reference_tuple + FILTER_FASTA.out.fasta ) - ch_out_merge = ch_out_merge.mix(GC_CONTENT.out.txt) - ch_versions = ch_versions.mix(GC_CONTENT.out.versions) + ch_versions = ch_versions.mix(GC_CONTENT.out.versions) // // SUBWORKFLOW: GENERATE GENOME FILE // GENERATE_GENOME ( - YAML_INPUT.out.reference_tuple, + FILTER_FASTA.out.fasta, YAML_INPUT.out.pacbio_barcodes ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + + // + // SUBWORKFLOW: GENERATE A REPORT ON LENGTHS OF N's IN THE INPUT GENOMe + // + TRAILINGNS_CHECK ( + FILTER_FASTA.out.fasta + ) + ch_versions = ch_versions.mix(TRAILINGNS_CHECK.out.versions) // // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS // - if ( workflow_steps.contains('kmers') || workflow_steps.contains('ALL')) { + if ( include_workflow_steps.contains('kmers') || include_workflow_steps.contains('ALL')) { GENERATE_GENOME.out.reference_tuple .map { meta, file -> @@ -119,48 +180,49 @@ workflow ASCC { YAML_INPUT.out.n_neighbours, autoencoder_epochs_count.map{it -> it[2]} ) - ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + ch_kmers = GET_KMERS_PROFILE.out.combined_csv + } else { + ch_kmers = [] } // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // - if ( workflow_steps.contains('tiara') ) { + if ( include_workflow_steps.contains('tiara') || include_workflow_steps.contains('ALL')) { EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + ch_tiara = EXTRACT_TIARA_HITS.out.ch_tiara.map{it[1]} + } else { + ch_tiara = [] } - // - // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE - // - YAML_INPUT.out.reference_tuple - .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) - .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) - .map { meta, ref, sliding, window -> - tuple([ id : meta.id, - sliding : sliding, - window : window - ], - file(ref) - )} - .set { modified_input } - // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST // - if ( workflow_steps.contains('nt_blast') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('nt_blast') || include_workflow_steps.contains('ALL') ) { + // + // NOTE: ch_nt_blast needs to be set in two places incase it + // fails during the run + // + + ch_nt_blast = [] EXTRACT_NT_BLAST ( modified_input, YAML_INPUT.out.nt_database, YAML_INPUT.out.ncbi_accessions, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + + } else { + ch_nt_blast = [] } - if ( workflow_steps.contains('mito') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH // @@ -176,14 +238,17 @@ workflow ASCC { // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME // MITO_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.mito_var, mito_check.valid ) - ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + ch_mito = MITO_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + } else { + ch_mito = [] } - if ( workflow_steps.contains('chloro') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH @@ -199,87 +264,293 @@ workflow ASCC { // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME // PLASTID_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.plastid_var, plastid_check.valid ) - ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + } else { + ch_chloro = [] } - // - // SUBWORKFLOW: + // SUBWORKFLOW: RUN FCS-ADAPTOR TO IDENTIDY ADAPTOR AND VECTORR CONTAMINATION // - if ( workflow_steps.contains('fcs_adapt') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcs-adaptor') || include_workflow_steps.contains('ALL') ) { RUN_FCSADAPTOR ( - YAML_INPUT.out.reference_tuple + GENERATE_GENOME.out.reference_tuple ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + RUN_FCSADAPTOR.out.ch_euk + .map{it[1]} + .combine( + RUN_FCSADAPTOR.out.ch_prok.map{it[1]} + ) + .set{ ch_fcsadapt } + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + } else { + ch_fcsadapt = [] } + // - // SUBWORKFLOW: + // SUBWORKFLOW: RUN FCS-GX TO IDENTIFY CONTAMINATION IN THE ASSEMBLY // - if ( workflow_steps.contains('fcsgx') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcs-gx') || include_workflow_steps.contains('ALL') ) { RUN_FCSGX ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, YAML_INPUT.out.taxid, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + + ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} + ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) + } else { + ch_fcsgx = [] } // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - if ( workflow_steps.contains('barcodes') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL') ) { PACBIO_BARCODE_CHECK ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.pacbio_barcodes, YAML_INPUT.out.pacbio_multiplex_codes ) - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + + PACBIO_BARCODE_CHECK.out.filtered + .map{ + it[1] + } + .collect() + .set { + ch_barcode // Not in use + } + + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + } else { + ch_barcode = [] } // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - if ( workflow_steps.contains('coverage') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('btk_busco') || include_workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.assembly_path, YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.reads_type ) - ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + ch_coverage = RUN_READ_COVERAGE.out.tsv_ch.map{it[1]} + ch_bam = RUN_READ_COVERAGE.out.bam_ch.map{it[1]} + ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + } else { + ch_coverage = [] + ch_bam = [] } // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS // - if ( workflow_steps.contains('vecscreen') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('vecscreen') || include_workflow_steps.contains('ALL') ) { RUN_VECSCREEN ( GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.vecscreen_database_path ) - ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + ch_vecscreen = RUN_VECSCREEN.out.vecscreen_contam.map{it[1]} + ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + } else { + ch_vecscreen = [] } // // SUBWORKFLOW: Run the kraken classifier // - if ( workflow_steps.contains('kraken') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('kraken') || include_workflow_steps.contains('ALL') ) { RUN_NT_KRAKEN( GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) + ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} + ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} + ch_kraken3 = RUN_NT_KRAKEN.out.lineage + + ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions) + } else { + ch_kraken1 = [] + ch_kraken2 = [] + ch_kraken3 = [] + } + + // + // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY + // + if ( include_workflow_steps.contains('nr_diamond') || include_workflow_steps.contains('ALL') ) { + NUCLEOT_DIAMOND ( + modified_input, + YAML_INPUT.out.diamond_nr_database_path + ) + nr_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} + ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) + } else { + nr_hits = [] + nr_full = [] + } + + // + // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY + // + //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles + if ( include_workflow_steps.contains('uniprot_diamond') || include_workflow_steps.contains('ALL') ) { + UNIPROT_DIAMOND ( + modified_input, + YAML_INPUT.out.diamond_uniprot_database_path + ) + un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} + un_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} + ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) + } else { + un_hits = [] + un_full = [] + } + + ch_dot_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} + + CREATE_BTK_DATASET ( + GENERATE_GENOME.out.reference_tuple, + ch_dot_genome, + ch_kmers, + ch_tiara, + ch_nt_blast, + ch_fcsgx, + ch_bam, + ch_coverage, + ch_kraken1, + ch_kraken2, + ch_kraken3, + nr_full, + un_full, + YAML_INPUT.out.ncbi_taxonomy_path.first() + ) + ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) + + + // + // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS + // + if ( include_workflow_steps.contains('tiara') && include_workflow_steps.contains('fcs-gx') && include_workflow_steps.contains("autofilter_assembly") || include_workflow_steps.contains('ALL') ) { + AUTOFILTER_AND_CHECK_ASSEMBLY ( + GENERATE_GENOME.out.reference_tuple, + EXTRACT_TIARA_HITS.out.ch_tiara, + RUN_FCSGX.out.fcsgxresult, + YAML_INPUT.out.ncbi_rankedlineage_path + ) + ch_autofilt_assem = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} + ch_autofilt_indicator = AUTOFILTER_AND_CHECK_ASSEMBLY.out.indicator_file + + AUTOFILTER_AND_CHECK_ASSEMBLY.out.alarm_file + .map { file -> file.text.trim() } + .branch { it -> + run_btk: "ABNORMAL" ? it.contains("YES_ABNORMAL"): false + dont_run: [] + } + .set { btk_bool } + + + ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) + } else { + ch_autofilt_assem = [] + ch_autofilt_indicator = [] + } + + // + // PIPELINE: PREPARE THE DATA FOR USE IN THE SANGER-TOL/BLOBTOOLKIT PIPELINE + // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT + // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. + // This will also eventually check for the above run_btk boolean from autofilter + if ( !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('btk_busco') && btk_busco_run_mode == "conditional" && include_workflow_steps.contains("autofilter_assembly") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('ALL') || btk_busco_run_mode == "mandatory" && !exclude_workflow_steps.contains('btk_busco') && include_workflow_steps.contains('btk_busco') ) { + GENERATE_GENOME.out.reference_tuple + .combine(ch_bam) + .map{ meta, ref, bam -> + tuple( [ id: meta.id ], + bam + ) + } + .set { new_bam } + + GENERATE_SAMPLESHEET ( + new_bam + ) + ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + + SANGER_TOL_BTK ( + GENERATE_GENOME.out.reference_tuple, + new_bam, + GENERATE_SAMPLESHEET.out.csv, + YAML_INPUT.out.diamond_uniprot_database_path, + YAML_INPUT.out.nt_database.map{it -> it[1]}, + YAML_INPUT.out.diamond_uniprot_database_path, + [], + YAML_INPUT.out.ncbi_taxonomy_path, + YAML_INPUT.out.btk_yaml, + YAML_INPUT.out.busco_lineages, + YAML_INPUT.out.taxid, + 'GCA_0001' + ) + ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + + + MERGE_BTK_DATASETS ( + CREATE_BTK_DATASET.out.btk_datasets, + SANGER_TOL_BTK.out.dataset + ) + ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv.map{it[1]} + + } else { + busco_merge_btk = [] } - // mix the outputs of the outpuutting process so that we can - // insert them into the one process to create the btk and the merged report - // much like the versions channel + + // + // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER + // + ASCC_MERGE_TABLES ( + GC_CONTENT.out.txt, // FROM -- GC_COVERAGE.out.tsv + ch_coverage, // FROM -- RUN_COVERAGE.out.tsv.map{it[1]} + ch_tiara, // FROM -- TIARA_TIARA.out.classifications.map{it[1]} + [], // <-- BACTERIAL KRAKEN -- NOT IN PIPELINE YET + ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} + ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv + nr_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} + un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} + [], // <-- MARKER SCAN -- NOT IN PIPELINE YET + [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET + CREATE_BTK_DATASET.out.create_summary.map{it[1]}, // FROM -- CREATE_BTK_DATASET + busco_merge_btk, // FROM -- MERGE_BTK_DATASETS.out.busco_summary_tsv + ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} + ) + ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) + + + // + // MODULE: GENERATE AN INDICATOR FILE REQUIRED FOR SANGER INTERNAL PIPELINE TRACKING + // + GenIndicator ( + ch_autofilt_indicator, + ch_fcsgx, + ch_fcsadapt, + ch_tiara, + ch_vecscreen, + ch_barcode, + ) + // // SUBWORKFLOW: Collates version data from prior subworflows @@ -289,8 +560,50 @@ workflow ASCC { ) emit: - software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml - versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions + software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml + versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions +} + +// +// MODULE: GRAB FASTQ FILES IN DIRECTORY +// +process GrabFiles { + label 'process_tiny' + + tag "${meta.id}" + executor 'local' + + input: + tuple val(meta), path("in") + + output: + tuple val(meta), path("in/*.{fa,fasta}.{gz}") + + "true" +} + +// +// MODULE: GENERATE AN INDICATOR FILE REQUIRED FOR SANGER INTERNAL PIPELINE TRACKING +// +process GenIndicator { + label 'process_tiny' + + tag "Generating Phase 1 Indicator" + executor 'local' + + input: + val(a) + val(b) + val(c) + val(d) + val(e) + val(f) + + output: + path("decon_first_stage_done_indicator_file.txt") + + script: + "touch decon_first_stage_done_indicator_file.txt" } /* @@ -309,6 +622,18 @@ workflow.onComplete { } // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir) + if (workflow.success) { + // Generate a pipeline completion indicator file, for use in + // Sanger-ToL automation + def newFile = new File("${params.outdir}/pipeline_run_done_indicator_file.txt") + newFile.createNewFile() + } + +} + +workflow.onError = { + def newFile = new File("${params.outdir}/pipeline_run_ERROR_indicator_file.txt") + newFile.createNewFile() } /*