From 9513796e4b928ab8fd05110e9bcd2b9c77e06854 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 12:39:41 +0100 Subject: [PATCH 001/117] initial work on the merge modules --- bin/create_btk_dataset.py | 184 ++++++++++++++++++++++++ conf/base.config | 2 +- modules/local/create_btk_dataset.nf | 48 +++++++ modules/local/gc_content.nf | 4 +- modules/local/get_kmer_counts.nf | 6 +- subworkflows/local/extract_nt_blast.nf | 1 + subworkflows/local/get_kmers_profile.nf | 2 - subworkflows/local/organellar_blast.nf | 2 +- subworkflows/local/run_fcsadaptor.nf | 4 +- subworkflows/local/run_fcsgx.nf | 14 +- subworkflows/local/run_nt_kraken.nf | 6 +- subworkflows/local/run_read_coverage.nf | 2 +- subworkflows/local/run_vecscreen.nf | 4 +- subworkflows/local/yaml_input.nf | 17 ++- workflows/ascc.nf | 115 ++++++++++++--- 15 files changed, 367 insertions(+), 44 deletions(-) create mode 100644 bin/create_btk_dataset.py create mode 100644 modules/local/create_btk_dataset.nf diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py new file mode 100644 index 00000000..6c83093b --- /dev/null +++ b/bin/create_btk_dataset.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Script for creating a BlobToolKit dataset +""" + +import general_purpose_functions as gpf +import argparse +from pathlib import Path +import sys +import os.path + +#installing BlobToolKit dependencies: +#conda install -c tolkit tolkein +#conda install -c bioconda pysa + + +def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): + """ + Creates the assembly YAML file for creating a BlobToolKit dataset + """ + if ".gz" in assembly_alias: + assembly_alias = assembly_alias.replace(".gz", "_gz") + out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format(assembly_alias, taxon_name) + with open(assembly_yaml_path, "w") as f: + f.write(out_string) + + +def tiara_results_to_btk_format(tiara_results_path, outfile_path): + """ + Reformatting Tiara output file so that the summarised results of the first and second pass of Tiara can be + added to a BlobToolKit dataset + """ + tiara_data = gpf.l(tiara_results_path) + tiara_data = tiara_data[1:len(tiara_data)] + with open(outfile_path, "w") as f: + f.write("identifier\ttiara\n") + for line in tiara_data: + split_line = line.split() + if len(split_line) != 3: + sys.stderr.write("Failed to parse the Tiara results file {}\n".format(tiara_results_path)) + sys.exit(1) + first_pass_result = split_line[1] + second_pass_result = split_line[2] + if second_pass_result != "n/a": + first_pass_result = second_pass_result + f.write(split_line[0] + "\t" + first_pass_result + "\n") + + +def detect_dim_reduction_methods(kmers_dim_reduction_output_path): + """ + Parses the header of the kmers dimensionality reduction report file to detect which dimensionality reduction methods were used + """ + header_string = None + with open(kmers_dim_reduction_output_path) as f: + header_string = f.readline() + header_string = header_string.strip() + split_header = header_string.split(",") + dim_reduction_methods = list() + for header_item in split_header: + if header_item.startswith("embedding_"): + if header_item.startswith("embedding_x_"): + header_item = header_item.split("embedding_x_")[1] + elif header_item.startswith("embedding_y_"): + header_item = header_item.split("embedding_y_")[1] + if header_item not in dim_reduction_methods: + dim_reduction_methods.append(header_item) + return dim_reduction_methods + + +def add_custom_variables_to_btk_dataset(pipeline_run_folder, btk_dataset_folder): + """ + Script for adding custom variables (e.g. Tiara results and PCA results) to the BlobToolKit dataset + """ + pipeline_output_folder = pipeline_run_folder + "/collected_tables" + if os.path.isdir(pipeline_output_folder) == False: + sys.stderr.write("The directory for the output tables of the pipeline ({}) was not found\n".format(pipeline_output_folder)) + sys.exit(1) + if os.path.isdir(btk_dataset_folder) == False: + sys.stderr.write("The BlobToolKit dataset directory ({}) was not found\n".format(btk_dataset_folder)) + sys.exit(1) + tiara_raw_output_path = pipeline_output_folder + "/tiara_out.txt" + if os.path.isfile(tiara_raw_output_path) and os.stat(tiara_raw_output_path).st_size > 0: + tiara_reformatted_output_path = pipeline_output_folder + "/tiara_out_btk_format.tsv" + tiara_results_to_btk_format(tiara_raw_output_path, tiara_reformatted_output_path) + add_tiara_command = 'blobtools add --text {} --text-delimiter "\t" --text-cols "identifier=identifiers,tiara=tiara" --text-header {}'.format(tiara_reformatted_output_path, btk_dataset_folder) + gpf.run_system_command(add_tiara_command) + + kmers_dim_reduction_output_path = pipeline_output_folder + "/kmers_dim_reduction_embeddings.csv" + if os.path.isfile(kmers_dim_reduction_output_path) and os.stat(kmers_dim_reduction_output_path).st_size > 0: + used_dim_reduction_methods = detect_dim_reduction_methods(kmers_dim_reduction_output_path) + for dim_reduction_method in used_dim_reduction_methods: + add_embedding_command = 'blobtools add --text {path} --text-delimiter "," --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {btk_dataset_folder}'.format(path=kmers_dim_reduction_output_path, dim_reduction_method=dim_reduction_method, btk_dataset_folder=btk_dataset_folder) + gpf.run_system_command(add_embedding_command) + + + kraken_lineage_path = pipeline_output_folder + "/nt_kraken_lineage.txt" + if os.path.isfile(kraken_lineage_path) and os.stat(kraken_lineage_path).st_size > 0: + for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): + add_kraken_command = 'blobtools add --text {} --text-delimiter "," --text-cols scaff=identifiers,nt_kraken_{}=nt_kraken_{} --text-header {}'.format(kraken_lineage_path, taxonomy_level, taxonomy_level, btk_dataset_folder) + gpf.run_system_command(add_kraken_command) + + fcs_gx_output_path = pipeline_output_folder + "/fcs-gx_summary.csv" + if os.path.isfile(fcs_gx_output_path) and os.stat(fcs_gx_output_path).st_size > 0: + add_fcs_gx_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action" --text-header {}'.format(fcs_gx_output_path, btk_dataset_folder) + gpf.run_system_command(add_fcs_gx_results_command) + + #cobiontid_markerscan_json_file_path = run_folder + "/" + sample_id + ".json" + #cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") + cobiontid_markerscan_output_path = pipeline_output_folder + "/cobiontid_markerscan.csv" + if os.path.isfile(cobiontid_markerscan_output_path) and os.stat(cobiontid_markerscan_output_path).st_size > 0: + add_cobiontid_markerscan_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,CobiontID_MarkerScan_embl_ebi_ena=CobiontID_MarkerScan_embl_ebi_ena,CobiontID_MarkerScan_slv=CobiontID_MarkerScan_slv,CobiontID_MarkerScan_Cluster=CobiontID_MarkerScan_Cluster" --text-header {}'.format(cobiontid_markerscan_output_path, btk_dataset_folder) + gpf.run_system_command(add_cobiontid_markerscan_results_command) + + cobiontid_contigviz_output_path = pipeline_output_folder + "/contigviz_results.csv" + if os.path.isfile(cobiontid_contigviz_output_path) and os.stat(cobiontid_contigviz_output_path).st_size > 0: + add_cobiontid_contigviz_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,ContigViz_UMAP1=ContigViz_UMAP1,ContigViz_UMAP2=ContigViz_UMAP2,ContigViz_Hexamer_continuous=ContigViz_Hexamer_continuous,ContigViz_Hexamer_digitized=ContigViz_Hexamer_digitized,ContigViz_FastK_continuous=ContigViz_FastK_continuous,ContigViz_FastK_digitized=ContigViz_FastK_digitized,ContigViz_Unique_15mers_continuous=ContigViz_Unique_15mers_continuous,ContigViz_Unique_15mers_digitized=ContigViz_Unique_15mers_digitized,ContigViz_Coverage_continuous=ContigViz_Coverage_continuous,ContigViz_Coverage_digitized=ContigViz_Coverage_digitized" --text-header {}'.format(cobiontid_contigviz_output_path, btk_dataset_folder) + gpf.run_system_command(add_cobiontid_contigviz_results_command) + + + +def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_title, taxon_name, taxid, blastn_hits_path, uniprot_diamond_hits_path, nr_diamond_hits_path, mapped_reads_path, taxdump_path, threads, assembly_alias, dry_run_flag): + + #out_folder = pipeline_run_folder + "/collected_tables" + + if assembly_alias == "": + assembly_alias = assembly_title + + if dry_run_flag == False: + Path(dataset_folder).mkdir(parents=True, exist_ok=True) + + edited_assembly_title = assembly_title.replace(".", "_") + edited_assembly_title = edited_assembly_title.replace(" ", "_") + + assembly_yaml_path = dataset_folder + "/" + edited_assembly_title + ".yaml" + if dry_run_flag == False: + create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name) + + blobtools_create_command = "blobtools create --fasta {} --meta {} --taxid {} --taxdump {} {}".format(assembly_fasta_path, assembly_yaml_path, taxid, taxdump_path, dataset_folder) + gpf.run_system_command(blobtools_create_command, dry_run=dry_run_flag) + + + hits_file_paths = [blastn_hits_path, uniprot_diamond_hits_path, nr_diamond_hits_path] + hits_file_paths = [n for n in hits_file_paths if os.path.isfile(n) is True and os.stat(n).st_size > 0] + + if len(hits_file_paths) > 0: + add_hits_command = "blobtools add" + for hits_file_path in hits_file_paths: + add_hits_command += " --hits {}".format(hits_file_path) + add_hits_command += " --taxrule bestsum --taxdump {} {}".format(taxdump_path, dataset_folder) + gpf.run_system_command(add_hits_command, dry_run=dry_run_flag) + + + if os.path.isfile(mapped_reads_path) is True and os.stat(mapped_reads_path).st_size > 0: + add_cov_command = "blobtools add --cov {} --threads {} {}".format(mapped_reads_path, threads, dataset_folder) + gpf.run_system_command(add_cov_command, dry_run=dry_run_flag) + + #export_table_command = "blobtools filter --table {}/btk_summary_table_basic.tsv {}".format(out_folder, dataset_folder) + add_custom_variables_to_btk_dataset(pipeline_run_folder, dataset_folder) + export_table_command = "blobtools filter --table {}/collected_tables/btk_summary_table_full.tsv {}".format(pipeline_run_folder, dataset_folder) + + gpf.run_system_command(export_table_command, dry_run=dry_run_flag) + + # json_file_path = run_folder + "/" + sample_id + ".json" + #cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("assembly_fasta_path", type=str, help="assembly_fasta_path") + parser.add_argument("dataset_folder", type=str, help="Path for dataset folder") + parser.add_argument("pipeline_run_folder", type=str, help="Folder where this pipeline is run pipeline") + parser.add_argument("assembly_title", type=str, help="Assembly title") + parser.add_argument("taxon_name", type=str, help="Taxon name") + parser.add_argument("taxid", type=int, help="taxid") + parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") + parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") + parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") + parser.add_argument("mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation") + parser.add_argument("taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") + parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") + parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") + parser.add_argument("--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)") + args = parser.parse_args() + main(args.assembly_fasta_path, args.dataset_folder, args.pipeline_run_folder, args.assembly_title, args.taxon_name, args.taxid, args.blastn_hits_path, args.uniprot_diamond_hits_path, args.nr_diamond_hits_path, args.mapped_reads_path, args.taxdump_path, args.threads, args.assembly_alias,args.dry_run) diff --git a/conf/base.config b/conf/base.config index 4e764169..e0e853b0 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - withName: BLAST_BLASTN { + withName: BLAST_BLASTN_MOD { memory = { check_max( 50.GB * task.attempt, 'memory' ) } time = { check_max( 12.h * task.attempt, 'time' ) } } diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf new file mode 100644 index 00000000..638a58b7 --- /dev/null +++ b/modules/local/create_btk_dataset.nf @@ -0,0 +1,48 @@ +process CREATE_BTK_DATASET { + label 'process_high' + + input: + tuple val(meta), path(reference) + path gc_content, stageAs: "?/GC_CONTENT.txt" + path dot_genome, stageAs: "?/SORTED.genome" + path kmers, stageAs: "?/KMERS_dim_reduction_embeddings_combined.csv" + path tiara, stageAs: "?/TIARA.txt" + path nt_blast, stageAs: "?/BLAST_with_LINEAGE.csv" + path mito, stageAs: "?/MITOCHO.contamination_recommendation" + path chloro, stageAs: "?/PLASTID.contamination_recommendation" + path fcs_adapt, stageAs: "?/*" + path fcsgx, stageAs: "?/FCSGX_parsed.csv" + path barcode, stageAs: "?/*" + path coverage, stageAs: "?/COVERAGE_AVERAGE.txt" + path vecscreen, stageAs: "?/VECSCREEN.vecscreen_contamination" + path kraken_class, stageAs: "?/KRAKEN_CLASSIFIED.txt" + path kraken_report, stageAs: "?/KRAKEN_REPORT.txt" + path kraken_lineage, stageAs: "?/KRAKEN_LINEAGE.txt" + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + + """ + + mkdir -p btk_datasets/ + + create_btk_dataset.py \\ + ${reference} \\ + btk_datasets/ \\ + ./1/ \\ + ${meta.id} \\ + ${meta.sci_name} \\ + ${meta.taxid} \\ + ${nt_blast} \\ + NA \\ + NA \\ + ${coverage} \\ + $ + + + """ +} \ No newline at end of file diff --git a/modules/local/gc_content.nf b/modules/local/gc_content.nf index 825b0ec8..a2673d7d 100644 --- a/modules/local/gc_content.nf +++ b/modules/local/gc_content.nf @@ -11,13 +11,13 @@ process GC_CONTENT { tuple val(meta), path(fasta) output: - tuple val(meta), path( "*-gc.txt" ) , emit: txt + tuple val(meta), path( "*-GC_CONTENT.txt" ) , emit: txt path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" """ - gc_content.py ${fasta} > ${prefix}-gc.txt + gc_content.py ${fasta} > ${prefix}-GC_CONTENT.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/get_kmer_counts.nf b/modules/local/get_kmer_counts.nf index b0742fe2..082d7de6 100755 --- a/modules/local/get_kmer_counts.nf +++ b/modules/local/get_kmer_counts.nf @@ -12,7 +12,7 @@ process GET_KMER_COUNTS { val kmer_size output: - tuple val(meta), path( "*_kmer_counts.csv" ) , emit: csv + tuple val(meta), path( "*_KMER_COUNTS.csv" ) , emit: csv path "versions.yml" , emit: versions when: @@ -24,7 +24,7 @@ process GET_KMER_COUNTS { """ get_kmers_counts.py \\ $input_fasta \\ - ${prefix}_kmer_counts.csv \\ + ${prefix}_KMER_COUNTS.csv \\ --kmer_size $kmer_size cat <<-END_VERSIONS > versions.yml @@ -40,7 +40,7 @@ process GET_KMER_COUNTS { def KCOUNTER_VERSION = "0.1.1" def prefix = args.ext.prefix ?: "${meta.id}" """ - touch ${prefix}_kmer_counts.csv + touch ${prefix}_KMER_COUNTS.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index 16f97efb..26e5ee9f 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -143,6 +143,7 @@ workflow EXTRACT_NT_BLAST { ch_versions = ch_versions.mix(GET_LINEAGE_FOR_TOP.out.versions) emit: + ch_top_lineages = GET_LINEAGE_FOR_TOP.out.full versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/get_kmers_profile.nf b/subworkflows/local/get_kmers_profile.nf index de117122..6775e613 100755 --- a/subworkflows/local/get_kmers_profile.nf +++ b/subworkflows/local/get_kmers_profile.nf @@ -80,8 +80,6 @@ workflow GET_KMERS_PROFILE { } .set { collected_files_for_combine } - collected_files_for_combine.view() - // // MODULE: COMBINE OUTPUTS OF MULTIPLE METHODS // diff --git a/subworkflows/local/organellar_blast.nf b/subworkflows/local/organellar_blast.nf index fe4342e7..41fa9e17 100644 --- a/subworkflows/local/organellar_blast.nf +++ b/subworkflows/local/organellar_blast.nf @@ -120,7 +120,7 @@ workflow ORGANELLAR_BLAST { ch_versions = ch_versions.mix(ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.versions) emit: - organelle_report = ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.recommendations + organelle_report= ORGANELLE_CONTAMINATION_RECOMMENDATIONS.out.recommendations versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_fcsadaptor.nf b/subworkflows/local/run_fcsadaptor.nf index 16b52759..0818140c 100755 --- a/subworkflows/local/run_fcsadaptor.nf +++ b/subworkflows/local/run_fcsadaptor.nf @@ -30,8 +30,8 @@ workflow RUN_FCSADAPTOR { ch_versions = ch_versions.mix(FCS_FCSADAPTOR_EUK.out.versions) emit: - FCS_FCSADAPTOR_EUK.out.adaptor_report - FCS_FCSADAPTOR_PROK.out.adaptor_report + ch_euk = FCS_FCSADAPTOR_EUK.out.adaptor_report + ch_prok = FCS_FCSADAPTOR_PROK.out.adaptor_report versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_fcsgx.nf b/subworkflows/local/run_fcsgx.nf index 3e8c6d41..7c4e04bb 100644 --- a/subworkflows/local/run_fcsgx.nf +++ b/subworkflows/local/run_fcsgx.nf @@ -13,11 +13,17 @@ workflow RUN_FCSGX { ch_versions = Channel.empty() Channel - .of('all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz') - .combine(fcsgxpath) - .map {suxfix, dbpath -> [file(dbpath + '/' + suxfix)]} + .of( + 'all.gxi', 'all.gxs', 'all.taxa.tsv', 'all.meta.jsonl', 'all.blast_div.tsv.gz' + ) + .combine( + fcsgxpath + ) + .map {suxfix, dbpath -> + [file(dbpath + '/' + suxfix)] + } .collect() - .set {fcsgxdb} + .set { fcsgxdb } // // Create input channel for FCS_FCSGX, taxid is required to be the meta id. diff --git a/subworkflows/local/run_nt_kraken.nf b/subworkflows/local/run_nt_kraken.nf index db2f668b..6f5b5e6a 100755 --- a/subworkflows/local/run_nt_kraken.nf +++ b/subworkflows/local/run_nt_kraken.nf @@ -45,8 +45,8 @@ workflow RUN_NT_KRAKEN { ch_versions = ch_versions.mix(GET_LINEAGE_FOR_KRAKEN.out.versions) emit: - KRAKEN2_KRAKEN2.out.classified_reads_assignment - KRAKEN2_KRAKEN2.out.report - GET_LINEAGE_FOR_KRAKEN.out.txt + classified = KRAKEN2_KRAKEN2.out.classified_reads_assignment + report = KRAKEN2_KRAKEN2.out.report + lineage = GET_LINEAGE_FOR_KRAKEN.out.txt versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_read_coverage.nf b/subworkflows/local/run_read_coverage.nf index 001d2f4f..2a08c235 100644 --- a/subworkflows/local/run_read_coverage.nf +++ b/subworkflows/local/run_read_coverage.nf @@ -83,6 +83,6 @@ workflow RUN_READ_COVERAGE { ch_versions = ch_versions.mix( SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.versions ) emit: - versions = ch_versions.ifEmpty(null) tsv_ch = SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.average_coverage + versions = ch_versions.ifEmpty(null) } \ No newline at end of file diff --git a/subworkflows/local/run_vecscreen.nf b/subworkflows/local/run_vecscreen.nf index fce4395f..db882680 100644 --- a/subworkflows/local/run_vecscreen.nf +++ b/subworkflows/local/run_vecscreen.nf @@ -24,8 +24,6 @@ workflow RUN_VECSCREEN { // // MODULE: RUNS NCBI VECSCREEN // - vecscreen_database_tuple.view() - NCBITOOLS_VECSCREEN( CHUNK_ASSEMBLY_FOR_VECSCREEN.out.chunked_assembly, vecscreen_database_tuple @@ -45,7 +43,7 @@ workflow RUN_VECSCREEN { ch_versions = ch_versions.mix( SUMMARISE_VECSCREEN_OUTPUT.out.versions ) emit: - vecscreen_contamination = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination + vecscreen_contam = SUMMARISE_VECSCREEN_OUTPUT.out.vecscreen_contamination versions = ch_versions.ifEmpty( null ) // channel: [ versions.yml ] } diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 6ed9ea4d..8732de1d 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -57,9 +57,20 @@ workflow YAML_INPUT { .set { seqkit } group.assembly_title - .combine( group.assembly_path ) - .map { id, file -> - tuple( [ id: id ], + .combine( + group.assembly_path, + ) + .combine( + group.taxid, + ) + .combine( + group.sci_name + ) + .map { id, file, tax, sci -> + tuple( [ id: id, + taxid: tax, + sci_name: sci + ], file ) } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index a80033d5..a0f625cd 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -42,6 +42,8 @@ include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/ // MODULE: Local modules // include { GC_CONTENT } from '../modules/local/gc_content' +include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -63,12 +65,12 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-co workflow ASCC { main: - ch_versions = Channel.empty() - ch_out_merge = Channel.empty() + ch_versions = Channel.empty() + ch_out_merge = Channel.empty() - workflow_steps = params.steps.split(",") + workflow_steps = params.steps.split(",") - input_ch = Channel.fromPath(params.input, checkIfExists: true) + input_ch = Channel.fromPath(params.input, checkIfExists: true) // // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE @@ -76,7 +78,7 @@ workflow ASCC { YAML_INPUT ( input_ch ) - ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + ch_versions = ch_versions.mix(YAML_INPUT.out.versions) // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA @@ -84,8 +86,7 @@ workflow ASCC { GC_CONTENT ( YAML_INPUT.out.reference_tuple ) - ch_out_merge = ch_out_merge.mix(GC_CONTENT.out.txt) - ch_versions = ch_versions.mix(GC_CONTENT.out.versions) + ch_versions = ch_versions.mix(GC_CONTENT.out.versions) // // SUBWORKFLOW: GENERATE GENOME FILE @@ -94,7 +95,7 @@ workflow ASCC { YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_barcodes ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) // // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS @@ -119,17 +120,23 @@ workflow ASCC { YAML_INPUT.out.n_neighbours, autoencoder_epochs_count.map{it -> it[2]} ) - ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + ch_kmers = GET_KMERS_PROFILE.out.combined_csv + } else { + ch_kmers = [] } // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // - if ( workflow_steps.contains('tiara') ) { + if ( workflow_steps.contains('tiara') || workflow_steps.contains('ALL')) { EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + ch_tiara = EXTRACT_TIARA_HITS.out.ch_tiara.map{it[1]} + } else { + ch_tiara = [] } // @@ -157,7 +164,10 @@ workflow ASCC { YAML_INPUT.out.ncbi_accessions, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + ch_nt_blast = EXTRACT_NT_BLAST.out.ch_top_lineages.map{it[1]} + } else { + ch_nt_blast = [] } if ( workflow_steps.contains('mito') || workflow_steps.contains('ALL') ) { @@ -180,7 +190,10 @@ workflow ASCC { YAML_INPUT.out.mito_var, mito_check.valid ) - ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + ch_mito = MITO_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + } else { + ch_mito = [] } if ( workflow_steps.contains('chloro') || workflow_steps.contains('ALL') ) { @@ -203,7 +216,10 @@ workflow ASCC { YAML_INPUT.out.plastid_var, plastid_check.valid ) - ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + } else { + ch_chloro = [] } @@ -214,8 +230,18 @@ workflow ASCC { RUN_FCSADAPTOR ( YAML_INPUT.out.reference_tuple ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + RUN_FCSADAPTOR.out.ch_euk + .map{it[1]} + .combine( + RUN_FCSADAPTOR.out.ch_prok.map{it[1]} + ) + .set{ ch_fcsadapt } + ch_fcsadapt + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + } else { + ch_fcsadapt = [] } + // // SUBWORKFLOW: // @@ -226,7 +252,10 @@ workflow ASCC { YAML_INPUT.out.taxid, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + } else { + ch_fcsgx = [] } // @@ -239,7 +268,19 @@ workflow ASCC { YAML_INPUT.out.pacbio_barcodes, YAML_INPUT.out.pacbio_multiplex_codes ) - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + + PACBIO_BARCODE_CHECK.out.filtered + .map{ + it[1] + } + .collect() + .set { + ch_barcode + } + + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + } else { + ch_barcode = [] } // @@ -252,7 +293,10 @@ workflow ASCC { YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.reads_type ) - ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + ch_coverage = RUN_READ_COVERAGE.out.tsv_ch.map{it[1]} + ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + } else { + ch_coverage = [] } // @@ -263,7 +307,10 @@ workflow ASCC { GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.vecscreen_database_path ) - ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + ch_vecscreen = RUN_VECSCREEN.out.vecscreen_contam.map{it[1]} + ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + } else { + ch_vecscreen = [] } // @@ -275,12 +322,42 @@ workflow ASCC { YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) + ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} + ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} + ch_kraken3 = RUN_NT_KRAKEN.out.lineage + + ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions) + } else { + ch_kraken1 = [] + ch_kraken2 = [] + ch_kraken3 = [] } // mix the outputs of the outpuutting process so that we can // insert them into the one process to create the btk and the merged report // much like the versions channel + GENERATE_GENOME.out.reference_tuple.view() + + CREATE_BTK_DATASET ( + GENERATE_GENOME.out.reference_tuple, + GC_CONTENT.out.txt.map{it[1]}, + GENERATE_GENOME.out.dot_genome.map{it[1]}, + ch_kmers, + ch_tiara, + ch_nt_blast, + ch_mito, + ch_chloro, + ch_fcsadapt, + ch_fcsgx, + ch_barcode, + ch_coverage, + ch_vecscreen, + ch_kraken1, + ch_kraken2, + ch_kraken3 + ) + // // SUBWORKFLOW: Collates version data from prior subworflows // From 3d4dbc754df64cc429feaf16bf6eada6e1d1888f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 14:41:53 +0100 Subject: [PATCH 002/117] Updates to the create_btk_dataset process --- bin/create_btk_dataset.py | 30 ++++++++++------------ modules/local/create_btk_dataset.nf | 34 ++++++++++--------------- subworkflows/local/run_read_coverage.nf | 5 ++-- workflows/ascc.nf | 13 +++++----- 4 files changed, 36 insertions(+), 46 deletions(-) diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py index 6c83093b..1447e5b5 100644 --- a/bin/create_btk_dataset.py +++ b/bin/create_btk_dataset.py @@ -9,10 +9,6 @@ import sys import os.path -#installing BlobToolKit dependencies: -#conda install -c tolkit tolkein -#conda install -c bioconda pysa - def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): """ @@ -166,19 +162,19 @@ def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_titl if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("assembly_fasta_path", type=str, help="assembly_fasta_path") - parser.add_argument("dataset_folder", type=str, help="Path for dataset folder") - parser.add_argument("pipeline_run_folder", type=str, help="Folder where this pipeline is run pipeline") - parser.add_argument("assembly_title", type=str, help="Assembly title") - parser.add_argument("taxon_name", type=str, help="Taxon name") - parser.add_argument("taxid", type=int, help="taxid") - parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") - parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") - parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") - parser.add_argument("mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation") - parser.add_argument("taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") - parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") - parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") + parser.add_argument("assembly_fasta_path", type=str, help="assembly_fasta_path") + parser.add_argument("dataset_folder", type=str, help="Path for dataset folder") + parser.add_argument("pipeline_run_folder", type=str, help="Folder where this pipeline is run pipeline") + parser.add_argument("assembly_title", type=str, help="Assembly title") + parser.add_argument("taxon_name", type=str, help="Taxon name") + parser.add_argument("taxid", type=int, help="taxid") + parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") + parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") + parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") + parser.add_argument("mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation") + parser.add_argument("taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") + parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") + parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") parser.add_argument("--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)") args = parser.parse_args() main(args.assembly_fasta_path, args.dataset_folder, args.pipeline_run_folder, args.assembly_title, args.taxon_name, args.taxid, args.blastn_hits_path, args.uniprot_diamond_hits_path, args.nr_diamond_hits_path, args.mapped_reads_path, args.taxdump_path, args.threads, args.assembly_alias,args.dry_run) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 638a58b7..bd3d550f 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -1,23 +1,19 @@ process CREATE_BTK_DATASET { - label 'process_high' + label 'process_medium' input: tuple val(meta), path(reference) - path gc_content, stageAs: "?/GC_CONTENT.txt" path dot_genome, stageAs: "?/SORTED.genome" path kmers, stageAs: "?/KMERS_dim_reduction_embeddings_combined.csv" path tiara, stageAs: "?/TIARA.txt" path nt_blast, stageAs: "?/BLAST_with_LINEAGE.csv" - path mito, stageAs: "?/MITOCHO.contamination_recommendation" - path chloro, stageAs: "?/PLASTID.contamination_recommendation" - path fcs_adapt, stageAs: "?/*" path fcsgx, stageAs: "?/FCSGX_parsed.csv" - path barcode, stageAs: "?/*" + path mapped_bam, stageAs: "?/MAPPED.bam" path coverage, stageAs: "?/COVERAGE_AVERAGE.txt" - path vecscreen, stageAs: "?/VECSCREEN.vecscreen_contamination" path kraken_class, stageAs: "?/KRAKEN_CLASSIFIED.txt" path kraken_report, stageAs: "?/KRAKEN_REPORT.txt" path kraken_lineage, stageAs: "?/KRAKEN_LINEAGE.txt" + path ncbi_taxdump when: task.ext.when == null || task.ext.when @@ -27,22 +23,20 @@ process CREATE_BTK_DATASET { def args = args.ext.args ?: "" """ - mkdir -p btk_datasets/ create_btk_dataset.py \\ - ${reference} \\ - btk_datasets/ \\ - ./1/ \\ - ${meta.id} \\ - ${meta.sci_name} \\ - ${meta.taxid} \\ - ${nt_blast} \\ - NA \\ - NA \\ - ${coverage} \\ - $ - + ${reference} \\ + btk_datasets/ \\ + ./1/ \\ + ${meta.id} \\ + ${meta.sci_name} \\ + ${meta.taxid} \\ + ${nt_blast} \\ + UNIPROT_HITS \\ + DIAMOND_HITS \\ + ${mapped_bam} \\ + ${ncbi_taxdump} """ } \ No newline at end of file diff --git a/subworkflows/local/run_read_coverage.nf b/subworkflows/local/run_read_coverage.nf index 2a08c235..f29aea6b 100644 --- a/subworkflows/local/run_read_coverage.nf +++ b/subworkflows/local/run_read_coverage.nf @@ -83,6 +83,7 @@ workflow RUN_READ_COVERAGE { ch_versions = ch_versions.mix( SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.versions ) emit: - tsv_ch = SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.average_coverage - versions = ch_versions.ifEmpty(null) + tsv_ch = SAMTOOLS_DEPTH_AVERAGE_COVERAGE.out.average_coverage + bam_ch = SAMTOOLS_SORT.out.bam + versions = ch_versions.ifEmpty(null) } \ No newline at end of file diff --git a/workflows/ascc.nf b/workflows/ascc.nf index a0f625cd..3119d867 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -294,9 +294,11 @@ workflow ASCC { YAML_INPUT.out.reads_type ) ch_coverage = RUN_READ_COVERAGE.out.tsv_ch.map{it[1]} + ch_bam = RUN_READ_COVERAGE.out.bam_ch.map{it[1]} ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) } else { ch_coverage = [] + ch_bam = [] } // @@ -341,21 +343,18 @@ workflow ASCC { CREATE_BTK_DATASET ( GENERATE_GENOME.out.reference_tuple, - GC_CONTENT.out.txt.map{it[1]}, GENERATE_GENOME.out.dot_genome.map{it[1]}, ch_kmers, ch_tiara, ch_nt_blast, - ch_mito, - ch_chloro, - ch_fcsadapt, ch_fcsgx, - ch_barcode, + ch_bam, ch_coverage, - ch_vecscreen, ch_kraken1, ch_kraken2, - ch_kraken3 + ch_kraken3, + YAML_INPUT.out.ncbi_taxonomy_path, + ) // From e040d3e95ceabae09e3d9448c68b813c61f2e427 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 14:55:40 +0100 Subject: [PATCH 003/117] Updates to the create_btk_dataset process --- modules/local/create_btk_dataset.nf | 30 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index bd3d550f..16cef4ac 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -24,19 +24,25 @@ process CREATE_BTK_DATASET { """ mkdir -p btk_datasets/ + ls -lh - create_btk_dataset.py \\ - ${reference} \\ - btk_datasets/ \\ - ./1/ \\ - ${meta.id} \\ - ${meta.sci_name} \\ - ${meta.taxid} \\ - ${nt_blast} \\ - UNIPROT_HITS \\ - DIAMOND_HITS \\ - ${mapped_bam} \\ - ${ncbi_taxdump} + create_btk_dataset.py \\ + ${reference} \\ + btk_datasets/ \\ + ./1/ \\ + ${meta.id} \\ + ${meta.sci_name} \\ + ${meta.taxid} \\ + ${nt_blast} \\ + UNIPROT_HITS \\ + DIAMOND_HITS \\ + ${mapped_bam} \\ + ${ncbi_taxdump} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) + END_VERSIONS """ } \ No newline at end of file From b3fc8f20174e1dfe85c794a4f66bb46a145eeb31 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 15:21:03 +0100 Subject: [PATCH 004/117] Adding Diamond Blast subworkflow --- modules.json | 90 ++++++++++++++++++------- modules/local/create_btk_dataset.nf | 3 +- modules/local/format_diamond_outfmt6.nf | 6 +- subworkflows/local/run_diamond.nf | 54 +++++++++++++++ workflows/ascc.nf | 20 +++++- 5 files changed, 143 insertions(+), 30 deletions(-) create mode 100644 subworkflows/local/run_diamond.nf diff --git a/modules.json b/modules.json index f8c9ac15..98d5341b 100644 --- a/modules.json +++ b/modules.json @@ -8,118 +8,162 @@ "blast/blastn": { "branch": "master", "git_sha": "acacb4075ef46fa74630aa3f4b0684f1021d5930", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "blast/makeblastdb": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/getchromsizes": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fcs/fcsadaptor": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fcs/fcsgx": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gnu/sort": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kraken2/kraken2": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" }, "minimap2/align": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/index": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "ncbitools/vecscreen": { "branch": "master", "git_sha": "1e4ac4aa2c612f9547f79f02ef7c651ccc9f657b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/depth": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/faidx": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/merge": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqkit/sliding": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tiara/tiara": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 16cef4ac..17154ae3 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -13,6 +13,7 @@ process CREATE_BTK_DATASET { path kraken_class, stageAs: "?/KRAKEN_CLASSIFIED.txt" path kraken_report, stageAs: "?/KRAKEN_REPORT.txt" path kraken_lineage, stageAs: "?/KRAKEN_LINEAGE.txt" + path diamond_outfmt6, stageAs: "?/DIAMOND_OUTFMT6.tsv" path ncbi_taxdump when: @@ -35,7 +36,7 @@ process CREATE_BTK_DATASET { ${meta.taxid} \\ ${nt_blast} \\ UNIPROT_HITS \\ - DIAMOND_HITS \\ + ${diamond_outfmt6} \\ ${mapped_bam} \\ ${ncbi_taxdump} diff --git a/modules/local/format_diamond_outfmt6.nf b/modules/local/format_diamond_outfmt6.nf index e2acf1ca..0d916f21 100644 --- a/modules/local/format_diamond_outfmt6.nf +++ b/modules/local/format_diamond_outfmt6.nf @@ -1,4 +1,4 @@ -process REFORMAT_FULL_OUTFMT6 { +process REFORMAT_DIAMOND_OUTFMT6 { tag "${meta.id}" label 'process_low' @@ -11,8 +11,8 @@ process REFORMAT_FULL_OUTFMT6 { tuple val(meta), path(diamond_blast) output: - tuple val(meta), path( "*_diamond_outfmt6.tsv" ) , emit: full - path "versions.yml" , emit: versions + tuple val(meta), path( "*_diamond_outfmt6.tsv" ) , emit: full + path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/subworkflows/local/run_diamond.nf b/subworkflows/local/run_diamond.nf new file mode 100644 index 00000000..1013df8a --- /dev/null +++ b/subworkflows/local/run_diamond.nf @@ -0,0 +1,54 @@ +include { SEQKIT_SLIDING } from '../../modules/nf-core/seqkit/sliding/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' +include { BLAST_CHUNK_TO_FULL as DIAMOND_BLAST_CHUNK_TO_FULL } from '../../modules/local/blast_chunk_to_full' +include { REFORMAT_DIAMOND_OUTFMT6 } from '../../modules/local/format_diamond_outfmt6' + +workflow RUN_DIAMOND { + take: + reference_tuple // tuple [[meta.id, meta.sliding, meta.window], reference] + diamond_db // val (path) + + main: + ch_versions = Channel.empty() + ch_ext = Channel.of(6) + ch_columns = Channel.of("qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles") + + // + // MODULE: CREATE SLIDING WINDOW OF THE INPUT ASSEMBLY + // + SEQKIT_SLIDING ( + reference_tuple + ) + ch_versions = ch_versions.mix(SEQKIT_SLIDING.out.versions) + + // + // MODULE: BLAST THE SLIDING WINDOW FASTA AGAINST THE DIAMOND DB + // + DIAMOND_BLASTX ( + SEQKIT_SLIDING.out.fastx, + diamond_db, + ch_ext, + ch_columns + ) + ch_versions = ch_versions.mix(DIAMOND_BLASTX.out.versions) + + // + // MODULE: COMBINE THE CHUNKS INTO THE FULL GENOME + // + DIAMOND_BLAST_CHUNK_TO_FULL ( + DIAMOND_BLASTX.out.txt + ) + ch_versions = ch_versions.mix(DIAMOND_BLAST_CHUNK_TO_FULL.out.versions) + + // + // MODULE: + // + REFORMAT_DIAMOND_OUTFMT6 ( + DIAMOND_BLAST_CHUNK_TO_FULL.out.full + ) + ch_versions = ch_versions.mix(REFORMAT_DIAMOND_OUTFMT6.out.versions) + + emit: + reformed = REFORMAT_DIAMOND_OUTFMT6.out.full + versions = ch_versions.ifEmpty(null) +} \ No newline at end of file diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 3119d867..ae681179 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -37,6 +37,7 @@ include { RUN_READ_COVERAGE } from '../subworkflows/ include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' +include { RUN_DIAMOND } from '../subworkflows/local/run_diamond.nf' // // MODULE: Local modules @@ -324,7 +325,7 @@ workflow ASCC { YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} + ch_kraken1 = RUN_NT_KRAKEN.out.classifiedreformed ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} ch_kraken3 = RUN_NT_KRAKEN.out.lineage @@ -335,12 +336,24 @@ workflow ASCC { ch_kraken3 = [] } + // + // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY + // + if ( workflow_steps.contains('diamond') || workflow_steps.contains('ALL') ) { + RUN_DIAMOND ( + modified_input, + YAML_INPUT.out.diamond_nr_database_path + ) + reform_out6tsv = RUN_DIAMOND.out.reformed.map{it[1]} + ch_versions = ch_versions.mix(RUN_DIAMOND.out.versions) + } else { + reform_out6tsv = [] + } + // mix the outputs of the outpuutting process so that we can // insert them into the one process to create the btk and the merged report // much like the versions channel - GENERATE_GENOME.out.reference_tuple.view() - CREATE_BTK_DATASET ( GENERATE_GENOME.out.reference_tuple, GENERATE_GENOME.out.dot_genome.map{it[1]}, @@ -353,6 +366,7 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, + reform_out6tsv, YAML_INPUT.out.ncbi_taxonomy_path, ) From 992fe3f8523a47ae766a0d4710645027655b273d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 15:55:48 +0100 Subject: [PATCH 005/117] Added the UNIPROT diamond blast subworkflow --- conf/modules.config | 4 ++++ modules/local/blast_chunk_to_full.nf | 2 +- modules/local/create_btk_dataset.nf | 9 ++++---- subworkflows/local/extract_nt_blast.nf | 1 + subworkflows/local/run_diamond.nf | 3 ++- workflows/ascc.nf | 30 ++++++++++++++++++++------ 6 files changed, 36 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0820f4a9..fb4f9759 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -36,6 +36,10 @@ process { ext.dbprefix = '*' } + withName: DIAMOND_BLASTX { + ext.args = { "--sensitive --max-target-seqs 3 --evalue 1e-25 --no-unlink --tmpdir ./" } + } + withName: '.*:EXTRACT_NT_BLAST:BLAST_BLASTN_MOD' { ext.args = { '-outfmt "6 qseqid staxids bitscore std" -max_target_seqs 10 -max_hsps 1 -evalue 1e-25 -dust yes -lcase_masking' } ext.dbprefix = { "${meta2.id}" } diff --git a/modules/local/blast_chunk_to_full.nf b/modules/local/blast_chunk_to_full.nf index 6fdb04f2..318dd5fc 100644 --- a/modules/local/blast_chunk_to_full.nf +++ b/modules/local/blast_chunk_to_full.nf @@ -12,7 +12,7 @@ process BLAST_CHUNK_TO_FULL { output: tuple val(meta), path( "*.tsv" ) , emit: full - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 17154ae3..2b19d391 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -6,14 +6,15 @@ process CREATE_BTK_DATASET { path dot_genome, stageAs: "?/SORTED.genome" path kmers, stageAs: "?/KMERS_dim_reduction_embeddings_combined.csv" path tiara, stageAs: "?/TIARA.txt" - path nt_blast, stageAs: "?/BLAST_with_LINEAGE.csv" + path nt_blast, stageAs: "?/BLAST_HITS.tsv" path fcsgx, stageAs: "?/FCSGX_parsed.csv" path mapped_bam, stageAs: "?/MAPPED.bam" path coverage, stageAs: "?/COVERAGE_AVERAGE.txt" path kraken_class, stageAs: "?/KRAKEN_CLASSIFIED.txt" path kraken_report, stageAs: "?/KRAKEN_REPORT.txt" path kraken_lineage, stageAs: "?/KRAKEN_LINEAGE.txt" - path diamond_outfmt6, stageAs: "?/DIAMOND_OUTFMT6.tsv" + path nt_diamond, stageAs: "?/NUCLEOT_DIAMOND_FULL.tsv" + path un_diamond, stageAs: "?/UNIPROT_DIAMOND_FULL.tsv" path ncbi_taxdump when: @@ -35,8 +36,8 @@ process CREATE_BTK_DATASET { ${meta.sci_name} \\ ${meta.taxid} \\ ${nt_blast} \\ - UNIPROT_HITS \\ - ${diamond_outfmt6} \\ + ${nt_diamond} \\ + ${un_diamond} \\ ${mapped_bam} \\ ${ncbi_taxdump} diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index 26e5ee9f..c1956b1a 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -144,6 +144,7 @@ workflow EXTRACT_NT_BLAST { emit: ch_top_lineages = GET_LINEAGE_FOR_TOP.out.full + ch_blast_hits = BLAST_CHUNK_TO_FULL.out.full versions = ch_versions.ifEmpty(null) } diff --git a/subworkflows/local/run_diamond.nf b/subworkflows/local/run_diamond.nf index 1013df8a..13ae533c 100644 --- a/subworkflows/local/run_diamond.nf +++ b/subworkflows/local/run_diamond.nf @@ -41,7 +41,7 @@ workflow RUN_DIAMOND { ch_versions = ch_versions.mix(DIAMOND_BLAST_CHUNK_TO_FULL.out.versions) // - // MODULE: + // MODULE: REFORMAT THE DIAMOND OUTPUT // REFORMAT_DIAMOND_OUTFMT6 ( DIAMOND_BLAST_CHUNK_TO_FULL.out.full @@ -49,6 +49,7 @@ workflow RUN_DIAMOND { ch_versions = ch_versions.mix(REFORMAT_DIAMOND_OUTFMT6.out.versions) emit: + full = DIAMOND_BLAST_CHUNK_TO_FULL.out.full reformed = REFORMAT_DIAMOND_OUTFMT6.out.full versions = ch_versions.ifEmpty(null) } \ No newline at end of file diff --git a/workflows/ascc.nf b/workflows/ascc.nf index ae681179..6bf07d15 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -37,7 +37,8 @@ include { RUN_READ_COVERAGE } from '../subworkflows/ include { RUN_VECSCREEN } from '../subworkflows/local/run_vecscreen' include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' -include { RUN_DIAMOND } from '../subworkflows/local/run_diamond.nf' +include { RUN_DIAMOND as NUCLEOT_DIAMOND } from '../subworkflows/local/run_diamond.nf' +include { RUN_DIAMOND as UNIPROT_DIAMOND } from '../subworkflows/local/run_diamond.nf' // // MODULE: Local modules @@ -166,7 +167,7 @@ workflow ASCC { YAML_INPUT.out.ncbi_rankedlineage_path ) ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) - ch_nt_blast = EXTRACT_NT_BLAST.out.ch_top_lineages.map{it[1]} + ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} } else { ch_nt_blast = [] } @@ -339,15 +340,29 @@ workflow ASCC { // // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY // - if ( workflow_steps.contains('diamond') || workflow_steps.contains('ALL') ) { - RUN_DIAMOND ( + if ( workflow_steps.contains('nt_diamond') || workflow_steps.contains('ALL') ) { + NUCLEOT_DIAMOND ( modified_input, YAML_INPUT.out.diamond_nr_database_path ) - reform_out6tsv = RUN_DIAMOND.out.reformed.map{it[1]} + nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} ch_versions = ch_versions.mix(RUN_DIAMOND.out.versions) } else { - reform_out6tsv = [] + nt_full = [] + } + + // + // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY + // + if ( workflow_steps.contains('uniprot_diamond') || workflow_steps.contains('ALL') ) { + UNIPROT_DIAMOND ( + modified_input, + YAML_INPUT.out.diamond_uniprot_database_path + ) + un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} + ch_versions = ch_versions.mix(RUN_DIAMOND.out.versions) + } else { + un_full = [] } // mix the outputs of the outpuutting process so that we can @@ -366,7 +381,8 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, - reform_out6tsv, + nt_full, + un_full, YAML_INPUT.out.ncbi_taxonomy_path, ) From 5d28b475a5f35737e94fc525860b23d38a011e53 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Apr 2024 16:04:14 +0100 Subject: [PATCH 006/117] Typo --- conf/base.config | 6 ++++++ workflows/ascc.nf | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index e0e853b0..4a214534 100644 --- a/conf/base.config +++ b/conf/base.config @@ -20,6 +20,12 @@ process { time = { check_max( 12.h * task.attempt, 'time' ) } } + withName: DIAMOND_BLASTX { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 50.GB * task.attempt, 'memory' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } + } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 6bf07d15..c86e9683 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -346,7 +346,7 @@ workflow ASCC { YAML_INPUT.out.diamond_nr_database_path ) nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} - ch_versions = ch_versions.mix(RUN_DIAMOND.out.versions) + ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { nt_full = [] } @@ -360,7 +360,7 @@ workflow ASCC { YAML_INPUT.out.diamond_uniprot_database_path ) un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} - ch_versions = ch_versions.mix(RUN_DIAMOND.out.versions) + ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) } else { un_full = [] } From 951c248725a67a0bb6fb1934ae8768554d48b2e7 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 8 Apr 2024 17:27:33 +0100 Subject: [PATCH 007/117] additions to scripts and a rewrite --- bin/ascc_merge_tables.py | 263 +++++++++++++++++++++ bin/create_btk_dataset.py | 8 +- bin/create_btk_dataset_V2.py | 301 +++++++++++++++++++++++++ bin/merge_btk_datasets.py | 107 +++++++++ bin/remove_fcs_gx_and_tiara_contams.py | 186 +++++++++++++++ bin/sam_to_sorted_indexed_bam.py | 0 6 files changed, 861 insertions(+), 4 deletions(-) create mode 100755 bin/ascc_merge_tables.py mode change 100644 => 100755 bin/create_btk_dataset.py create mode 100755 bin/create_btk_dataset_V2.py create mode 100755 bin/merge_btk_datasets.py create mode 100755 bin/remove_fcs_gx_and_tiara_contams.py mode change 100644 => 100755 bin/sam_to_sorted_indexed_bam.py diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py new file mode 100755 index 00000000..307812ff --- /dev/null +++ b/bin/ascc_merge_tables.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +""" +Script for merging contaminant check results into one table +""" + +import argparse +import pandas as pd +import os +import sys +import general_purpose_functions as gpf + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + for data_type, input_file_path in paths_dict.items(): + out_dict[data_type] = None + if os.path.isfile(input_file_path) == False: + if data_type in required_files: + sys.stderr.write("Input file {} was not found\n".format(input_file_path)) + sys.exit(1) + else: + if os.stat(input_file_path).st_size == 0: + sys.stderr.write("Warning: the file {} is empty and will therefore not be included in the final results\n".format(input_file_path)) + else: + out_dict[data_type] = input_file_path + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + coverage_file_path = paths_dict["coverage"] + if coverage_file_path is not None: + if os.stat(coverage_file_path).st_size > 0: + coverage_df = pd.read_csv(coverage_file_path, sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write("No rows were found in the coverages table ({})\n".format(coverage_file_path)) + coverage_df = None + else: + sys.stderr.write("Warning: the output file for PacBio coverage ({}) is empty\n".format(coverage_file_path)) + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:,[0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"])) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write("No rows were found in kmers dimensionality reduction output table ({})\n".format(paths_dict["dim_reduction_embeddings"])) + dim_reduction_df = None + + btk_df = None + + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write("No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"])) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + #{"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns = btk_renaming_dict, inplace=True) + + btk_selected_cols = [col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"]] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write("No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"])) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + #if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + # btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + #if "bestsum_phylum" in btk_df.columns: + # btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + #{"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_busco_df.rename(columns = btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [col for col in btk_busco_df.columns if col in ["scaff", "buscogenes_superkingdom", "buscogenes_kingdom", "buscogenes_phylum", "buscogenes_class", "buscogenes_order", "buscogenes_family", "buscogenes_genus", "buscogenes_species", "buscoregions_superkingdom", "buscoregions_kingdom", "buscoregions_phylum", "buscoregions_class", "buscoregions_order", "buscoregions_family", "buscoregions_genus", "buscoregions_species"]] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + #df = pd.merge(main_df, btk_df, on="scaff", how="outer") + + + #if paths_dict["blobtoolkit"] is not None: + # #if 'A' in df.columns: + # blobtoolkit_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + # if blobtoolkit_df.shape[0] > 0: + # blobtoolkit_df = blobtoolkit_df[["identifiers", "bestsum_phylum"]] + # blobtoolkit_df.columns = ["scaff", "btk_bestsum_phylum"] + # else: + # sys.stderr.write("No rows were found in BlobToolKit output table ({})\n".format(paths_dict["blobtoolkit"])) + # blobtoolkit_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"])) + uniprot_diamond_df = None + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write("No rows were found in CobiontID MarkerScan output table ({})\n".format(paths_dict["cobiontid_markerscan"])) + uniprot_diamond_df = None + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(data_folder, out_path, sample_name): + paths_dict = dict() + paths_dict["gc_content"] = "{}/gc.txt".format(data_folder) + paths_dict["coverage"] = "{}/pacbio_reads_coverage.txt".format(data_folder) + paths_dict["tiara"] = "{}/tiara_out.txt".format(data_folder) + paths_dict["bacterial_kraken"] = "{}/bacterial_kraken_lineage.txt".format(data_folder) + paths_dict["nt_kraken"] = "{}/nt_kraken_lineage.txt".format(data_folder) + paths_dict["nt_blast"] = "{}/BLAST_results_with_lineage.csv".format(data_folder) + paths_dict["dim_reduction_embeddings"] = "{}/kmers_dim_reduction_embeddings.csv".format(data_folder) + paths_dict["nr_diamond"] = "{}/nr_diamond_blastx_top_hits.csv".format(data_folder) + paths_dict["uniprot_diamond"] = "{}/uniprot_diamond_blastx_top_hits.csv".format(data_folder) + paths_dict["cobiontid_markerscan"] = "{}/cobiontid_markerscan.csv".format(data_folder) + paths_dict["contigviz"] = "{}/contigviz_results.csv".format(data_folder) + paths_dict["blobtoolkit"] = "{}/btk_summary_table_full.tsv".format(data_folder) + paths_dict["btk_busco"] = "{}/btk_busco_summary_table_full.tsv".format(data_folder) + paths_dict["fcs_gx"] = "{}/fcs-gx_summary.csv".format(data_folder) + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(out_path, index=False) + + if paths_dict["nt_blast"] is not None and paths_dict["nr_diamond"] is not None and paths_dict["uniprot_diamond"] is not None and paths_dict["coverage"] is not None and paths_dict["tiara"] is not None and paths_dict["nt_kraken"] is not None: + process_results_tables_command = "process_result_tables.py {} {}".format(data_folder, sample_name) + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write("Skipping generating the {}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n".format(sample_name)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("data_folder", type=str, help="Path to folder with ASG contamination check result files of individual steps") + parser.add_argument("out_path", type=str, help="Path for output CSV file") + parser.add_argument("--sample_name", type=str, help="Sample name (e.g. ToLID)", default="unnamed") + args = parser.parse_args() + main(args.data_folder, args.out_path, args.sample_name) \ No newline at end of file diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py old mode 100644 new mode 100755 index 1447e5b5..53b95455 --- a/bin/create_btk_dataset.py +++ b/bin/create_btk_dataset.py @@ -166,15 +166,15 @@ def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_titl parser.add_argument("dataset_folder", type=str, help="Path for dataset folder") parser.add_argument("pipeline_run_folder", type=str, help="Folder where this pipeline is run pipeline") parser.add_argument("assembly_title", type=str, help="Assembly title") - parser.add_argument("taxon_name", type=str, help="Taxon name") - parser.add_argument("taxid", type=int, help="taxid") + parser.add_argument("-tn", "--taxon_name", type=str, help="Taxon name") + parser.add_argument("-ti", "--taxid", type=int, help="taxid") parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") parser.add_argument("mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation") - parser.add_argument("taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") + parser.add_argument("-td", "--taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") parser.add_argument("--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)") args = parser.parse_args() - main(args.assembly_fasta_path, args.dataset_folder, args.pipeline_run_folder, args.assembly_title, args.taxon_name, args.taxid, args.blastn_hits_path, args.uniprot_diamond_hits_path, args.nr_diamond_hits_path, args.mapped_reads_path, args.taxdump_path, args.threads, args.assembly_alias,args.dry_run) + main(args.assembly_fasta_path, args.dataset_folder, args.pipeline_run_folder, args.assembly_title, args.taxon_name, args.taxid, args.blastn_hits_path, args.uniprot_diamond_hits_path, args.nr_diamond_hits_path, args.mapped_reads_path, args.taxdump_path, args.threads, args.assembly_alias,args.dry_run) \ No newline at end of file diff --git a/bin/create_btk_dataset_V2.py b/bin/create_btk_dataset_V2.py new file mode 100755 index 00000000..e4185956 --- /dev/null +++ b/bin/create_btk_dataset_V2.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for creating a BlobToolKit dataset from the ASCC output files +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import general_purpose_functions as gpf +import argparse +from pathlib import Path +import textwrap +import sys +import os.path + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="createBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION) + + ) + parser.add_argument( + "-n", + "--name", + required=True, + type=str, + help="Assembly name (for the output files)" + ) + parser.add_argument( + "-tn", + "--taxon_name", + required=True, + type=str, + help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)" + ) + parser.add_argument( + "-id", + "--taxid", + required=True, + type=int, + help="Taxon ID of the assembly" + ) + parser.add_argument( + "-td", + "--taxdump", + required=True, + type=str, + help="Path to the directory containing the NCBI taxdump" + ) + parser.add_argument( + "-f", + "--fasta", + required=True, + type=str, + help="The path for the assembly fasta file" + ) + parser.add_argument( + "-d", + "--dataset", + type=str, + required=True, + help="The folder containing the data generated throughout the pipeline" + ) + parser.add_argument( + "-bh", + "--blastn_hits", + default="N", + type=str, + help="Path to the BLASTN hits file" + ) + parser.add_argument( + "-ud", + "--uniprot_diamond_hits", + default="N", + type=str, + help="Path to the UNIPROT diamond BlastX hits file" + ) + parser.add_argument( + "-nr", + "--nr_diamond_hits", + default="N", + type=str, + help="Path to the DIAMOND BlastX hits file" + ) + parser.add_argument( + "-r", + "--mapped_reads", + default="N", + type=str, + help="Path to mapped reads BAM for coverage estimation" + ) + parser.add_argument( + "-t", + "--tiara", + default="N", + type=str, + help="Path to the tiara_out.txt file" + ) + parser.add_argument( + "-p", + "--pca", + default="N", + type=str, + help="Path to the kmers_dim_reduction_embeddings.csv file" + ) + parser.add_argument( + "-fc", + "--fcs_gx", + default="N", + type=str, + help="Path to the fcs-gx_summary.csv.csv file" + ) + parser.add_argument( + "-k", + "--kraken", + default="N", + type=str, + help="Path to the nt_kraken_lineage.txt file" + ) + parser.add_argument( + "-ms", + "--markerscan", + default="N", + type=str, + help="Path to the cobiontid_markerscan.csv file" + ) + parser.add_argument( + "-cv", + "--contigviz", + default="N", + type=str, + help="Path to the contigviz_results.csv file" + ) + parser.add_argument( + "-o", + "--output", + default="btk_datasets", + type=str, + help="Output directory" + ) + parser.add_argument( + "--threads", + type=int, + default=1, + help="Number of threads to utilise" + ) + parser.add_argument( + "--alias", + type=str, + default="", + help="Assembly alias" + ) + parser.add_argument( + "--dry_run", + dest="dry_run", + action="store_true", + help="Dry run (print commands without executing)" + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): + """ + Creates the assembly YAML file for creating a BlobToolKit dataset + """ + if ".gz" in assembly_alias: + assembly_alias = assembly_alias.replace(".gz", "_gz") + out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format(assembly_alias, taxon_name) + with open(assembly_yaml_path, "w") as f: + f.write(out_string) + +def tiara_results_to_btk_format(tiara_results_path, outfile_path): + """ + Reformatting Tiara output file so that the summarised results of the first and second pass of Tiara can be + added to a BlobToolKit dataset + """ + tiara_data = gpf.l(tiara_results_path) + tiara_data = tiara_data[1:len(tiara_data)] + with open(outfile_path, "w") as f: + f.write("identifier\ttiara\n") + for line in tiara_data: + split_line = line.split() + if len(split_line) != 3: + sys.stderr.write("Failed to parse the Tiara results file {}\n".format(tiara_results_path)) + sys.exit(1) + first_pass_result = split_line[1] + second_pass_result = split_line[2] + if second_pass_result != "n/a": + first_pass_result = second_pass_result + f.write(split_line[0] + "\t" + first_pass_result + "\n") + + +def detect_dim_reduction_methods(kmers_dim_reduction_output_path): + """ + Parses the header of the kmers dimensionality reduction report file to detect which dimensionality reduction methods were used + """ + header_string = None + with open(kmers_dim_reduction_output_path) as f: + header_string = f.readline() + header_string = header_string.strip() + split_header = header_string.split(",") + dim_reduction_methods = list() + for header_item in split_header: + if header_item.startswith("embedding_"): + if header_item.startswith("embedding_x_"): + header_item = header_item.split("embedding_x_")[1] + elif header_item.startswith("embedding_y_"): + header_item = header_item.split("embedding_y_")[1] + if header_item not in dim_reduction_methods: + dim_reduction_methods.append(header_item) + return dim_reduction_methods + +def main(args): + command_list = [] + + assembly_alias = ( args.name if args.alias == "" else args.alias ) + + edited_assembly_title = args.name.replace(".", "_").replace(" ", "_") + + assembly_yaml_path = args.output + "/" + edited_assembly_title + "BTK_DS.yaml" + + if args.dry_run == False: + Path(args.dataset).mkdir(parents=True, exist_ok=True) + create_assembly_yaml(assembly_yaml_path, assembly_alias, args.taxon_name) + + # Base command for new BTK Dataset + blobtools_create_command = f"blobtools create --fasta {args.fasta} --meta {assembly_yaml_path} --taxid {args.taxid} --taxdump {args.taxdump} {args.output}" + gpf.run_system_command(blobtools_create_command, dry_run=args.dry_run) + + # ADDING BLAST HIT DATA TO BTK + hits_file_paths = [ + args.blastn_hits, + args.uniprot_diamond_hits, + args.nr_diamond_hits + ] + + hits_file = [ + n for n in hits_file_paths + if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0 + ] + + if len(hits_file) > 0: + add_hits_command = "blobtools add" + for file in hits_file_paths: + add_hits_command += f" --hits {file}" + add_hits_command += f" --taxrule bestsum --taxdump {args.taxdump} {args.output}" + command_list.append(add_hits_command) + + # ADDING MAPPED READS DATA TO BTK + if args.mapped_reads != "N" and os.path.isfile(args.mapped_reads) is True and os.stat(args.mapped_reads).st_size > 0: + add_cov_command = f"blobtools add --cov {args.mapped_reads} --threads {args.threads} {args.output}" + command_list.append(add_cov_command) + + # ADDING TIARA + if args.tiara != "N" and os.path.isfile(args.tiara) and os.stat(args.tiara).st_size > 0: + tiara_reformatted_output_path = args.dataset + "/tiara_out_btk_format.tsv" + tiara_results_to_btk_format(args.tiara, tiara_reformatted_output_path) + add_tiara_command = f"blobtools add --text {tiara_reformatted_output_path} --text-delimiter '\t' --text-cols 'identifier=identifiers,tiara=tiara' --text-header {args.output}" + command_list.append(add_tiara_command) + + # ADDING KMER DIM REDUCTION + if args.pca != "N" and os.path.isfile(args.pca) and os.stat(args.pca).st_size > 0: + used_dim_reduction_methods = detect_dim_reduction_methods(args.pca) + for dim_reduction_method in used_dim_reduction_methods: + add_embedding_command = f"blobtools add --text {args.pca} --text-delimiter ',' --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {args.output}" + command_list.append(add_embedding_command) + + # ADDIND KRAKEN DATA + if args.kraken != "N" and os.path.isfile(args.kraken) and os.stat(args.kraken).st_size > 0: + for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): + add_kraken_command = f"blobtools add --text {args.kraken} --text-delimiter ',' --text-cols scaff=identifiers,nt_kraken_{taxonomy_level}=nt_kraken_{taxonomy_level} --text-header {args.output}" + command_list.append(add_kraken_command) + + # ADDING FCS_GX DATA + if args.fcs_gx != "N" and os.path.isfile(args.fcs_gx) and os.stat(args.fcs_gx).st_size > 0: + add_fcs_gx_results_command = f"blobtools add --text {args.fcs_gx} --text-delimiter ',' --text-cols 'scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action' --text-header {args.output}" + command_list.append(add_fcs_gx_results_command) + + + export_table_command = f"blobtools filter --table {args.dataset}/collected_tables/btk_summary_table_full.tsv {args.output}" + command_list.append(export_table_command) + + # EXECUTE ALL BTK COMMANDS + for i in command_list: + gpf.run_system_command(i, dry_run=args.dry_run) + +if __name__ == "__main__": + main( + parse_args() + ) \ No newline at end of file diff --git a/bin/merge_btk_datasets.py b/bin/merge_btk_datasets.py new file mode 100755 index 00000000..c3af0a1d --- /dev/null +++ b/bin/merge_btk_datasets.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +Script for merging BTK datasets from the this pipeline and the BUSCO-based Snakemake BTK pipeline +""" + +import json +from pathlib import Path +import shutil +import os +import sys +import argparse +import general_purpose_functions as gpf + + +def load_json(filename): + """ Loads a JSON file and returns it as a dictionary """ + with open(filename) as f: + return json.load(f) + + +def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder): + """ + Creates a meta.json file for the new BTK dataset by combining the two meta.json files from the input directories + """ + for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): + if os.path.isdir(folder) is False: + sys.stderr.write(f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n") + sys.exit(0) + + main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" + btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json" + for json_path in (main_btk_json_path, btk_busco_json_path): + if os.path.isfile(json_path) is False: + sys.stderr.write(f"File {json_path} not found)\n") + sys.exit(1) + main_meta_dict = load_json(main_btk_json_path) + btk_busco_meta_dict = load_json(btk_busco_json_path) + + merged_dict = btk_busco_meta_dict.copy() + + keys_to_skip = [] + fields = main_meta_dict["fields"] + for field in fields: + field_id = field["id"] + + if field_id == "taxonomy": + btk_main_taxonomy_field = field.copy() + btk_main_taxonomy_field["id"] = "btk_main_taxonomy" + btk_main_taxonomy_field["name"] = "btk_main_taxonomy" + merged_dict["fields"].append(btk_main_taxonomy_field) + else: + if field_id not in keys_to_skip: + merged_dict["fields"].append(field) + + + meta_json_outpath = f"{combined_dataset_folder}/meta.json" + with open(meta_json_outpath, "w") as json_outfile: + json.dump(merged_dict, json_outfile, indent=1, sort_keys=True) + + +def main(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder, pipeline_output_folder, skip_renaming_folders): + if os.path.isdir(main_btk_dataset_folder) is False: + sys.stderr.write(f"The BlobToolKit dataset ({main_btk_dataset_folder}) was not found\n") + sys.exit(1) + + if os.path.isdir(btk_busco_dataset_folder) is False: + sys.stderr.write(f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {btk_busco_dataset_folder}, skipping the merging of BTK datasets\n") + sys.exit(0) + + not_copying_list = ["identifiers.json", "gc_data.json", "length_data.json", "ncount_data.json", "meta.json"] + + Path(combined_dataset_folder).mkdir(parents=True, exist_ok=True) + + main_btk_dataset_files = [f for f in os.listdir(main_btk_dataset_folder) if os.path.isfile(os.path.join(main_btk_dataset_folder, f))] + main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] + for main_btk_dataset_file in main_btk_dataset_files: + main_btk_dataset_file_full_path = f"{main_btk_dataset_folder}/{main_btk_dataset_file}" + copied_file_full_path = f"{combined_dataset_folder}/{main_btk_dataset_file}" + shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) + + btk_busco_files = [f for f in os.listdir(btk_busco_dataset_folder) if os.path.isfile(os.path.join(btk_busco_dataset_folder, f))] + for btk_busco_file in btk_busco_files: + btk_busco_file_full_path = f"{btk_busco_dataset_folder}/{btk_busco_file}" + copied_file_full_path = f"{combined_dataset_folder}/{btk_busco_file}" + shutil.copy(btk_busco_file_full_path, copied_file_full_path) + + create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder) + old_main_btk_dataset_folder = main_btk_dataset_folder + "_without_busco" + + if skip_renaming_folders is False: + os.rename(main_btk_dataset_folder, old_main_btk_dataset_folder) + os.rename(combined_dataset_folder, main_btk_dataset_folder) + + btk_busco_table_outpath = f"{pipeline_output_folder}/btk_busco_summary_table_full.tsv" + btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species {main_btk_dataset_folder}" + gpf.run_system_command(btk_busco_table_exporting_command) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("main_btk_dataset_folder", type=str, help="Path to the BTK dataset (blobdir) created from the output of the steps of this pipeline") + parser.add_argument("btk_busco_dataset_folder", type=str, help="Path to the BTK dataset (blobdir) created by the BUSCO-based Snakemake BTK pipeline") + parser.add_argument("combined_dataset_folder", type=str, help="Path for creating a new BTK dataset (blobdir) that combines the two input BTK datasets") + parser.add_argument("pipeline_output_folder", type=str, help="Path to the directory with the output tables of the pipeline") + parser.add_argument("--skip_renaming_folders", dest="skip_renaming_folders", help="Optional boolean argument. If set to true, the script skips the renaming of the input BTK dataset directories after creating the merged BTK dataset", action="store_true") + args = parser.parse_args() + main(args.main_btk_dataset_folder, args.btk_busco_dataset_folder, args.combined_dataset_folder, args.pipeline_output_folder, args.skip_renaming_folders) \ No newline at end of file diff --git a/bin/remove_fcs_gx_and_tiara_contams.py b/bin/remove_fcs_gx_and_tiara_contams.py new file mode 100755 index 00000000..8a89f44f --- /dev/null +++ b/bin/remove_fcs_gx_and_tiara_contams.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +Script for filtering the assembly to remove putative contaminants based on FGCS-GX and Tiara results +""" + +import general_purpose_functions as gpf +import os +import sys +import argparse +from pathlib import Path +import csv + +def get_domain_from_taxid(query_taxid, rankedlineage_path): + """ + Input: 1) a taxID, 2) path to the NCBI rankedlineage.dmp file + Output: domain classification corresponding to the taxID + """ + domain = None + query_taxid = str(query_taxid) + rankedlineage_data = gpf.ll(rankedlineage_path) + for line in rankedlineage_data: + split_line = line.split("|") + split_line = [n.strip() for n in split_line] + assert len(split_line) == 11 + taxid = split_line[0] + domain = split_line[9] + if taxid == query_taxid: + domain = split_line[9] + if domain not in ("", "Archaea", "Bacteria", "Eukaryota", "Viruses"): + sys.stderr.write(f"Unrecognised value for domain-level taxonomy: {domain}") + sys.exit(1) + break + if domain is None: + sys.stderr.write("The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format(query_taxid, rankedlineage_path)) + sys.exit(1) + return domain + + +def process_tiara_results(tiara_results_path, target_domain): + """ + Input: 1) path to the main output file of Tiara, 2) the domain of the target species + Output: dictionary where the keys are scaffold names and the values are the decontamination action based on Tiara results + ('keep' or 'exclude') + """ + tiara_action_dict = dict() + + allowed_classif_dict = dict() + allowed_classif_dict[""] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_dict["Archaea"] = ["archaea", "prokarya", "unknown"] + allowed_classif_dict["Bacteria"] = ["bacteria", "prokarya", "unknown"] + allowed_classif_dict["Eukaryota"] = ["eukarya", "organelle", "unknown"] + allowed_classif_dict["Viruses"] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_list = allowed_classif_dict[target_domain] + + tiara_output = gpf.ll(tiara_results_path) + for counter, line in enumerate(tiara_output): + if counter == 0: + continue + split_line = line.split() + assert len(split_line) == 3 + tiara_class_fst_stage = split_line[1] + assert tiara_class_fst_stage in ("archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown") + tiara_action = "KEEP" + if tiara_class_fst_stage not in allowed_classif_list: + tiara_action = "EXCLUDE" + scaff = split_line[0] + tiara_action_dict[scaff] = tiara_action + return tiara_action_dict + + +def get_fcs_gx_action_dict(fcs_gx_summary_path): + """ + Input: path to FCS-GX summary CSV file (produced by ascc_parse_fcsgx_results.py) + Output: dictionary where the keys are scaffold names and the values are the FCS-GX action values + """ + fcs_gx_action_dict = dict() + fcs_gx_summary_data = gpf.ll(fcs_gx_summary_path) + for counter, line in enumerate(fcs_gx_summary_data): + if counter == 0: + continue + split_line = line.split(",") + scaff = split_line[0] + fcs_gx_action = split_line[8] + fcs_gx_action_dict[scaff] = fcs_gx_action + return fcs_gx_action_dict + + +def get_scaff_names(assembly_path): + """ + Reads FASTA headers from a FASTA file and returns them as a list + """ + scaffs = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for fasta_tuple in fasta_data: + scaffs.append(fasta_tuple[0]) + return scaffs + + +def filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path): + """ + Filters a genome assembly FASTA file to remove sequences that are listed in the scaffs_to_exclude list + """ + out_list = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for header, seq in fasta_data: + if header not in scaffs_to_exclude: + out_list.append(">" + header) + split_seq = gpf.split_with_fixed_row_length(seq, 80) + out_list.extend(split_seq) + else: + sys.stderr.write(f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n") + gpf.export_list_as_line_break_separated_file(out_list, filtered_assembly_path) + + + +def main(pipeline_run_folder, taxid, rankedlineage_path): + if taxid == -1: + sys.stderr.write("The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n") + + assembly_path = f"{pipeline_run_folder}/fasta/assembly.fasta" + tiara_results_path = f"{pipeline_run_folder}/collected_tables/tiara_out.txt" + fcs_gx_summary_path = f"{pipeline_run_folder}/collected_tables/fcs-gx_summary.csv" + filtered_assembly_path = f"{pipeline_run_folder}/fasta/filtered/assembly_autofiltered.fasta" + assembly_filtering_summary_table_path = f"{pipeline_run_folder}/collected_tables/fcs-gx_and_tiara_combined_summary.csv" + excluded_seq_list_path = f"{pipeline_run_folder}/collected_tables/assembly_filtering_removed_sequences.txt" + + Path(f"{pipeline_run_folder}/fasta/filtered").mkdir(parents=True, exist_ok=True) + + if os.path.isfile(rankedlineage_path) is False: + sys.stderr.write(f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n") + sys.exit(1) + if os.path.isfile(tiara_results_path) is False: + sys.stderr.write(f"The Tiara output file was not found at the expected location ({tiara_results_path})\n") + sys.exit(1) + if os.path.isfile(fcs_gx_summary_path) is False: + sys.stderr.write(f"The FCS-GX results summary file was not found at the expected location ({fcs_gx_summary_path})\n") + sys.exit(1) + if os.path.isfile(assembly_path) is False: + sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({assembly_path})\n") + sys.exit(1) + + target_domain = get_domain_from_taxid(taxid, rankedlineage_path) + tiara_action_dict = process_tiara_results(tiara_results_path, target_domain) + + fcs_gx_action_dict = get_fcs_gx_action_dict(fcs_gx_summary_path) + + combined_action_dict = dict() + scaffs_to_exclude = list() + scaffs = get_scaff_names(assembly_path) + for scaff in scaffs: + fcs_gx_action = "NA" + tiara_action = "NA" + if scaff in fcs_gx_action_dict: + fcs_gx_action = fcs_gx_action_dict[scaff] + if scaff in tiara_action_dict: + tiara_action = tiara_action_dict[scaff] + combined_action = fcs_gx_action + if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": + combined_action = "EXCLUDE" + if combined_action == "EXCLUDE": + scaffs_to_exclude.append(scaff) + combined_action_dict[scaff] = {"fcs_gx_action": fcs_gx_action, "tiara_action": tiara_action, "combined_action": combined_action} + filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) + gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) + + #csv_writer = csv.writer(open(assembly_filtering_summary_table_path, "w")) + #for key, value in combined_action_dict.items(): + # line = [key] + # for ik, iv in value.items(): + # line.append(ik) + # line.extend([v for v in iv]) + # csv_writer.writerow(line) + out_csv_list = list() + out_csv_list.append("scaff,fcs_gx_action,tiara_action,combined_action") + for scaff, scaff_properties in combined_action_dict.items(): + out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']}" + out_csv_list.append(out_line) + gpf.export_list_as_line_break_separated_file(out_csv_list, assembly_filtering_summary_table_path) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("pipeline_run_folder", type=str, help="Path to the directory where the pipeline is be run") + parser.add_argument("taxid", type=int, help="NCBI taxonomy ID of the species") + parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy") + args = parser.parse_args() + main(args.pipeline_run_folder, args.taxid, args.ncbi_rankedlineage_path) \ No newline at end of file diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py old mode 100644 new mode 100755 From c45bc2cccd52ae15473aac0849d0083acb15be01 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 8 Apr 2024 17:29:43 +0100 Subject: [PATCH 008/117] Adding and updating modules --- modules/local/autofiltering.nf | 33 +++++++++++++++++++ modules/local/create_btk_dataset.nf | 51 +++++++++++++++++++++-------- modules/local/merge_btk_datasets.nf | 0 3 files changed, 70 insertions(+), 14 deletions(-) create mode 100644 modules/local/autofiltering.nf create mode 100644 modules/local/merge_btk_datasets.nf diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf new file mode 100644 index 00000000..b95b36bf --- /dev/null +++ b/modules/local/autofiltering.nf @@ -0,0 +1,33 @@ +process AUTOFILTER_ASSEMBLY { + tag "$meta.id" + label "process_medium" + + container 'docker://quay.io/sanger-tol/ascc_main:0.001-c1' + + input: + tuple val(meta), path(reference) + tuple val(tiara_meta), path(tiara_txt) + tuple val(fcs_meta), path(fcs_csv) + + output: + tuple val(meta), path("*autofiltered.fasta"), emit: decontaminated_assembly + tuple val(meta), path("fcs-gx_and_tiara_combined_summary.csv"), emit: fcs_tiara_summary + tuple val(meta), path("assembly_filtering_removed_sequences.txt") emit: removed_seqs + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + """ + remove_fcs_gx_and_tiara.py \\ + $reference \\ + $meta.taxid \\ + $tiara_txt \\ + $fcs_csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 2b19d391..d809a1a5 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -1,6 +1,10 @@ process CREATE_BTK_DATASET { + tag "$meta.id" label 'process_medium' + container 'sanger-tol/ascc_btk:3.2.6-c1' + + input: tuple val(meta), path(reference) path dot_genome, stageAs: "?/SORTED.genome" @@ -17,29 +21,48 @@ process CREATE_BTK_DATASET { path un_diamond, stageAs: "?/UNIPROT_DIAMOND_FULL.tsv" path ncbi_taxdump + output: + tuple val(meta), path("btk_datasets") + when: task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: "" + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" + def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" + def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" + def kraken_arg = kraken_lineage ? "-k ${kraken_lineage}": "" + def mapped_arg = mapped_bam ? "-r ${mapped_bam}" : "" + def tiara_arg = tiara ? "-t ${tiara}" : "" + def pca_arg = kmers ? "-p ${kmers}" : "" + def fcs_arg = fcsgx ? "-fc ${fcsgx}" : "" + def marker_arg = "" + def contigviz_arg = "" """ mkdir -p btk_datasets/ ls -lh - create_btk_dataset.py \\ - ${reference} \\ - btk_datasets/ \\ - ./1/ \\ - ${meta.id} \\ - ${meta.sci_name} \\ - ${meta.taxid} \\ - ${nt_blast} \\ - ${nt_diamond} \\ - ${un_diamond} \\ - ${mapped_bam} \\ - ${ncbi_taxdump} + create_btk_dataset_V2.py \\ + -f ${reference} \\ + -d ./1/ \\ + -n "${prefix}" \\ + -tn "${meta.sci_name}" \\ + -id ${meta.taxid} \\ + -td ${ncbi_taxdump}/ \\ + $blastn_arg \\ + $nt_diamond_arg \\ + $un_diamond_arg \\ + $kraken_arg \\ + $mapped_arg \\ + $tiara_arg \\ + $pca_arg \\ + $fcs_arg \\ + $args + + echo "merge_btk_dataset.py btk_datasets/ BTK_FOLDER_PATH BTK_WITH_BUSCO" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf new file mode 100644 index 00000000..e69de29b From d07cb6bfe90f1f19ca5a1ec02fd4d82aa6fb8cc3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 8 Apr 2024 17:31:16 +0100 Subject: [PATCH 009/117] Adding more modules --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index c86e9683..94649653 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -326,7 +326,7 @@ workflow ASCC { YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_kraken1 = RUN_NT_KRAKEN.out.classifiedreformed + ch_kraken1 = RUN_NT_KRAKEN.out.classified ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} ch_kraken3 = RUN_NT_KRAKEN.out.lineage From 3c5102ca7d97644bf1dc272e2ef652b310c15fe3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 8 Apr 2024 17:32:12 +0100 Subject: [PATCH 010/117] Updating the modules conf for the diamond blastx --- conf/modules.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index fb4f9759..20851fe6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,10 @@ process { ext.args = 'nucleotide' } + withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { + ext.args = 'diamond' + } + withName: BLAST_MAKEBLASTDB { ext.args = { "-dbtype nucl" } } From 62f528e096387d3a38c64377bb28bad4d765d246 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 17 Apr 2024 11:50:21 +0100 Subject: [PATCH 011/117] updates to modules --- modules/local/blast_chunk_to_full.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/blast_chunk_to_full.nf b/modules/local/blast_chunk_to_full.nf index 318dd5fc..c010b678 100644 --- a/modules/local/blast_chunk_to_full.nf +++ b/modules/local/blast_chunk_to_full.nf @@ -15,8 +15,8 @@ process BLAST_CHUNK_TO_FULL { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" """ blast_hit_chunk_coords_to_full_coords.py ${chunked} ${args} > full_coords.tsv From ee1bbdcbd6b33e583f3b118eadd700bf6fab216a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 17 Apr 2024 11:50:54 +0100 Subject: [PATCH 012/117] Update to config --- conf/modules.config | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 20851fe6..8de42d34 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -22,14 +22,18 @@ process { ext.args = {"-s ${meta.sliding} -W ${meta.window} "} } - withName: BLAST_CHUNK_TO_FULL { - ext.args = 'nucleotide' + withName: '.*:.*:EXTRACT_NT_BLAST:BLAST_CHUNK_TO_FULL' { + ext.args = "nucleotide" } - withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { - ext.args = 'diamond' + withName: '.*:.*:NUCLEOT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { + ext.args = "diamond" } + withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { + ext.args = "diamond" + }| + withName: BLAST_MAKEBLASTDB { ext.args = { "-dbtype nucl" } } From c9dab6b4a4257e63f5bcce7ccabb4901aca8a2c6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 12:39:40 +0100 Subject: [PATCH 013/117] Addition of the V2 python scripts --- modules/local/create_btk_dataset.nf | 2 +- modules/local/merge_btk_datasets.nf | 45 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index d809a1a5..bdce181f 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -22,7 +22,7 @@ process CREATE_BTK_DATASET { path ncbi_taxdump output: - tuple val(meta), path("btk_datasets") + tuple val(meta), path("btk_datasets"), emit: btk_datasets when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index e69de29b..614698e3 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -0,0 +1,45 @@ +rocess MERGE_BTK_DATASETS { + tag "$meta.id" + label 'process_low' + + container 'sanger-tol/ascc_btk:3.2.6-c1' + + + input: + tuple val(meta), path(create_btk_datasets) + tuple val(meta), path(busco_btk_datasets) + tuple val(meta), path(busco_summary_file) + + output: + tuple val(meta), path("merged_datasets"), emit: merged_datasets + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + def busco_btk_datasets = busco_btk_datasets? "-b ${busco_btk_datasets}" : "" + def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" + def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" + + """ + mkdir -p merged_datasets/ + ls -lh + + merge_btk_datasets_V2.py \\ + -m $create_btk_datasets \\ + -o ./merged_datasets \\ + $busco_btk_datasets \\ + // $busco_summary_file \\ + $args + + echo "merge_btk_dataset.py btk_datasets/ BTK_FOLDER_PATH BTK_WITH_BUSCO" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} \ No newline at end of file From c2c76c34c64136c3d37ca8aef2f7c8fd9ce550ae Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 12:40:34 +0100 Subject: [PATCH 014/117] Modifications for malformed channels --- workflows/ascc.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 94649653..118816bd 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -220,6 +220,7 @@ workflow ASCC { ) ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + ch_chloro.view() } else { ch_chloro = [] } @@ -238,7 +239,7 @@ workflow ASCC { RUN_FCSADAPTOR.out.ch_prok.map{it[1]} ) .set{ ch_fcsadapt } - ch_fcsadapt + ch_fcsadapt.view() ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) } else { ch_fcsadapt = [] @@ -326,7 +327,7 @@ workflow ASCC { YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_kraken1 = RUN_NT_KRAKEN.out.classified + ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} ch_kraken3 = RUN_NT_KRAKEN.out.lineage @@ -360,6 +361,7 @@ workflow ASCC { YAML_INPUT.out.diamond_uniprot_database_path ) un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} + un_full.view() ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) } else { un_full = [] @@ -387,6 +389,10 @@ workflow ASCC { ) + MERGE_BTK_DATASET ( + CREATE_BTK_DATASET.out.btk_datasets + ) + // // SUBWORKFLOW: Collates version data from prior subworflows // From 13eabccb8638da45265f4f0f4712bab628ecc371 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 12:41:18 +0100 Subject: [PATCH 015/117] Updated --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 422de162..01c68b33 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { // Input options input = null outdir = "results" - tracedir = "${outdir}/pipeline_info/" + tracedir = "${params.outdir}/pipeline_info/" // MultiQC options multiqc_config = null From 947b1c7a37bed2c37c741f40def0f497dd16c121 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 12:41:43 +0100 Subject: [PATCH 016/117] Updated --- conf/base.config | 2 +- conf/modules.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 4a214534..4f84e8a8 100644 --- a/conf/base.config +++ b/conf/base.config @@ -21,7 +21,7 @@ process { } withName: DIAMOND_BLASTX { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } + cpus = { check_max( 12 * task.attempt, 'cpus' ) } memory = { check_max( 50.GB * task.attempt, 'memory' ) } time = { check_max( 12.h * task.attempt, 'time' ) } } diff --git a/conf/modules.config b/conf/modules.config index 8de42d34..bc8389f9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,7 +32,7 @@ process { withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { ext.args = "diamond" - }| + } withName: BLAST_MAKEBLASTDB { ext.args = { "-dbtype nucl" } From c5cd5f7b9312deb14a10c4d3124868c7b16a6e5e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 12:42:20 +0100 Subject: [PATCH 017/117] Addition of a V2 python script --- bin/merge_btk_datasets_V2.py | 149 +++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100755 bin/merge_btk_datasets_V2.py diff --git a/bin/merge_btk_datasets_V2.py b/bin/merge_btk_datasets_V2.py new file mode 100755 index 00000000..d9317c02 --- /dev/null +++ b/bin/merge_btk_datasets_V2.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for merging BlobToolKit datasets from the createBTKdatasets output directory. +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import json +from pathlib import Path +import shutil +import os +import sys +import argparse +import textwrap +import general_purpose_functions as gpf + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="mergeBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument( + "-m", "--main_btk_datasets", required=True, type=str, help="The btk_datasets generated by createBTKdatasets" + ) + parser.add_argument( + "-b", + "--btk_busco_datasets", + type=str, + help="Path to the BTK dataset (blobdir) created by the BUSCO-based BTK pipeline", + ) + parser.add_argument( + "-s", + "--btk_busco_summary_full", + type=str, + help="The btk_datasets generated by createBTKdatasets", + ) + parser.add_argument( + "-o", + "--new_output_directory", + default="merged_datasets", + type=str, + help="The new output directory for the merged datasets", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def load_json(filename): + """ + Loads a JSON file and returns it as a dictionary + """ + with open(filename) as f: + return json.load(f) + + +def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder): + """ + Creates a meta.json file for the new BTK dataset by combining the two meta.json files from the input directories + """ + for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): + if os.path.isdir(folder) is False: + sys.stderr.write( + f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n" + ) + sys.exit(0) + + main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" + btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json" + for json_path in (main_btk_json_path, btk_busco_json_path): + if os.path.isfile(json_path) is False: + sys.stderr.write(f"File {json_path} not found)\n") + sys.exit(1) + + main_meta_dict = load_json(main_btk_json_path) + btk_busco_meta_dict = load_json(btk_busco_json_path) + + merged_dict = btk_busco_meta_dict.copy() + + keys_to_skip = [] + fields = main_meta_dict["fields"] + for field in fields: + field_id = field["id"] + + if field_id == "taxonomy": + btk_main_taxonomy_field = field.copy() + btk_main_taxonomy_field["id"] = "btk_main_taxonomy" + btk_main_taxonomy_field["name"] = "btk_main_taxonomy" + merged_dict["fields"].append(btk_main_taxonomy_field) + else: + if field_id not in keys_to_skip: + merged_dict["fields"].append(field) + + meta_json_outpath = f"{combined_dataset_folder}/meta.json" + with open(meta_json_outpath, "w") as json_outfile: + json.dump(merged_dict, json_outfile, indent=1, sort_keys=True) + + +def main(args): + if os.path.isdir(args.main_btk_datasets) is False: + sys.stderr.write(f"The BlobToolKit dataset ({args.main_btk_datasets}) was not found!\n") + sys.exit(1) + + if os.path.isdir(args.btk_busco_datasets) is False: + sys.stderr.write( + f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {args.btk_busco_datasets}, skipping the merging of BTK datasets\n" + ) + sys.exit(0) + + not_copying_list = ["identifiers.json", "gc_data.json", "length_data.json", "ncount_data.json", "meta.json"] + + Path(args.new_output_directory).mkdir(parents=True, exist_ok=True) + + main_btk_dataset_files = [ + f for f in os.listdir(args.main_btk_datasets) if os.path.isfile(os.path.join(args.main_btk_datasets, f)) + ] + main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] + for main_btk_dataset_file in main_btk_dataset_files: + main_btk_dataset_file_full_path = f"{args.main_btk_datasets}/{main_btk_dataset_file}" + copied_file_full_path = f"{args.new_output_directory}/{main_btk_dataset_file}" + shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) + + btk_busco_files = [ + f for f in os.listdir(args.btk_busco_datasets) if os.path.isfile(os.path.join(args.btk_busco_datasets, f)) + ] + for btk_busco_file in btk_busco_files: + btk_busco_file_full_path = f"{args.btk_busco_datasets}/{btk_busco_file}" + copied_file_full_path = f"{args.new_output_directory}/{btk_busco_file}" + shutil.copy(btk_busco_file_full_path, copied_file_full_path) + + create_meta_json(args.main_btk_datasets, args.btk_busco_datasets, args.new_output_directory) + + btk_busco_table_outpath = f"{args.new_output_directory}/btk_busco_summary_table_full.tsv" + btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species {args.main_btk_datasets}" + gpf.run_system_command(btk_busco_table_exporting_command) + + +if __name__ == "__main__": + main(parse_args()) From b0903e4d8aaf1705e5179be9b00cda5397b8b994 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 13:33:35 +0100 Subject: [PATCH 018/117] updates for merge BTK datasets --- workflows/ascc.nf | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 118816bd..c245459b 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -45,6 +45,7 @@ include { RUN_DIAMOND as UNIPROT_DIAMOND } from '../subworkflows/ // include { GC_CONTENT } from '../modules/local/gc_content' include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' +include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' /* @@ -389,10 +390,30 @@ workflow ASCC { ) - MERGE_BTK_DATASET ( - CREATE_BTK_DATASET.out.btk_datasets + //SANGER_TOL_BTK.out.btk_datasets = [] + //SANGER_TOL_BTK.out.summary = [] + + ASCC_MERGE_TABLES ( + ) + // + // NOT TESTED AS WE NEED BTK INTEGRATED FIRST!!! + // + + if ( workflow_steps.contains('busco_btk') || workflow_steps.contains('ALL') ) { + //SANGER_TOL_BTK ( + // yaml_input.out.reference_tuple + //) + + MERGE_BTK_DATASETS ( + CREATE_BTK_DATASET.out.btk_datasets, + [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] + [[],[]] //SANGER_TOL_BTK.out.summary = [] + ) + } + + // // SUBWORKFLOW: Collates version data from prior subworflows // From c09a852b6493e0484ecbdb2b57152fdac7210175 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Apr 2024 13:34:52 +0100 Subject: [PATCH 019/117] Remove ls -lh and unnecessary args --- modules/local/create_btk_dataset.nf | 1 - modules/local/merge_btk_datasets.nf | 18 +++++++----------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index bdce181f..54c2e28d 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -43,7 +43,6 @@ process CREATE_BTK_DATASET { """ mkdir -p btk_datasets/ - ls -lh create_btk_dataset_V2.py \\ -f ${reference} \\ diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 614698e3..96fa0308 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -1,4 +1,4 @@ -rocess MERGE_BTK_DATASETS { +process MERGE_BTK_DATASETS { tag "$meta.id" label 'process_low' @@ -7,8 +7,8 @@ rocess MERGE_BTK_DATASETS { input: tuple val(meta), path(create_btk_datasets) - tuple val(meta), path(busco_btk_datasets) - tuple val(meta), path(busco_summary_file) + tuple val(meta2), path(busco_btk_datasets) + tuple val(meta3), path(busco_summary_file) output: tuple val(meta), path("merged_datasets"), emit: merged_datasets @@ -17,21 +17,17 @@ rocess MERGE_BTK_DATASETS { task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: "" - def busco_btk_datasets = busco_btk_datasets? "-b ${busco_btk_datasets}" : "" - def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" - def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" """ mkdir -p merged_datasets/ - ls -lh merge_btk_datasets_V2.py \\ -m $create_btk_datasets \\ -o ./merged_datasets \\ - $busco_btk_datasets \\ - // $busco_summary_file \\ + -b $busco_btk_datasets \\ + -s $busco_summary_file \\ $args echo "merge_btk_dataset.py btk_datasets/ BTK_FOLDER_PATH BTK_WITH_BUSCO" From ddc627da827ee982b6e5b53ce460b2bcb7045b59 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:55:28 +0100 Subject: [PATCH 020/117] Addition of the trimNs subworkflow Co-authored-by: eeanuin --- bin/trim_Ns.py | 182 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100755 bin/trim_Ns.py diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py new file mode 100755 index 00000000..4775a2b6 --- /dev/null +++ b/bin/trim_Ns.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +""" +Script for detecting trailing Ns that should be trimmed from an assembly, from James Torrance (jt8@sanger.ac.uk). Edited by Eerik Aunin (ea10@sanger.ac.uk) +""" + +import re +from Bio import SeqIO +import argparse + + +def main(fasta_file, output_file): + + minleftover = 200 # after trimming start/end, at least this many bp should be left + winsize = 5000 # for sliding window analysis + minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off + + output_handle = open(output_file, "w") + with open(fasta_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + # output_handle.write('Testing ' + record.id + "\n") + + # Do this twice: once with the regular string, once with the reversed string + # The output should be as one or more variables that can be reversed for the second iteration + seq_string = str(record.seq) + + n_count = seq_string.count("N") + seq_string.count("n") + n_perc = n_count / len(seq_string) + if n_perc > 0.8: + output_handle.write( + "# WARNING: " + + record.id + + "\t" + + str(int(n_perc * 10000) / 100) + + " % Ns of total " + + str(len(seq_string)) + + "\n" + ) + + # Consider handling this via two separate regexps for start and end. Adding character-class might be escalating run-time + # terminal_n_match = re.match('^([Nn]*)(\S+?)([Nn]*)$', seq_string) + start_n_match = re.match("^([Nn]*)", seq_string) + end_n_match = re.search("^([Nn]*)", seq_string[::-1]) + + startseq = "" + if start_n_match: + startseq = start_n_match.group(1) + endseq = "" + if end_n_match: + endseq = end_n_match.group(1) + realseq_length = len(seq_string) - len(startseq) - len(endseq) + # Handle "all Ns exception" + if len(startseq) == len(seq_string): + realseq_length = 0 + endseq = "" + + if len(startseq) > 0 or len(endseq) > 0: + if len(startseq) > 0: + output_handle.write("\t".join(["TRIM:", record.id, "1", str(len(startseq))]) + "\t\n") + if len(endseq) > 0: + output_handle.write( + "\t".join(["TRIM:", record.id, str(len(startseq) + realseq_length + 1), str(len(seq_string))]) + + "\t\n" + ) + + if realseq_length <= minleftover: + output_handle.write( + "REMOVE: " + record.id + "\t" + str(realseq_length) + " bp leftover after trimming\n" + ) + + # Only attempt the windowing approach if there's an initial window that doesn't meet the trigger condition + for seq_string_loop, seq_string_for_window in enumerate([seq_string, seq_string[::-1]]): + if ( + len(seq_string_for_window) > winsize + and (seq_string_for_window[:winsize].count("N") + seq_string_for_window[:winsize].count("n")) + > winsize * minslidingBase + ): + + non_n_regions = [] + non_n_iterator = re.finditer("[^Nn]+", seq_string_for_window) + for non_n_instance in non_n_iterator: + # output_handle.write(record.id + '\t' + str(non_n_instance.start(0)) + '\t' + str(non_n_instance.end(0)) + '\n') + + current_non_n_start = non_n_instance.start(0) + 1 + current_non_n_end = non_n_instance.end(0) + + non_n_regions.insert(0, [current_non_n_start, current_non_n_end]) + + # Does the *end* of this block satisfy the window condition? + bases_in_window = 0 + start_of_first_base_block_in_window = None + for non_n_region in non_n_regions: + if non_n_region[1] >= current_non_n_end - winsize: + start_of_first_base_block_in_window = non_n_region[0] + if non_n_region[0] >= current_non_n_end - winsize: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + else: + bases_in_window += non_n_region[1] - non_n_region[0] + 1 + break + else: + break + + if bases_in_window >= minslidingBase * winsize: + # Because start positions have BED-style + + # Remember that the blocks are in *reverse* order along the sequence + trackback_to_start_flag = False + + for i, non_n_region in enumerate(non_n_regions): + if i == 0: + continue + bases_in_window_2 = 0 + if non_n_region[1] < non_n_regions[0][0] - winsize: + break + else: + current_window_start = max(non_n_region[0], non_n_regions[0][0] - winsize) + + # Count the bases from this block + bases_in_window_2 += ( + min(non_n_region[1], (current_window_start + winsize - 1)) + - current_window_start + + 1 + ) + + # Add up all the sequence in blocks tested thus far, but not the final block + for j in range(1, i): + bases_in_window_2 += non_n_regions[j][1] - non_n_regions[j][0] + 1 + + # Count the sequence in the final block that can contribute + # This is the region between the start of the final block and either the end of the block + # or the end of a window extending from the current test start point, whichever comes first + bases_in_window_2 += ( + min(non_n_regions[0][1], (current_window_start + winsize - 1)) + - non_n_regions[0][0] + + 1 + ) + + # output_handle.write('BIW: ' + str(i) + ' ' + str(bases_in_window_2) + "\n") + if bases_in_window_2 >= minslidingBase * winsize: + if current_window_start == non_n_region[0]: + start_of_first_base_block_in_window = current_window_start + else: + start_of_first_base_block_in_window = non_n_regions[i - 1][0] + else: + # We keep going back until the threshold condition is *not* met + break + + # If we find the break-point should be before the first block, then we don't want to trim at all! + if i == len(non_n_regions) - 1: + trackback_to_start_flag = True + + # Only trim if the breakpoint isn't before the first block + # and if the breakpoint isn't at the start of the sequence + if not (trackback_to_start_flag) and start_of_first_base_block_in_window != 1: + if seq_string_loop == 0: + output_handle.write( + "FWDCLIP:\t" + + record.id + + "\t1\t" + + str(start_of_first_base_block_in_window - 1) + + "\n" + ) + else: + output_handle.write( + "REVCLIP:\t" + + record.id + + "\t" + + str(len(seq_string) - start_of_first_base_block_in_window + 2) + + "\t" + + str(len(seq_string)) + + "\n" + ) + + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("fasta_file", type=str, help="Path to input FASTA file") + parser.add_argument("output_file", type=str, help="Path for output report file") + parser.add_argument("-v", "--version", action="version", version="1.0") + args = parser.parse_args() + main(args.fasta_file, args.output_file) From 581bb915d5b2c16fba7935cd0758c7d815542d40 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:56:29 +0100 Subject: [PATCH 021/117] Addition of the trimNs subworkflow Co-authored-by: eeanuin --- modules/local/trailingns.nf | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 modules/local/trailingns.nf diff --git a/modules/local/trailingns.nf b/modules/local/trailingns.nf new file mode 100644 index 00000000..e69bcbcf --- /dev/null +++ b/modules/local/trailingns.nf @@ -0,0 +1,41 @@ +process TRAILINGNS { + tag "$meta.id" + label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py27_1': + 'biocontainers/biopython:1.70--np112py27_1' }" + + input: + tuple val(meta), path(fasta_input_file) + + output: + tuple val(meta), path("*_trim_Ns"), emit: trailing_ns_report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + """ + trim_Ns.py $fasta_input_file ${prefix}_trim_Ns ${args} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = args.ext.prefix ?: "${meta.id}" + def args = args.ext.args ?: "" + """ + touch ${prefix}_trim_Ns + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} \ No newline at end of file From 001b2f7cece1c2a06a55ad48ff41b452be14c8f2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:56:42 +0100 Subject: [PATCH 022/117] Addition of the trimNs subworkflow Co-authored-by: eeanuin --- subworkflows/local/trailingns_check.nf | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 subworkflows/local/trailingns_check.nf diff --git a/subworkflows/local/trailingns_check.nf b/subworkflows/local/trailingns_check.nf new file mode 100644 index 00000000..a63a9605 --- /dev/null +++ b/subworkflows/local/trailingns_check.nf @@ -0,0 +1,21 @@ +include { TRAILINGNS } from '../../modules/local/trailingns' + +workflow TRAILINGNS_CHECK { + + take: + reference_tuple // tuple [ val(meta), path(fasta) ] + + main: + + ch_versions = Channel.empty() + + // + // MODULE: TRIM LENGTHS OF N'S FROM THE INPUT GENOME AND GENERATE A REPORT ON LENGTH AND LOCATION + // + TRAILINGNS( reference_tuple ) + ch_versions = ch_versions.mix( TRAILINGNS.out.versions ) + + emit: + trailing_ns_report = TRAILINGNS.out.trailing_ns_report + versions = ch_versions.ifEmpty( null ) +} From 1c086f7929a115fc6a8f19ed32285fff2d132fb0 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:57:05 +0100 Subject: [PATCH 023/117] Addition of the trimNs subworkflow Co-authored-by: eeanuin --- workflows/ascc.nf | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index c245459b..be9a96dd 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -39,6 +39,7 @@ include { ORGANELLAR_BLAST as PLASTID_ORGANELLAR_BLAST } from '../subworkflows/ include { ORGANELLAR_BLAST as MITO_ORGANELLAR_BLAST } from '../subworkflows/local/organellar_blast' include { RUN_DIAMOND as NUCLEOT_DIAMOND } from '../subworkflows/local/run_diamond.nf' include { RUN_DIAMOND as UNIPROT_DIAMOND } from '../subworkflows/local/run_diamond.nf' +include { TRAILINGNS_CHECK } from '../subworkflows/local/trailingns_check' // // MODULE: Local modules @@ -46,6 +47,7 @@ include { RUN_DIAMOND as UNIPROT_DIAMOND } from '../subworkflows/ include { GC_CONTENT } from '../modules/local/gc_content' include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' +include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' /* @@ -100,6 +102,14 @@ workflow ASCC { ) ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + // + // SUBWORKFLOW: GENERATE A REPORT ON LENGTHS OF N's IN THE INPUT GENOMe + // + TRAILINGNS_CHECK ( + YAML_INPUT.out.reference_tuple + ) + ch_versions = ch_versions.mix(TRAILINGNS_CHECK.out.versions) + // // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS // @@ -221,7 +231,6 @@ workflow ASCC { ) ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) - ch_chloro.view() } else { ch_chloro = [] } @@ -240,7 +249,6 @@ workflow ASCC { RUN_FCSADAPTOR.out.ch_prok.map{it[1]} ) .set{ ch_fcsadapt } - ch_fcsadapt.view() ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) } else { ch_fcsadapt = [] @@ -348,23 +356,27 @@ workflow ASCC { YAML_INPUT.out.diamond_nr_database_path ) nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nt_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { + nt_hits = [] nt_full = [] } // // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY // + //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles if ( workflow_steps.contains('uniprot_diamond') || workflow_steps.contains('ALL') ) { UNIPROT_DIAMOND ( modified_input, YAML_INPUT.out.diamond_uniprot_database_path ) un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} - un_full.view() + un_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) } else { + un_hits = [] un_full = [] } @@ -384,8 +396,8 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, - nt_full, - un_full, + nt_hits, + un_hits, YAML_INPUT.out.ncbi_taxonomy_path, ) @@ -393,10 +405,6 @@ workflow ASCC { //SANGER_TOL_BTK.out.btk_datasets = [] //SANGER_TOL_BTK.out.summary = [] - ASCC_MERGE_TABLES ( - - ) - // // NOT TESTED AS WE NEED BTK INTEGRATED FIRST!!! // @@ -409,10 +417,29 @@ workflow ASCC { MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] - [[],[]] //SANGER_TOL_BTK.out.summary = [] + [[],[]] //SANGER_TOL_BTK.out.summary = [] ) } + // + // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER + // + ASCC_MERGE_TABLES ( + GC_CONTENT.out.txt, // FROM -- GC_COVERAGE.out.tsv + ch_coverage, // FROM -- RUN_COVERAGE.out.tsv.map{it[1]} + ch_tiara, // FROM -- TIARA_TIARA.out.classifications.map{it[1]} + [], // <-- BACTERIAL KRAKEN -- NOT IN PIPELINE YET + ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} + ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv + nt_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} + un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} + [], // <-- MARKER SCAN -- NOT IN PIPELINE YET + [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET + CREATE_BTK_DATASET.out.create_summary.map{it[1]}, + [], // <-- BUSCO_BTK -- NOT IN PIPELINE YET + ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} + ) // // SUBWORKFLOW: Collates version data from prior subworflows From 3a28d356bf47559a414327b299514c2aca964314 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:58:39 +0100 Subject: [PATCH 024/117] Addition of the ascc merge tables module --- modules/local/ascc_merge_tables.nf | 74 ++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 modules/local/ascc_merge_tables.nf diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf new file mode 100644 index 00000000..d39057a4 --- /dev/null +++ b/modules/local/ascc_merge_tables.nf @@ -0,0 +1,74 @@ +process ASCC_MERGE_TABLES { + tag "$meta.id" + label 'process_low' + + container 'sanger-tol/ascc_btk:3.2.6-c1' + + input: + tuple val(meta), path(gc_content, stageAs: "GC.txt") + path coverage + path tiara, stageAs: "TIARA.txt" + path bacterial_kraken + path nt_kraken, stageAs: "LINEAGE.txt" + path nt_blast + path dim_reduction_embeddings + path nr_diamond + path uniprot_diamond, stageAs: "UP_DIAMOND.tsv" + path cobiontid_markerscan + path contigviz + path btk, stageAs: "BTK_summary_table_full.tsv" + path btk_busco + path fcs_gx, stageAs: "FCSGX_parsed.csv" + + output: + tuple val(meta), path("*_contamination_check_merged_table.csv") , emit: merged_table + tuple val(meta), path("*_contamination_check_merged_table_extended.csv"), optional: true, emit: extended_table + tuple val(meta), path("*_phylum_counts_and_coverage.csv") , optional: true, emit: phylum_counts + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def coverage = coverage ? "-c ${coverage}" : "" + def tiara = tiara ? "-t ${tiara}" : "" + def bacterial_kraken = bacterial_kraken ? "-bk ${bacterial_kraken}" : "" + def nt_kraken = nt_kraken ? "-nk ${nt_kraken}" : "" + def nt_blast = nt_blast ? "-nb ${nt_blast}" : "" + def dim_reduction_embeddings = dim_reduction_embeddings ? "-dr ${dim_reduction_embeddings}" : "" + def nr_diamond = nr_diamond ? "-nd ${nr_diamond}" : "" + def uniprot_diamond = uniprot_diamond ? "-ud ${uniprot_diamond}" : "" + def contigviz = contigviz ? "-cv ${contigviz}" : "" + def btk = btk ? "-btk ${btk}" : "" + def btk_busco = btk_busco ? "-bb ${btk_busco}" : "" + def fcs_gx = fcs_gx ? "-fg ${fcs_gx}" : "" + def cobiontid_markerscan = "" + + """ + + ascc_m_tables.py \\ + --gc_cov $gc_content \\ + --sample_name $meta.id \\ + $coverage \\ + $tiara \\ + $bacterial_kraken \\ + $nt_kraken \\ + $nt_blast \\ + $dim_reduction_embeddings \\ + $nr_diamond \\ + $uniprot_diamond \\ + $contigviz \\ + $btk \\ + $btk_busco \\ + $fcs_gx \\ + $cobiontid_markerscan \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_merge_tables: \$(ascc_merge_tables.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} From 645df154da75cd7a6f53b1ae4e8cce633bf427ee Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 12:59:16 +0100 Subject: [PATCH 025/117] module which was missing from the diamond subworkflow, required to generate the hits file --- modules/local/convert_to_hits_file.nf | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 modules/local/convert_to_hits_file.nf diff --git a/modules/local/convert_to_hits_file.nf b/modules/local/convert_to_hits_file.nf new file mode 100644 index 00000000..4b125444 --- /dev/null +++ b/modules/local/convert_to_hits_file.nf @@ -0,0 +1,25 @@ +process CONVERT_TO_HITS_FILE { + tag "$meta.id" + label 'process_low' + + container 'sanger-tol/ascc_btk:3.2.6-c1' + + input: + tuple val(meta), path(blast_full) + + output: + tuple val(meta), path("*csv"), emit: hits_file + path "versions.yml", emit: versions + + script: + def args = task.ext.args + """ + convert_to_hits.py $blast_full $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + convert_to_hits: \$(convert_to_hits.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} \ No newline at end of file From e71c6d51024b12c868798ce672930a0ae2ab00ed Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:00:05 +0100 Subject: [PATCH 026/117] Updates to the modules --- modules/local/autofiltering.nf | 4 ++-- modules/local/create_btk_dataset.nf | 9 ++++----- modules/local/filter_vecscreen_results.nf | 4 ++-- modules/local/gc_content.nf | 2 +- modules/local/get_kmer_counts.nf | 2 +- modules/local/merge_btk_datasets.nf | 6 ++---- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index b95b36bf..fcf58bc4 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -15,8 +15,8 @@ process AUTOFILTER_ASSEMBLY { tuple val(meta), path("assembly_filtering_removed_sequences.txt") emit: removed_seqs script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" """ remove_fcs_gx_and_tiara.py \\ $reference \\ diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 54c2e28d..c8546c45 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -22,14 +22,15 @@ process CREATE_BTK_DATASET { path ncbi_taxdump output: - tuple val(meta), path("btk_datasets"), emit: btk_datasets + tuple val(meta), path("btk_datasets"), emit: btk_datasets + tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary when: task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" @@ -61,8 +62,6 @@ process CREATE_BTK_DATASET { $fcs_arg \\ $args - echo "merge_btk_dataset.py btk_datasets/ BTK_FOLDER_PATH BTK_WITH_BUSCO" - cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/modules/local/filter_vecscreen_results.nf b/modules/local/filter_vecscreen_results.nf index cf1863e8..50331a1c 100644 --- a/modules/local/filter_vecscreen_results.nf +++ b/modules/local/filter_vecscreen_results.nf @@ -18,8 +18,8 @@ process FILTER_VECSCREEN_RESULTS { task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' """ VSlistTo1HitPerLine.py ${args} ${vecscreen_outfile} > ${prefix}_vecscreen.grepped.out diff --git a/modules/local/gc_content.nf b/modules/local/gc_content.nf index a2673d7d..d1fc5766 100644 --- a/modules/local/gc_content.nf +++ b/modules/local/gc_content.nf @@ -12,7 +12,7 @@ process GC_CONTENT { output: tuple val(meta), path( "*-GC_CONTENT.txt" ) , emit: txt - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/get_kmer_counts.nf b/modules/local/get_kmer_counts.nf index 082d7de6..a3ff0857 100755 --- a/modules/local/get_kmer_counts.nf +++ b/modules/local/get_kmer_counts.nf @@ -20,7 +20,7 @@ process GET_KMER_COUNTS { script: def KCOUNTER_VERSION = "0.1.1" - def prefix = args.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: "${meta.id}" """ get_kmers_counts.py \\ $input_fasta \\ diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 96fa0308..4b6904b2 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -17,8 +17,8 @@ process MERGE_BTK_DATASETS { task.ext.when == null || task.ext.when script: - def prefix = args.ext.prefix ?: "${meta.id}" - def args = args.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" """ mkdir -p merged_datasets/ @@ -30,8 +30,6 @@ process MERGE_BTK_DATASETS { -s $busco_summary_file \\ $args - echo "merge_btk_dataset.py btk_datasets/ BTK_FOLDER_PATH BTK_WITH_BUSCO" - cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') From 98a8bc168a9c591995ebe6cf0c682e5d417900d2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:00:37 +0100 Subject: [PATCH 027/117] Updating modules config --- conf/modules.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index bc8389f9..69117d81 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -30,10 +30,18 @@ process { ext.args = "diamond" } + withName: '.*:.*:NUCLEOT_DIAMOND:CONVERT_TO_HITS_FILE' { + ext.args = "nr" + } + withName: '.*:.*:UNIPROT_DIAMOND:DIAMOND_BLAST_CHUNK_TO_FULL' { ext.args = "diamond" } + withName: '.*:.*:UNIPROT_DIAMOND:CONVERT_TO_HITS_FILE' { + ext.args = "Uniprot" + } + withName: BLAST_MAKEBLASTDB { ext.args = { "-dbtype nucl" } } From 3554bbf5e19d0a775f762467abaa515244d99c90 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:01:25 +0100 Subject: [PATCH 028/117] Addition of a new module to generate the hits file --- subworkflows/local/run_diamond.nf | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/run_diamond.nf b/subworkflows/local/run_diamond.nf index 13ae533c..3bb107a4 100644 --- a/subworkflows/local/run_diamond.nf +++ b/subworkflows/local/run_diamond.nf @@ -1,6 +1,7 @@ include { SEQKIT_SLIDING } from '../../modules/nf-core/seqkit/sliding/main' include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' include { BLAST_CHUNK_TO_FULL as DIAMOND_BLAST_CHUNK_TO_FULL } from '../../modules/local/blast_chunk_to_full' +include { CONVERT_TO_HITS_FILE } from '../../modules/local/convert_to_hits_file' include { REFORMAT_DIAMOND_OUTFMT6 } from '../../modules/local/format_diamond_outfmt6' workflow RUN_DIAMOND { @@ -10,7 +11,7 @@ workflow RUN_DIAMOND { main: ch_versions = Channel.empty() - ch_ext = Channel.of(6) + ch_ext = Channel.of("txt") ch_columns = Channel.of("qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles") // @@ -40,6 +41,14 @@ workflow RUN_DIAMOND { ) ch_versions = ch_versions.mix(DIAMOND_BLAST_CHUNK_TO_FULL.out.versions) + // + // MODULE: CONVERT THE FULL GENOME FILE INTO A HITS FILE + // + CONVERT_TO_HITS_FILE( + DIAMOND_BLAST_CHUNK_TO_FULL.out.full + ) + ch_versions = ch_versions.mix(CONVERT_TO_HITS_FILE.out.versions) + // // MODULE: REFORMAT THE DIAMOND OUTPUT // @@ -51,5 +60,6 @@ workflow RUN_DIAMOND { emit: full = DIAMOND_BLAST_CHUNK_TO_FULL.out.full reformed = REFORMAT_DIAMOND_OUTFMT6.out.full + hits_file = CONVERT_TO_HITS_FILE.out.hits_file versions = ch_versions.ifEmpty(null) } \ No newline at end of file From dcc82bab0be6bc33a2b4e9c840bf8247290a57d9 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:03:57 +0100 Subject: [PATCH 029/117] Addition of re-written scripts --- bin/convert_to_hits.py | 83 +++++++++++++++++ bin/create_btk_dataset_V2.py | 173 ++++++++--------------------------- 2 files changed, 121 insertions(+), 135 deletions(-) create mode 100755 bin/convert_to_hits.py diff --git a/bin/convert_to_hits.py b/bin/convert_to_hits.py new file mode 100755 index 00000000..5cbe4439 --- /dev/null +++ b/bin/convert_to_hits.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" + +DESCRIPTION = """ +Script for getting the top hits of Diamond BLASTX against the nr database. Top hits per each scaffold are determined from the BLASTX results of scaffold chunks. +The output is reformatted into a CSV table +Argument1: path to Diamond BLASTX results, with the following columns: +"qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "end", "evalue", "bitscore", "staxids", "sscinames", "sskingdoms", "sphylums", "salltitles" + +Version: {VERSION} +--- +Written by Eerik Anuin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" +import general_purpose_functions as gpf +from collections import OrderedDict +import textwrap +import argparse + + +def save_file(output_list, name): + with open(f"{name}_diamond_blastx_top_hits.csv", "w") as f: + for line in output_list: + f.write(f"{line}\n") + + +def main(in_path, diamond_database_title): + in_data = gpf.ll(in_path) + + top_hits_dict = OrderedDict() + output = [] + + colnames = ( + "scaff", + "diamond_{0}_bitscore", + "diamond_{0}_staxids", + "diamond_{0}_sscinames", + "diamond_{0}_sskingdoms", + "diamond_{0}_sphylums", + "diamond_{0}_salltitles", + ) + colnames = [n.format(diamond_database_title) for n in colnames] + + for line in in_data: + line = line.replace(",", " ") + line = line.replace("<>", " ") + split_line = line.split("\t") + seq_name = split_line[0] + seq_name = seq_name.split("_sliding")[0] + bitscore = float(split_line[11]) + if seq_name not in top_hits_dict: + top_hits_dict[seq_name] = dict() + top_hits_dict[seq_name]["bitscore"] = bitscore + top_hits_dict[seq_name]["line"] = line + else: + if bitscore > top_hits_dict[seq_name]["bitscore"]: + top_hits_dict[seq_name]["bitscore"] = bitscore + top_hits_dict[seq_name]["line"] = line + + output.append(",".join(colnames)) + for seq_name in top_hits_dict: + out_line = top_hits_dict[seq_name]["line"] + out_line = out_line.split("\t") + out_line = ",".join(out_line[11 : len(out_line)]) + out_line = seq_name + "," + out_line + output.append(out_line) + + save_file(output, diamond_database_title) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Convert File to hits", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("in_path", type=str, help="Path to Diamond BLASTX results") + parser.add_argument("diamond_database_title", type=str, help="Name of the Diamond database (e.g. nr or Uniprot)") + parser.add_argument("-v", "--version", action="version", version=VERSION) + args = parser.parse_args() + main(args.in_path, args.diamond_database_title) diff --git a/bin/create_btk_dataset_V2.py b/bin/create_btk_dataset_V2.py index e4185956..69638fdf 100755 --- a/bin/create_btk_dataset_V2.py +++ b/bin/create_btk_dataset_V2.py @@ -25,145 +25,49 @@ def parse_args(argv=None): parser = argparse.ArgumentParser( prog="createBTKdatasets", formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent(DESCRIPTION) - - ) - parser.add_argument( - "-n", - "--name", - required=True, - type=str, - help="Assembly name (for the output files)" + description=textwrap.dedent(DESCRIPTION), ) + parser.add_argument("-n", "--name", required=True, type=str, help="Assembly name (for the output files)") parser.add_argument( "-tn", "--taxon_name", required=True, type=str, - help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)" + help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)", ) + parser.add_argument("-id", "--taxid", required=True, type=int, help="Taxon ID of the assembly") parser.add_argument( - "-id", - "--taxid", - required=True, - type=int, - help="Taxon ID of the assembly" - ) - parser.add_argument( - "-td", - "--taxdump", - required=True, - type=str, - help="Path to the directory containing the NCBI taxdump" - ) - parser.add_argument( - "-f", - "--fasta", - required=True, - type=str, - help="The path for the assembly fasta file" + "-td", "--taxdump", required=True, type=str, help="Path to the directory containing the NCBI taxdump" ) + parser.add_argument("-f", "--fasta", required=True, type=str, help="The path for the assembly fasta file") parser.add_argument( "-d", "--dataset", type=str, required=True, - help="The folder containing the data generated throughout the pipeline" + help="The folder containing the data generated throughout the pipeline", ) + parser.add_argument("-bh", "--blastn_hits", default="N", type=str, help="Path to the BLASTN hits file") parser.add_argument( - "-bh", - "--blastn_hits", - default="N", - type=str, - help="Path to the BLASTN hits file" + "-ud", "--uniprot_diamond_hits", default="N", type=str, help="Path to the UNIPROT diamond BlastX hits file" ) + parser.add_argument("-nr", "--nr_diamond_hits", default="N", type=str, help="Path to the DIAMOND BlastX hits file") parser.add_argument( - "-ud", - "--uniprot_diamond_hits", - default="N", - type=str, - help="Path to the UNIPROT diamond BlastX hits file" + "-r", "--mapped_reads", default="N", type=str, help="Path to mapped reads BAM for coverage estimation" ) + parser.add_argument("-t", "--tiara", default="N", type=str, help="Path to the tiara_out.txt file") parser.add_argument( - "-nr", - "--nr_diamond_hits", - default="N", - type=str, - help="Path to the DIAMOND BlastX hits file" + "-p", "--pca", default="N", type=str, help="Path to the kmers_dim_reduction_embeddings.csv file" ) + parser.add_argument("-fc", "--fcs_gx", default="N", type=str, help="Path to the fcs-gx_summary.csv.csv file") + parser.add_argument("-k", "--kraken", default="N", type=str, help="Path to the nt_kraken_lineage.txt file") + parser.add_argument("-ms", "--markerscan", default="N", type=str, help="Path to the cobiontid_markerscan.csv file") + parser.add_argument("-cv", "--contigviz", default="N", type=str, help="Path to the contigviz_results.csv file") + parser.add_argument("-o", "--output", default="btk_datasets", type=str, help="Output directory") + parser.add_argument("--threads", type=int, default=1, help="Number of threads to utilise") + parser.add_argument("--alias", type=str, default="", help="Assembly alias") parser.add_argument( - "-r", - "--mapped_reads", - default="N", - type=str, - help="Path to mapped reads BAM for coverage estimation" - ) - parser.add_argument( - "-t", - "--tiara", - default="N", - type=str, - help="Path to the tiara_out.txt file" - ) - parser.add_argument( - "-p", - "--pca", - default="N", - type=str, - help="Path to the kmers_dim_reduction_embeddings.csv file" - ) - parser.add_argument( - "-fc", - "--fcs_gx", - default="N", - type=str, - help="Path to the fcs-gx_summary.csv.csv file" - ) - parser.add_argument( - "-k", - "--kraken", - default="N", - type=str, - help="Path to the nt_kraken_lineage.txt file" - ) - parser.add_argument( - "-ms", - "--markerscan", - default="N", - type=str, - help="Path to the cobiontid_markerscan.csv file" - ) - parser.add_argument( - "-cv", - "--contigviz", - default="N", - type=str, - help="Path to the contigviz_results.csv file" - ) - parser.add_argument( - "-o", - "--output", - default="btk_datasets", - type=str, - help="Output directory" - ) - parser.add_argument( - "--threads", - type=int, - default=1, - help="Number of threads to utilise" - ) - parser.add_argument( - "--alias", - type=str, - default="", - help="Assembly alias" - ) - parser.add_argument( - "--dry_run", - dest="dry_run", - action="store_true", - help="Dry run (print commands without executing)" + "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" ) parser.add_argument("-v", "--version", action="version", version=VERSION) @@ -176,17 +80,20 @@ def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): """ if ".gz" in assembly_alias: assembly_alias = assembly_alias.replace(".gz", "_gz") - out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format(assembly_alias, taxon_name) + out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format( + assembly_alias, taxon_name + ) with open(assembly_yaml_path, "w") as f: f.write(out_string) + def tiara_results_to_btk_format(tiara_results_path, outfile_path): """ Reformatting Tiara output file so that the summarised results of the first and second pass of Tiara can be added to a BlobToolKit dataset """ tiara_data = gpf.l(tiara_results_path) - tiara_data = tiara_data[1:len(tiara_data)] + tiara_data = tiara_data[1 : len(tiara_data)] with open(outfile_path, "w") as f: f.write("identifier\ttiara\n") for line in tiara_data: @@ -221,10 +128,11 @@ def detect_dim_reduction_methods(kmers_dim_reduction_output_path): dim_reduction_methods.append(header_item) return dim_reduction_methods + def main(args): command_list = [] - assembly_alias = ( args.name if args.alias == "" else args.alias ) + assembly_alias = args.name if args.alias == "" else args.alias edited_assembly_title = args.name.replace(".", "_").replace(" ", "_") @@ -239,16 +147,9 @@ def main(args): gpf.run_system_command(blobtools_create_command, dry_run=args.dry_run) # ADDING BLAST HIT DATA TO BTK - hits_file_paths = [ - args.blastn_hits, - args.uniprot_diamond_hits, - args.nr_diamond_hits - ] + hits_file_paths = [args.blastn_hits, args.uniprot_diamond_hits, args.nr_diamond_hits] - hits_file = [ - n for n in hits_file_paths - if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0 - ] + hits_file = [n for n in hits_file_paths if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0] if len(hits_file) > 0: add_hits_command = "blobtools add" @@ -258,7 +159,11 @@ def main(args): command_list.append(add_hits_command) # ADDING MAPPED READS DATA TO BTK - if args.mapped_reads != "N" and os.path.isfile(args.mapped_reads) is True and os.stat(args.mapped_reads).st_size > 0: + if ( + args.mapped_reads != "N" + and os.path.isfile(args.mapped_reads) is True + and os.stat(args.mapped_reads).st_size > 0 + ): add_cov_command = f"blobtools add --cov {args.mapped_reads} --threads {args.threads} {args.output}" command_list.append(add_cov_command) @@ -287,15 +192,13 @@ def main(args): add_fcs_gx_results_command = f"blobtools add --text {args.fcs_gx} --text-delimiter ',' --text-cols 'scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action' --text-header {args.output}" command_list.append(add_fcs_gx_results_command) - - export_table_command = f"blobtools filter --table {args.dataset}/collected_tables/btk_summary_table_full.tsv {args.output}" + export_table_command = f"blobtools filter --table btk_summary_table_full.tsv {args.output}" command_list.append(export_table_command) # EXECUTE ALL BTK COMMANDS for i in command_list: gpf.run_system_command(i, dry_run=args.dry_run) + if __name__ == "__main__": - main( - parse_args() - ) \ No newline at end of file + main(parse_args()) From 90359a35c3e6dd52ad2710be229211492dc94bce Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:04:42 +0100 Subject: [PATCH 030/117] Addition of re-written scripts --- bin/process_result_tables.py | 111 +++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100755 bin/process_result_tables.py diff --git a/bin/process_result_tables.py b/bin/process_result_tables.py new file mode 100755 index 00000000..831adc67 --- /dev/null +++ b/bin/process_result_tables.py @@ -0,0 +1,111 @@ +""" +Script processing the cobiont check result tables to add a combined classification ('merged_classif') column that is based + on the output of multiple tools. Also generates a table for estimated coverages per 'merged_classif' column +""" + +import pandas as pd +import numpy as np +import argparse +import sys +import os + + +def generate_counts_df(df, counts_df_output_path): + """ + Creates a table that shows the number of sequences, mean coverage and sequence span per phylum + """ + merged_classif_counts = df["merged_classif"].value_counts(dropna=False) + cov_list = list() + span_list = list() + for classif_item, _ in merged_classif_counts.iteritems(): + ind = list(np.where(df["merged_classif"] == classif_item)[0]) + cov_values = df.iloc[ind, df.columns.get_loc("coverage")] + length_values = df.iloc[ind, df.columns.get_loc("length")] + cov_list.append(round(np.mean(cov_values), 2)) + span_sum = sum(length_values) / 1000000 + span_list.append(round(span_sum, 2)) + + counts_df = merged_classif_counts.to_frame() + counts_df["mean_coverage"] = cov_list + counts_df["span_mb"] = span_list + + counts_df.rename(columns={"merged_classif": "number_of_sequences"}, inplace=True) + counts_df.index.name = "phylum" + counts_df.to_csv(counts_df_output_path, index=True) + + +def main(output_folder, sample_id): + main_results_table_path = "{}/{}_contamination_check_merged_table.csv".format(output_folder, sample_id) + btk_results_table_path = "{}/btk_summary_table_full.tsv".format(output_folder) + output_df_path = "{}/{}_contamination_check_merged_table_extended.csv".format(output_folder, sample_id) + counts_df_output_path = "{}/{}_phylum_counts_and_coverage.csv".format(output_folder, sample_id) + + if os.path.isdir(output_folder) == False: + sys.stderr.write( + "The directory with the output tables of the pipeline ({}) was not found\n".format(output_folder) + ) + sys.exit(1) + + if os.path.isfile(main_results_table_path) == False: + sys.stderr.write("The main results table file ({}) was not found\n".format(main_results_table_path)) + sys.exit(1) + + if os.path.isfile(btk_results_table_path) == False: + sys.stderr.write( + "Skipping the exporting of extended results table because the BlobToolKit dataset ({}) was not found\n".format( + btk_results_table_path + ) + ) + sys.exit(0) + + main_df = None + main_df = pd.read_csv(main_results_table_path) + if main_df.shape[0] == 0: + sys.stderr.write("No rows were found in cobiont check results table ({})\n".format(main_results_table_path)) + sys.exit(1) + + if "btk_bestsum_phylum" not in main_df.columns: + sys.stderr.write( + "Column 'btk_bestsum_phylum' was not found in results table ({})\n".format(main_results_table_path) + ) + sys.exit(1) + + df = main_df + df["merged_classif"] = df["btk_bestsum_phylum"] + df["merged_classif_source"] = "btk_bestsum_phylum" + + if "nt_kraken_phylum" in df.columns: + ind = list(np.where(df["merged_classif"] == "no-hit")[0]) + ind2 = list(np.where(df["nt_kraken_phylum"].isna())[0]) + ind3 = [n for n in ind if n not in ind2] + + df.iloc[ind3, df.columns.get_loc("merged_classif")] = df.iloc[ind3, df.columns.get_loc("nt_kraken_phylum")] + df.iloc[ind3, df.columns.get_loc("merged_classif_source")] = "nt_kraken_phylum" + + if "tiara_classif" in df.columns: + tiara_ind = list(np.where(df["merged_classif"] == "no-hit")[0]) + tiara_ind2 = list(np.where(df["tiara_classif"].isna())[0]) + tiara_ind3 = list(np.where(df["tiara_classif"] == "unknown")[0]) + tiara_ind = [n for n in tiara_ind if n not in tiara_ind2 and n not in tiara_ind3] + df.iloc[tiara_ind, df.columns.get_loc("merged_classif")] = df.iloc[ + tiara_ind, df.columns.get_loc("tiara_classif") + ] + df.iloc[tiara_ind, df.columns.get_loc("merged_classif_source")] = "tiara" + + df["merged_classif"] = df["merged_classif"].replace("bacteria", "Bacteria-undef") + df["merged_classif"] = df["merged_classif"].replace("eukarya", "Eukaryota-undef") + df["merged_classif"] = df["merged_classif"].replace("prokarya", "Prokaryota-undef") + df["merged_classif"] = df["merged_classif"].replace("archaea", "Archaea-undef") + + df.to_csv(output_df_path, index=False) + os.remove(main_results_table_path) + # os.rename(output_df_path, main_results_table_path) To remove Nextflow confussion + generate_counts_df(df, counts_df_output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("output_folder", type=str, help="Path to the directory with the output tables of the pipeline") + parser.add_argument("sample_id", type=str, help="ToL ID of the sample") + args = parser.parse_args() + main(args.output_folder, args.sample_id) From bed9234cdd9c80b82e374eecfa19d24560a7aec2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:06:20 +0100 Subject: [PATCH 031/117] addition of ascc merge tables script --- bin/ascc_m_tables.py | 313 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100755 bin/ascc_m_tables.py diff --git a/bin/ascc_m_tables.py b/bin/ascc_m_tables.py new file mode 100755 index 00000000..932f5059 --- /dev/null +++ b/bin/ascc_m_tables.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = """ +Script for merging contaminant check results into one table +Version: {VERSION} +--- +Written by Eerik Anuin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + +import argparse +import pandas as pd +import textwrap +import os +import sys +import general_purpose_functions as gpf + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="AsccMergeTables", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") + parser.add_argument("-c", "--coverage", type=str, help="Coverage file") + parser.add_argument("-t", "--tiara", type=str, help="Tiara file") + parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") + parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") + parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") + parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") + parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") + parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") + parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") + parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") + parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") + parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") + parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + + for data_type, input_file in paths_dict.items(): + if input == None: + pass + else: + out_dict[data_type] = input_file + + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + if paths_dict["coverage"] is not None: + coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") + coverage_df = None + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:, [0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write( + "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) + ) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in kmers dimensionality reduction output table ({})\n".format( + paths_dict["dim_reduction_embeddings"] + ) + ) + dim_reduction_df = None + + btk_df = None + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) + ) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns=btk_renaming_dict, inplace=True) + + btk_selected_cols = [ + col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] + ] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) + ) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + + btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [ + col + for col in btk_busco_df.columns + if col + in [ + "scaff", + "buscogenes_superkingdom", + "buscogenes_kingdom", + "buscogenes_phylum", + "buscogenes_class", + "buscogenes_order", + "buscogenes_family", + "buscogenes_genus", + "buscogenes_species", + "buscoregions_superkingdom", + "buscoregions_kingdom", + "buscoregions_phylum", + "buscoregions_class", + "buscoregions_order", + "buscoregions_family", + "buscoregions_genus", + "buscoregions_species", + ] + ] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) + ) + uniprot_diamond_df = None + + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in CobiontID MarkerScan output table ({})\n".format( + paths_dict["cobiontid_markerscan"] + ) + ) + uniprot_diamond_df = None + + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(args): + paths_dict = dict() + paths_dict["gc_content"] = args.gc_cov + paths_dict["coverage"] = args.coverage + paths_dict["tiara"] = args.tiara + paths_dict["bacterial_kraken"] = args.bacterial_kraken + paths_dict["nt_kraken"] = args.nt_kraken + paths_dict["nt_blast"] = args.nt_blast + paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings + paths_dict["nr_diamond"] = args.nr_diamond + paths_dict["uniprot_diamond"] = args.uniprot_diamond + paths_dict["cobiontid_markerscan"] = args.markerscan + paths_dict["contigviz"] = args.contigviz + paths_dict["blobtoolkit"] = args.blobtoolkit + paths_dict["btk_busco"] = args.busco_btk + paths_dict["fcs_gx"] = args.fcs_gx + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) + + if ( + paths_dict["nt_blast"] + and paths_dict["nr_diamond"] + and paths_dict["uniprot_diamond"] + and paths_dict["coverage"] + and paths_dict["tiara"] + and paths_dict["nt_kraken"] + ): + process_results_tables_command = f"process_result_tables.py . {args.sample_name}" + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write( + f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" + ) + + +if __name__ == "__main__": + main(parse_args()) From 5b623512703347a797b858fddb5e90345488e2c6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 13:57:53 +0100 Subject: [PATCH 032/117] Skeleton module for the sanger-tol_btk module/pipeline --- modules/local/sanger_tol_btk.nf | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 modules/local/sanger_tol_btk.nf diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf new file mode 100644 index 00000000..3db52197 --- /dev/null +++ b/modules/local/sanger_tol_btk.nf @@ -0,0 +1,77 @@ +process SANGER_TOL_BTK { + tag "$meta.id" + label 'process_low' + + input: + tuple val(meta), path(reference, stageAs: "REFERENCE.fa") + path samplesheet_csv + path blastp + path blastn + path blastx + path btk_config + path tax_dump + val taxon + val gca_accession + + output: + path("blobtoolkit/$gca_accession"), emit: btk_results + path("blobtoolkit/plots"), emit: btk_plots + path("blobktoolkit/busco"), emit: btk_busco + path("blobktoolkit/multiqc"), emit: btk_multiqc + path("blobtoolkit_pipeline_info"), emit: btk_pipeline + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def executor = task.ext.executor ?: "" + def profiles = task.ext.profiles ?: "" + def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + """ + $executor 'nextflow run sanger-tol/blobtoolkit \\ + -profile $profiles \\ + --input $samplesheet_csv \\ + --outdir ${meta.id}_btk_out \\ + --fasta $reference \\ + --accession $gc_accession \\ + --taxon $taxon \\ + --taxdump $tax_dump \\ + --blastp $blastp \\ + --blastn $blastn \\ + --blastx $blastx \\ + -c $btk_config' + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version + END_VERSIONS + """ + + stub: + """ + mkdir -p blobtoolkit/$gca_accession + touch blobtoolkit/$gca_accession/test.json.gz + + mkdir blobtoolkit/plots + touch blobtoolkit/plots/test.png + + mkdir blobktoolkit/busco + touch blobtoolkit/busco/test.batch_summary.txt + touch blobtoolkit/busco/test.fasta.txt + touch blobtoolkit/busco/test.json + + mkdir blobktoolkit/multiqc + mkdir blobktoolkit/multiqc/multiqc_data + mkdir blobktoolkit/multiqc/multiqc_plots + touch blobktoolkit/multiqc/multiqc_report.html + + mv pipeline_into blobtoolkit_pipeline_info + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + parse_fcsgx_result: \$(parse_fcsgx_result.py -v) + END_VERSIONS + """ +} \ No newline at end of file From 8f38b5b4ff009b292f31a26dd858e3d6d5cbe1a3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:03:44 +0100 Subject: [PATCH 033/117] Adding the expected values for the sanger-tol-blobltoolkit pipeline/module --- conf/modules.config | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 69117d81..223be558 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,13 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: SANGER_TOL_BTK { + ext.args = "--blastx_outext 'txt'" + ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" + ext.profiles = "singularity,sanger" + ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" + } + withName: SEQKIT_SLIDING { ext.args = {"-s ${meta.sliding} -W ${meta.window} "} } From d95a6831b34039b6436a096b7c088a74bd82379d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:04:16 +0100 Subject: [PATCH 034/117] adding args --- modules/local/sanger_tol_btk.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 3db52197..87c3d1d2 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -38,7 +38,8 @@ process SANGER_TOL_BTK { --blastp $blastp \\ --blastn $blastn \\ --blastx $blastx \\ - -c $btk_config' + -c $btk_config \\ + $args' cat <<-END_VERSIONS > versions.yml From 5be3f3f32ea33c46cafa086ac0e895371ff2a089 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:07:12 +0100 Subject: [PATCH 035/117] Adding IN-DEVELOPMENT banner --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index e4e96212..b5aaaafe 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,17 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/ascc) +--- + +# THIS IS AN IN-DEVELOPMENT PIPELINE THAT IS CURRENTLY NOT READY FOR ANY USE + +AS SUCH YOU MAY FIND THAT THE DOCUMENTATION DOES NOT MATCH THE CODE AND IT MAY NOT WORK + +ONCE THE PIPELINE REACHES A USABLE STATE A TAGGED RELEASE/PRE-RELEASE WILL BE MADE + +--- + + ## Introduction **sanger-tol/ascc** is a bioinformatics pipeline that ... From a0d5fae5b977a4de77c026f091655f22987f02d1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:21:25 +0100 Subject: [PATCH 036/117] linting fix --- bin/create_btk_dataset.py | 102 +++++++++++++++++++------ bin/merge_btk_datasets.py | 64 ++++++++++++---- bin/remove_fcs_gx_and_tiara_contams.py | 41 +++++++--- 3 files changed, 158 insertions(+), 49 deletions(-) diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py index 53b95455..d904d2ed 100755 --- a/bin/create_btk_dataset.py +++ b/bin/create_btk_dataset.py @@ -16,7 +16,9 @@ def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): """ if ".gz" in assembly_alias: assembly_alias = assembly_alias.replace(".gz", "_gz") - out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format(assembly_alias, taxon_name) + out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format( + assembly_alias, taxon_name + ) with open(assembly_yaml_path, "w") as f: f.write(out_string) @@ -27,7 +29,7 @@ def tiara_results_to_btk_format(tiara_results_path, outfile_path): added to a BlobToolKit dataset """ tiara_data = gpf.l(tiara_results_path) - tiara_data = tiara_data[1:len(tiara_data)] + tiara_data = tiara_data[1 : len(tiara_data)] with open(outfile_path, "w") as f: f.write("identifier\ttiara\n") for line in tiara_data: @@ -69,7 +71,9 @@ def add_custom_variables_to_btk_dataset(pipeline_run_folder, btk_dataset_folder) """ pipeline_output_folder = pipeline_run_folder + "/collected_tables" if os.path.isdir(pipeline_output_folder) == False: - sys.stderr.write("The directory for the output tables of the pipeline ({}) was not found\n".format(pipeline_output_folder)) + sys.stderr.write( + "The directory for the output tables of the pipeline ({}) was not found\n".format(pipeline_output_folder) + ) sys.exit(1) if os.path.isdir(btk_dataset_folder) == False: sys.stderr.write("The BlobToolKit dataset directory ({}) was not found\n".format(btk_dataset_folder)) @@ -78,45 +82,72 @@ def add_custom_variables_to_btk_dataset(pipeline_run_folder, btk_dataset_folder) if os.path.isfile(tiara_raw_output_path) and os.stat(tiara_raw_output_path).st_size > 0: tiara_reformatted_output_path = pipeline_output_folder + "/tiara_out_btk_format.tsv" tiara_results_to_btk_format(tiara_raw_output_path, tiara_reformatted_output_path) - add_tiara_command = 'blobtools add --text {} --text-delimiter "\t" --text-cols "identifier=identifiers,tiara=tiara" --text-header {}'.format(tiara_reformatted_output_path, btk_dataset_folder) + add_tiara_command = 'blobtools add --text {} --text-delimiter "\t" --text-cols "identifier=identifiers,tiara=tiara" --text-header {}'.format( + tiara_reformatted_output_path, btk_dataset_folder + ) gpf.run_system_command(add_tiara_command) kmers_dim_reduction_output_path = pipeline_output_folder + "/kmers_dim_reduction_embeddings.csv" if os.path.isfile(kmers_dim_reduction_output_path) and os.stat(kmers_dim_reduction_output_path).st_size > 0: used_dim_reduction_methods = detect_dim_reduction_methods(kmers_dim_reduction_output_path) for dim_reduction_method in used_dim_reduction_methods: - add_embedding_command = 'blobtools add --text {path} --text-delimiter "," --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {btk_dataset_folder}'.format(path=kmers_dim_reduction_output_path, dim_reduction_method=dim_reduction_method, btk_dataset_folder=btk_dataset_folder) + add_embedding_command = 'blobtools add --text {path} --text-delimiter "," --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {btk_dataset_folder}'.format( + path=kmers_dim_reduction_output_path, + dim_reduction_method=dim_reduction_method, + btk_dataset_folder=btk_dataset_folder, + ) gpf.run_system_command(add_embedding_command) - kraken_lineage_path = pipeline_output_folder + "/nt_kraken_lineage.txt" if os.path.isfile(kraken_lineage_path) and os.stat(kraken_lineage_path).st_size > 0: for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): - add_kraken_command = 'blobtools add --text {} --text-delimiter "," --text-cols scaff=identifiers,nt_kraken_{}=nt_kraken_{} --text-header {}'.format(kraken_lineage_path, taxonomy_level, taxonomy_level, btk_dataset_folder) + add_kraken_command = 'blobtools add --text {} --text-delimiter "," --text-cols scaff=identifiers,nt_kraken_{}=nt_kraken_{} --text-header {}'.format( + kraken_lineage_path, taxonomy_level, taxonomy_level, btk_dataset_folder + ) gpf.run_system_command(add_kraken_command) fcs_gx_output_path = pipeline_output_folder + "/fcs-gx_summary.csv" if os.path.isfile(fcs_gx_output_path) and os.stat(fcs_gx_output_path).st_size > 0: - add_fcs_gx_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action" --text-header {}'.format(fcs_gx_output_path, btk_dataset_folder) + add_fcs_gx_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action" --text-header {}'.format( + fcs_gx_output_path, btk_dataset_folder + ) gpf.run_system_command(add_fcs_gx_results_command) - #cobiontid_markerscan_json_file_path = run_folder + "/" + sample_id + ".json" - #cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") + # cobiontid_markerscan_json_file_path = run_folder + "/" + sample_id + ".json" + # cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") cobiontid_markerscan_output_path = pipeline_output_folder + "/cobiontid_markerscan.csv" if os.path.isfile(cobiontid_markerscan_output_path) and os.stat(cobiontid_markerscan_output_path).st_size > 0: - add_cobiontid_markerscan_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,CobiontID_MarkerScan_embl_ebi_ena=CobiontID_MarkerScan_embl_ebi_ena,CobiontID_MarkerScan_slv=CobiontID_MarkerScan_slv,CobiontID_MarkerScan_Cluster=CobiontID_MarkerScan_Cluster" --text-header {}'.format(cobiontid_markerscan_output_path, btk_dataset_folder) + add_cobiontid_markerscan_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,CobiontID_MarkerScan_embl_ebi_ena=CobiontID_MarkerScan_embl_ebi_ena,CobiontID_MarkerScan_slv=CobiontID_MarkerScan_slv,CobiontID_MarkerScan_Cluster=CobiontID_MarkerScan_Cluster" --text-header {}'.format( + cobiontid_markerscan_output_path, btk_dataset_folder + ) gpf.run_system_command(add_cobiontid_markerscan_results_command) cobiontid_contigviz_output_path = pipeline_output_folder + "/contigviz_results.csv" if os.path.isfile(cobiontid_contigviz_output_path) and os.stat(cobiontid_contigviz_output_path).st_size > 0: - add_cobiontid_contigviz_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,ContigViz_UMAP1=ContigViz_UMAP1,ContigViz_UMAP2=ContigViz_UMAP2,ContigViz_Hexamer_continuous=ContigViz_Hexamer_continuous,ContigViz_Hexamer_digitized=ContigViz_Hexamer_digitized,ContigViz_FastK_continuous=ContigViz_FastK_continuous,ContigViz_FastK_digitized=ContigViz_FastK_digitized,ContigViz_Unique_15mers_continuous=ContigViz_Unique_15mers_continuous,ContigViz_Unique_15mers_digitized=ContigViz_Unique_15mers_digitized,ContigViz_Coverage_continuous=ContigViz_Coverage_continuous,ContigViz_Coverage_digitized=ContigViz_Coverage_digitized" --text-header {}'.format(cobiontid_contigviz_output_path, btk_dataset_folder) + add_cobiontid_contigviz_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,ContigViz_UMAP1=ContigViz_UMAP1,ContigViz_UMAP2=ContigViz_UMAP2,ContigViz_Hexamer_continuous=ContigViz_Hexamer_continuous,ContigViz_Hexamer_digitized=ContigViz_Hexamer_digitized,ContigViz_FastK_continuous=ContigViz_FastK_continuous,ContigViz_FastK_digitized=ContigViz_FastK_digitized,ContigViz_Unique_15mers_continuous=ContigViz_Unique_15mers_continuous,ContigViz_Unique_15mers_digitized=ContigViz_Unique_15mers_digitized,ContigViz_Coverage_continuous=ContigViz_Coverage_continuous,ContigViz_Coverage_digitized=ContigViz_Coverage_digitized" --text-header {}'.format( + cobiontid_contigviz_output_path, btk_dataset_folder + ) gpf.run_system_command(add_cobiontid_contigviz_results_command) - -def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_title, taxon_name, taxid, blastn_hits_path, uniprot_diamond_hits_path, nr_diamond_hits_path, mapped_reads_path, taxdump_path, threads, assembly_alias, dry_run_flag): - - #out_folder = pipeline_run_folder + "/collected_tables" +def main( + assembly_fasta_path, + dataset_folder, + pipeline_run_folder, + assembly_title, + taxon_name, + taxid, + blastn_hits_path, + uniprot_diamond_hits_path, + nr_diamond_hits_path, + mapped_reads_path, + taxdump_path, + threads, + assembly_alias, + dry_run_flag, +): + + # out_folder = pipeline_run_folder + "/collected_tables" if assembly_alias == "": assembly_alias = assembly_title @@ -131,10 +162,11 @@ def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_titl if dry_run_flag == False: create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name) - blobtools_create_command = "blobtools create --fasta {} --meta {} --taxid {} --taxdump {} {}".format(assembly_fasta_path, assembly_yaml_path, taxid, taxdump_path, dataset_folder) + blobtools_create_command = "blobtools create --fasta {} --meta {} --taxid {} --taxdump {} {}".format( + assembly_fasta_path, assembly_yaml_path, taxid, taxdump_path, dataset_folder + ) gpf.run_system_command(blobtools_create_command, dry_run=dry_run_flag) - hits_file_paths = [blastn_hits_path, uniprot_diamond_hits_path, nr_diamond_hits_path] hits_file_paths = [n for n in hits_file_paths if os.path.isfile(n) is True and os.stat(n).st_size > 0] @@ -145,19 +177,20 @@ def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_titl add_hits_command += " --taxrule bestsum --taxdump {} {}".format(taxdump_path, dataset_folder) gpf.run_system_command(add_hits_command, dry_run=dry_run_flag) - if os.path.isfile(mapped_reads_path) is True and os.stat(mapped_reads_path).st_size > 0: add_cov_command = "blobtools add --cov {} --threads {} {}".format(mapped_reads_path, threads, dataset_folder) gpf.run_system_command(add_cov_command, dry_run=dry_run_flag) - #export_table_command = "blobtools filter --table {}/btk_summary_table_basic.tsv {}".format(out_folder, dataset_folder) + # export_table_command = "blobtools filter --table {}/btk_summary_table_basic.tsv {}".format(out_folder, dataset_folder) add_custom_variables_to_btk_dataset(pipeline_run_folder, dataset_folder) - export_table_command = "blobtools filter --table {}/collected_tables/btk_summary_table_full.tsv {}".format(pipeline_run_folder, dataset_folder) + export_table_command = "blobtools filter --table {}/collected_tables/btk_summary_table_full.tsv {}".format( + pipeline_run_folder, dataset_folder + ) gpf.run_system_command(export_table_command, dry_run=dry_run_flag) # json_file_path = run_folder + "/" + sample_id + ".json" - #cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") + # cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") if __name__ == "__main__": @@ -171,10 +204,29 @@ def main(assembly_fasta_path, dataset_folder, pipeline_run_folder, assembly_titl parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") - parser.add_argument("mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation") + parser.add_argument( + "mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation" + ) parser.add_argument("-td", "--taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") - parser.add_argument("--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)") + parser.add_argument( + "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" + ) args = parser.parse_args() - main(args.assembly_fasta_path, args.dataset_folder, args.pipeline_run_folder, args.assembly_title, args.taxon_name, args.taxid, args.blastn_hits_path, args.uniprot_diamond_hits_path, args.nr_diamond_hits_path, args.mapped_reads_path, args.taxdump_path, args.threads, args.assembly_alias,args.dry_run) \ No newline at end of file + main( + args.assembly_fasta_path, + args.dataset_folder, + args.pipeline_run_folder, + args.assembly_title, + args.taxon_name, + args.taxid, + args.blastn_hits_path, + args.uniprot_diamond_hits_path, + args.nr_diamond_hits_path, + args.mapped_reads_path, + args.taxdump_path, + args.threads, + args.assembly_alias, + args.dry_run, + ) diff --git a/bin/merge_btk_datasets.py b/bin/merge_btk_datasets.py index c3af0a1d..4054d121 100755 --- a/bin/merge_btk_datasets.py +++ b/bin/merge_btk_datasets.py @@ -13,7 +13,7 @@ def load_json(filename): - """ Loads a JSON file and returns it as a dictionary """ + """Loads a JSON file and returns it as a dictionary""" with open(filename) as f: return json.load(f) @@ -24,7 +24,9 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined """ for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): if os.path.isdir(folder) is False: - sys.stderr.write(f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n") + sys.stderr.write( + f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n" + ) sys.exit(0) main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" @@ -52,33 +54,44 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined if field_id not in keys_to_skip: merged_dict["fields"].append(field) - meta_json_outpath = f"{combined_dataset_folder}/meta.json" with open(meta_json_outpath, "w") as json_outfile: json.dump(merged_dict, json_outfile, indent=1, sort_keys=True) -def main(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder, pipeline_output_folder, skip_renaming_folders): +def main( + main_btk_dataset_folder, + btk_busco_dataset_folder, + combined_dataset_folder, + pipeline_output_folder, + skip_renaming_folders, +): if os.path.isdir(main_btk_dataset_folder) is False: sys.stderr.write(f"The BlobToolKit dataset ({main_btk_dataset_folder}) was not found\n") sys.exit(1) if os.path.isdir(btk_busco_dataset_folder) is False: - sys.stderr.write(f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {btk_busco_dataset_folder}, skipping the merging of BTK datasets\n") + sys.stderr.write( + f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {btk_busco_dataset_folder}, skipping the merging of BTK datasets\n" + ) sys.exit(0) not_copying_list = ["identifiers.json", "gc_data.json", "length_data.json", "ncount_data.json", "meta.json"] Path(combined_dataset_folder).mkdir(parents=True, exist_ok=True) - main_btk_dataset_files = [f for f in os.listdir(main_btk_dataset_folder) if os.path.isfile(os.path.join(main_btk_dataset_folder, f))] + main_btk_dataset_files = [ + f for f in os.listdir(main_btk_dataset_folder) if os.path.isfile(os.path.join(main_btk_dataset_folder, f)) + ] main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] for main_btk_dataset_file in main_btk_dataset_files: main_btk_dataset_file_full_path = f"{main_btk_dataset_folder}/{main_btk_dataset_file}" copied_file_full_path = f"{combined_dataset_folder}/{main_btk_dataset_file}" shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) - btk_busco_files = [f for f in os.listdir(btk_busco_dataset_folder) if os.path.isfile(os.path.join(btk_busco_dataset_folder, f))] + btk_busco_files = [ + f for f in os.listdir(btk_busco_dataset_folder) if os.path.isfile(os.path.join(btk_busco_dataset_folder, f)) + ] for btk_busco_file in btk_busco_files: btk_busco_file_full_path = f"{btk_busco_dataset_folder}/{btk_busco_file}" copied_file_full_path = f"{combined_dataset_folder}/{btk_busco_file}" @@ -98,10 +111,35 @@ def main(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_fol if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("main_btk_dataset_folder", type=str, help="Path to the BTK dataset (blobdir) created from the output of the steps of this pipeline") - parser.add_argument("btk_busco_dataset_folder", type=str, help="Path to the BTK dataset (blobdir) created by the BUSCO-based Snakemake BTK pipeline") - parser.add_argument("combined_dataset_folder", type=str, help="Path for creating a new BTK dataset (blobdir) that combines the two input BTK datasets") - parser.add_argument("pipeline_output_folder", type=str, help="Path to the directory with the output tables of the pipeline") - parser.add_argument("--skip_renaming_folders", dest="skip_renaming_folders", help="Optional boolean argument. If set to true, the script skips the renaming of the input BTK dataset directories after creating the merged BTK dataset", action="store_true") + parser.add_argument( + "main_btk_dataset_folder", + type=str, + help="Path to the BTK dataset (blobdir) created from the output of the steps of this pipeline", + ) + parser.add_argument( + "btk_busco_dataset_folder", + type=str, + help="Path to the BTK dataset (blobdir) created by the BUSCO-based Snakemake BTK pipeline", + ) + parser.add_argument( + "combined_dataset_folder", + type=str, + help="Path for creating a new BTK dataset (blobdir) that combines the two input BTK datasets", + ) + parser.add_argument( + "pipeline_output_folder", type=str, help="Path to the directory with the output tables of the pipeline" + ) + parser.add_argument( + "--skip_renaming_folders", + dest="skip_renaming_folders", + help="Optional boolean argument. If set to true, the script skips the renaming of the input BTK dataset directories after creating the merged BTK dataset", + action="store_true", + ) args = parser.parse_args() - main(args.main_btk_dataset_folder, args.btk_busco_dataset_folder, args.combined_dataset_folder, args.pipeline_output_folder, args.skip_renaming_folders) \ No newline at end of file + main( + args.main_btk_dataset_folder, + args.btk_busco_dataset_folder, + args.combined_dataset_folder, + args.pipeline_output_folder, + args.skip_renaming_folders, + ) diff --git a/bin/remove_fcs_gx_and_tiara_contams.py b/bin/remove_fcs_gx_and_tiara_contams.py index 8a89f44f..aa0aadb8 100755 --- a/bin/remove_fcs_gx_and_tiara_contams.py +++ b/bin/remove_fcs_gx_and_tiara_contams.py @@ -10,6 +10,7 @@ from pathlib import Path import csv + def get_domain_from_taxid(query_taxid, rankedlineage_path): """ Input: 1) a taxID, 2) path to the NCBI rankedlineage.dmp file @@ -31,7 +32,11 @@ def get_domain_from_taxid(query_taxid, rankedlineage_path): sys.exit(1) break if domain is None: - sys.stderr.write("The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format(query_taxid, rankedlineage_path)) + sys.stderr.write( + "The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format( + query_taxid, rankedlineage_path + ) + ) sys.exit(1) return domain @@ -108,32 +113,41 @@ def filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path): split_seq = gpf.split_with_fixed_row_length(seq, 80) out_list.extend(split_seq) else: - sys.stderr.write(f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n") + sys.stderr.write( + f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n" + ) gpf.export_list_as_line_break_separated_file(out_list, filtered_assembly_path) - def main(pipeline_run_folder, taxid, rankedlineage_path): if taxid == -1: - sys.stderr.write("The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n") + sys.stderr.write( + "The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n" + ) assembly_path = f"{pipeline_run_folder}/fasta/assembly.fasta" tiara_results_path = f"{pipeline_run_folder}/collected_tables/tiara_out.txt" fcs_gx_summary_path = f"{pipeline_run_folder}/collected_tables/fcs-gx_summary.csv" filtered_assembly_path = f"{pipeline_run_folder}/fasta/filtered/assembly_autofiltered.fasta" - assembly_filtering_summary_table_path = f"{pipeline_run_folder}/collected_tables/fcs-gx_and_tiara_combined_summary.csv" + assembly_filtering_summary_table_path = ( + f"{pipeline_run_folder}/collected_tables/fcs-gx_and_tiara_combined_summary.csv" + ) excluded_seq_list_path = f"{pipeline_run_folder}/collected_tables/assembly_filtering_removed_sequences.txt" Path(f"{pipeline_run_folder}/fasta/filtered").mkdir(parents=True, exist_ok=True) if os.path.isfile(rankedlineage_path) is False: - sys.stderr.write(f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n") + sys.stderr.write( + f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n" + ) sys.exit(1) if os.path.isfile(tiara_results_path) is False: sys.stderr.write(f"The Tiara output file was not found at the expected location ({tiara_results_path})\n") sys.exit(1) if os.path.isfile(fcs_gx_summary_path) is False: - sys.stderr.write(f"The FCS-GX results summary file was not found at the expected location ({fcs_gx_summary_path})\n") + sys.stderr.write( + f"The FCS-GX results summary file was not found at the expected location ({fcs_gx_summary_path})\n" + ) sys.exit(1) if os.path.isfile(assembly_path) is False: sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({assembly_path})\n") @@ -159,12 +173,16 @@ def main(pipeline_run_folder, taxid, rankedlineage_path): combined_action = "EXCLUDE" if combined_action == "EXCLUDE": scaffs_to_exclude.append(scaff) - combined_action_dict[scaff] = {"fcs_gx_action": fcs_gx_action, "tiara_action": tiara_action, "combined_action": combined_action} + combined_action_dict[scaff] = { + "fcs_gx_action": fcs_gx_action, + "tiara_action": tiara_action, + "combined_action": combined_action, + } filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) - #csv_writer = csv.writer(open(assembly_filtering_summary_table_path, "w")) - #for key, value in combined_action_dict.items(): + # csv_writer = csv.writer(open(assembly_filtering_summary_table_path, "w")) + # for key, value in combined_action_dict.items(): # line = [key] # for ik, iv in value.items(): # line.append(ik) @@ -177,10 +195,11 @@ def main(pipeline_run_folder, taxid, rankedlineage_path): out_csv_list.append(out_line) gpf.export_list_as_line_break_separated_file(out_csv_list, assembly_filtering_summary_table_path) + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("pipeline_run_folder", type=str, help="Path to the directory where the pipeline is be run") parser.add_argument("taxid", type=int, help="NCBI taxonomy ID of the species") parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy") args = parser.parse_args() - main(args.pipeline_run_folder, args.taxid, args.ncbi_rankedlineage_path) \ No newline at end of file + main(args.pipeline_run_folder, args.taxid, args.ncbi_rankedlineage_path) From e6ea8adbe8d326fc5f66bba5ed0e7b9a1fe9cd16 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:23:03 +0100 Subject: [PATCH 037/117] Prettier Fix --- modules.json | 90 ++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 67 deletions(-) diff --git a/modules.json b/modules.json index 98d5341b..f8c9ac15 100644 --- a/modules.json +++ b/modules.json @@ -8,162 +8,118 @@ "blast/blastn": { "branch": "master", "git_sha": "acacb4075ef46fa74630aa3f4b0684f1021d5930", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "blast/makeblastdb": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/getchromsizes": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fcs/fcsadaptor": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fcs/fcsgx": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gnu/sort": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kraken2/kraken2": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" }, "minimap2/align": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/index": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "ncbitools/vecscreen": { "branch": "master", "git_sha": "1e4ac4aa2c612f9547f79f02ef7c651ccc9f657b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/depth": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/merge": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqkit/sliding": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tiara/tiara": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } } } } -} \ No newline at end of file +} From 664eaf03a86379e94fc9c38e06fa56ebe2bd4278 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:24:17 +0100 Subject: [PATCH 038/117] Adding skeleton module --- modules/local/sanger_tol_btk.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 87c3d1d2..6fa27d66 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -75,4 +75,4 @@ process SANGER_TOL_BTK { parse_fcsgx_result: \$(parse_fcsgx_result.py -v) END_VERSIONS """ -} \ No newline at end of file +} From bdf0248347b044f5205f5050579780f94d5b67b8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:26:30 +0100 Subject: [PATCH 039/117] Formatting and EditorConfig linting --- modules/local/autofiltering.nf | 2 +- modules/local/convert_to_hits_file.nf | 2 +- modules/local/create_btk_dataset.nf | 2 +- modules/local/merge_btk_datasets.nf | 2 +- modules/local/trailingns.nf | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index fcf58bc4..c9893fd5 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -30,4 +30,4 @@ process AUTOFILTER_ASSEMBLY { END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/convert_to_hits_file.nf b/modules/local/convert_to_hits_file.nf index 4b125444..c7b4e64b 100644 --- a/modules/local/convert_to_hits_file.nf +++ b/modules/local/convert_to_hits_file.nf @@ -22,4 +22,4 @@ process CONVERT_TO_HITS_FILE { convert_to_hits: \$(convert_to_hits.py --version | cut -d' ' -f2) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index c8546c45..2fc5ed49 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -68,4 +68,4 @@ process CREATE_BTK_DATASET { create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 4b6904b2..ee228b2d 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -36,4 +36,4 @@ process MERGE_BTK_DATASETS { create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/trailingns.nf b/modules/local/trailingns.nf index e69bcbcf..55afa522 100644 --- a/modules/local/trailingns.nf +++ b/modules/local/trailingns.nf @@ -38,4 +38,4 @@ process TRAILINGNS { trim_Ns.py: \$(trim_Ns.py --version | cut -d' ' -f2) END_VERSIONS """ -} \ No newline at end of file +} From 2b892570e06b956393dfa820696650c3e2b59c49 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:27:46 +0100 Subject: [PATCH 040/117] updating container --- modules/local/autofiltering.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index c9893fd5..f8d60fcd 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -2,7 +2,7 @@ process AUTOFILTER_ASSEMBLY { tag "$meta.id" label "process_medium" - container 'docker://quay.io/sanger-tol/ascc_main:0.001-c1' + container 'sanger-tol/ascc_main:0.001-c1' input: tuple val(meta), path(reference) From 005e600237edb7895270d461da7f760f510d87de Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:32:07 +0100 Subject: [PATCH 041/117] removed --- bin/ascc_merge_tables.py | 263 --------------------------------------- 1 file changed, 263 deletions(-) delete mode 100755 bin/ascc_merge_tables.py diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py deleted file mode 100755 index 307812ff..00000000 --- a/bin/ascc_merge_tables.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python3 -""" -Script for merging contaminant check results into one table -""" - -import argparse -import pandas as pd -import os -import sys -import general_purpose_functions as gpf - - -def check_paths(paths_dict, required_files): - """ - Checks if a required file exists and exits with an error message if it doesn't - """ - out_dict = dict() - for data_type, input_file_path in paths_dict.items(): - out_dict[data_type] = None - if os.path.isfile(input_file_path) == False: - if data_type in required_files: - sys.stderr.write("Input file {} was not found\n".format(input_file_path)) - sys.exit(1) - else: - if os.stat(input_file_path).st_size == 0: - sys.stderr.write("Warning: the file {} is empty and will therefore not be included in the final results\n".format(input_file_path)) - else: - out_dict[data_type] = input_file_path - return out_dict - - -def load_and_merge_dataframes(paths_dict): - """ - Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table - """ - gc_path = paths_dict["gc_content"] - df = pd.read_csv(gc_path, sep="\t", header=None) - if df.shape[0] > 0: - df.columns = ["scaff", "gc"] - df["gc"] = df["gc"] * 100 - else: - sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) - sys.exit(1) - - coverage_df = None - coverage_file_path = paths_dict["coverage"] - if coverage_file_path is not None: - if os.stat(coverage_file_path).st_size > 0: - coverage_df = pd.read_csv(coverage_file_path, sep=",", header=None) - if coverage_df.shape[0] > 0: - coverage_df.columns = ["scaff", "coverage"] - else: - sys.stderr.write("No rows were found in the coverages table ({})\n".format(coverage_file_path)) - coverage_df = None - else: - sys.stderr.write("Warning: the output file for PacBio coverage ({}) is empty\n".format(coverage_file_path)) - - tiara_df = None - if paths_dict["tiara"] is not None: - tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") - if tiara_df.shape[0] > 0: - tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] - tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] - tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] - tiara_df = tiara_df.iloc[:,[0, 3]] - tiara_df.columns = ["scaff", "tiara_classif"] - else: - sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) - tiara_df = None - - bacterial_kraken_df = None - if paths_dict["bacterial_kraken"] is not None: - bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") - if bacterial_kraken_df.shape[0] > 0: - bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) - bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) - else: - sys.stderr.write("No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"])) - bacterial_kraken_df = None - - nt_kraken_df = None - if paths_dict["nt_kraken"] is not None: - nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") - if nt_kraken_df.shape[0] > 0: - nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) - nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) - else: - sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) - nt_kraken_df = None - - dim_reduction_df = None - if paths_dict["dim_reduction_embeddings"] is not None: - dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") - if dim_reduction_df.shape[0] == 0: - sys.stderr.write("No rows were found in kmers dimensionality reduction output table ({})\n".format(paths_dict["dim_reduction_embeddings"])) - dim_reduction_df = None - - btk_df = None - - if paths_dict["blobtoolkit"] is not None: - btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") - if btk_df.shape[0] == 0: - sys.stderr.write("No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"])) - sys.exit(1) - btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} - if "mapped_hifi_reads_sorted_cov" in btk_df.columns: - btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" - if "bestsum_phylum" in btk_df.columns: - btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" - #{"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} - - btk_df.rename(columns = btk_renaming_dict, inplace=True) - - btk_selected_cols = [col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"]] - if len(btk_selected_cols) > 0: - btk_df = btk_df[btk_selected_cols] - else: - btk_df = None - - btk_busco_df = None - - if paths_dict["btk_busco"] is not None: - btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") - if btk_busco_df.shape[0] == 0: - sys.stderr.write("No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"])) - sys.exit(1) - btk_busco_renaming_dict = {"identifiers": "scaff"} - #if "mapped_hifi_reads_sorted_cov" in btk_df.columns: - # btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" - #if "bestsum_phylum" in btk_df.columns: - # btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" - #{"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} - - btk_busco_df.rename(columns = btk_busco_renaming_dict, inplace=True) - - btk_busco_selected_cols = [col for col in btk_busco_df.columns if col in ["scaff", "buscogenes_superkingdom", "buscogenes_kingdom", "buscogenes_phylum", "buscogenes_class", "buscogenes_order", "buscogenes_family", "buscogenes_genus", "buscogenes_species", "buscoregions_superkingdom", "buscoregions_kingdom", "buscoregions_phylum", "buscoregions_class", "buscoregions_order", "buscoregions_family", "buscoregions_genus", "buscoregions_species"]] - if len(btk_busco_selected_cols) > 0: - btk_busco_df = btk_busco_df[btk_busco_selected_cols] - else: - btk_busco_df = None - - #df = pd.merge(main_df, btk_df, on="scaff", how="outer") - - - #if paths_dict["blobtoolkit"] is not None: - # #if 'A' in df.columns: - # blobtoolkit_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") - # if blobtoolkit_df.shape[0] > 0: - # blobtoolkit_df = blobtoolkit_df[["identifiers", "bestsum_phylum"]] - # blobtoolkit_df.columns = ["scaff", "btk_bestsum_phylum"] - # else: - # sys.stderr.write("No rows were found in BlobToolKit output table ({})\n".format(paths_dict["blobtoolkit"])) - # blobtoolkit_df = None - - fcs_gx_df = None - if paths_dict["fcs_gx"] is not None: - fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") - if fcs_gx_df.shape[0] == 0: - sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) - fcs_gx_df = None - - nt_blast_df = None - if paths_dict["nt_blast"] is not None: - nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") - if nt_blast_df.shape[0] == 0: - sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) - nt_blast_df = None - nr_diamond_df = None - if paths_dict["nr_diamond"] is not None: - nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") - if nr_diamond_df.shape[0] == 0: - sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) - nr_diamond_df = None - uniprot_diamond_df = None - if paths_dict["uniprot_diamond"] is not None: - uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") - if uniprot_diamond_df.shape[0] == 0: - sys.stderr.write("No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"])) - uniprot_diamond_df = None - cobiontid_markerscan_df = None - if paths_dict["cobiontid_markerscan"] is not None: - cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") - if cobiontid_markerscan_df.shape[0] == 0: - sys.stderr.write("No rows were found in CobiontID MarkerScan output table ({})\n".format(paths_dict["cobiontid_markerscan"])) - uniprot_diamond_df = None - contigviz_df = None - if paths_dict["contigviz"] is not None: - contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") - if contigviz_df.shape[0] == 0: - sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) - contigviz_df = None - - - if coverage_df is not None: - df = pd.merge(df, coverage_df, on="scaff", how="outer") - if tiara_df is not None: - df = pd.merge(df, tiara_df, on="scaff", how="outer") - if bacterial_kraken_df is not None: - df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") - if nt_kraken_df is not None: - df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") - if dim_reduction_df is not None: - df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") - if nt_blast_df is not None: - df = pd.merge(df, nt_blast_df, on="scaff", how="outer") - - if nr_diamond_df is not None: - df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") - if uniprot_diamond_df is not None: - df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") - - if fcs_gx_df is not None: - df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") - - if cobiontid_markerscan_df is not None: - df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") - if contigviz_df is not None: - df = pd.merge(df, contigviz_df, on="scaff", how="outer") - if btk_df is not None: - df = pd.merge(df, btk_df, on="scaff", how="outer") - if btk_busco_df is not None: - df = pd.merge(df, btk_busco_df, on="scaff", how="outer") - - return df - - -def main(data_folder, out_path, sample_name): - paths_dict = dict() - paths_dict["gc_content"] = "{}/gc.txt".format(data_folder) - paths_dict["coverage"] = "{}/pacbio_reads_coverage.txt".format(data_folder) - paths_dict["tiara"] = "{}/tiara_out.txt".format(data_folder) - paths_dict["bacterial_kraken"] = "{}/bacterial_kraken_lineage.txt".format(data_folder) - paths_dict["nt_kraken"] = "{}/nt_kraken_lineage.txt".format(data_folder) - paths_dict["nt_blast"] = "{}/BLAST_results_with_lineage.csv".format(data_folder) - paths_dict["dim_reduction_embeddings"] = "{}/kmers_dim_reduction_embeddings.csv".format(data_folder) - paths_dict["nr_diamond"] = "{}/nr_diamond_blastx_top_hits.csv".format(data_folder) - paths_dict["uniprot_diamond"] = "{}/uniprot_diamond_blastx_top_hits.csv".format(data_folder) - paths_dict["cobiontid_markerscan"] = "{}/cobiontid_markerscan.csv".format(data_folder) - paths_dict["contigviz"] = "{}/contigviz_results.csv".format(data_folder) - paths_dict["blobtoolkit"] = "{}/btk_summary_table_full.tsv".format(data_folder) - paths_dict["btk_busco"] = "{}/btk_busco_summary_table_full.tsv".format(data_folder) - paths_dict["fcs_gx"] = "{}/fcs-gx_summary.csv".format(data_folder) - - required_files = ["gc_content"] - - paths_dict = check_paths(paths_dict, required_files) - df = load_and_merge_dataframes(paths_dict) - df.to_csv(out_path, index=False) - - if paths_dict["nt_blast"] is not None and paths_dict["nr_diamond"] is not None and paths_dict["uniprot_diamond"] is not None and paths_dict["coverage"] is not None and paths_dict["tiara"] is not None and paths_dict["nt_kraken"] is not None: - process_results_tables_command = "process_result_tables.py {} {}".format(data_folder, sample_name) - gpf.run_system_command(process_results_tables_command) - else: - sys.stderr.write("Skipping generating the {}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n".format(sample_name)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("data_folder", type=str, help="Path to folder with ASG contamination check result files of individual steps") - parser.add_argument("out_path", type=str, help="Path for output CSV file") - parser.add_argument("--sample_name", type=str, help="Sample name (e.g. ToLID)", default="unnamed") - args = parser.parse_args() - main(args.data_folder, args.out_path, args.sample_name) \ No newline at end of file From ba4e00c22d2108ae58589e4d8b6279e10c3fafaa Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Apr 2024 14:32:20 +0100 Subject: [PATCH 042/117] formatting --- bin/create_btk_dataset.py | 1 - bin/trim_Ns.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py index d904d2ed..8cad4eb0 100755 --- a/bin/create_btk_dataset.py +++ b/bin/create_btk_dataset.py @@ -146,7 +146,6 @@ def main( assembly_alias, dry_run_flag, ): - # out_folder = pipeline_run_folder + "/collected_tables" if assembly_alias == "": diff --git a/bin/trim_Ns.py b/bin/trim_Ns.py index 4775a2b6..059cce1b 100755 --- a/bin/trim_Ns.py +++ b/bin/trim_Ns.py @@ -9,7 +9,6 @@ def main(fasta_file, output_file): - minleftover = 200 # after trimming start/end, at least this many bp should be left winsize = 5000 # for sliding window analysis minslidingBase = 0.4 # maximum fraction of Ns in sliding window before alarm sets off @@ -74,7 +73,6 @@ def main(fasta_file, output_file): and (seq_string_for_window[:winsize].count("N") + seq_string_for_window[:winsize].count("n")) > winsize * minslidingBase ): - non_n_regions = [] non_n_iterator = re.finditer("[^Nn]+", seq_string_for_window) for non_n_instance in non_n_iterator: From 39c3cd0cf96543c01c99168c02cf88e2dc88d412 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 23 Apr 2024 13:40:06 +0100 Subject: [PATCH 043/117] Updating from sanger-tol containers to public more suitable ones --- modules/local/autofiltering.nf | 5 ++++- modules/local/convert_to_hits_file.nf | 5 ++++- modules/local/create_btk_dataset.nf | 6 ++++-- modules/local/merge_btk_datasets.nf | 6 ++++-- modules/local/trailingns.nf | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index f8d60fcd..8b7b0a09 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -2,7 +2,10 @@ process AUTOFILTER_ASSEMBLY { tag "$meta.id" label "process_medium" - container 'sanger-tol/ascc_main:0.001-c1' + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(reference) diff --git a/modules/local/convert_to_hits_file.nf b/modules/local/convert_to_hits_file.nf index c7b4e64b..0bb59016 100644 --- a/modules/local/convert_to_hits_file.nf +++ b/modules/local/convert_to_hits_file.nf @@ -2,7 +2,10 @@ process CONVERT_TO_HITS_FILE { tag "$meta.id" label 'process_low' - container 'sanger-tol/ascc_btk:3.2.6-c1' + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(blast_full) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 2fc5ed49..60af4877 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -2,8 +2,10 @@ process CREATE_BTK_DATASET { tag "$meta.id" label 'process_medium' - container 'sanger-tol/ascc_btk:3.2.6-c1' - + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(reference) diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index ee228b2d..0592fc29 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -2,8 +2,10 @@ process MERGE_BTK_DATASETS { tag "$meta.id" label 'process_low' - container 'sanger-tol/ascc_btk:3.2.6-c1' - + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(create_btk_datasets) diff --git a/modules/local/trailingns.nf b/modules/local/trailingns.nf index 55afa522..eb97af22 100644 --- a/modules/local/trailingns.nf +++ b/modules/local/trailingns.nf @@ -1,6 +1,7 @@ process TRAILINGNS { tag "$meta.id" label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/biopython:1.70--np112py27_1': 'biocontainers/biopython:1.70--np112py27_1' }" From 1929298e75700892c990e9ed8bf7f3546ec639da Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 23 Apr 2024 15:05:46 +0100 Subject: [PATCH 044/117] spelling --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index be9a96dd..67d03241 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -356,7 +356,7 @@ workflow ASCC { YAML_INPUT.out.diamond_nr_database_path ) nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} - nt_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} + nt_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { nt_hits = [] From 79d04d23fff13f9a5f1fbfbd97a3c1229f93aa42 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 30 Apr 2024 15:18:03 +0100 Subject: [PATCH 045/117] modified: modules/local/autofiltering.nf modified: workflows/ascc.nf --- modules/local/autofiltering.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 8b7b0a09..856f1ce4 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -15,18 +15,24 @@ process AUTOFILTER_ASSEMBLY { output: tuple val(meta), path("*autofiltered.fasta"), emit: decontaminated_assembly tuple val(meta), path("fcs-gx_and_tiara_combined_summary.csv"), emit: fcs_tiara_summary - tuple val(meta), path("assembly_filtering_removed_sequences.txt") emit: removed_seqs + tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs + path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file script: def prefix = task.ext.prefix ?: "${meta.id}" def args = task.ext.args ?: "" """ - remove_fcs_gx_and_tiara.py \\ + autofilter.py \\ $reference \\ $meta.taxid \\ $tiara_txt \\ $fcs_csv + abnormal_contamination_check.py \\ + $reference \\ + fcs-gx_and_tiara_combined_summary.csv + + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') From 5335a8d3734ed544a9f10851e2564476be289f12 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 30 Apr 2024 15:18:36 +0100 Subject: [PATCH 046/117] new file: bin/abnormal_contamination_check.py new file: bin/autofilter.py --- bin/abnormal_contamination_check.py | 145 ++++++++++++++++++ bin/autofilter.py | 219 ++++++++++++++++++++++++++++ 2 files changed, 364 insertions(+) create mode 100644 bin/abnormal_contamination_check.py create mode 100755 bin/autofilter.py diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py new file mode 100644 index 00000000..53d26683 --- /dev/null +++ b/bin/abnormal_contamination_check.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +VERSION = "V1.0.0" + +DESCRIPTION = """ +------------------------------------- + Abnormal Contamination Check + Version = {VERSION} +------------------------------------- +Written by James Torrance +Modified by Eerik Aunin +Modified by Damon-Lee Pointon +------------------------------------- + +Script for determining if there is +enough contamination found by FCS-GX +to warrant an abnormal contamination +report alarm. Partially based on code +written by James Torrance +------------------------------------- + +""" + +import general_purpose_functions as gpf +import sys +import os.path +import pathlib +import argparse +import textwrap + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="Abnormal Contamination Check", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("assembly", type=str, help="Path to the fasta assembly file") + parser.add_argument("summary_path", type=str, help="Path to the tiara summary file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def get_sequence_lengths(assembly_fasta_path): + """ + Gets sequence lengths of a FASTA file and returns them as a dictionary + """ + seq_lengths_dict = dict() + fasta_data = gpf.read_fasta_in_chunks(assembly_fasta_path) + for header, seq in fasta_data: + seq_len = len(seq) + seq_lengths_dict[header] = dict() + seq_lengths_dict[header]["seq_len"] = seq_len + return seq_lengths_dict + + +def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path): + """ + Loads FCS-GX actions from the FCS-GX and Tiara results summary file, adds them to the dictionary that contains sequence lengths + """ + fcs_gx_and_tiara_summary_data = gpf.l(fcs_gx_and_tiara_summary_path) + fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)] + for line in fcs_gx_and_tiara_summary_data: + # print(line) + split_line = line.split(",") + assert len(split_line) == 5 + seq_name = split_line[0] + fcs_gx_action = split_line[1] + seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action + return seq_dict + + +def main(): + args = parse_args() + if os.path.isfile(args.summary_path) is False: + sys.stderr.write( + f"The FCS-GX and Tiara results file was not found at the expected location ({fcs_gx_and_tiara_summary_path})\n" + ) + sys.exit(1) + + if os.path.isfile(args.assembly) is False: + sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({assembly_fasta_path})\n") + sys.exit(1) + + seq_dict = get_sequence_lengths(args.assembly) + seq_dict = load_fcs_gx_results(seq_dict, args.summary_path) + + total_assembly_length = 0 + lengths_removed = list() + scaffolds_removed = 0 + scaffold_count = len(seq_dict) + + for seq_name in seq_dict: + seq_len = seq_dict[seq_name]["seq_len"] + if seq_dict[seq_name]["fcs_gx_action"] == "EXCLUDE": + lengths_removed.append(seq_len) + scaffolds_removed += 1 + total_assembly_length += seq_len + + alarm_threshold_for_parameter = { + "TOTAL_LENGTH_REMOVED": 1e7, + "PERCENTAGE_LENGTH_REMOVED": 3, + "LARGEST_SCAFFOLD_REMOVED": 1.8e6, + } + + report_dict = { + "TOTAL_LENGTH_REMOVED": sum(lengths_removed), + "PERCENTAGE_LENGTH_REMOVED": 100 * sum(lengths_removed) / total_assembly_length, + "LARGEST_SCAFFOLD_REMOVED": max(lengths_removed, default=0), + "SCAFFOLDS_REMOVED": scaffolds_removed, + "PERCENTAGE_SCAFFOLDS_REMOVED": 100 * scaffolds_removed / scaffold_count, + } + + for param in report_dict: + sys.stderr.write(f"{param}: {report_dict[param]}\n") + + fcs_gx_alarm_indicator_path = f"fcs-gx_alarm_indicator_file.txt" + pathlib.Path(fcs_gx_alarm_indicator_path).unlink(missing_ok=True) + + alarm_list = [] + stage1_decon_pass_flag = True + for param in alarm_threshold_for_parameter: + param_value = report_dict[param] + alarm_threshold = alarm_threshold_for_parameter[param] + + # IF CONTAMINATING SEQ FOUND FILL FILE WITH ABNORMAL CONTAM + if param_value > alarm_threshold_for_parameter[param]: + stage1_decon_pass_flag = False + alarm_list.append( + f"ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n" + ) + + # Seperated out to ensure that the file is written in one go and doesn't confuse Nextflow + with open(fcs_gx_alarm_indicator_path, "a") as f: + f.write("".join(alarm_list)) + + # IF NO CONTAM FILL FILE WITH NO CONTAM + if stage1_decon_pass_flag is True: + alarm_message = f"NO_ABNORMAL_CONTAMINATION: No scaffolds were tagged for removal by FCS-GX\n" + with open(fcs_gx_alarm_indicator_path, "a") as f: + f.write(alarm_message) + + +if __name__ == "__main__": + main() diff --git a/bin/autofilter.py b/bin/autofilter.py new file mode 100755 index 00000000..85b3ef2a --- /dev/null +++ b/bin/autofilter.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +VERSION = "V1.0.0" + +DESCRIPTION = """ +------------------------------------- + Autofilter + Version = {VERSION} +------------------------------------- +Written by Eerik Aunin +Modified by Damon-Lee Pointon +------------------------------------- + +Script for filtering the assembly to +remove putative contaminants based on +FGCS-GX and Tiara results. +------------------------------------- + +""" + +from pathlib import Path +import general_purpose_functions as gpf +import os +import sys +import argparse +import textwrap + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="Abnormal Contamination Check", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("fasta", type=str, help="Path to the fasta assembly file") + parser.add_argument("tiara", type=str, help="Path to the tiara summary file") + parser.add_argument("fcsgx_sum", type=str, help="Path to the fcs-gx_summary.csv file") + parser.add_argument("auto_filtered", type=str, help="Path to the assembly_autofiltered.fasta file") + parser.add_argument("combined_sum", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file") + parser.add_argument("rejected_seq", type=str, help="Path to the assembly_filtering_removed_sequences.txt file") + parser.add_argument("taxid", type=int, help="NCBI taxonomy ID of the species") + parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def get_domain_from_taxid(query_taxid, rankedlineage_path): + """ + Input: 1) a taxID, 2) path to the NCBI rankedlineage.dmp file + Output: domain classification corresponding to the taxID + """ + domain = None + query_taxid = str(query_taxid) + rankedlineage_data = gpf.ll(rankedlineage_path) + for line in rankedlineage_data: + split_line = line.split("|") + split_line = [n.strip() for n in split_line] + assert len(split_line) == 11 + taxid = split_line[0] + domain = split_line[9] + if taxid == query_taxid: + domain = split_line[9] + if domain not in ("", "Archaea", "Bacteria", "Eukaryota", "Viruses"): + sys.stderr.write(f"Unrecognised value for domain-level taxonomy: {domain}") + sys.exit(1) + break + if domain is None: + sys.stderr.write( + "The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format( + query_taxid, rankedlineage_path + ) + ) + sys.exit(1) + return domain + + +def process_tiara_results(tiara_results_path, target_domain): + """ + Input: 1) path to the main output file of Tiara, 2) the domain of the target species + Output: dictionary where the keys are scaffold names and the values are the decontamination action based on Tiara results + ('keep' or 'exclude') + """ + tiara_action_dict = dict() + + allowed_classif_dict = dict() + allowed_classif_dict[""] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_dict["Archaea"] = ["archaea", "prokarya", "unknown"] + allowed_classif_dict["Bacteria"] = ["bacteria", "prokarya", "unknown"] + allowed_classif_dict["Eukaryota"] = ["eukarya", "organelle", "unknown"] + allowed_classif_dict["Viruses"] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] + allowed_classif_list = allowed_classif_dict[target_domain] + + tiara_output = gpf.ll(tiara_results_path) + for counter, line in enumerate(tiara_output): + if counter == 0: + continue + split_line = line.split() + assert len(split_line) == 3 + tiara_class_fst_stage = split_line[1] + assert tiara_class_fst_stage in ("archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown") + tiara_action = "KEEP" + if tiara_class_fst_stage not in allowed_classif_list: + tiara_action = "EXCLUDE" + scaff = split_line[0] + tiara_action_dict[scaff] = tiara_action + return tiara_action_dict + + +def get_fcs_gx_action_dict(fcs_gx_summary_path): + """ + Input: path to FCS-GX summary CSV file (produced by ascc_parse_fcsgx_results.py) + Output: dictionary where the keys are scaffold names and the values are the FCS-GX action values + """ + fcs_gx_action_dict = dict() + fcs_gx_summary_data = gpf.ll(fcs_gx_summary_path) + for counter, line in enumerate(fcs_gx_summary_data): + if counter == 0: + continue + split_line = line.split(",") + scaff = split_line[0] + fcs_gx_action = split_line[8] + fcs_gx_action_dict[scaff] = fcs_gx_action + return fcs_gx_action_dict + + +def get_scaff_names(assembly_path): + """ + Reads FASTA headers from a FASTA file and returns them as a list + """ + scaffs = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for fasta_tuple in fasta_data: + scaffs.append(fasta_tuple[0]) + return scaffs + + +def filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path): + """ + Filters a genome assembly FASTA file to remove sequences that are listed in the scaffs_to_exclude list + """ + out_list = list() + fasta_data = gpf.read_fasta_in_chunks(assembly_path) + for header, seq in fasta_data: + if header not in scaffs_to_exclude: + out_list.append(">" + header) + split_seq = gpf.split_with_fixed_row_length(seq, 80) + out_list.extend(split_seq) + else: + sys.stderr.write( + f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n" + ) + gpf.export_list_as_line_break_separated_file(out_list, filtered_assembly_path) + + +def main(): + args = parse_args() + if args.taxid == -1: + sys.stderr.write( + "The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n" + ) + + assembly_path = args.fasta + tiara_results_path = args.tiara + fcs_gx_summary_path = args.fcsgx_sum + filtered_assembly_path = args.auto_filtered + combined_summary = args.combined_sum + excluded_seq_list_path = args.rejected_seq + ncbi_rankedlist = args.ncbi_rankedlineage_path + + Path(f"{args.data_folder}/fasta/filtered").mkdir(parents=True, exist_ok=True) + + for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: + if os.path.isfile(i) is False: + sys.stderr.write(f"{i} WAS NOT AT THE EXPECTED LOCATION\n") + sys.exit(1) + + target_domain = get_domain_from_taxid(args.taxid, ncbi_rankedlist) + tiara_action_dict = process_tiara_results(tiara_results_path, target_domain) + fcs_gx_action_dict = get_fcs_gx_action_dict(fcs_gx_summary_path) + + combined_action_dict = dict() + scaffs_to_exclude = list() + scaffs = get_scaff_names(assembly_path) + for scaff in scaffs: + fcs_gx_action = "NA" + tiara_action = "NA" + + if scaff in fcs_gx_action_dict: + fcs_gx_action = fcs_gx_action_dict[scaff] + + if scaff in tiara_action_dict: + tiara_action = tiara_action_dict[scaff] + + combined_action = fcs_gx_action + + if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": + combined_action = "EXCLUDE" + + if combined_action == "EXCLUDE": + scaffs_to_exclude.append(scaff) + + combined_action_dict[scaff] = { + "fcs_gx_action": fcs_gx_action, + "tiara_action": tiara_action, + "combined_action": combined_action, + } + filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) + gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) + + out_csv_list = list() + out_csv_list.append("scaff,fcs_gx_action,tiara_action,combined_action") + for scaff, scaff_properties in combined_action_dict.items(): + out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']}" + out_csv_list.append(out_line) + gpf.export_list_as_line_break_separated_file(out_csv_list, "ABNORMAL_CHECK.csv") + + +if __name__ == "__main__": + main() From 6f43fa889ac2f77987019d9cb65460cc087d2050 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 10:50:13 +0100 Subject: [PATCH 047/117] Updates --- modules/local/sanger_tol_btk.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 6fa27d66..fa275d5d 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -8,7 +8,7 @@ process SANGER_TOL_BTK { path blastp path blastn path blastx - path btk_config + path btk_config_file path tax_dump val taxon val gca_accession @@ -26,6 +26,7 @@ process SANGER_TOL_BTK { def executor = task.ext.executor ?: "" def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + def btk_config = ( if btk_config_file ? "-c $btk_config_file" : "") """ $executor 'nextflow run sanger-tol/blobtoolkit \\ -profile $profiles \\ @@ -38,7 +39,7 @@ process SANGER_TOL_BTK { --blastp $blastp \\ --blastn $blastn \\ --blastx $blastx \\ - -c $btk_config \\ + $btk_config \\ $args' From cbde4116507420d441796b9c4fda70d1d8da2749 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 10:50:37 +0100 Subject: [PATCH 048/117] Updats --- subworkflows/local/yaml_input.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 8732de1d..7a2f31b4 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -78,8 +78,10 @@ workflow YAML_INPUT { group.assembly_title .combine( group.reads_path ) + .combine( group.reads_type ) .map { id, file -> - tuple( [ id: id ], + tuple( [ id: id, + type: type ], file ) } From 010f7c9d0e354e0e77e5dac5a31f5e9823c6c873 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 10:51:10 +0100 Subject: [PATCH 049/117] Adding the sanger tol btk pipeline --- workflows/ascc.nf | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 67d03241..58f63f5b 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -45,11 +45,11 @@ include { TRAILINGNS_CHECK } from '../subworkflows/ // MODULE: Local modules // include { GC_CONTENT } from '../modules/local/gc_content' + include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -264,6 +264,7 @@ workflow ASCC { YAML_INPUT.out.taxid, YAML_INPUT.out.ncbi_rankedlineage_path ) + ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) } else { @@ -379,6 +380,7 @@ workflow ASCC { un_hits = [] un_full = [] } +` // mix the outputs of the outpuutting process so that we can // insert them into the one process to create the btk and the merged report @@ -409,10 +411,25 @@ workflow ASCC { // NOT TESTED AS WE NEED BTK INTEGRATED FIRST!!! // + if ( workflow_steps.contains('busco_btk') || workflow_steps.contains('ALL') ) { - //SANGER_TOL_BTK ( - // yaml_input.out.reference_tuple - //) + + GENERATE_SAMPLESHEET ( + YAML_INPUT.out.reference_tuple, + YAML_INPUT.out.pacbio_tuple + ) + + SANGER_TOL_BTK ( + YAML_INPUT.out.reference_tuple, + GENERATE_SAMPLESHEET.out.csv, + YAML_INPUT.out.blastp, + YAML_INPUT.out.blastn, + YAML_INPUT.out.blastx, + [], + YAML_INPUT.out.tax_dump, + YAML_INPUT.out.taxon, + 'GCA_0001' + ) MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, @@ -453,6 +470,21 @@ workflow ASCC { versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions } +process GrabFiles { + label 'process_tiny' + + tag "${meta.id}" + executor 'local' + + input: + tuple val(meta), path("in") + + output: + tuple val(meta), path("in/*.{fa,fasta}.{gz}") + + "true" +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COMPLETION EMAIL AND SUMMARY From 1243eee0e005c8d30fddf32b74f22dcfd765bf48 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 12:57:29 +0100 Subject: [PATCH 050/117] Fix the script --- modules/local/generate_samplesheet.nf | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 modules/local/generate_samplesheet.nf diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf new file mode 100644 index 00000000..29e4f322 --- /dev/null +++ b/modules/local/generate_samplesheet.nf @@ -0,0 +1,31 @@ +process GENERATE_SAMPLESHEET { + tag "$meta.id" + label "process_low" + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(pacbio_path) + + output: + tuple val(meta), path("*csv"), emit: csv + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + """ + generate_samplesheet.py \\ + $prefix \\ + $pacbio_path + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ + +} From 85271c082889c1c5f9b7f884f0bbfc67854520a8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 12:59:32 +0100 Subject: [PATCH 051/117] Add the script for generating the samplesheet --- bin/generate_samplesheet.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 bin/generate_samplesheet.py diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py new file mode 100644 index 00000000..cc551b6a --- /dev/null +++ b/bin/generate_samplesheet.py @@ -0,0 +1,34 @@ +import os +import argparse + +""" +A simple script to generate csv file + +Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Generate a csv file for BTK") + parser.add_argument("sample_name", type=str, help="Name of sample") + parser.add_argument("pacbio_path", type=str, help="Path containing the pacbio files") + parser.add_argument("-v", "--version", action="version", version="1.0.0") + return parser.parse_args() + + +def main(): + args = parse_args() + + data_list = [] + + data_list.append("sample,datatype,datafile\n") + for file in os.listdir(args.pacbio_path): + if file.endswith(".fasta.gz"): + data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}/{file}\n") + + with open("samplesheet.csv") as file: + file.write("".join(data_list)) + + +if __name__ == "__main__": + main() From 63628945e0433f64de12df57687fda4e4201c582 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 13:00:29 +0100 Subject: [PATCH 052/117] Adding the module for abnormal checks --- modules/local/abnormal_contam_check.nf | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 modules/local/abnormal_contam_check.nf diff --git a/modules/local/abnormal_contam_check.nf b/modules/local/abnormal_contam_check.nf new file mode 100644 index 00000000..e45e068e --- /dev/null +++ b/modules/local/abnormal_contam_check.nf @@ -0,0 +1,46 @@ +process ABNORMAL_CONTAM_CHECK { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta1), path(tiara) + tuple val(meta2), path(fcsgx_sum) + tuple val(meta3), path(auto_filtered) + tuple val(meta4), path(combined_sum) + tuple val(meta5), path(rejected_seq) + tuple val(meta6), path(taxid) + tuple val(meta7), path(ncbi_rankedlineage_path) + + output: + tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: abnormal + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + abnormal_contam_check.py \\ + $fasta \\ + $tiara \\ + $fcsgx_sum \\ + $auto_filtered \\ + $combined_sum \\ + $rejected_seq \\ + $taxid \\ + $ncbi_rankedlineage_path + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + abnormal_contam_check: \$(abnormal_contam_check.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} From 00b2595bca145ee21fe156c6224cd1551b80fe36 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 14:44:50 +0100 Subject: [PATCH 053/117] Updates to add contam check --- bin/abnormal_contamination_check.py | 7 +++--- modules/local/abnormal_contam_check.nf | 20 ++++----------- modules/local/autofiltering.nf | 3 ++- workflows/ascc.nf | 34 +++++++++++++++++++++++--- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py index 53d26683..7e5bbf74 100644 --- a/bin/abnormal_contamination_check.py +++ b/bin/abnormal_contamination_check.py @@ -61,7 +61,6 @@ def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path): fcs_gx_and_tiara_summary_data = gpf.l(fcs_gx_and_tiara_summary_path) fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)] for line in fcs_gx_and_tiara_summary_data: - # print(line) split_line = line.split(",") assert len(split_line) == 5 seq_name = split_line[0] @@ -74,12 +73,12 @@ def main(): args = parse_args() if os.path.isfile(args.summary_path) is False: sys.stderr.write( - f"The FCS-GX and Tiara results file was not found at the expected location ({fcs_gx_and_tiara_summary_path})\n" + f"The FCS-GX and Tiara results file was not found at the expected location ({args.summary_path})\n" ) sys.exit(1) if os.path.isfile(args.assembly) is False: - sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({assembly_fasta_path})\n") + sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({args.assembly})\n") sys.exit(1) seq_dict = get_sequence_lengths(args.assembly) @@ -127,7 +126,7 @@ def main(): if param_value > alarm_threshold_for_parameter[param]: stage1_decon_pass_flag = False alarm_list.append( - f"ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n" + f"YES_ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n" ) # Seperated out to ensure that the file is written in one go and doesn't confuse Nextflow diff --git a/modules/local/abnormal_contam_check.nf b/modules/local/abnormal_contam_check.nf index e45e068e..0aa8458b 100644 --- a/modules/local/abnormal_contam_check.nf +++ b/modules/local/abnormal_contam_check.nf @@ -9,13 +9,7 @@ process ABNORMAL_CONTAM_CHECK { input: tuple val(meta), path(fasta) - tuple val(meta1), path(tiara) - tuple val(meta2), path(fcsgx_sum) - tuple val(meta3), path(auto_filtered) - tuple val(meta4), path(combined_sum) - tuple val(meta5), path(rejected_seq) - tuple val(meta6), path(taxid) - tuple val(meta7), path(ncbi_rankedlineage_path) + tuple val(meta1), path(fcsgx_tiara_sum) output: tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: abnormal @@ -27,15 +21,11 @@ process ABNORMAL_CONTAM_CHECK { def prefix = task.ext.prefix ?: "${meta.id}" """ - abnormal_contam_check.py \\ + abnormal_contamination_check.py \\ $fasta \\ - $tiara \\ - $fcsgx_sum \\ - $auto_filtered \\ - $combined_sum \\ - $rejected_seq \\ - $taxid \\ - $ncbi_rankedlineage_path + $fcsgx_tiara_sum + + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 856f1ce4..7c0fa543 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -1,4 +1,4 @@ -process AUTOFILTER_ASSEMBLY { +process AUTOFILTER_AND_CHECK_ASSEMBLY { tag "$meta.id" label "process_medium" @@ -17,6 +17,7 @@ process AUTOFILTER_ASSEMBLY { tuple val(meta), path("fcs-gx_and_tiara_combined_summary.csv"), emit: fcs_tiara_summary tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file + path "versions.yml", emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 58f63f5b..6fde4db8 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -49,6 +49,7 @@ include { GC_CONTENT } from '../modules/local include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' +include { AUTOFILTER_AND_CHECK_ASSEMBLY } from '../modules/local/autofiltering' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -380,7 +381,6 @@ workflow ASCC { un_hits = [] un_full = [] } -` // mix the outputs of the outpuutting process so that we can // insert them into the one process to create the btk and the merged report @@ -403,21 +403,44 @@ workflow ASCC { YAML_INPUT.out.ncbi_taxonomy_path, ) + ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) + //SANGER_TOL_BTK.out.btk_datasets = [] //SANGER_TOL_BTK.out.summary = [] + // - // NOT TESTED AS WE NEED BTK INTEGRATED FIRST!!! + // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // + if ( ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') ) && || workflow_steps.contains('ALL') ) { + AUTOFILTER_AND_CHECK_ASSEMBLY ( + YAML_INPUT.out.reference_tuple, + EXTRACT_TIARA_HITS.out.ch_tiara, + RUN_FCSGX.out.fcsgxresult + ) + ch_autofiltered_assembly = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} + ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) + } else { + ch_autofiltered_assembly = [] + } + ch_autofiltered_assembly + .branch{ + btk_run: ch_autofiltered_assembly.getText().contains("YES_ABNORMAL_CONTAMINATION") + skip: [] + } + .set { abnormal_flag } - if ( workflow_steps.contains('busco_btk') || workflow_steps.contains('ALL') ) { + + if ( ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && abnormal_flag ) || workflow_steps.contains('ALL') || workflow_steps.contains("force_btk") ) { GENERATE_SAMPLESHEET ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_tuple ) + ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + SANGER_TOL_BTK ( YAML_INPUT.out.reference_tuple, @@ -430,14 +453,19 @@ workflow ASCC { YAML_INPUT.out.taxon, 'GCA_0001' ) + //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] [[],[]] //SANGER_TOL_BTK.out.summary = [] ) + ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + } + // // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER // From 85f216605bbc00be3f9ba378a7c5bae315145192 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 14:54:57 +0100 Subject: [PATCH 054/117] Updates to add contam check --- workflows/ascc.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 6fde4db8..f1d13b85 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -413,7 +413,7 @@ workflow ASCC { // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // - if ( ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') ) && || workflow_steps.contains('ALL') ) { + if ( ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') && workflow_steps.contains("autofilter") ) || workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, @@ -425,9 +425,13 @@ workflow ASCC { ch_autofiltered_assembly = [] } + // + // LOGIC: SCAN FILE FOR PRESENCE OF ABNORMAL CONTAMINATION + // IF FOUND THEN WE WANT TO RUN BTK + // ch_autofiltered_assembly .branch{ - btk_run: ch_autofiltered_assembly.getText().contains("YES_ABNORMAL_CONTAMINATION") + btk_run: { if ch_autofiltered_assembly.getText().contains("YES_ABNORMAL_CONTAMINATION") ? "PASS" : [] } skip: [] } .set { abnormal_flag } From 369e132ff9d31c5196e74fae8e544d871d33cd4e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 9 May 2024 15:01:05 +0100 Subject: [PATCH 055/117] Updates to add contam check --- modules/local/abnormal_contam_check.nf | 36 -------------------------- workflows/ascc.nf | 5 ++++ 2 files changed, 5 insertions(+), 36 deletions(-) delete mode 100644 modules/local/abnormal_contam_check.nf diff --git a/modules/local/abnormal_contam_check.nf b/modules/local/abnormal_contam_check.nf deleted file mode 100644 index 0aa8458b..00000000 --- a/modules/local/abnormal_contam_check.nf +++ /dev/null @@ -1,36 +0,0 @@ -process ABNORMAL_CONTAM_CHECK { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=3.9" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" - - input: - tuple val(meta), path(fasta) - tuple val(meta1), path(fcsgx_tiara_sum) - - output: - tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: abnormal - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - abnormal_contamination_check.py \\ - $fasta \\ - $fcsgx_tiara_sum - - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - abnormal_contam_check: \$(abnormal_contam_check.py --version | cut -d' ' -f2) - END_VERSIONS - """ -} diff --git a/workflows/ascc.nf b/workflows/ascc.nf index f1d13b85..ba0139b7 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -437,6 +437,11 @@ workflow ASCC { .set { abnormal_flag } + // + // PIPELINE: PREPARE THE DATA FOR USE IN THE SANGER-TOL/BLOBTOOLKIT PIPELINE + // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT + // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. + // if ( ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && abnormal_flag ) || workflow_steps.contains('ALL') || workflow_steps.contains("force_btk") ) { GENERATE_SAMPLESHEET ( From 58561b5b0599f722d1b5a95b55ce6afd5f61345b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 13 May 2024 10:05:35 +0100 Subject: [PATCH 056/117] Updates to all --- assets/test.yaml | 5 +- bin/abnormal_contamination_check.py | 0 bin/autofilter.py | 22 +++++--- bin/generate_samplesheet.py | 4 +- modules/local/autofiltering.nf | 6 +- modules/local/sanger_tol_btk.nf | 86 +++++++++++++++++------------ subworkflows/local/yaml_input.nf | 4 +- workflows/ascc.nf | 68 +++++++++++++---------- 8 files changed, 115 insertions(+), 80 deletions(-) mode change 100644 => 100755 bin/abnormal_contamination_check.py mode change 100644 => 100755 bin/generate_samplesheet.py diff --git a/assets/test.yaml b/assets/test.yaml index 122db327..1525975f 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -12,11 +12,11 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /data/blastdb/Supported/NT/202308/dbv4/ +nt_database: /data/blastdb/Supported/NT/current nt_database_prefix: nt nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ -ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/ +ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb @@ -27,3 +27,4 @@ seqkit: sliding: 100000 window: 6000 n_neighbours: 13 +btk_yaml: "/nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml" diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py old mode 100644 new mode 100755 diff --git a/bin/autofilter.py b/bin/autofilter.py index 85b3ef2a..53b7cbcb 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -28,18 +28,22 @@ def parse_args(): parser = argparse.ArgumentParser( - prog="Abnormal Contamination Check", + prog="Autofilter", formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(DESCRIPTION), ) parser.add_argument("fasta", type=str, help="Path to the fasta assembly file") - parser.add_argument("tiara", type=str, help="Path to the tiara summary file") - parser.add_argument("fcsgx_sum", type=str, help="Path to the fcs-gx_summary.csv file") - parser.add_argument("auto_filtered", type=str, help="Path to the assembly_autofiltered.fasta file") - parser.add_argument("combined_sum", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file") - parser.add_argument("rejected_seq", type=str, help="Path to the assembly_filtering_removed_sequences.txt file") - parser.add_argument("taxid", type=int, help="NCBI taxonomy ID of the species") - parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy") + parser.add_argument("-t", "--tiara", type=str, help="Path to the tiara summary file") + parser.add_argument("-s", "--fcsgx_sum", type=str, help="Path to the fcs-gx_summary.csv file") + parser.add_argument("-a", "--auto_filtered", type=str, help="Path to the assembly_autofiltered.fasta file") + parser.add_argument("-c", "--combined_sum", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file") + parser.add_argument( + "-r", "--rejected_seq", type=str, help="Path to the assembly_filtering_removed_sequences.txt file" + ) + parser.add_argument("-i", "--taxid", type=int, help="NCBI taxonomy ID of the species") + parser.add_argument( + "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" + ) parser.add_argument("-v", "--version", action="version", version=VERSION) return parser.parse_args() @@ -167,7 +171,7 @@ def main(): excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path - Path(f"{args.data_folder}/fasta/filtered").mkdir(parents=True, exist_ok=True) + Path(f"./fasta/filtered").mkdir(parents=True, exist_ok=True) for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: if os.path.isfile(i) is False: diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py old mode 100644 new mode 100755 index cc551b6a..333eeacf --- a/bin/generate_samplesheet.py +++ b/bin/generate_samplesheet.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import os import argparse @@ -26,7 +28,7 @@ def main(): if file.endswith(".fasta.gz"): data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}/{file}\n") - with open("samplesheet.csv") as file: + with open("samplesheet.csv", "w") as file: file.write("".join(data_list)) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 7c0fa543..9f8ab5df 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -25,9 +25,9 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { """ autofilter.py \\ $reference \\ - $meta.taxid \\ - $tiara_txt \\ - $fcs_csv + --taxid $meta.taxid \\ + --tiara $tiara_txt \\ + --fcsgx_sum $fcs_csv abnormal_contamination_check.py \\ $reference \\ diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index fa275d5d..17e99cbd 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -4,76 +4,94 @@ process SANGER_TOL_BTK { input: tuple val(meta), path(reference, stageAs: "REFERENCE.fa") - path samplesheet_csv - path blastp + tuple val(meta1), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") + path blastp, stageAs: "blastp.dmnd" path blastn path blastx path btk_config_file path tax_dump + path btk_yaml, stageAs: "BTK.yaml" val taxon val gca_accession output: - path("blobtoolkit/$gca_accession"), emit: btk_results - path("blobtoolkit/plots"), emit: btk_plots - path("blobktoolkit/busco"), emit: btk_busco - path("blobktoolkit/multiqc"), emit: btk_multiqc - path("blobtoolkit_pipeline_info"), emit: btk_pipeline + path("${meta.id}_btk_out/plots"), emit: btk_plots + path("${meta.id}_btk_out/busco"), emit: btk_busco + path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc + path("${meta.id}_btk_out/blobtoolkit_pipeline_info"), emit: btk_pipeline script: - def prefix = task.ext.prefix ?: "${meta.id}" - def args = task.ext.args ?: "" - def executor = task.ext.executor ?: "" - def profiles = task.ext.profiles ?: "" - def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" - def btk_config = ( if btk_config_file ? "-c $btk_config_file" : "") + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def executor = task.ext.executor ?: "" + def profiles = task.ext.profiles ?: "" + def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" + def btk_config = btk_config_file ? "-c $btk_config_file" : "" + def pipeline_version = task.ext.version ?: "main" + // YAML used to avoid the use of GCA accession number + // https://github.com/sanger-tol/blobtoolkit/issues/77 + + // Seems to be an issue where a nested pipeline can't see the files in the same directory + // Running realpath gets around this but the files copied into the folder are + // now just wasted space. + + // outdir should be an arg + """ $executor 'nextflow run sanger-tol/blobtoolkit \\ + -r $pipeline_version \\ -profile $profiles \\ - --input $samplesheet_csv \\ - --outdir ${meta.id}_btk_out \\ - --fasta $reference \\ - --accession $gc_accession \\ + --input "\$(realpath $samplesheet_csv)" \\ + --outdir ${prefix}_btk_out \\ + --fasta "\$(realpath REFERENCE.fa)" \\ + --yaml "\$(realpath BTK.yaml)" \\ + --accession draft \\ --taxon $taxon \\ - --taxdump $tax_dump \\ - --blastp $blastp \\ - --blastn $blastn \\ - --blastx $blastx \\ + --taxdump "\$(realpath $tax_dump)" \\ + --blastp "\$(realpath blastp.dmnd)" \\ + --blastn "\$(realpath $blastn)" \\ + --blastx "\$(realpath $blastx)" \\ $btk_config \\ $args' + mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info cat <<-END_VERSIONS > versions.yml "${task.process}": + Blobtoolkit: $pipeline_version Nextflow: \$(nextflow -v | cut -d " " -f3) executor system: $get_version END_VERSIONS """ stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def pipeline_version = task.ext.version ?: "main" + """ mkdir -p blobtoolkit/$gca_accession touch blobtoolkit/$gca_accession/test.json.gz - mkdir blobtoolkit/plots - touch blobtoolkit/plots/test.png + mkdir ${prefix}_btk_out/plots + touch ${prefix}_btk_out/plots/test.png - mkdir blobktoolkit/busco - touch blobtoolkit/busco/test.batch_summary.txt - touch blobtoolkit/busco/test.fasta.txt - touch blobtoolkit/busco/test.json + mkdir ${prefix}_btk_out/busco + touch ${prefix}_btk_out/busco/test.batch_summary.txt + touch ${prefix}_btk_out/busco/test.fasta.txt + touch ${prefix}_btk_out/busco/test.json - mkdir blobktoolkit/multiqc - mkdir blobktoolkit/multiqc/multiqc_data - mkdir blobktoolkit/multiqc/multiqc_plots - touch blobktoolkit/multiqc/multiqc_report.html + mkdir ${prefix}_btk_out/multiqc + mkdir ${prefix}_btk_out/multiqc/multiqc_data + mkdir ${prefix}_btk_out/multiqc/multiqc_plots + touch ${prefix}_btk_out/multiqc/multiqc_report.html - mv pipeline_into blobtoolkit_pipeline_info + mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') - parse_fcsgx_result: \$(parse_fcsgx_result.py -v) + Blobtoolkit: $pipeline_version + Nextflow: \$(nextflow -v | cut -d " " -f3) + executor system: $get_version END_VERSIONS """ } diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 7a2f31b4..e4d4b3a3 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -44,6 +44,7 @@ workflow YAML_INPUT { diamond_nr_database_path: ( data.diamond_nr_database_path ) vecscreen_database_path: ( data.vecscreen_database_path ) neighbours: ( data.n_neighbours ) + btk_yaml: ( file(data.btk_yaml) ) } .set{ group } @@ -79,7 +80,7 @@ workflow YAML_INPUT { group.assembly_title .combine( group.reads_path ) .combine( group.reads_type ) - .map { id, file -> + .map { id, file, type -> tuple( [ id: id, type: type ], file @@ -148,6 +149,7 @@ workflow YAML_INPUT { plastid_var = "plastid_genome" kmer_len = group.kmer_len n_neighbours = group.neighbours + btk_yaml = group.btk_yaml versions = ch_versions.ifEmpty(null) } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index ba0139b7..c408d08b 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -50,6 +50,8 @@ include { CREATE_BTK_DATASET } from '../modules/local include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' include { AUTOFILTER_AND_CHECK_ASSEMBLY } from '../modules/local/autofiltering' +include { SANGER_TOL_BTK } from '../modules/local/sanger_tol_btk' +include { GENERATE_SAMPLESHEET } from '../modules/local/generate_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -267,7 +269,7 @@ workflow ASCC { ) ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) } else { ch_fcsgx = [] } @@ -386,7 +388,7 @@ workflow ASCC { // insert them into the one process to create the btk and the merged report // much like the versions channel - CREATE_BTK_DATASET ( +/* CREATE_BTK_DATASET ( GENERATE_GENOME.out.reference_tuple, GENERATE_GENOME.out.dot_genome.map{it[1]}, ch_kmers, @@ -402,8 +404,8 @@ workflow ASCC { un_hits, YAML_INPUT.out.ncbi_taxonomy_path, - ) - ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) + )*/ + //ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) //SANGER_TOL_BTK.out.btk_datasets = [] @@ -413,64 +415,70 @@ workflow ASCC { // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // - if ( ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') && workflow_steps.contains("autofilter") ) || workflow_steps.contains('ALL') ) { + run_btk = false + if ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, RUN_FCSGX.out.fcsgxresult ) ch_autofiltered_assembly = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} + +/* ch_autofiltered_assembly + .view() + + ch_autofiltered_assembly + .splitText( by: 10 ) + .view() + + for (i in ch_autofiltered_assembly) { + if (i.contains("YES_ABNORMAL")) { + run_btk = true + break + } + } */ + ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) } else { ch_autofiltered_assembly = [] } - // - // LOGIC: SCAN FILE FOR PRESENCE OF ABNORMAL CONTAMINATION - // IF FOUND THEN WE WANT TO RUN BTK - // - ch_autofiltered_assembly - .branch{ - btk_run: { if ch_autofiltered_assembly.getText().contains("YES_ABNORMAL_CONTAMINATION") ? "PASS" : [] } - skip: [] - } - .set { abnormal_flag } - - // // PIPELINE: PREPARE THE DATA FOR USE IN THE SANGER-TOL/BLOBTOOLKIT PIPELINE // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // - if ( ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && abnormal_flag ) || workflow_steps.contains('ALL') || workflow_steps.contains("force_btk") ) { + if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( - YAML_INPUT.out.reference_tuple, - YAML_INPUT.out.pacbio_tuple + YAML_INPUT.out.pacbio_tuple.collect() ) - ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + YAML_INPUT.out.reference_tuple.view() + GENERATE_SAMPLESHEET.out.csv.view() SANGER_TOL_BTK ( YAML_INPUT.out.reference_tuple, GENERATE_SAMPLESHEET.out.csv, - YAML_INPUT.out.blastp, - YAML_INPUT.out.blastn, - YAML_INPUT.out.blastx, + YAML_INPUT.out.diamond_uniprot_database_path, + YAML_INPUT.out.nt_database, + YAML_INPUT.out.diamond_uniprot_database_path, [], - YAML_INPUT.out.tax_dump, - YAML_INPUT.out.taxon, + YAML_INPUT.out.ncbi_taxonomy_path, + YAML_INPUT.out.btk_yaml, + YAML_INPUT.out.taxid, 'GCA_0001' ) //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) - MERGE_BTK_DATASETS ( +/* MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] [[],[]] //SANGER_TOL_BTK.out.summary = [] ) - ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) */ } @@ -478,7 +486,7 @@ workflow ASCC { // // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER // - ASCC_MERGE_TABLES ( +/* ASCC_MERGE_TABLES ( GC_CONTENT.out.txt, // FROM -- GC_COVERAGE.out.tsv ch_coverage, // FROM -- RUN_COVERAGE.out.tsv.map{it[1]} ch_tiara, // FROM -- TIARA_TIARA.out.classifications.map{it[1]} @@ -493,7 +501,7 @@ workflow ASCC { CREATE_BTK_DATASET.out.create_summary.map{it[1]}, [], // <-- BUSCO_BTK -- NOT IN PIPELINE YET ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} - ) + ) */ // // SUBWORKFLOW: Collates version data from prior subworflows From 5b7db2533c77b93ca5b269c9eb97f055ede85398 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 13 May 2024 10:05:53 +0100 Subject: [PATCH 057/117] Updates to all --- assets/btk_draft.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 assets/btk_draft.yaml diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml new file mode 100644 index 00000000..34b1b3f0 --- /dev/null +++ b/assets/btk_draft.yaml @@ -0,0 +1,17 @@ +assembly: + level: bar +settings: + foo: 0 +similarity: + diamond_blastx: + foo: 0 +taxon: + class: class_name + family: family_name + genus: genus_name + kingdom: kingdom_name + name: species_name + order: order_name + phylum: phylum_name + superkingdom: superkingdom_name + taxid: 0 \ No newline at end of file From df53c2dd88ca2b4f60aefb37225ce2daa4a214fd Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 13 May 2024 10:07:34 +0100 Subject: [PATCH 058/117] Updates to all --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index c408d08b..f1e648e5 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -447,7 +447,7 @@ workflow ASCC { // PIPELINE: PREPARE THE DATA FOR USE IN THE SANGER-TOL/BLOBTOOLKIT PIPELINE // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. - // + // This will also eventually check for the above run_btk boolean from autofilter if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( From 57cbc06822caee706b8d3ddef70598a4933fef00 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 13 May 2024 10:16:52 +0100 Subject: [PATCH 059/117] Updates to all --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index f1e648e5..fc8b9adc 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -428,7 +428,7 @@ workflow ASCC { .view() ch_autofiltered_assembly - .splitText( by: 10 ) + .text() .view() for (i in ch_autofiltered_assembly) { From e5563b05737e8f68dfff66f4b8acea14af74d02f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 23 May 2024 09:55:02 +0100 Subject: [PATCH 060/117] Updates --- modules/local/autofiltering.nf | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 9f8ab5df..7f47a81e 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -11,10 +11,11 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { tuple val(meta), path(reference) tuple val(tiara_meta), path(tiara_txt) tuple val(fcs_meta), path(fcs_csv) + path ncbi_rankedlineage output: - tuple val(meta), path("*autofiltered.fasta"), emit: decontaminated_assembly - tuple val(meta), path("fcs-gx_and_tiara_combined_summary.csv"), emit: fcs_tiara_summary + tuple val(meta), path("autofiltered.fasta"), emit: decontaminated_assembly + tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: fcs_tiara_summary tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file path "versions.yml", emit: versions @@ -27,11 +28,12 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { $reference \\ --taxid $meta.taxid \\ --tiara $tiara_txt \\ - --fcsgx_sum $fcs_csv + --fcsgx_sum $fcs_csv \\ + --ncbi_rankedlineage_path $ncbi_rankedlineage \\ abnormal_contamination_check.py \\ $reference \\ - fcs-gx_and_tiara_combined_summary.csv + assembly_filtering_removed_sequences.txt cat <<-END_VERSIONS > versions.yml From 0116b8f7bd7d3ab5de08df7b04a9c00b1a473963 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 23 May 2024 09:55:15 +0100 Subject: [PATCH 061/117] Updates --- assets/test.yaml | 2 +- bin/abnormal_contamination_check.py | 2 +- bin/autofilter.py | 18 ++++-- docs/usage.md | 94 ++++++++++------------------- workflows/ascc.nf | 18 +++--- 5 files changed, 56 insertions(+), 78 deletions(-) diff --git a/assets/test.yaml b/assets/test.yaml index 1525975f..e80d4fb3 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -27,4 +27,4 @@ seqkit: sliding: 100000 window: 6000 n_neighbours: 13 -btk_yaml: "/nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml" +btk_yaml: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py index 7e5bbf74..bb434945 100755 --- a/bin/abnormal_contamination_check.py +++ b/bin/abnormal_contamination_check.py @@ -62,7 +62,7 @@ def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path): fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)] for line in fcs_gx_and_tiara_summary_data: split_line = line.split(",") - assert len(split_line) == 5 + assert len(split_line) == 4 # THIS WAS 5 seq_name = split_line[0] fcs_gx_action = split_line[1] seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action diff --git a/bin/autofilter.py b/bin/autofilter.py index 53b7cbcb..7d7de7aa 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -35,10 +35,20 @@ def parse_args(): parser.add_argument("fasta", type=str, help="Path to the fasta assembly file") parser.add_argument("-t", "--tiara", type=str, help="Path to the tiara summary file") parser.add_argument("-s", "--fcsgx_sum", type=str, help="Path to the fcs-gx_summary.csv file") - parser.add_argument("-a", "--auto_filtered", type=str, help="Path to the assembly_autofiltered.fasta file") + parser.add_argument( + "-o", + "--output_auto_filtered", + type=str, + help="Path to the assembly_autofiltered.fasta file", + default="autofiltered.fasta", + ) parser.add_argument("-c", "--combined_sum", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file") parser.add_argument( - "-r", "--rejected_seq", type=str, help="Path to the assembly_filtering_removed_sequences.txt file" + "-r", + "--rejected_seq", + type=str, + help="Path to the assembly_filtering_removed_sequences.txt file", + default="assembly_filtering_removed_sequences.txt", ) parser.add_argument("-i", "--taxid", type=int, help="NCBI taxonomy ID of the species") parser.add_argument( @@ -166,7 +176,7 @@ def main(): assembly_path = args.fasta tiara_results_path = args.tiara fcs_gx_summary_path = args.fcsgx_sum - filtered_assembly_path = args.auto_filtered + filtered_assembly_path = args.output_auto_filtered combined_summary = args.combined_sum excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path @@ -174,7 +184,7 @@ def main(): Path(f"./fasta/filtered").mkdir(parents=True, exist_ok=True) for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: - if os.path.isfile(i) is False: + if not os.path.isfile(i): sys.stderr.write(f"{i} WAS NOT AT THE EXPECTED LOCATION\n") sys.exit(1) diff --git a/docs/usage.md b/docs/usage.md index 27730271..390973b8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,50 +6,43 @@ -## Samplesheet input +## Yaml input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` +### Full yaml -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```yaml +assembly_path: PATH TO INPUT FASTA +assembly_title: NAME OF INPUT ORGANISM +sci_name: "{SCIENTIFIC NAME OF ORGANISM}" +taxid: 352914 +mito_fasta_path: PATH TO MITO FASTA +plastid_fasta_path: PATH TO PLASTID FASTA +reads_path: /path/to/pacbio/fasta/ +reads_type: "hifi" +pacbio_barcodes: FULL PATH TO /ascc/assets/pacbio_adaptors.fa +pacbio_multiplexing_barcode_names: "bc2008,bc2009" {BARCODES EXPECTED IN DATA} +kmer_len: 7 +dimensionality_reduction_methods: "pca,random_trees" A CSV OF THE BELOW METHODS +# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" +nt_database: PATH TO UPTO DATE BLASTDB NT DATABASE +nt_database_prefix: PREFIX FOR THE BLASTDB DATABASE +nt_kraken_db_path: PATH+PREFIX TO THE NT KRAKEN DATABASE +ncbi_accessionids_folder: PATH TO /accession2taxid/ +ncbi_taxonomy_path: PATH TO /taxdump/ +ncbi_rankedlineage_path: PATH TO /taxdump/rankedlineage.dmp +busco_lineages_folder: PATH TO THE BUSCO LINEAGES FOLDER +fcs_gx_database_path: PATH TO FOLDER CONTAINING THE FCS_GX DB +vecscreen_database_path: PATH TO VECSCREEN DB +diamond_uniprot_database_path: PATH TO uniprot_reference_proteomes_with_taxonnames.dmnd +diamond_nr_database_path: PATH TO /nr.dmnd +seqkit: + sliding: 100000 + window: 6000 +n_neighbours: 13 +btk_yaml: PATH TO /ascc/assets/btk_draft.yaml <- THIS IS DEFAULT AND ONLY SERVES TO BYPASS GCA REQUIREMENTS OF SANGER-TOL/BLOBTOOLKIT ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - ## Running the pipeline The typical command for running the pipeline is as follows: @@ -69,29 +62,6 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. - -Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. - -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). - -The above pipeline run specified with a params file in yaml format: - -```bash -nextflow run sanger-tol/ascc -profile docker -params-file params.yaml -``` - -with `params.yaml` containing: - -```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -<...> -``` - -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). - ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/workflows/ascc.nf b/workflows/ascc.nf index fc8b9adc..2ade2265 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -420,18 +420,16 @@ workflow ASCC { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, - RUN_FCSGX.out.fcsgxresult + RUN_FCSGX.out.fcsgxresult, + YAML_INPUT.out.ncbi_rankedlineage_path ) ch_autofiltered_assembly = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} -/* ch_autofiltered_assembly - .view() - - ch_autofiltered_assembly - .text() + AUTOFILTER_AND_CHECK_ASSEMBLY.out.fcs_tiara_summary + .map { id, file -> file.text.trim() } .view() - for (i in ch_autofiltered_assembly) { +/* for (i in ch_autofiltered_assembly) { if (i.contains("YES_ABNORMAL")) { run_btk = true break @@ -448,7 +446,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { +/* if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( YAML_INPUT.out.pacbio_tuple.collect() @@ -469,7 +467,7 @@ workflow ASCC { YAML_INPUT.out.btk_yaml, YAML_INPUT.out.taxid, 'GCA_0001' - ) + ) */ //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) @@ -480,7 +478,7 @@ workflow ASCC { ) ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) */ - } + //} // From e4bda7650641315692a1430b493fd80fb06d419f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 28 May 2024 14:37:31 +0100 Subject: [PATCH 062/117] Fixing the autofilter, wrong file was being passed and booling the output was incomplete -- fixed --- bin/abnormal_contamination_check.py | 2 +- bin/autofilter.py | 13 +- bin/remove_fcs_gx_and_tiara_contams.py | 205 ------------------------- docs/usage.md | 4 +- modules/local/autofiltering.nf | 3 +- workflows/ascc.nf | 27 ++-- 6 files changed, 24 insertions(+), 230 deletions(-) delete mode 100755 bin/remove_fcs_gx_and_tiara_contams.py diff --git a/bin/abnormal_contamination_check.py b/bin/abnormal_contamination_check.py index bb434945..7e5bbf74 100755 --- a/bin/abnormal_contamination_check.py +++ b/bin/abnormal_contamination_check.py @@ -62,7 +62,7 @@ def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path): fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)] for line in fcs_gx_and_tiara_summary_data: split_line = line.split(",") - assert len(split_line) == 4 # THIS WAS 5 + assert len(split_line) == 5 seq_name = split_line[0] fcs_gx_action = split_line[1] seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action diff --git a/bin/autofilter.py b/bin/autofilter.py index 7d7de7aa..11ccc83d 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -198,25 +198,24 @@ def main(): for scaff in scaffs: fcs_gx_action = "NA" tiara_action = "NA" - if scaff in fcs_gx_action_dict: fcs_gx_action = fcs_gx_action_dict[scaff] - + combined_action_source = "FCS-GX" if scaff in tiara_action_dict: tiara_action = tiara_action_dict[scaff] - combined_action = fcs_gx_action - if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": combined_action = "EXCLUDE" - + combined_action_source = "Tiara" + if fcs_gx_action == "EXCLUDE" and tiara_action == "EXCLUDE": + combined_action_source = "FCS-GX_and_Tiara" if combined_action == "EXCLUDE": scaffs_to_exclude.append(scaff) - combined_action_dict[scaff] = { "fcs_gx_action": fcs_gx_action, "tiara_action": tiara_action, "combined_action": combined_action, + "combined_action_source": combined_action_source, } filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) @@ -224,7 +223,7 @@ def main(): out_csv_list = list() out_csv_list.append("scaff,fcs_gx_action,tiara_action,combined_action") for scaff, scaff_properties in combined_action_dict.items(): - out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']}" + out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']},{scaff_properties['combined_action_source']}" out_csv_list.append(out_line) gpf.export_list_as_line_break_separated_file(out_csv_list, "ABNORMAL_CHECK.csv") diff --git a/bin/remove_fcs_gx_and_tiara_contams.py b/bin/remove_fcs_gx_and_tiara_contams.py deleted file mode 100755 index aa0aadb8..00000000 --- a/bin/remove_fcs_gx_and_tiara_contams.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python3 -""" -Script for filtering the assembly to remove putative contaminants based on FGCS-GX and Tiara results -""" - -import general_purpose_functions as gpf -import os -import sys -import argparse -from pathlib import Path -import csv - - -def get_domain_from_taxid(query_taxid, rankedlineage_path): - """ - Input: 1) a taxID, 2) path to the NCBI rankedlineage.dmp file - Output: domain classification corresponding to the taxID - """ - domain = None - query_taxid = str(query_taxid) - rankedlineage_data = gpf.ll(rankedlineage_path) - for line in rankedlineage_data: - split_line = line.split("|") - split_line = [n.strip() for n in split_line] - assert len(split_line) == 11 - taxid = split_line[0] - domain = split_line[9] - if taxid == query_taxid: - domain = split_line[9] - if domain not in ("", "Archaea", "Bacteria", "Eukaryota", "Viruses"): - sys.stderr.write(f"Unrecognised value for domain-level taxonomy: {domain}") - sys.exit(1) - break - if domain is None: - sys.stderr.write( - "The domain for taxid ({}) was not found in the NCBI rankedlineage.dmp file ({})\n".format( - query_taxid, rankedlineage_path - ) - ) - sys.exit(1) - return domain - - -def process_tiara_results(tiara_results_path, target_domain): - """ - Input: 1) path to the main output file of Tiara, 2) the domain of the target species - Output: dictionary where the keys are scaffold names and the values are the decontamination action based on Tiara results - ('keep' or 'exclude') - """ - tiara_action_dict = dict() - - allowed_classif_dict = dict() - allowed_classif_dict[""] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] - allowed_classif_dict["Archaea"] = ["archaea", "prokarya", "unknown"] - allowed_classif_dict["Bacteria"] = ["bacteria", "prokarya", "unknown"] - allowed_classif_dict["Eukaryota"] = ["eukarya", "organelle", "unknown"] - allowed_classif_dict["Viruses"] = ["archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown"] - allowed_classif_list = allowed_classif_dict[target_domain] - - tiara_output = gpf.ll(tiara_results_path) - for counter, line in enumerate(tiara_output): - if counter == 0: - continue - split_line = line.split() - assert len(split_line) == 3 - tiara_class_fst_stage = split_line[1] - assert tiara_class_fst_stage in ("archaea", "bacteria", "prokarya", "eukarya", "organelle", "unknown") - tiara_action = "KEEP" - if tiara_class_fst_stage not in allowed_classif_list: - tiara_action = "EXCLUDE" - scaff = split_line[0] - tiara_action_dict[scaff] = tiara_action - return tiara_action_dict - - -def get_fcs_gx_action_dict(fcs_gx_summary_path): - """ - Input: path to FCS-GX summary CSV file (produced by ascc_parse_fcsgx_results.py) - Output: dictionary where the keys are scaffold names and the values are the FCS-GX action values - """ - fcs_gx_action_dict = dict() - fcs_gx_summary_data = gpf.ll(fcs_gx_summary_path) - for counter, line in enumerate(fcs_gx_summary_data): - if counter == 0: - continue - split_line = line.split(",") - scaff = split_line[0] - fcs_gx_action = split_line[8] - fcs_gx_action_dict[scaff] = fcs_gx_action - return fcs_gx_action_dict - - -def get_scaff_names(assembly_path): - """ - Reads FASTA headers from a FASTA file and returns them as a list - """ - scaffs = list() - fasta_data = gpf.read_fasta_in_chunks(assembly_path) - for fasta_tuple in fasta_data: - scaffs.append(fasta_tuple[0]) - return scaffs - - -def filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path): - """ - Filters a genome assembly FASTA file to remove sequences that are listed in the scaffs_to_exclude list - """ - out_list = list() - fasta_data = gpf.read_fasta_in_chunks(assembly_path) - for header, seq in fasta_data: - if header not in scaffs_to_exclude: - out_list.append(">" + header) - split_seq = gpf.split_with_fixed_row_length(seq, 80) - out_list.extend(split_seq) - else: - sys.stderr.write( - f"Excluding the sequence {header} from the filtered assembly ({filtered_assembly_path}), as it appears to be a contaminant based on FCS-GX and/or Tiara results\n" - ) - gpf.export_list_as_line_break_separated_file(out_list, filtered_assembly_path) - - -def main(pipeline_run_folder, taxid, rankedlineage_path): - if taxid == -1: - sys.stderr.write( - "The filtering of assembly based on FCS-GX and Tiara results requires a taxID but a valid taxID has not been provided (the provided taxID is -1, which is a placeholder value)\n" - ) - - assembly_path = f"{pipeline_run_folder}/fasta/assembly.fasta" - tiara_results_path = f"{pipeline_run_folder}/collected_tables/tiara_out.txt" - fcs_gx_summary_path = f"{pipeline_run_folder}/collected_tables/fcs-gx_summary.csv" - filtered_assembly_path = f"{pipeline_run_folder}/fasta/filtered/assembly_autofiltered.fasta" - assembly_filtering_summary_table_path = ( - f"{pipeline_run_folder}/collected_tables/fcs-gx_and_tiara_combined_summary.csv" - ) - excluded_seq_list_path = f"{pipeline_run_folder}/collected_tables/assembly_filtering_removed_sequences.txt" - - Path(f"{pipeline_run_folder}/fasta/filtered").mkdir(parents=True, exist_ok=True) - - if os.path.isfile(rankedlineage_path) is False: - sys.stderr.write( - f"The NCBI rankedlineage.dmp file was not found at the expected location ({rankedlineage_path})\n" - ) - sys.exit(1) - if os.path.isfile(tiara_results_path) is False: - sys.stderr.write(f"The Tiara output file was not found at the expected location ({tiara_results_path})\n") - sys.exit(1) - if os.path.isfile(fcs_gx_summary_path) is False: - sys.stderr.write( - f"The FCS-GX results summary file was not found at the expected location ({fcs_gx_summary_path})\n" - ) - sys.exit(1) - if os.path.isfile(assembly_path) is False: - sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({assembly_path})\n") - sys.exit(1) - - target_domain = get_domain_from_taxid(taxid, rankedlineage_path) - tiara_action_dict = process_tiara_results(tiara_results_path, target_domain) - - fcs_gx_action_dict = get_fcs_gx_action_dict(fcs_gx_summary_path) - - combined_action_dict = dict() - scaffs_to_exclude = list() - scaffs = get_scaff_names(assembly_path) - for scaff in scaffs: - fcs_gx_action = "NA" - tiara_action = "NA" - if scaff in fcs_gx_action_dict: - fcs_gx_action = fcs_gx_action_dict[scaff] - if scaff in tiara_action_dict: - tiara_action = tiara_action_dict[scaff] - combined_action = fcs_gx_action - if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": - combined_action = "EXCLUDE" - if combined_action == "EXCLUDE": - scaffs_to_exclude.append(scaff) - combined_action_dict[scaff] = { - "fcs_gx_action": fcs_gx_action, - "tiara_action": tiara_action, - "combined_action": combined_action, - } - filter_assembly(assembly_path, scaffs_to_exclude, filtered_assembly_path) - gpf.export_list_as_line_break_separated_file(scaffs_to_exclude, excluded_seq_list_path) - - # csv_writer = csv.writer(open(assembly_filtering_summary_table_path, "w")) - # for key, value in combined_action_dict.items(): - # line = [key] - # for ik, iv in value.items(): - # line.append(ik) - # line.extend([v for v in iv]) - # csv_writer.writerow(line) - out_csv_list = list() - out_csv_list.append("scaff,fcs_gx_action,tiara_action,combined_action") - for scaff, scaff_properties in combined_action_dict.items(): - out_line = f"{scaff},{scaff_properties['fcs_gx_action']},{scaff_properties['tiara_action']},{scaff_properties['combined_action']}" - out_csv_list.append(out_line) - gpf.export_list_as_line_break_separated_file(out_csv_list, assembly_filtering_summary_table_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("pipeline_run_folder", type=str, help="Path to the directory where the pipeline is be run") - parser.add_argument("taxid", type=int, help="NCBI taxonomy ID of the species") - parser.add_argument("ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy") - args = parser.parse_args() - main(args.pipeline_run_folder, args.taxid, args.ncbi_rankedlineage_path) diff --git a/docs/usage.md b/docs/usage.md index 390973b8..bd4c33b4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,10 +48,10 @@ btk_yaml: PATH TO /ascc/assets/btk_draft.yaml <- THIS IS DEFAULT AND ONLY SERVES The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/ascc --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run sanger-tol/ascc --input {INPUT YAML} --outdir {OUTDIR} --steps {CSV LIST OF STEPS TO RUN} -profile singularity ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `singularity` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 7f47a81e..7bc294b1 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -33,8 +33,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { abnormal_contamination_check.py \\ $reference \\ - assembly_filtering_removed_sequences.txt - + ABNORMAL_CHECK.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 2ade2265..2c69c90d 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -415,7 +415,6 @@ workflow ASCC { // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // - run_btk = false if ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, @@ -425,16 +424,18 @@ workflow ASCC { ) ch_autofiltered_assembly = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} - AUTOFILTER_AND_CHECK_ASSEMBLY.out.fcs_tiara_summary - .map { id, file -> file.text.trim() } - .view() - -/* for (i in ch_autofiltered_assembly) { - if (i.contains("YES_ABNORMAL")) { - run_btk = true - break + AUTOFILTER_AND_CHECK_ASSEMBLY.out.alarm_file + .map { file -> file.text.trim() } + .branch { it -> + run_btk: "ABNORMAL" ? it.contains("YES_ABNORMAL"): false + dont_run: [] } - } */ + .set { btk_bool } + + btk_bool.run_btk.view{ it -> "ABNORMALS == $it" } + + btk_bool.dont_run.view{ it -> "NORMALS == $it" } + ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) } else { @@ -446,7 +447,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter -/* if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { + if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( YAML_INPUT.out.pacbio_tuple.collect() @@ -467,7 +468,7 @@ workflow ASCC { YAML_INPUT.out.btk_yaml, YAML_INPUT.out.taxid, 'GCA_0001' - ) */ + ) //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) @@ -478,7 +479,7 @@ workflow ASCC { ) ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) */ - //} + } // From 61da0acd47fee2d9ce3acc7742fb402aabf9ea9a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 11 Jun 2024 14:43:09 +0100 Subject: [PATCH 063/117] Updates --- modules/local/create_btk_dataset.nf | 8 ++--- modules/local/sanger_tol_btk.nf | 6 ++-- subworkflows/local/run_fcsgx.nf | 4 +-- workflows/ascc.nf | 55 ++++++++++++++--------------- 4 files changed, 35 insertions(+), 38 deletions(-) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 60af4877..ec21e8a7 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -2,10 +2,10 @@ process CREATE_BTK_DATASET { tag "$meta.id" label 'process_medium' - conda "conda-forge::python=3.9" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(reference) diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 17e99cbd..e83ad6a2 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -15,10 +15,10 @@ process SANGER_TOL_BTK { val gca_accession output: - path("${meta.id}_btk_out/plots"), emit: btk_plots + // path("${meta.id}_btk_out/plots"), emit: btk_plots path("${meta.id}_btk_out/busco"), emit: btk_busco path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc - path("${meta.id}_btk_out/blobtoolkit_pipeline_info"), emit: btk_pipeline + path("blobtoolkit_pipeline_info"), emit: btk_pipeline script: def prefix = task.ext.prefix ?: "${meta.id}" @@ -26,7 +26,7 @@ process SANGER_TOL_BTK { def executor = task.ext.executor ?: "" def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" - def btk_config = btk_config_file ? "-c $btk_config_file" : "" + def btk_config = btk_config_file ? "-c $btk_config_file" : "" def pipeline_version = task.ext.version ?: "main" // YAML used to avoid the use of GCA accession number // https://github.com/sanger-tol/blobtoolkit/issues/77 diff --git a/subworkflows/local/run_fcsgx.nf b/subworkflows/local/run_fcsgx.nf index 7c4e04bb..78f220ad 100644 --- a/subworkflows/local/run_fcsgx.nf +++ b/subworkflows/local/run_fcsgx.nf @@ -1,5 +1,5 @@ -include { FCS_FCSGX } from '../../modules/nf-core/fcs/fcsgx/main' -include { PARSE_FCSGX_RESULT } from '../../modules/local/parse_fcsgx_result' +include { FCS_FCSGX } from '../../modules/nf-core/fcs/fcsgx/main' +include { PARSE_FCSGX_RESULT } from '../../modules/local/parse_fcsgx_result' workflow RUN_FCSGX { diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 2c69c90d..3d7a5b9e 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -88,6 +88,21 @@ workflow ASCC { ) ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + // + // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE + // + YAML_INPUT.out.reference_tuple + .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) + .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) + .map { meta, ref, sliding, window -> + tuple([ id : meta.id, + sliding : sliding, + window : window + ], + file(ref) + )} + .set { modified_input } + // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // @@ -155,25 +170,15 @@ workflow ASCC { ch_tiara = [] } - // - // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE - // - YAML_INPUT.out.reference_tuple - .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) - .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) - .map { meta, ref, sliding, window -> - tuple([ id : meta.id, - sliding : sliding, - window : window - ], - file(ref) - )} - .set { modified_input } - // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST // if ( workflow_steps.contains('nt_blast') || workflow_steps.contains('ALL') ) { + // + // NOTE: ch_nt_blast needs to be set in two places incase it + // fails during the run + // + ch_nt_blast = [] EXTRACT_NT_BLAST ( modified_input, YAML_INPUT.out.nt_database, @@ -182,6 +187,7 @@ workflow ASCC { ) ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + } else { ch_nt_blast = [] } @@ -238,7 +244,6 @@ workflow ASCC { ch_chloro = [] } - // // SUBWORKFLOW: // @@ -384,11 +389,11 @@ workflow ASCC { un_full = [] } - // mix the outputs of the outpuutting process so that we can + // mix the outputs of the outputting process so that we can // insert them into the one process to create the btk and the merged report // much like the versions channel -/* CREATE_BTK_DATASET ( + CREATE_BTK_DATASET ( GENERATE_GENOME.out.reference_tuple, GENERATE_GENOME.out.dot_genome.map{it[1]}, ch_kmers, @@ -403,15 +408,10 @@ workflow ASCC { nt_hits, un_hits, YAML_INPUT.out.ncbi_taxonomy_path, - - )*/ + ) //ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) - //SANGER_TOL_BTK.out.btk_datasets = [] - //SANGER_TOL_BTK.out.summary = [] - - // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // @@ -454,9 +454,6 @@ workflow ASCC { ) //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) - - YAML_INPUT.out.reference_tuple.view() - GENERATE_SAMPLESHEET.out.csv.view() SANGER_TOL_BTK ( YAML_INPUT.out.reference_tuple, GENERATE_SAMPLESHEET.out.csv, @@ -476,8 +473,8 @@ workflow ASCC { CREATE_BTK_DATASET.out.btk_datasets, [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] [[],[]] //SANGER_TOL_BTK.out.summary = [] - ) - ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) */ + ) */ + //ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) } From dd247e8c8eb2481648532662b70f2347d81d6148 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 20 Jun 2024 14:52:18 +0100 Subject: [PATCH 064/117] Updates --- modules.json | 5 +- modules/local/create_btk_dataset.nf | 70 ++++++++++--------- .../nf-core/blast/blastn/blast-blastn.diff | 53 -------------- modules/nf-core/blast/blastn/environment.yml | 3 +- modules/nf-core/blast/blastn/main.nf | 36 ++++++---- modules/nf-core/blast/blastn/meta.yml | 15 +++- .../nf-core/blast/blastn/tests/main.nf.test | 47 ++++++++++--- subworkflows/local/extract_nt_blast.nf | 50 ++----------- subworkflows/local/yaml_input.nf | 15 +++- workflows/ascc.nf | 55 ++++++++++----- 10 files changed, 167 insertions(+), 182 deletions(-) delete mode 100644 modules/nf-core/blast/blastn/blast-blastn.diff diff --git a/modules.json b/modules.json index f8c9ac15..0acbe5f1 100644 --- a/modules.json +++ b/modules.json @@ -7,9 +7,8 @@ "nf-core": { "blast/blastn": { "branch": "master", - "git_sha": "acacb4075ef46fa74630aa3f4b0684f1021d5930", - "installed_by": ["modules"], - "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" + "git_sha": "4262a04142431275e54e1f4b413628a2201ed6e6", + "installed_by": ["modules"] }, "blast/makeblastdb": { "branch": "master", diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index ec21e8a7..1037cddc 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -3,25 +3,43 @@ process CREATE_BTK_DATASET { label 'process_medium' if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + exit 1, "CREATE_BTK_DATASET module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.3.9" + container "docker.io/genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(reference) - path dot_genome, stageAs: "?/SORTED.genome" - path kmers, stageAs: "?/KMERS_dim_reduction_embeddings_combined.csv" - path tiara, stageAs: "?/TIARA.txt" - path nt_blast, stageAs: "?/BLAST_HITS.tsv" - path fcsgx, stageAs: "?/FCSGX_parsed.csv" - path mapped_bam, stageAs: "?/MAPPED.bam" - path coverage, stageAs: "?/COVERAGE_AVERAGE.txt" - path kraken_class, stageAs: "?/KRAKEN_CLASSIFIED.txt" - path kraken_report, stageAs: "?/KRAKEN_REPORT.txt" - path kraken_lineage, stageAs: "?/KRAKEN_LINEAGE.txt" - path nt_diamond, stageAs: "?/NUCLEOT_DIAMOND_FULL.tsv" - path un_diamond, stageAs: "?/UNIPROT_DIAMOND_FULL.tsv" - path ncbi_taxdump + path dot_genome, stageAs: "SORTED.genome" + path kmers, stageAs: "KMERS_dim_reduction_embeddings_combined.csv" + path tiara, stageAs: "TIARA.txt" + path nt_blast, stageAs: "BLAST_HITS.tsv" + path fcsgx, stageAs: "FCSGX_parsed.csv" + path mapped_bam, stageAs: "MAPPED.bam" + path coverage, stageAs: "COVERAGE_AVERAGE.txt" + path kraken_class, stageAs: "KRAKEN_CLASSIFIED.txt" + path kraken_report, stageAs: "KRAKEN_REPORT.txt" + path kraken_lineage, stageAs: "KRAKEN_LINEAGE.txt" + path nt_diamond, stageAs: "NUCLEOT_DIAMOND_FULL.tsv" + path un_diamond, stageAs: "UNIPROT_DIAMOND_FULL.tsv" + path ncbi_taxdump, stageAs: "TAXDUMP" + + + /* + -f ${reference} \\ + -d ./1/ \\ + -n "${prefix}" \\ + -tn "${meta.sci_name}" \\ + -id ${meta.taxid} \\ + -td ${ncbi_taxdump}/ \\ + $blastn_arg \\ + $nt_diamond_arg \\ + $un_diamond_arg \\ + $kraken_arg \\ + $mapped_arg \\ + $tiara_arg \\ + $pca_arg \\ + $fcs_arg \\ + $args */ output: tuple val(meta), path("btk_datasets"), emit: btk_datasets @@ -33,7 +51,7 @@ process CREATE_BTK_DATASET { script: def prefix = task.ext.prefix ?: "${meta.id}" def args = task.ext.args ?: "" - def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" +/* def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" def kraken_arg = kraken_lineage ? "-k ${kraken_lineage}": "" @@ -42,27 +60,13 @@ process CREATE_BTK_DATASET { def pca_arg = kmers ? "-p ${kmers}" : "" def fcs_arg = fcsgx ? "-fc ${fcsgx}" : "" def marker_arg = "" - def contigviz_arg = "" + def contigviz_arg = "" */ """ mkdir -p btk_datasets/ - create_btk_dataset_V2.py \\ - -f ${reference} \\ - -d ./1/ \\ - -n "${prefix}" \\ - -tn "${meta.sci_name}" \\ - -id ${meta.taxid} \\ - -td ${ncbi_taxdump}/ \\ - $blastn_arg \\ - $nt_diamond_arg \\ - $un_diamond_arg \\ - $kraken_arg \\ - $mapped_arg \\ - $tiara_arg \\ - $pca_arg \\ - $fcs_arg \\ - $args + create_btk_dataset_V2.py -h\\ + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff deleted file mode 100644 index 320fc3a1..00000000 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ /dev/null @@ -1,53 +0,0 @@ -Changes in module 'nf-core/blast/blastn' ---- modules/nf-core/blast/blastn/main.nf -+++ modules/nf-core/blast/blastn/main.nf -@@ -8,8 +8,8 @@ - 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" - - input: -- tuple val(meta), path(fasta) -- path db -+ tuple val(meta), path(fasta) -+ tuple val(meta2), path(db) - - output: - tuple val(meta), path('*.txt'), emit: txt -@@ -19,16 +19,17 @@ - task.ext.when == null || task.ext.when - - script: -- def args = task.ext.args ?: '' -- def prefix = task.ext.prefix ?: "${meta.id}" -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" - """ -- DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` -+ DB=`find -L ./ -name "${db_prefix}.nin" | sed 's/\\.nin\$//'` - blastn \\ - -num_threads $task.cpus \\ - -db \$DB \\ - -query $fasta \\ - $args \\ -- -out ${prefix}.txt -+ -out ${prefix}-${db_prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": -@@ -37,10 +38,11 @@ - """ - - stub: -- def args = task.ext.args ?: '' -- def prefix = task.ext.prefix ?: "${meta.id}" -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" - """ -- touch ${prefix}.txt -+ touch ${prefix}-${db_prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - -************************************************************ diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml index e2a15166..e4a72800 100644 --- a/modules/nf-core/blast/blastn/environment.yml +++ b/modules/nf-core/blast/blastn/environment.yml @@ -1,6 +1,7 @@ +name: blast_blastn channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::blast=2.14.1 + - bioconda::blast=2.15.0 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 9136b9fb..68b43ba4 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -4,11 +4,11 @@ process BLAST_BLASTN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': - 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + 'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1': + 'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }" input: - tuple val(meta), path(fasta) + tuple val(meta) , path(fasta) tuple val(meta2), path(db) output: @@ -19,16 +19,27 @@ process BLAST_BLASTN { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + """ - DB=`find -L ./ -name "${db_prefix}.nin" | sed 's/\\.nin\$//'` + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'` + if [ -z "\$DB" ]; then + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + fi + echo Using \$DB + blastn \\ - -num_threads $task.cpus \\ + -num_threads ${task.cpus} \\ -db \$DB \\ - -query $fasta \\ - $args \\ + -query ${fasta_name} \\ + ${args} \\ -out ${prefix}.txt cat <<-END_VERSIONS > versions.yml @@ -38,9 +49,8 @@ process BLAST_BLASTN { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def db_prefix = task.ext.dbprefix ?: "${meta2.db_prefix}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.txt diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml index 5fff6a7b..a0d64dd6 100644 --- a/modules/nf-core/blast/blastn/meta.yml +++ b/modules/nf-core/blast/blastn/meta.yml @@ -22,12 +22,22 @@ input: - fasta: type: file description: Input fasta file containing queries sequences - pattern: "*.{fa,fasta}" + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] - db: type: directory - description: Directory containing blast database + description: Directory containing the blast database pattern: "*" output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - txt: type: file description: File containing blastn hits @@ -39,7 +49,6 @@ output: authors: - "@joseespinosa" - "@drpatelh" - - "@vagkaratzas" maintainers: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test index a6badbc4..02ecfab5 100644 --- a/modules/nf-core/blast/blastn/tests/main.nf.test +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -8,27 +8,52 @@ nextflow_process { tag "modules_nfcore" tag "blast" tag "blast/blastn" + tag "blast/makeblastdb" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + } test("Should search for nucleotide hits against a blast db") { - setup { - run("BLAST_MAKEBLASTDB") { - script "../../makeblastdb/main.nf" - process { - """ - input[0] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] - """ - } + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ } } + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("Should search for zipped nucleotide hits against a blast db") { + when { params { outdir = "$outputDir" } process { """ - input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] input[1] = BLAST_MAKEBLASTDB.out.db """ } @@ -37,8 +62,8 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, - { assert process.out.versions } + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions_zipped") } ) } diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index c1956b1a..1b48fb69 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -24,56 +24,12 @@ workflow EXTRACT_NT_BLAST { SEQKIT_SLIDING ( input_genome ) ch_versions = ch_versions.mix(SEQKIT_SLIDING.out.versions) - // - // LOGIC: GLOB ALL *NIN FILES IN DIRECTORY AND SPLIT INTO CHANNELS - // - - blastn_db_path - .map( - it -> file("${it}*") // glob all files in directory - ) - .flatten() // flatten to file per channel - .map( - it -> - tuple ( - [ id: it.toString().split('/')[-1].split("\\....\$")[0] ], // get basename and trim off the extension, returns database prefix - it // list of files - ) - ) - .groupTuple() // group files by id (which = db prefix) - .map { - meta, files -> - tuple ( - [ id: meta.id, - file_count: files.size() ], // get number of files - files - ) - } - .filter { it[0].file_count >= 8 } // a database is made of 8 files, less than this means it is an accessory to the db - .set { databases_by_prefix } - - databases_by_prefix - .combine( blastn_db_path ) - .map { meta, files, rootpath -> - tuple( rootpath, meta.id ) - } - .combine ( SEQKIT_SLIDING.out.fastx ) - .multiMap { root, db_prefix, meta, ref -> - reference: tuple( [ id: meta.id ], - ref - ) - nin_db: tuple( [ id: db_prefix ], - root - ) - } - .set { nin } - // // MODULE: BLASTS THE INPUT GENOME AGAINST A LOCAL NCBI DATABASE // BLAST_BLASTN_MOD ( - nin.reference, - nin.nin_db + SEQKIT_SLIDING.out.fastx, + blastn_db_path ) ch_versions = ch_versions.mix(BLAST_BLASTN_MOD.out.versions) @@ -102,6 +58,8 @@ workflow EXTRACT_NT_BLAST { BLAST_CHUNK_TO_FULL ( blast_results ) ch_versions = ch_versions.mix(BLAST_CHUNK_TO_FULL.out.versions) + BLAST_CHUNK_TO_FULL.out.full.view{ $it -> "SUPPOSED TO BE-1: $it"} + // // MODULE: RE_ORDER THE DATA IN THE FULL_COORDINATE FILE // diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index e4d4b3a3..fdf4fc2c 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -29,6 +29,7 @@ workflow YAML_INPUT { taxid: ( data.taxid ) mito_fasta_path: ( data.mito_fasta_path ) plastid_fasta_path: ( data.plastid_fasta_path ) + nt_db_prefix: ( data.nt_database_prefix ) nt_database: ( data.nt_database ) reference_proteomes: ( data.reference_proteomes ) nt_kraken_db_path: ( data.nt_kraken_db_path ) @@ -36,7 +37,7 @@ workflow YAML_INPUT { dimensionality_reduction_methods: ( data.dimensionality_reduction_methods ) fcs_gx_database_path: ( data.fcs_gx_database_path ) ncbi_taxonomy_path: ( data.ncbi_taxonomy_path ) - ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path ) + ncbi_rankedlineage_path: ( file(data.ncbi_rankedlineage_path) ) ncbi_accessionids: ( data.ncbi_accessionids_folder ) busco_lineages_folder: ( data.busco_lineages_folder ) seqkit_values: ( data.seqkit ) @@ -121,6 +122,15 @@ workflow YAML_INPUT { } .set{ ch_vecscreen } + group.nt_database + .combine( group.assembly_title ) + .map{ db, meta -> + tuple( [ id: meta ], + db + ) + } + .set{ ch_nt_db } + emit: reference_tuple = ch_reference pacbio_tuple = ch_pacbio @@ -130,7 +140,8 @@ workflow YAML_INPUT { assembly_title = group.assembly_title assembly_path = group.assembly_path taxid = group.taxid - nt_database = group.nt_database + nt_database = ch_nt_db + nt_db_prefix = group.nt_db_prefix nt_kraken_db_path = group.nt_kraken_db_path ncbi_accessions = group.ncbi_accessionids ncbi_taxonomy_path = group.ncbi_taxonomy_path diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 3d7a5b9e..8ff9a1fc 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -103,6 +103,8 @@ workflow ASCC { )} .set { modified_input } + modified_input.view{it -> "MODINPUT: $it"} + // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // @@ -178,6 +180,10 @@ workflow ASCC { // NOTE: ch_nt_blast needs to be set in two places incase it // fails during the run // + YAML_INPUT.out.nt_database.view{it -> "NT_DB: $it"} + YAML_INPUT.out.ncbi_accessions.view{it -> "ACCESS: $it"} + YAML_INPUT.out.ncbi_rankedlineage_path.view{it -> "LINEAGE: $it"} + ch_nt_blast = [] EXTRACT_NT_BLAST ( modified_input, @@ -187,6 +193,7 @@ workflow ASCC { ) ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + ch_nt_blast.view{ $it -> "SUPPOSED TO BE: $it"} } else { ch_nt_blast = [] @@ -393,21 +400,39 @@ workflow ASCC { // insert them into the one process to create the btk and the merged report // much like the versions channel +/* GENERATE_GENOME.out.reference_tuple.view{it -> "INPUT GENOME $it"} + GENERATE_GENOME.out.dot_genome.map{it[1]}.view{it -> "GENOME $it"} + ch_kmers.view{it -> "KMER $it"} + ch_tiara.view{it -> "TIARA $it"} + ch_nt_blast.view{it -> "NT $it"} + ch_fcsgx.view{it -> "FSCSCCSCS $it"} + ch_bam.view{it -> "BAM $it"} + ch_coverage.view{it -> "COVERAGE $it"} + ch_kraken1.view{it -> "KRAKEN1 $it"} + ch_kraken2.view{it -> "KRAKEN2 $it"} + ch_kraken3.view{it -> "KRAKEN3 $it"} + nt_hits.view{it -> "HITS $it"} + un_hits.view{it -> "UNHITS $it"} + YAML_INPUT.out.ncbi_taxonomy_path.view{it -> "TAXDUMP $it"} */ + + + ch_got_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} + CREATE_BTK_DATASET ( - GENERATE_GENOME.out.reference_tuple, - GENERATE_GENOME.out.dot_genome.map{it[1]}, + GENERATE_GENOME.out.reference_tuple.first(), + ch_got_genome, ch_kmers, - ch_tiara, + ch_tiara.first(), ch_nt_blast, - ch_fcsgx, - ch_bam, - ch_coverage, - ch_kraken1, - ch_kraken2, - ch_kraken3, - nt_hits, - un_hits, - YAML_INPUT.out.ncbi_taxonomy_path, + ch_fcsgx.first(), + ch_bam.first(), + ch_coverage.first(), + ch_kraken1.first(), + ch_kraken2.first(), + ch_kraken3.first(), + nt_hits.first(), + un_hits.first(), + YAML_INPUT.out.ncbi_taxonomy_path.first() ) //ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) @@ -432,10 +457,6 @@ workflow ASCC { } .set { btk_bool } - btk_bool.run_btk.view{ it -> "ABNORMALS == $it" } - - btk_bool.dont_run.view{ it -> "NORMALS == $it" } - ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) } else { @@ -458,7 +479,7 @@ workflow ASCC { YAML_INPUT.out.reference_tuple, GENERATE_SAMPLESHEET.out.csv, YAML_INPUT.out.diamond_uniprot_database_path, - YAML_INPUT.out.nt_database, + YAML_INPUT.out.nt_database.map{it -> it[1]}, YAML_INPUT.out.diamond_uniprot_database_path, [], YAML_INPUT.out.ncbi_taxonomy_path, From 60e5ae4f2b18dd6428836fbf9acea5b8bc7e1825 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 25 Jun 2024 16:56:07 +0100 Subject: [PATCH 065/117] Fixes that were stopping the pipeline completing --- modules/local/create_btk_dataset.nf | 21 ++++++++++++++++++--- subworkflows/local/extract_nt_blast.nf | 2 +- workflows/ascc.nf | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 1037cddc..42045b11 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -51,7 +51,7 @@ process CREATE_BTK_DATASET { script: def prefix = task.ext.prefix ?: "${meta.id}" def args = task.ext.args ?: "" -/* def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" + def blastn_arg = nt_blast ? "-bh ${nt_blast}" : "" def nt_diamond_arg = nt_diamond ? "-nr ${nt_diamond}" : "" def un_diamond_arg = un_diamond ? "-ud ${un_diamond}" : "" def kraken_arg = kraken_lineage ? "-k ${kraken_lineage}": "" @@ -60,12 +60,27 @@ process CREATE_BTK_DATASET { def pca_arg = kmers ? "-p ${kmers}" : "" def fcs_arg = fcsgx ? "-fc ${fcsgx}" : "" def marker_arg = "" - def contigviz_arg = "" */ + def contigviz_arg = "" """ mkdir -p btk_datasets/ - create_btk_dataset_V2.py -h\\ + create_btk_dataset_V2.py \\ + -f ${reference} \\ + -d ./1/ \\ + -n "${prefix}" \\ + -tn "${meta.sci_name}" \\ + -id ${meta.taxid} \\ + -td ${ncbi_taxdump}/ \\ + $blastn_arg \\ + $nt_diamond_arg \\ + $un_diamond_arg \\ + $kraken_arg \\ + $mapped_arg \\ + $tiara_arg \\ + $pca_arg \\ + $fcs_arg \\ + $args\\ cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index 1b48fb69..654548ce 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -58,7 +58,7 @@ workflow EXTRACT_NT_BLAST { BLAST_CHUNK_TO_FULL ( blast_results ) ch_versions = ch_versions.mix(BLAST_CHUNK_TO_FULL.out.versions) - BLAST_CHUNK_TO_FULL.out.full.view{ $it -> "SUPPOSED TO BE-1: $it"} + BLAST_CHUNK_TO_FULL.out.full.view{ it -> "SUPPOSED TO BE-1: it"} // // MODULE: RE_ORDER THE DATA IN THE FULL_COORDINATE FILE diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 8ff9a1fc..775d9789 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -193,7 +193,7 @@ workflow ASCC { ) ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} - ch_nt_blast.view{ $it -> "SUPPOSED TO BE: $it"} + ch_nt_blast.view{ it -> "SUPPOSED TO BE: it"} } else { ch_nt_blast = [] From 8e76bc16b98890f8a792b23cf2c2d2a9e07bdfb3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 13:30:25 +0100 Subject: [PATCH 066/117] Update for ea --- conf/base.config | 2 +- modules/local/sanger_tol_btk.nf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 4f84e8a8..01fb4653 100644 --- a/conf/base.config +++ b/conf/base.config @@ -23,7 +23,7 @@ process { withName: DIAMOND_BLASTX { cpus = { check_max( 12 * task.attempt, 'cpus' ) } memory = { check_max( 50.GB * task.attempt, 'memory' ) } - time = { check_max( 12.h * task.attempt, 'time' ) } + time = { check_max( 20.h * task.attempt, 'time' ) } } diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index e83ad6a2..eae98ca7 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -17,6 +17,7 @@ process SANGER_TOL_BTK { output: // path("${meta.id}_btk_out/plots"), emit: btk_plots path("${meta.id}_btk_out/busco"), emit: btk_busco + path("${meta.id}_btk_out/blobtoolkit"), emit: btk_dataset path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc path("blobtoolkit_pipeline_info"), emit: btk_pipeline From 1fadcc574514e3b680f7bf62310339017ad641d2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 14:49:17 +0100 Subject: [PATCH 067/117] Update to use the sorted bam file for generate_samplesheet --- bin/generate_samplesheet.py | 2 +- workflows/ascc.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py index 333eeacf..84d11d41 100755 --- a/bin/generate_samplesheet.py +++ b/bin/generate_samplesheet.py @@ -25,7 +25,7 @@ def main(): data_list.append("sample,datatype,datafile\n") for file in os.listdir(args.pacbio_path): - if file.endswith(".fasta.gz"): + if file.endswith(".bam"): data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}/{file}\n") with open("samplesheet.csv", "w") as file: diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 775d9789..2e7603f2 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -471,7 +471,7 @@ workflow ASCC { if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( - YAML_INPUT.out.pacbio_tuple.collect() + bam_ch ) //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) From b6519665a76e28c87c0dd2ab2367eeb115a8faa4 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 14:50:43 +0100 Subject: [PATCH 068/117] Update to use the sorted bam file for generate_samplesheet --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 2e7603f2..ffd1d359 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -471,7 +471,7 @@ workflow ASCC { if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { GENERATE_SAMPLESHEET ( - bam_ch + ch_bam ) //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) From 5f6d38455a1a0dc0252e25cd55d1bcf37b19e459 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 14:52:07 +0100 Subject: [PATCH 069/117] Update to use the sorted bam file for generate_samplesheet --- workflows/ascc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index ffd1d359..54357fbf 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -314,7 +314,7 @@ workflow ASCC { // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - if ( workflow_steps.contains('coverage') || workflow_steps.contains('ALL') ) { + if ( workflow_steps.contains('coverage') || workflow_steps.contains('busco_btk') || workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, From 94251bbc7166956779382d811a601db3ca9765e3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 15:04:49 +0100 Subject: [PATCH 070/117] Input Channel lacked meta and so failed the input check --- workflows/ascc.nf | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 54357fbf..352a4908 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -470,8 +470,17 @@ workflow ASCC { // This will also eventually check for the above run_btk boolean from autofilter if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { + ch_bam + .combine(YAML_INPUT.out.reference_tuple) + .map{ bam, meta, ref -> + tuple( [ id: meta.id ] + bam + ) + } + .set { new_bam } + GENERATE_SAMPLESHEET ( - ch_bam + new_bam ) //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) From 0223b94d5e4f886d0aad1a149e5ce4da832549c2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 27 Jun 2024 15:23:56 +0100 Subject: [PATCH 071/117] Updates to channel paths and channels for modules --- bin/generate_samplesheet.py | 8 +++++--- modules/local/sanger_tol_btk.nf | 3 ++- workflows/ascc.nf | 18 ++++++++---------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py index 84d11d41..4691944e 100755 --- a/bin/generate_samplesheet.py +++ b/bin/generate_samplesheet.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os +import sys import argparse """ @@ -24,9 +25,10 @@ def main(): data_list = [] data_list.append("sample,datatype,datafile\n") - for file in os.listdir(args.pacbio_path): - if file.endswith(".bam"): - data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}/{file}\n") + if args.pacbio_path.endswith(".bam"): + data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}\n") + else: + sys.exit("I was expecting a mapped BAM file") with open("samplesheet.csv", "w") as file: file.write("".join(data_list)) diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index eae98ca7..03a9fe48 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -4,7 +4,8 @@ process SANGER_TOL_BTK { input: tuple val(meta), path(reference, stageAs: "REFERENCE.fa") - tuple val(meta1), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") + tuple val(meta1), path(bam) + tuple val(meta2), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") path blastp, stageAs: "blastp.dmnd" path blastn path blastx diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 352a4908..20283411 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -103,8 +103,6 @@ workflow ASCC { )} .set { modified_input } - modified_input.view{it -> "MODINPUT: $it"} - // // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // @@ -180,9 +178,6 @@ workflow ASCC { // NOTE: ch_nt_blast needs to be set in two places incase it // fails during the run // - YAML_INPUT.out.nt_database.view{it -> "NT_DB: $it"} - YAML_INPUT.out.ncbi_accessions.view{it -> "ACCESS: $it"} - YAML_INPUT.out.ncbi_rankedlineage_path.view{it -> "LINEAGE: $it"} ch_nt_blast = [] EXTRACT_NT_BLAST ( @@ -193,7 +188,6 @@ workflow ASCC { ) ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} - ch_nt_blast.view{ it -> "SUPPOSED TO BE: it"} } else { ch_nt_blast = [] @@ -470,10 +464,13 @@ workflow ASCC { // This will also eventually check for the above run_btk boolean from autofilter if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { - ch_bam - .combine(YAML_INPUT.out.reference_tuple) - .map{ bam, meta, ref -> - tuple( [ id: meta.id ] + YAML_INPUT.out.reference_tuple.view() + ch_bam.view() + + YAML_INPUT.out.reference_tuple + .combine(ch_bam) + .map{ meta, ref, bam -> + tuple( [ id: meta.id ], bam ) } @@ -486,6 +483,7 @@ workflow ASCC { SANGER_TOL_BTK ( YAML_INPUT.out.reference_tuple, + new_bam, GENERATE_SAMPLESHEET.out.csv, YAML_INPUT.out.diamond_uniprot_database_path, YAML_INPUT.out.nt_database.map{it -> it[1]}, From 0a4fd2c72520cbfd7a410d04bc32b7ef5dfa8d49 Mon Sep 17 00:00:00 2001 From: Eerik Aunin Date: Wed, 3 Jul 2024 11:15:56 +0100 Subject: [PATCH 072/117] Update based on work of ea12 --- bin/merge_btk_datasets_V2.py | 64 ++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/bin/merge_btk_datasets_V2.py b/bin/merge_btk_datasets_V2.py index d9317c02..fcc6c9cb 100755 --- a/bin/merge_btk_datasets_V2.py +++ b/bin/merge_btk_datasets_V2.py @@ -21,6 +21,7 @@ import argparse import textwrap import general_purpose_functions as gpf +import gzip def parse_args(argv=None): @@ -60,13 +61,19 @@ def load_json(filename): """ Loads a JSON file and returns it as a dictionary """ - with open(filename) as f: - return json.load(f) + json_contents = None + if filename.endswith(".gz"): + with gzip.open(filename, "rt", encoding="UTF-8") as zipfile: + json_contents = json.load(zipfile) + else: + with open(filename) as f: + json_contents = json.load(f) + return json_contents -def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder): +def create_meta_json_contents(main_btk_dataset_folder, btk_busco_dataset_folder): """ - Creates a meta.json file for the new BTK dataset by combining the two meta.json files from the input directories + Creates the contents for the meta.json file for the new BTK dataset by combining the two meta.json files from the input directories """ for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): if os.path.isdir(folder) is False: @@ -76,7 +83,7 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined sys.exit(0) main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" - btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json" + btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json.gz" for json_path in (main_btk_json_path, btk_busco_json_path): if os.path.isfile(json_path) is False: sys.stderr.write(f"File {json_path} not found)\n") @@ -100,10 +107,24 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined else: if field_id not in keys_to_skip: merged_dict["fields"].append(field) + return merged_dict - meta_json_outpath = f"{combined_dataset_folder}/meta.json" - with open(meta_json_outpath, "w") as json_outfile: - json.dump(merged_dict, json_outfile, indent=1, sort_keys=True) + +def detect_buscogenes_variables(merged_jsons_dict): + """ + Goes through the content of merged meta.json file (derived from both BTK datasets) and detects if buscogenes + variables are present + """ + buscogenes_present_flag = False + fields = merged_jsons_dict["fields"] + for field in fields: + field_id = field["id"] + if field_id == "taxonomy": + for item in field["children"]: + if item["id"] == "buscogenes": + buscogenes_present_flag = True + break + return buscogenes_present_flag def main(args): @@ -117,7 +138,13 @@ def main(args): ) sys.exit(0) - not_copying_list = ["identifiers.json", "gc_data.json", "length_data.json", "ncount_data.json", "meta.json"] + not_copying_list = [ + "identifiers.json.gz", + "gc_data.json.gz", + "length_data.json.gz", + "ncount_data.json.gz", + "meta.json.gz", + ] Path(args.new_output_directory).mkdir(parents=True, exist_ok=True) @@ -127,7 +154,7 @@ def main(args): main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] for main_btk_dataset_file in main_btk_dataset_files: main_btk_dataset_file_full_path = f"{args.main_btk_datasets}/{main_btk_dataset_file}" - copied_file_full_path = f"{args.new_output_directory}/{main_btk_dataset_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{main_btk_dataset_file}") shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) btk_busco_files = [ @@ -135,13 +162,24 @@ def main(args): ] for btk_busco_file in btk_busco_files: btk_busco_file_full_path = f"{args.btk_busco_datasets}/{btk_busco_file}" - copied_file_full_path = f"{args.new_output_directory}/{btk_busco_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{btk_busco_file}") shutil.copy(btk_busco_file_full_path, copied_file_full_path) - create_meta_json(args.main_btk_datasets, args.btk_busco_datasets, args.new_output_directory) + merged_jsons_dict = create_meta_json_contents(args.main_btk_datasets, args.btk_busco_datasets) + meta_json_outpath = f"{args.new_output_directory}/meta.json" + + with open(meta_json_outpath, "w") as json_outfile: + json.dump(merged_jsons_dict, json_outfile, indent=1, sort_keys=True) + + buscogenes_present_flag = detect_buscogenes_variables(merged_jsons_dict) btk_busco_table_outpath = f"{args.new_output_directory}/btk_busco_summary_table_full.tsv" - btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species {args.main_btk_datasets}" + + btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species" + if buscogenes_present_flag == True: + btk_busco_table_exporting_command += ",buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species" + btk_busco_table_exporting_command += f" {args.new_output_directory}" + gpf.run_system_command(btk_busco_table_exporting_command) From 16bad9a103a8769f8d54d0812beb6e2c0dad4758 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 3 Jul 2024 11:19:22 +0100 Subject: [PATCH 073/117] Update to command --- modules/local/merge_btk_datasets.nf | 2 -- modules/local/sanger_tol_btk.nf | 21 +++++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 0592fc29..1f772b93 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -10,7 +10,6 @@ process MERGE_BTK_DATASETS { input: tuple val(meta), path(create_btk_datasets) tuple val(meta2), path(busco_btk_datasets) - tuple val(meta3), path(busco_summary_file) output: tuple val(meta), path("merged_datasets"), emit: merged_datasets @@ -29,7 +28,6 @@ process MERGE_BTK_DATASETS { -m $create_btk_datasets \\ -o ./merged_datasets \\ -b $busco_btk_datasets \\ - -s $busco_summary_file \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 03a9fe48..2004bc8d 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -4,7 +4,7 @@ process SANGER_TOL_BTK { input: tuple val(meta), path(reference, stageAs: "REFERENCE.fa") - tuple val(meta1), path(bam) + tuple val(meta1), path(bam) // Name needs to remain the same as previous process as they are referenced in the samplesheet tuple val(meta2), path(samplesheet_csv, stageAs: "SAMPLESHEET.csv") path blastp, stageAs: "blastp.dmnd" path blastn @@ -16,11 +16,12 @@ process SANGER_TOL_BTK { val gca_accession output: - // path("${meta.id}_btk_out/plots"), emit: btk_plots - path("${meta.id}_btk_out/busco"), emit: btk_busco - path("${meta.id}_btk_out/blobtoolkit"), emit: btk_dataset - path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc - path("blobtoolkit_pipeline_info"), emit: btk_pipeline + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: btk_plots + path("${meta.id}_btk_out/blobtoolkit/*"), emit: btk_dataset + path("${meta.id}_btk_out/blobtoolkit/*/summary.json.gz"), emit: btk_summary + path("${meta.id}_btk_out/busco"), emit: btk_busco + path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc + path("blobtoolkit_pipeline_info"), emit: btk_pipeline script: def prefix = task.ext.prefix ?: "${meta.id}" @@ -71,11 +72,11 @@ process SANGER_TOL_BTK { def pipeline_version = task.ext.version ?: "main" """ - mkdir -p blobtoolkit/$gca_accession - touch blobtoolkit/$gca_accession/test.json.gz + mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession + touch ${prefix}_btk_out/blobtoolkit/$gca_accession/test.json.gz - mkdir ${prefix}_btk_out/plots - touch ${prefix}_btk_out/plots/test.png + mkdir ${prefix}_btk_out/blobtoolkit/plots + touch ${prefix}_btk_out/blobtoolkit/plots/test.png mkdir ${prefix}_btk_out/busco touch ${prefix}_btk_out/busco/test.batch_summary.txt From d1a571abe71c740fe632d735fb6ef18317eabc29 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 3 Jul 2024 11:23:43 +0100 Subject: [PATCH 074/117] Linting fix --- assets/btk_draft.yaml | 2 +- docs/usage.md | 1 - workflows/ascc.nf | 7 +++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml index 34b1b3f0..0e023513 100644 --- a/assets/btk_draft.yaml +++ b/assets/btk_draft.yaml @@ -14,4 +14,4 @@ taxon: order: order_name phylum: phylum_name superkingdom: superkingdom_name - taxid: 0 \ No newline at end of file + taxid: 0 diff --git a/docs/usage.md b/docs/usage.md index bd4c33b4..341980d8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,7 +8,6 @@ ## Yaml input - ### Full yaml ```yaml diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 20283411..2bdcc9db 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -497,11 +497,10 @@ workflow ASCC { //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) -/* MERGE_BTK_DATASETS ( + MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, - [[],[]], //SANGER_TOL_BTK.out.btk_datasets = [] - [[],[]] //SANGER_TOL_BTK.out.summary = [] - ) */ + SANGER_TOL_BTK.out.btk_dataset + ) //ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) } From c96ba688c08963f971b28f58a0273dee036f95c3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 3 Jul 2024 13:24:12 +0100 Subject: [PATCH 075/117] Updates - fixed errpr in test.yaml --- assets/github_testing/test.yaml | 5 +++-- modules/local/create_btk_dataset.nf | 22 +--------------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index fde2e2e8..6e548a92 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -24,6 +24,7 @@ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/ seqkit: - sliding: 6000 - window: 100000 + sliding: 100000 + window: 6000 n_neighbours: 13 +btk_yaml: /home/runner/work/ascc/ascc/assets/btk_draft.yaml diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 42045b11..4a8a055d 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -23,24 +23,6 @@ process CREATE_BTK_DATASET { path un_diamond, stageAs: "UNIPROT_DIAMOND_FULL.tsv" path ncbi_taxdump, stageAs: "TAXDUMP" - - /* - -f ${reference} \\ - -d ./1/ \\ - -n "${prefix}" \\ - -tn "${meta.sci_name}" \\ - -id ${meta.taxid} \\ - -td ${ncbi_taxdump}/ \\ - $blastn_arg \\ - $nt_diamond_arg \\ - $un_diamond_arg \\ - $kraken_arg \\ - $mapped_arg \\ - $tiara_arg \\ - $pca_arg \\ - $fcs_arg \\ - $args */ - output: tuple val(meta), path("btk_datasets"), emit: btk_datasets tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary @@ -59,8 +41,6 @@ process CREATE_BTK_DATASET { def tiara_arg = tiara ? "-t ${tiara}" : "" def pca_arg = kmers ? "-p ${kmers}" : "" def fcs_arg = fcsgx ? "-fc ${fcsgx}" : "" - def marker_arg = "" - def contigviz_arg = "" """ mkdir -p btk_datasets/ @@ -80,7 +60,7 @@ process CREATE_BTK_DATASET { $tiara_arg \\ $pca_arg \\ $fcs_arg \\ - $args\\ + $args cat <<-END_VERSIONS > versions.yml From 20c47019717dc41f4c358394454abd44d6e92840 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 16:59:02 +0100 Subject: [PATCH 076/117] Updates for testing --- .github/workflows/ci.yml | 11 ++++++++++- assets/github_testing/test.yaml | 4 ++-- assets/test.yaml | 8 ++++---- modules/local/sanger_tol_btk.nf | 12 ++++++------ subworkflows/local/extract_nt_blast.nf | 2 -- subworkflows/local/run_read_coverage.nf | 4 +++- subworkflows/local/yaml_input.nf | 4 ++-- workflows/ascc.nf | 23 ----------------------- 8 files changed, 27 insertions(+), 41 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2bc5487b..c1463406 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,6 +72,14 @@ jobs: run: | curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf - + - name: Temporary ASCC Diamond Data + run: | + curl http://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd + + - name: Temporary BLASTN Data + run: | + curl http://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - + - name: Download the NCBI taxdump database run: | mkdir ncbi_taxdump @@ -120,10 +128,11 @@ jobs: run: | mkdir vecscreen curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf - + ls -lh - name: Singularity - Run FULL pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL + nextflow run ./sanger-treeval/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --steps ALL diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 6e548a92..0fdf8031 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -12,8 +12,8 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /home/runner/work/ascc/ascc/NT_database/ -nt_database_prefix: 18S_fungal_sequences +nt_database: /home/runner/work/ascc/ascc/blastn/ +nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ diff --git a/assets/test.yaml b/assets/test.yaml index e80d4fb3..3f6c1f4b 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -12,16 +12,16 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /data/blastdb/Supported/NT/current -nt_database_prefix: nt +nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/ +nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages -fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb +fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/ vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/ -diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd +diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd seqkit: sliding: 100000 diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 2004bc8d..4ce728de 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -16,12 +16,12 @@ process SANGER_TOL_BTK { val gca_accession output: - path("${meta.id}_btk_out/blobtoolkit/plots"), emit: btk_plots - path("${meta.id}_btk_out/blobtoolkit/*"), emit: btk_dataset - path("${meta.id}_btk_out/blobtoolkit/*/summary.json.gz"), emit: btk_summary - path("${meta.id}_btk_out/busco"), emit: btk_busco - path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc - path("blobtoolkit_pipeline_info"), emit: btk_pipeline + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: btk_plots + path("${meta.id}_btk_out/blobtoolkit/draft"), emit: btk_dataset + path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: btk_summary + path("${meta.id}_btk_out/busco"), emit: btk_busco + path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc + path("blobtoolkit_pipeline_info"), emit: btk_pipeline script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/subworkflows/local/extract_nt_blast.nf b/subworkflows/local/extract_nt_blast.nf index 654548ce..ccbf0be4 100644 --- a/subworkflows/local/extract_nt_blast.nf +++ b/subworkflows/local/extract_nt_blast.nf @@ -58,8 +58,6 @@ workflow EXTRACT_NT_BLAST { BLAST_CHUNK_TO_FULL ( blast_results ) ch_versions = ch_versions.mix(BLAST_CHUNK_TO_FULL.out.versions) - BLAST_CHUNK_TO_FULL.out.full.view{ it -> "SUPPOSED TO BE-1: it"} - // // MODULE: RE_ORDER THE DATA IN THE FULL_COORDINATE FILE // diff --git a/subworkflows/local/run_read_coverage.nf b/subworkflows/local/run_read_coverage.nf index f29aea6b..49ada33f 100644 --- a/subworkflows/local/run_read_coverage.nf +++ b/subworkflows/local/run_read_coverage.nf @@ -4,7 +4,7 @@ include { SAMTOOLS_MERGE } from '../../modules/nf include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_DEPTH } from '../../modules/nf-core/samtools/depth/main' -include { SAMTOOLS_DEPTH_AVERAGE_COVERAGE } from '../../modules/local/samtools_depth_average_coverage' +include { SAMTOOLS_DEPTH_AVERAGE_COVERAGE } from '../../modules/local/samtools_depth_average_coverage' workflow RUN_READ_COVERAGE { @@ -30,6 +30,7 @@ workflow RUN_READ_COVERAGE { platform ) ch_versions = ch_versions.mix(SE_MAPPING.out.versions) + ch_align_bam .mix( SE_MAPPING.out.mapped_bam ) .set { merged_bam } @@ -43,6 +44,7 @@ workflow RUN_READ_COVERAGE { platform ) ch_versions = ch_versions.mix(PE_MAPPING.out.versions) + ch_align_bam .mix( PE_MAPPING.out.mapped_bam ) .set { merged_bam } diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index fdf4fc2c..db5f2538 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -53,8 +53,8 @@ workflow YAML_INPUT { .seqkit_values .flatten() .multiMap { data -> - sliding_value : ( data.sliding ) - window_value : ( data.window ) + sliding_value : ( data.sliding ) + window_value : ( data.window ) } .set { seqkit } diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 2bdcc9db..c4a93e56 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -390,26 +390,6 @@ workflow ASCC { un_full = [] } - // mix the outputs of the outputting process so that we can - // insert them into the one process to create the btk and the merged report - // much like the versions channel - -/* GENERATE_GENOME.out.reference_tuple.view{it -> "INPUT GENOME $it"} - GENERATE_GENOME.out.dot_genome.map{it[1]}.view{it -> "GENOME $it"} - ch_kmers.view{it -> "KMER $it"} - ch_tiara.view{it -> "TIARA $it"} - ch_nt_blast.view{it -> "NT $it"} - ch_fcsgx.view{it -> "FSCSCCSCS $it"} - ch_bam.view{it -> "BAM $it"} - ch_coverage.view{it -> "COVERAGE $it"} - ch_kraken1.view{it -> "KRAKEN1 $it"} - ch_kraken2.view{it -> "KRAKEN2 $it"} - ch_kraken3.view{it -> "KRAKEN3 $it"} - nt_hits.view{it -> "HITS $it"} - un_hits.view{it -> "UNHITS $it"} - YAML_INPUT.out.ncbi_taxonomy_path.view{it -> "TAXDUMP $it"} */ - - ch_got_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} CREATE_BTK_DATASET ( @@ -464,9 +444,6 @@ workflow ASCC { // This will also eventually check for the above run_btk boolean from autofilter if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { - YAML_INPUT.out.reference_tuple.view() - ch_bam.view() - YAML_INPUT.out.reference_tuple .combine(ch_bam) .map{ meta, ref, bam -> From b8ce7879e8ff7ebc4f06ebd516b257aff35b6d66 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 17:11:21 +0100 Subject: [PATCH 077/117] Updates for testing --- .github/workflows/ci.yml | 2 +- assets/github_testing/test.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c1463406..bee9ed54 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,7 +74,7 @@ jobs: - name: Temporary ASCC Diamond Data run: | - curl http://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd + curl http://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd - name: Temporary BLASTN Data run: | diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 0fdf8031..b04f66f9 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -20,8 +20,8 @@ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ -diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd -diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd +diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd +diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond.dmnd vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/ seqkit: sliding: 100000 From d21fd7c458f7a678e1618dbb3ab0c6100963299a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 22:47:20 +0100 Subject: [PATCH 078/117] Adding https rather than http --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bee9ed54..0966a95a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,11 +74,11 @@ jobs: - name: Temporary ASCC Diamond Data run: | - curl http://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd + curl https://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd - name: Temporary BLASTN Data run: | - curl http://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - + curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - - name: Download the NCBI taxdump database run: | From fb9cb828380504f1e320db02b4a44963836db3bc Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 22:51:05 +0100 Subject: [PATCH 079/117] add file exist arg to linting --- .nf-core.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.nf-core.yml b/.nf-core.yml index d1422793..9872485b 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,6 +1,7 @@ repository_type: pipeline lint: - files_exist: false + files_exist: + - assets/multiqc_config.yml files_unchanged: - CODE_OF_CONDUCT.md - assets/nf-core-ascc_logo_light.png From 32ad0b3fb65c652ff114494a0fcfa8789bbaf29a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 22:54:43 +0100 Subject: [PATCH 080/117] add file exist arg to linting --- .nf-core.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index 9872485b..0a309c0e 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,7 +1,6 @@ repository_type: pipeline lint: - files_exist: - - assets/multiqc_config.yml + files_exist: false files_unchanged: - CODE_OF_CONDUCT.md - assets/nf-core-ascc_logo_light.png @@ -20,5 +19,3 @@ lint: nextflow_config: - manifest.name - manifest.homePage - multiqc_config: - - report_comment From 09bfcb4104de8b6651b7e5815abbfcbe7a8d54f6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 22:59:25 +0100 Subject: [PATCH 081/117] add file exist arg to linting --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index 0a309c0e..48d56ffa 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -19,3 +19,4 @@ lint: nextflow_config: - manifest.name - manifest.homePage + multiqc_config: False \ No newline at end of file From 8e111e1ba5b2015fa4215755584e4f9aa3b9425c Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 5 Jul 2024 23:01:09 +0100 Subject: [PATCH 082/117] add file exist arg to linting --- .nf-core.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.nf-core.yml b/.nf-core.yml index 48d56ffa..5a60a58e 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -19,4 +19,4 @@ lint: nextflow_config: - manifest.name - manifest.homePage - multiqc_config: False \ No newline at end of file + multiqc_config: False From c220c3c4b199d80c1f39890ba3e3b2c938e02b66 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 8 Jul 2024 09:45:18 +0100 Subject: [PATCH 083/117] Correct treeval to ascc --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0966a95a..6bf0ec1b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,4 +135,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ./sanger-treeval/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --steps ALL + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --steps ALL From e4f4116d2b739244bcb5a33279aa94b54aa563f4 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 9 Jul 2024 10:36:37 +0100 Subject: [PATCH 084/117] correction --- assets/github_testing/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index b04f66f9..5344f343 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -12,7 +12,7 @@ kmer_len: 7 dimensionality_reduction_methods: "pca,random_trees" # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" -nt_database: /home/runner/work/ascc/ascc/blastn/ +nt_database: /home/runner/work/ascc/ascc/blastdb/ nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ From fd55686b2f50a73b537fda6389e6400f7454fba3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 10 Jul 2024 12:55:58 +0100 Subject: [PATCH 085/117] Update workflow --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6bf0ec1b..5f6ee2c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,6 +80,10 @@ jobs: run: | curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - + - name: Temporary ASCC Diamond Data + run: | + curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -C accession2taxid -xzf - + - name: Download the NCBI taxdump database run: | mkdir ncbi_taxdump From f32be17ed0ccb54807b2961679938a6d7b0fb104 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 10 Jul 2024 12:56:23 +0100 Subject: [PATCH 086/117] Update test.yaml --- assets/github_testing/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 5344f343..a37718a1 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -15,7 +15,7 @@ dimensionality_reduction_methods: "pca,random_trees" nt_database: /home/runner/work/ascc/ascc/blastdb/ nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 -ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ +ncbi_accessionids_folder: /home/runner/work/ascc/ascc/accession2taxid/ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages From 37c8847673fbad7ddbf57ab9b92267297216a6d9 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 10 Jul 2024 13:19:33 +0100 Subject: [PATCH 087/117] Updates to Workflows --- .github/workflows/ci.yml | 4 ++-- assets/github_testing/test.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f6ee2c1..5505fcc2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,9 +80,9 @@ jobs: run: | curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf - - - name: Temporary ASCC Diamond Data + - name: Temporary Accession2TaxID Data run: | - curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -C accession2taxid -xzf - + curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -xzf - - name: Download the NCBI taxdump database run: | diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index a37718a1..9c258fcb 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -15,7 +15,7 @@ dimensionality_reduction_methods: "pca,random_trees" nt_database: /home/runner/work/ascc/ascc/blastdb/ nt_database_prefix: tiny_plasmodium_blastdb.fa nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2 -ncbi_accessionids_folder: /home/runner/work/ascc/ascc/accession2taxid/ +ncbi_accessionids_folder: /home/runner/work/ascc/ascc/20240709_tiny_accession2taxid/ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages From 7e6eae70ea9412dbffdbac0d2a674cc19bb67c05 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 12 Jul 2024 14:46:23 +0100 Subject: [PATCH 088/117] Updates, changes to flags and ci to allow for turning off btk --- .github/workflows/ci.yml | 2 +- assets/test.yaml | 4 +-- workflows/ascc.nf | 53 +++++++++++++--------------------------- 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5505fcc2..86bc8d40 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -139,4 +139,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --steps ALL + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk diff --git a/assets/test.yaml b/assets/test.yaml index 3f6c1f4b..850c2c2e 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -14,8 +14,8 @@ dimensionality_reduction_methods: "pca,random_trees" # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/ nt_database_prefix: tiny_plasmodium_blastdb.fa -nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt -ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/ +nt_kraken_db_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/kraken2/kraken2/ +ncbi_accessionids_folder: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/20240709_tiny_accession2taxid/ ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages diff --git a/workflows/ascc.nf b/workflows/ascc.nf index c4a93e56..4f348079 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -76,7 +76,8 @@ workflow ASCC { ch_versions = Channel.empty() ch_out_merge = Channel.empty() - workflow_steps = params.steps.split(",") + include_workflow_steps = params.include ? params.include.split(",") : "" + exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" input_ch = Channel.fromPath(params.input, checkIfExists: true) @@ -132,7 +133,7 @@ workflow ASCC { // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS // - if ( workflow_steps.contains('kmers') || workflow_steps.contains('ALL')) { + if ( include_workflow_steps.contains('kmers') || include_workflow_steps.contains('ALL')) { GENERATE_GENOME.out.reference_tuple .map { meta, file -> @@ -160,7 +161,7 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM TIARA // - if ( workflow_steps.contains('tiara') || workflow_steps.contains('ALL')) { + if ( include_workflow_steps.contains('tiara') || include_workflow_steps.contains('ALL')) { EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) @@ -173,7 +174,7 @@ workflow ASCC { // // SUBWORKFLOW: EXTRACT RESULTS HITS FROM NT-BLAST // - if ( workflow_steps.contains('nt_blast') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('nt_blast') || include_workflow_steps.contains('ALL') ) { // // NOTE: ch_nt_blast needs to be set in two places incase it // fails during the run @@ -193,7 +194,7 @@ workflow ASCC { ch_nt_blast = [] } - if ( workflow_steps.contains('mito') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('mito') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH // @@ -219,7 +220,7 @@ workflow ASCC { ch_mito = [] } - if ( workflow_steps.contains('chloro') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('chloro') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH @@ -248,7 +249,7 @@ workflow ASCC { // // SUBWORKFLOW: // - if ( workflow_steps.contains('fcs_adapt') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcs_adapt') || include_workflow_steps.contains('ALL') ) { RUN_FCSADAPTOR ( YAML_INPUT.out.reference_tuple ) @@ -266,7 +267,7 @@ workflow ASCC { // // SUBWORKFLOW: // - if ( workflow_steps.contains('fcsgx') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcsgx') || include_workflow_steps.contains('ALL') ) { RUN_FCSGX ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, @@ -283,7 +284,7 @@ workflow ASCC { // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - if ( workflow_steps.contains('barcodes') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('barcodes') || include_workflow_steps.contains('ALL') ) { PACBIO_BARCODE_CHECK ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, @@ -308,7 +309,7 @@ workflow ASCC { // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - if ( workflow_steps.contains('coverage') || workflow_steps.contains('busco_btk') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('busco_btk') || include_workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, @@ -326,7 +327,7 @@ workflow ASCC { // // SUBWORKFLOW: COLLECT SOFTWARE VERSIONS // - if ( workflow_steps.contains('vecscreen') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('vecscreen') || include_workflow_steps.contains('ALL') ) { RUN_VECSCREEN ( GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.vecscreen_database_path @@ -340,7 +341,7 @@ workflow ASCC { // // SUBWORKFLOW: Run the kraken classifier // - if ( workflow_steps.contains('kraken') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('kraken') || include_workflow_steps.contains('ALL') ) { RUN_NT_KRAKEN( GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.nt_kraken_db_path, @@ -360,7 +361,7 @@ workflow ASCC { // // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY // - if ( workflow_steps.contains('nt_diamond') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('nt_diamond') || include_workflow_steps.contains('ALL') ) { NUCLEOT_DIAMOND ( modified_input, YAML_INPUT.out.diamond_nr_database_path @@ -377,7 +378,7 @@ workflow ASCC { // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY // //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames sskingdoms sphylums salltitles - if ( workflow_steps.contains('uniprot_diamond') || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('uniprot_diamond') || include_workflow_steps.contains('ALL') ) { UNIPROT_DIAMOND ( modified_input, YAML_INPUT.out.diamond_uniprot_database_path @@ -414,7 +415,7 @@ workflow ASCC { // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // - if ( workflow_steps.contains('tiara') && workflow_steps.contains('fcsgx') && workflow_steps.contains("autofilter") || workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('tiara') && include_workflow_steps.contains('fcsgx') && include_workflow_steps.contains("autofilter") || include_workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, @@ -442,7 +443,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( workflow_steps.contains('busco_btk') && workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || workflow_steps.contains('ALL') ) { + if ( !exclude_workflow_steps.contains("btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk") && include_workflow_steps.contains('ALL') ) { YAML_INPUT.out.reference_tuple .combine(ch_bam) @@ -483,26 +484,6 @@ workflow ASCC { } - // - // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER - // -/* ASCC_MERGE_TABLES ( - GC_CONTENT.out.txt, // FROM -- GC_COVERAGE.out.tsv - ch_coverage, // FROM -- RUN_COVERAGE.out.tsv.map{it[1]} - ch_tiara, // FROM -- TIARA_TIARA.out.classifications.map{it[1]} - [], // <-- BACTERIAL KRAKEN -- NOT IN PIPELINE YET - ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} - ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} - ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv - nt_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} - un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} - [], // <-- MARKER SCAN -- NOT IN PIPELINE YET - [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET - CREATE_BTK_DATASET.out.create_summary.map{it[1]}, - [], // <-- BUSCO_BTK -- NOT IN PIPELINE YET - ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} - ) */ - // // SUBWORKFLOW: Collates version data from prior subworflows // From cf544bbc1c5eef1d159cb75113b05f15c4f3d762 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Jul 2024 12:55:44 +0100 Subject: [PATCH 089/117] Fixing reviewer comments --- bin/autofilter.py | 16 +++++++++------- bin/generate_samplesheet.py | 14 ++++++++++---- bin/process_result_tables.py | 2 ++ conf/test_full.config | 31 ------------------------------- modules/local/sanger_tol_btk.nf | 12 ++++++------ workflows/ascc.nf | 28 ++++++++++++++-------------- 6 files changed, 41 insertions(+), 62 deletions(-) delete mode 100644 conf/test_full.config diff --git a/bin/autofilter.py b/bin/autofilter.py index 11ccc83d..93849f67 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -13,7 +13,7 @@ Script for filtering the assembly to remove putative contaminants based on -FGCS-GX and Tiara results. +FCS-GX and Tiara results. ------------------------------------- """ @@ -32,9 +32,9 @@ def parse_args(): formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(DESCRIPTION), ) - parser.add_argument("fasta", type=str, help="Path to the fasta assembly file") - parser.add_argument("-t", "--tiara", type=str, help="Path to the tiara summary file") - parser.add_argument("-s", "--fcsgx_sum", type=str, help="Path to the fcs-gx_summary.csv file") + parser.add_argument("fasta", type=str, help="Path to the assembly FASTA file") + parser.add_argument("-t", "--tiara", type=str, help="Path to the Tiara summary file") + parser.add_argument("-s", "--fcsgx_summary", type=str, help="Path to the fcs-gx_summary.csv file") parser.add_argument( "-o", "--output_auto_filtered", @@ -42,7 +42,9 @@ def parse_args(): help="Path to the assembly_autofiltered.fasta file", default="autofiltered.fasta", ) - parser.add_argument("-c", "--combined_sum", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file") + parser.add_argument( + "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" + ) parser.add_argument( "-r", "--rejected_seq", @@ -175,9 +177,9 @@ def main(): assembly_path = args.fasta tiara_results_path = args.tiara - fcs_gx_summary_path = args.fcsgx_sum + fcs_gx_summary_path = args.fcsgx_summary filtered_assembly_path = args.output_auto_filtered - combined_summary = args.combined_sum + combined_summary = args.fcs_gx_and_tiara_summary excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py index 4691944e..12af7059 100755 --- a/bin/generate_samplesheet.py +++ b/bin/generate_samplesheet.py @@ -5,7 +5,9 @@ import argparse """ -A simple script to generate csv file +A simple script to generate a csv file required for the sanger-tol/blobtoolkit pipeline-module. + +Required input include the sample ID and the mapped BAM file generated with PacBio data and input FASTA assembly Written by Damon-Lee Pointon (dp24/DLBPointon) """ @@ -14,7 +16,11 @@ def parse_args(): parser = argparse.ArgumentParser(description="Generate a csv file for BTK") parser.add_argument("sample_name", type=str, help="Name of sample") - parser.add_argument("pacbio_path", type=str, help="Path containing the pacbio files") + parser.add_argument( + "mapped_bam_file", + type=str, + help="Path containing the mapped BAM generated with PacBio data and the ASCC input assembly", + ) parser.add_argument("-v", "--version", action="version", version="1.0.0") return parser.parse_args() @@ -25,8 +31,8 @@ def main(): data_list = [] data_list.append("sample,datatype,datafile\n") - if args.pacbio_path.endswith(".bam"): - data_list.append(f"{args.sample_name},pacbio,{args.pacbio_path}\n") + if args.mapped_bam_file.endswith(".bam"): + data_list.append(f"{args.sample_name},pacbio,{args.mapped_bam_file}\n") else: sys.exit("I was expecting a mapped BAM file") diff --git a/bin/process_result_tables.py b/bin/process_result_tables.py index 831adc67..dc1ca398 100755 --- a/bin/process_result_tables.py +++ b/bin/process_result_tables.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + """ Script processing the cobiont check result tables to add a combined classification ('merged_classif') column that is based on the output of multiple tools. Also generates a table for estimated coverages per 'merged_classif' column diff --git a/conf/test_full.config b/conf/test_full.config deleted file mode 100644 index 338f7c8b..00000000 --- a/conf/test_full.config +++ /dev/null @@ -1,31 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running full-size tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a full size pipeline test. - - Use as follows: - nextflow run sanger-tol/ascc -profile test_full, --outdir - ----------------------------------------------------------------------------------------- -*/ -process { - maxForks = 1 -} - -executor { - queueSize=1 -} - -params { - config_profile_name = 'Full test profile' - config_profile_description = 'Full test dataset to check pipeline function' - - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Fasta references - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz' -} diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 4ce728de..dbd9fc2a 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -16,12 +16,12 @@ process SANGER_TOL_BTK { val gca_accession output: - path("${meta.id}_btk_out/blobtoolkit/plots"), emit: btk_plots - path("${meta.id}_btk_out/blobtoolkit/draft"), emit: btk_dataset - path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: btk_summary - path("${meta.id}_btk_out/busco"), emit: btk_busco - path("${meta.id}_btk_out/multiqc"), emit: btk_multiqc - path("blobtoolkit_pipeline_info"), emit: btk_pipeline + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots + path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset + path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/busco"), emit: busco_data + path("${meta.id}_btk_out/multiqc"), emit: multiqc_report + path("blobtoolkit_pipeline_info"), emit: pipeline_info script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 4f348079..f26aca64 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -391,22 +391,22 @@ workflow ASCC { un_full = [] } - ch_got_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} + ch_dot_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} CREATE_BTK_DATASET ( - GENERATE_GENOME.out.reference_tuple.first(), - ch_got_genome, + GENERATE_GENOME.out.reference_tuple, + ch_dot_genome, ch_kmers, - ch_tiara.first(), + ch_tiara, ch_nt_blast, - ch_fcsgx.first(), - ch_bam.first(), - ch_coverage.first(), - ch_kraken1.first(), - ch_kraken2.first(), - ch_kraken3.first(), - nt_hits.first(), - un_hits.first(), + ch_fcsgx, + ch_bam, + ch_coverage, + ch_kraken1, + ch_kraken2, + ch_kraken3, + nt_hits, + un_hits, YAML_INPUT.out.ncbi_taxonomy_path.first() ) //ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) @@ -443,7 +443,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( !exclude_workflow_steps.contains("btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk") && include_workflow_steps.contains('ALL') ) { + if ( !exclude_workflow_steps.contains("busbo_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) { YAML_INPUT.out.reference_tuple .combine(ch_bam) @@ -477,7 +477,7 @@ workflow ASCC { MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, - SANGER_TOL_BTK.out.btk_dataset + SANGER_TOL_BTK.out.dataset ) //ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) From 6277abbfcca2ed9791f534d2cf1476c117d03877 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 18 Jul 2024 13:45:37 +0100 Subject: [PATCH 090/117] Fixed Variable name for cicd --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 86bc8d40..24d41b55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -139,4 +139,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude busco_btk From 560b0d35d8687cb827c4316a479e6d3671acb8ee Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 25 Jul 2024 17:11:52 +0100 Subject: [PATCH 091/117] Adding version output and stubs --- modules/local/ascc_merge_tables.nf | 29 ++++++++++++++----- modules/local/autofiltering.nf | 13 +++++++++ modules/local/blast_chunk_to_full.nf | 4 ++- modules/local/convert_to_hits_file.nf | 12 ++++++++ modules/local/create_btk_dataset.nf | 16 ++++++++++ modules/local/generate_samplesheet.nf | 12 ++++++++ modules/local/kmer_count_dim_reduction.nf | 4 +-- .../kmer_count_dim_reduction_combine_csv.nf | 4 +-- modules/local/merge_btk_datasets.nf | 15 +++++++++- .../local/samtools_depth_average_coverage.nf | 4 +-- modules/local/sanger_tol_btk.nf | 1 + 11 files changed, 99 insertions(+), 15 deletions(-) diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf index d39057a4..22d6e714 100644 --- a/modules/local/ascc_merge_tables.nf +++ b/modules/local/ascc_merge_tables.nf @@ -5,25 +5,26 @@ process ASCC_MERGE_TABLES { container 'sanger-tol/ascc_btk:3.2.6-c1' input: - tuple val(meta), path(gc_content, stageAs: "GC.txt") + tuple val(meta), path(gc_content, stageAs: "GC.txt") path coverage - path tiara, stageAs: "TIARA.txt" + path tiara, stageAs: "TIARA.txt" path bacterial_kraken - path nt_kraken, stageAs: "LINEAGE.txt" + path nt_kraken, stageAs: "LINEAGE.txt" path nt_blast path dim_reduction_embeddings path nr_diamond - path uniprot_diamond, stageAs: "UP_DIAMOND.tsv" + path uniprot_diamond, stageAs: "UP_DIAMOND.tsv" path cobiontid_markerscan path contigviz - path btk, stageAs: "BTK_summary_table_full.tsv" + path btk, stageAs: "BTK_summary_table_full.tsv" path btk_busco - path fcs_gx, stageAs: "FCSGX_parsed.csv" + path fcs_gx, stageAs: "FCSGX_parsed.csv" output: tuple val(meta), path("*_contamination_check_merged_table.csv") , emit: merged_table tuple val(meta), path("*_contamination_check_merged_table_extended.csv"), optional: true, emit: extended_table tuple val(meta), path("*_phylum_counts_and_coverage.csv") , optional: true, emit: phylum_counts + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -46,7 +47,6 @@ process ASCC_MERGE_TABLES { def cobiontid_markerscan = "" """ - ascc_m_tables.py \\ --gc_cov $gc_content \\ --sample_name $meta.id \\ @@ -71,4 +71,19 @@ process ASCC_MERGE_TABLES { ascc_merge_tables: \$(ascc_merge_tables.py --version | cut -d' ' -f2) END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_contamination_check_merged_table.csv + touch ${prefix}_contamination_check_merged_table_extended.csv + touch ${prefix}_phylum_counts_and_coverage.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_merge_tables: \$(ascc_merge_tables.py --version | cut -d' ' -f2) + END_VERSIONS + """ + } diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 7bc294b1..00dbc98d 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -41,4 +41,17 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { END_VERSIONS """ + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch autofiltered.fasta + touch ABNORMAL_CHECK.csv + touch assembly_filtering_removed_sequences.txt + touch fcs-gx_alarm_indicator_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ } diff --git a/modules/local/blast_chunk_to_full.nf b/modules/local/blast_chunk_to_full.nf index c010b678..f51b8b97 100644 --- a/modules/local/blast_chunk_to_full.nf +++ b/modules/local/blast_chunk_to_full.nf @@ -28,8 +28,10 @@ process BLAST_CHUNK_TO_FULL { """ stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ - touch full_coords.tsv + touch ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/convert_to_hits_file.nf b/modules/local/convert_to_hits_file.nf index 0bb59016..0f04ad93 100644 --- a/modules/local/convert_to_hits_file.nf +++ b/modules/local/convert_to_hits_file.nf @@ -25,4 +25,16 @@ process CONVERT_TO_HITS_FILE { convert_to_hits: \$(convert_to_hits.py --version | cut -d' ' -f2) END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + convert_to_hits: \$(convert_to_hits.py -v) + END_VERSIONS + """ } diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 4a8a055d..54b1eb4a 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -26,6 +26,8 @@ process CREATE_BTK_DATASET { output: tuple val(meta), path("btk_datasets"), emit: btk_datasets tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary + path "versions.yaml", emit: versions + when: task.ext.when == null || task.ext.when @@ -69,4 +71,18 @@ process CREATE_BTK_DATASET { create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir btk_datasets + touch btk_datasets/${prefix}.txt + touch btk_summary_table_full.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + create_btk_dataset: \$(create_btk_dataset_V2.py -v) + END_VERSIONS + """ } diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf index 29e4f322..e2c30de1 100644 --- a/modules/local/generate_samplesheet.nf +++ b/modules/local/generate_samplesheet.nf @@ -12,6 +12,7 @@ process GENERATE_SAMPLESHEET { output: tuple val(meta), path("*csv"), emit: csv + path "versions.yml", emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" @@ -28,4 +29,15 @@ process GENERATE_SAMPLESHEET { END_VERSIONS """ + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_samplesheet: \$(generate_samplesheet.py -v) + END_VERSIONS + """ } diff --git a/modules/local/kmer_count_dim_reduction.nf b/modules/local/kmer_count_dim_reduction.nf index f80b06bc..b900dd92 100755 --- a/modules/local/kmer_count_dim_reduction.nf +++ b/modules/local/kmer_count_dim_reduction.nf @@ -15,8 +15,8 @@ process KMER_COUNT_DIM_REDUCTION { val autoencoder_epochs_count output: - path '*_kmers_dim_reduction_embeddings.csv', emit: csv - path "versions.yml", emit: versions + path '*_kmers_dim_reduction_embeddings.csv', emit: csv + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/kmer_count_dim_reduction_combine_csv.nf b/modules/local/kmer_count_dim_reduction_combine_csv.nf index cdd2d002..01998a76 100755 --- a/modules/local/kmer_count_dim_reduction_combine_csv.nf +++ b/modules/local/kmer_count_dim_reduction_combine_csv.nf @@ -11,8 +11,8 @@ process KMER_COUNT_DIM_REDUCTION_COMBINE_CSV { tuple val(meta), path(input_files) output: - path '*_kmers_dim_reduction_embeddings_combined.csv', emit: csv - path "versions.yml", emit: versions + path '*_kmers_dim_reduction_embeddings_combined.csv', emit: csv + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 1f772b93..edbe1381 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -13,6 +13,7 @@ process MERGE_BTK_DATASETS { output: tuple val(meta), path("merged_datasets"), emit: merged_datasets + path "versions.yaml", emit: versions when: task.ext.when == null || task.ext.when @@ -33,7 +34,19 @@ process MERGE_BTK_DATASETS { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) + merge_btk_datasets_V2: \$(merge_btk_datasets_V2.py --version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p merged_datasets/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_btk_datasets_V2: \$(merge_btk_datasets_V2.py -v) END_VERSIONS """ } diff --git a/modules/local/samtools_depth_average_coverage.nf b/modules/local/samtools_depth_average_coverage.nf index 9af43516..b04b9ab2 100644 --- a/modules/local/samtools_depth_average_coverage.nf +++ b/modules/local/samtools_depth_average_coverage.nf @@ -11,8 +11,8 @@ process SAMTOOLS_DEPTH_AVERAGE_COVERAGE { tuple val(meta), path(depth) output: - tuple val(meta), path( "*.txt" ), emit: average_coverage - path "versions.yml", emit: versions + tuple val(meta), path( "*.txt" ), emit: average_coverage + path "versions.yml", emit: versions script: def args = task.ext.args ?: '' diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index dbd9fc2a..48ab9ba9 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -22,6 +22,7 @@ process SANGER_TOL_BTK { path("${meta.id}_btk_out/busco"), emit: busco_data path("${meta.id}_btk_out/multiqc"), emit: multiqc_report path("blobtoolkit_pipeline_info"), emit: pipeline_info + path "versions.yml", emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" From 9afd0405eb9b200965cd1090c91b663250c45b0a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 26 Jul 2024 11:46:10 +0100 Subject: [PATCH 092/117] updates and version outputs --- modules/local/ascc_merge_tables.nf | 5 ++++- modules/local/create_btk_dataset.nf | 2 +- modules/local/merge_btk_datasets.nf | 5 +++-- workflows/ascc.nf | 32 ++++++++++++++++++++++++++--- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf index 22d6e714..6850f1d3 100644 --- a/modules/local/ascc_merge_tables.nf +++ b/modules/local/ascc_merge_tables.nf @@ -2,7 +2,10 @@ process ASCC_MERGE_TABLES { tag "$meta.id" label 'process_low' - container 'sanger-tol/ascc_btk:3.2.6-c1' + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "ASCC_MERGE_TABLES module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(gc_content, stageAs: "GC.txt") diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 54b1eb4a..2a82cb39 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -26,7 +26,7 @@ process CREATE_BTK_DATASET { output: tuple val(meta), path("btk_datasets"), emit: btk_datasets tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary - path "versions.yaml", emit: versions + path "versions.yml", emit: versions when: diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index edbe1381..df5d4f68 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -12,8 +12,9 @@ process MERGE_BTK_DATASETS { tuple val(meta2), path(busco_btk_datasets) output: - tuple val(meta), path("merged_datasets"), emit: merged_datasets - path "versions.yaml", emit: versions + tuple val(meta), path("merged_datasets"), emit: merged_datasets + tuple val(meta), path("merged_datasets/btk_busco_summary_table_full.tsv"), emit: busco_summary_tsv + path "versions.yaml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/workflows/ascc.nf b/workflows/ascc.nf index f26aca64..414509a9 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -409,7 +409,7 @@ workflow ASCC { un_hits, YAML_INPUT.out.ncbi_taxonomy_path.first() ) - //ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) + ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) // @@ -472,18 +472,44 @@ workflow ASCC { YAML_INPUT.out.taxid, 'GCA_0001' ) - //ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, SANGER_TOL_BTK.out.dataset ) - //ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv + } else { + busco_merge_btk = [] } + // + // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER + // + ASCC_MERGE_TABLES ( + GC_CONTENT.out.txt, // FROM -- GC_COVERAGE.out.tsv + ch_coverage, // FROM -- RUN_COVERAGE.out.tsv.map{it[1]} + ch_tiara, // FROM -- TIARA_TIARA.out.classifications.map{it[1]} + [], // <-- BACTERIAL KRAKEN -- NOT IN PIPELINE YET + ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} + ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv + nt_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} + un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} + [], // <-- MARKER SCAN -- NOT IN PIPELINE YET + [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET + CREATE_BTK_DATASET.out.create_summary.map{it[1]}, // FROM -- CREATE_BTK_DATASET + busco_merge_btk, // FROM -- MERGE_BTK_DATASETS.out.busco_summary_tsv + ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} + ) + ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) + + + // // SUBWORKFLOW: Collates version data from prior subworflows // From a3b2d98ec27271a2cc2da99dd0eda7c1b3d34cf6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 26 Jul 2024 14:10:53 +0100 Subject: [PATCH 093/117] Adding Merge Tables - spelling error - tuple error --- modules/local/merge_btk_datasets.nf | 10 +++++----- modules/local/sanger_tol_btk.nf | 2 +- workflows/ascc.nf | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index df5d4f68..bc0df020 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -2,10 +2,10 @@ process MERGE_BTK_DATASETS { tag "$meta.id" label 'process_low' - conda "conda-forge::python=3.9" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "ASCC_MERGE_TABLES module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.9" input: tuple val(meta), path(create_btk_datasets) @@ -14,7 +14,7 @@ process MERGE_BTK_DATASETS { output: tuple val(meta), path("merged_datasets"), emit: merged_datasets tuple val(meta), path("merged_datasets/btk_busco_summary_table_full.tsv"), emit: busco_summary_tsv - path "versions.yaml", emit: versions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 48ab9ba9..09518fcf 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -16,8 +16,8 @@ process SANGER_TOL_BTK { val gca_accession output: + tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots - path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: summary_json path("${meta.id}_btk_out/busco"), emit: busco_data path("${meta.id}_btk_out/multiqc"), emit: multiqc_report diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 414509a9..4ceff862 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -443,7 +443,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( !exclude_workflow_steps.contains("busbo_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) { + if ( !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) { YAML_INPUT.out.reference_tuple .combine(ch_bam) @@ -480,7 +480,7 @@ workflow ASCC { SANGER_TOL_BTK.out.dataset ) ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) - busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv + busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv.map{it[1]} } else { busco_merge_btk = [] From 7bf73910b348f135274142d72d40337804d51824 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 6 Aug 2024 12:14:20 +0100 Subject: [PATCH 094/117] Updates to spelling, name corrections to match OLD_ASCC --- assets/github_testing/test.yaml | 1 + bin/ascc_m_tables.py | 313 -------------------------- bin/convert_to_hits.py | 2 +- bin/merge_btk_datasets.py | 186 +++++++++------ bin/merge_btk_datasets_V2.py | 187 --------------- modules/local/ascc_merge_tables.nf | 10 +- modules/local/generate_samplesheet.nf | 4 +- modules/local/merge_btk_datasets.nf | 6 +- modules/local/sanger_tol_btk.nf | 2 + subworkflows/local/pe_mapping.nf | 4 +- subworkflows/local/yaml_input.nf | 2 + workflows/ascc.nf | 15 +- 12 files changed, 140 insertions(+), 592 deletions(-) delete mode 100755 bin/ascc_m_tables.py delete mode 100755 bin/merge_btk_datasets_V2.py diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 9c258fcb..d1929eb7 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -19,6 +19,7 @@ ncbi_accessionids_folder: /home/runner/work/ascc/ascc/20240709_tiny_accession2ta ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages +busco_lineages: "diptera_odb10,insecta_odb10" fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond.dmnd diff --git a/bin/ascc_m_tables.py b/bin/ascc_m_tables.py deleted file mode 100755 index 932f5059..00000000 --- a/bin/ascc_m_tables.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python3 - -VERSION = "2.0.0" -DESCRIPTION = """ -Script for merging contaminant check results into one table -Version: {VERSION} ---- -Written by Eerik Anuin - -Re-Written by Damon-Lee Pointon (dp24/DLBPointon) -""" - -import argparse -import pandas as pd -import textwrap -import os -import sys -import general_purpose_functions as gpf - - -def parse_args(): - parser = argparse.ArgumentParser( - prog="AsccMergeTables", - formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent(DESCRIPTION), - ) - parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") - parser.add_argument("-c", "--coverage", type=str, help="Coverage file") - parser.add_argument("-t", "--tiara", type=str, help="Tiara file") - parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") - parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") - parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") - parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") - parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") - parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") - parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") - parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") - parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") - parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") - parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") - parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") - parser.add_argument("-v", "--version", action="version", version=VERSION) - return parser.parse_args() - - -def check_paths(paths_dict, required_files): - """ - Checks if a required file exists and exits with an error message if it doesn't - """ - out_dict = dict() - - for data_type, input_file in paths_dict.items(): - if input == None: - pass - else: - out_dict[data_type] = input_file - - return out_dict - - -def load_and_merge_dataframes(paths_dict): - """ - Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table - """ - gc_path = paths_dict["gc_content"] - df = pd.read_csv(gc_path, sep="\t", header=None) - if df.shape[0] > 0: - df.columns = ["scaff", "gc"] - df["gc"] = df["gc"] * 100 - else: - sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) - sys.exit(1) - - coverage_df = None - if paths_dict["coverage"] is not None: - coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) - if coverage_df.shape[0] > 0: - coverage_df.columns = ["scaff", "coverage"] - else: - sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") - coverage_df = None - - tiara_df = None - if paths_dict["tiara"] is not None: - tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") - if tiara_df.shape[0] > 0: - tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] - tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] - tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] - tiara_df = tiara_df.iloc[:, [0, 3]] - tiara_df.columns = ["scaff", "tiara_classif"] - else: - sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) - tiara_df = None - - bacterial_kraken_df = None - if paths_dict["bacterial_kraken"] is not None: - bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") - if bacterial_kraken_df.shape[0] > 0: - bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) - bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) - else: - sys.stderr.write( - "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) - ) - bacterial_kraken_df = None - - nt_kraken_df = None - if paths_dict["nt_kraken"] is not None: - nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") - if nt_kraken_df.shape[0] > 0: - nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) - nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) - else: - sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) - nt_kraken_df = None - - dim_reduction_df = None - if paths_dict["dim_reduction_embeddings"] is not None: - dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") - if dim_reduction_df.shape[0] == 0: - sys.stderr.write( - "No rows were found in kmers dimensionality reduction output table ({})\n".format( - paths_dict["dim_reduction_embeddings"] - ) - ) - dim_reduction_df = None - - btk_df = None - if paths_dict["blobtoolkit"] is not None: - btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") - if btk_df.shape[0] == 0: - sys.stderr.write( - "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) - ) - sys.exit(1) - btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} - if "mapped_hifi_reads_sorted_cov" in btk_df.columns: - btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" - if "bestsum_phylum" in btk_df.columns: - btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" - # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} - - btk_df.rename(columns=btk_renaming_dict, inplace=True) - - btk_selected_cols = [ - col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] - ] - if len(btk_selected_cols) > 0: - btk_df = btk_df[btk_selected_cols] - else: - btk_df = None - - btk_busco_df = None - if paths_dict["btk_busco"] is not None: - btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") - if btk_busco_df.shape[0] == 0: - sys.stderr.write( - "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) - ) - sys.exit(1) - btk_busco_renaming_dict = {"identifiers": "scaff"} - - btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) - - btk_busco_selected_cols = [ - col - for col in btk_busco_df.columns - if col - in [ - "scaff", - "buscogenes_superkingdom", - "buscogenes_kingdom", - "buscogenes_phylum", - "buscogenes_class", - "buscogenes_order", - "buscogenes_family", - "buscogenes_genus", - "buscogenes_species", - "buscoregions_superkingdom", - "buscoregions_kingdom", - "buscoregions_phylum", - "buscoregions_class", - "buscoregions_order", - "buscoregions_family", - "buscoregions_genus", - "buscoregions_species", - ] - ] - if len(btk_busco_selected_cols) > 0: - btk_busco_df = btk_busco_df[btk_busco_selected_cols] - else: - btk_busco_df = None - - fcs_gx_df = None - if paths_dict["fcs_gx"] is not None: - fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") - if fcs_gx_df.shape[0] == 0: - sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) - fcs_gx_df = None - - nt_blast_df = None - if paths_dict["nt_blast"] is not None: - nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") - if nt_blast_df.shape[0] == 0: - sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) - nt_blast_df = None - - nr_diamond_df = None - if paths_dict["nr_diamond"] is not None: - nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") - if nr_diamond_df.shape[0] == 0: - sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) - nr_diamond_df = None - - uniprot_diamond_df = None - if paths_dict["uniprot_diamond"] is not None: - uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") - if uniprot_diamond_df.shape[0] == 0: - sys.stderr.write( - "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) - ) - uniprot_diamond_df = None - - cobiontid_markerscan_df = None - if paths_dict["cobiontid_markerscan"] is not None: - cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") - if cobiontid_markerscan_df.shape[0] == 0: - sys.stderr.write( - "No rows were found in CobiontID MarkerScan output table ({})\n".format( - paths_dict["cobiontid_markerscan"] - ) - ) - uniprot_diamond_df = None - - contigviz_df = None - if paths_dict["contigviz"] is not None: - contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") - if contigviz_df.shape[0] == 0: - sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) - contigviz_df = None - - if coverage_df is not None: - df = pd.merge(df, coverage_df, on="scaff", how="outer") - if tiara_df is not None: - df = pd.merge(df, tiara_df, on="scaff", how="outer") - if bacterial_kraken_df is not None: - df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") - if nt_kraken_df is not None: - df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") - if dim_reduction_df is not None: - df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") - if nt_blast_df is not None: - df = pd.merge(df, nt_blast_df, on="scaff", how="outer") - if nr_diamond_df is not None: - df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") - if uniprot_diamond_df is not None: - df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") - if fcs_gx_df is not None: - df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") - if cobiontid_markerscan_df is not None: - df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") - if contigviz_df is not None: - df = pd.merge(df, contigviz_df, on="scaff", how="outer") - if btk_df is not None: - df = pd.merge(df, btk_df, on="scaff", how="outer") - if btk_busco_df is not None: - df = pd.merge(df, btk_busco_df, on="scaff", how="outer") - - return df - - -def main(args): - paths_dict = dict() - paths_dict["gc_content"] = args.gc_cov - paths_dict["coverage"] = args.coverage - paths_dict["tiara"] = args.tiara - paths_dict["bacterial_kraken"] = args.bacterial_kraken - paths_dict["nt_kraken"] = args.nt_kraken - paths_dict["nt_blast"] = args.nt_blast - paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings - paths_dict["nr_diamond"] = args.nr_diamond - paths_dict["uniprot_diamond"] = args.uniprot_diamond - paths_dict["cobiontid_markerscan"] = args.markerscan - paths_dict["contigviz"] = args.contigviz - paths_dict["blobtoolkit"] = args.blobtoolkit - paths_dict["btk_busco"] = args.busco_btk - paths_dict["fcs_gx"] = args.fcs_gx - - required_files = ["gc_content"] - - paths_dict = check_paths(paths_dict, required_files) - df = load_and_merge_dataframes(paths_dict) - df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) - - if ( - paths_dict["nt_blast"] - and paths_dict["nr_diamond"] - and paths_dict["uniprot_diamond"] - and paths_dict["coverage"] - and paths_dict["tiara"] - and paths_dict["nt_kraken"] - ): - process_results_tables_command = f"process_result_tables.py . {args.sample_name}" - gpf.run_system_command(process_results_tables_command) - else: - sys.stderr.write( - f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" - ) - - -if __name__ == "__main__": - main(parse_args()) diff --git a/bin/convert_to_hits.py b/bin/convert_to_hits.py index 5cbe4439..37915f2f 100755 --- a/bin/convert_to_hits.py +++ b/bin/convert_to_hits.py @@ -10,7 +10,7 @@ Version: {VERSION} --- -Written by Eerik Anuin +Written by Eerik Aunin Re-Written by Damon-Lee Pointon (dp24/DLBPointon) """ diff --git a/bin/merge_btk_datasets.py b/bin/merge_btk_datasets.py index 4054d121..fcc6c9cb 100755 --- a/bin/merge_btk_datasets.py +++ b/bin/merge_btk_datasets.py @@ -1,6 +1,16 @@ #!/usr/bin/env python3 -""" -Script for merging BTK datasets from the this pipeline and the BUSCO-based Snakemake BTK pipeline + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for merging BlobToolKit datasets from the createBTKdatasets output directory. +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + """ import json @@ -9,18 +19,61 @@ import os import sys import argparse +import textwrap import general_purpose_functions as gpf +import gzip + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="mergeBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument( + "-m", "--main_btk_datasets", required=True, type=str, help="The btk_datasets generated by createBTKdatasets" + ) + parser.add_argument( + "-b", + "--btk_busco_datasets", + type=str, + help="Path to the BTK dataset (blobdir) created by the BUSCO-based BTK pipeline", + ) + parser.add_argument( + "-s", + "--btk_busco_summary_full", + type=str, + help="The btk_datasets generated by createBTKdatasets", + ) + parser.add_argument( + "-o", + "--new_output_directory", + default="merged_datasets", + type=str, + help="The new output directory for the merged datasets", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) def load_json(filename): - """Loads a JSON file and returns it as a dictionary""" - with open(filename) as f: - return json.load(f) + """ + Loads a JSON file and returns it as a dictionary + """ + json_contents = None + if filename.endswith(".gz"): + with gzip.open(filename, "rt", encoding="UTF-8") as zipfile: + json_contents = json.load(zipfile) + else: + with open(filename) as f: + json_contents = json.load(f) + return json_contents -def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder): +def create_meta_json_contents(main_btk_dataset_folder, btk_busco_dataset_folder): """ - Creates a meta.json file for the new BTK dataset by combining the two meta.json files from the input directories + Creates the contents for the meta.json file for the new BTK dataset by combining the two meta.json files from the input directories """ for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): if os.path.isdir(folder) is False: @@ -30,11 +83,12 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined sys.exit(0) main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" - btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json" + btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json.gz" for json_path in (main_btk_json_path, btk_busco_json_path): if os.path.isfile(json_path) is False: sys.stderr.write(f"File {json_path} not found)\n") sys.exit(1) + main_meta_dict = load_json(main_btk_json_path) btk_busco_meta_dict = load_json(btk_busco_json_path) @@ -53,93 +107,81 @@ def create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined else: if field_id not in keys_to_skip: merged_dict["fields"].append(field) + return merged_dict - meta_json_outpath = f"{combined_dataset_folder}/meta.json" - with open(meta_json_outpath, "w") as json_outfile: - json.dump(merged_dict, json_outfile, indent=1, sort_keys=True) - - -def main( - main_btk_dataset_folder, - btk_busco_dataset_folder, - combined_dataset_folder, - pipeline_output_folder, - skip_renaming_folders, -): - if os.path.isdir(main_btk_dataset_folder) is False: - sys.stderr.write(f"The BlobToolKit dataset ({main_btk_dataset_folder}) was not found\n") + +def detect_buscogenes_variables(merged_jsons_dict): + """ + Goes through the content of merged meta.json file (derived from both BTK datasets) and detects if buscogenes + variables are present + """ + buscogenes_present_flag = False + fields = merged_jsons_dict["fields"] + for field in fields: + field_id = field["id"] + if field_id == "taxonomy": + for item in field["children"]: + if item["id"] == "buscogenes": + buscogenes_present_flag = True + break + return buscogenes_present_flag + + +def main(args): + if os.path.isdir(args.main_btk_datasets) is False: + sys.stderr.write(f"The BlobToolKit dataset ({args.main_btk_datasets}) was not found!\n") sys.exit(1) - if os.path.isdir(btk_busco_dataset_folder) is False: + if os.path.isdir(args.btk_busco_datasets) is False: sys.stderr.write( - f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {btk_busco_dataset_folder}, skipping the merging of BTK datasets\n" + f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {args.btk_busco_datasets}, skipping the merging of BTK datasets\n" ) sys.exit(0) - not_copying_list = ["identifiers.json", "gc_data.json", "length_data.json", "ncount_data.json", "meta.json"] + not_copying_list = [ + "identifiers.json.gz", + "gc_data.json.gz", + "length_data.json.gz", + "ncount_data.json.gz", + "meta.json.gz", + ] - Path(combined_dataset_folder).mkdir(parents=True, exist_ok=True) + Path(args.new_output_directory).mkdir(parents=True, exist_ok=True) main_btk_dataset_files = [ - f for f in os.listdir(main_btk_dataset_folder) if os.path.isfile(os.path.join(main_btk_dataset_folder, f)) + f for f in os.listdir(args.main_btk_datasets) if os.path.isfile(os.path.join(args.main_btk_datasets, f)) ] main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] for main_btk_dataset_file in main_btk_dataset_files: - main_btk_dataset_file_full_path = f"{main_btk_dataset_folder}/{main_btk_dataset_file}" - copied_file_full_path = f"{combined_dataset_folder}/{main_btk_dataset_file}" + main_btk_dataset_file_full_path = f"{args.main_btk_datasets}/{main_btk_dataset_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{main_btk_dataset_file}") shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) btk_busco_files = [ - f for f in os.listdir(btk_busco_dataset_folder) if os.path.isfile(os.path.join(btk_busco_dataset_folder, f)) + f for f in os.listdir(args.btk_busco_datasets) if os.path.isfile(os.path.join(args.btk_busco_datasets, f)) ] for btk_busco_file in btk_busco_files: - btk_busco_file_full_path = f"{btk_busco_dataset_folder}/{btk_busco_file}" - copied_file_full_path = f"{combined_dataset_folder}/{btk_busco_file}" + btk_busco_file_full_path = f"{args.btk_busco_datasets}/{btk_busco_file}" + copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{btk_busco_file}") shutil.copy(btk_busco_file_full_path, copied_file_full_path) - create_meta_json(main_btk_dataset_folder, btk_busco_dataset_folder, combined_dataset_folder) - old_main_btk_dataset_folder = main_btk_dataset_folder + "_without_busco" + merged_jsons_dict = create_meta_json_contents(args.main_btk_datasets, args.btk_busco_datasets) + meta_json_outpath = f"{args.new_output_directory}/meta.json" - if skip_renaming_folders is False: - os.rename(main_btk_dataset_folder, old_main_btk_dataset_folder) - os.rename(combined_dataset_folder, main_btk_dataset_folder) + with open(meta_json_outpath, "w") as json_outfile: + json.dump(merged_jsons_dict, json_outfile, indent=1, sort_keys=True) + + buscogenes_present_flag = detect_buscogenes_variables(merged_jsons_dict) + + btk_busco_table_outpath = f"{args.new_output_directory}/btk_busco_summary_table_full.tsv" + + btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species" + if buscogenes_present_flag == True: + btk_busco_table_exporting_command += ",buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species" + btk_busco_table_exporting_command += f" {args.new_output_directory}" - btk_busco_table_outpath = f"{pipeline_output_folder}/btk_busco_summary_table_full.tsv" - btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species {main_btk_dataset_folder}" gpf.run_system_command(btk_busco_table_exporting_command) if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "main_btk_dataset_folder", - type=str, - help="Path to the BTK dataset (blobdir) created from the output of the steps of this pipeline", - ) - parser.add_argument( - "btk_busco_dataset_folder", - type=str, - help="Path to the BTK dataset (blobdir) created by the BUSCO-based Snakemake BTK pipeline", - ) - parser.add_argument( - "combined_dataset_folder", - type=str, - help="Path for creating a new BTK dataset (blobdir) that combines the two input BTK datasets", - ) - parser.add_argument( - "pipeline_output_folder", type=str, help="Path to the directory with the output tables of the pipeline" - ) - parser.add_argument( - "--skip_renaming_folders", - dest="skip_renaming_folders", - help="Optional boolean argument. If set to true, the script skips the renaming of the input BTK dataset directories after creating the merged BTK dataset", - action="store_true", - ) - args = parser.parse_args() - main( - args.main_btk_dataset_folder, - args.btk_busco_dataset_folder, - args.combined_dataset_folder, - args.pipeline_output_folder, - args.skip_renaming_folders, - ) + main(parse_args()) diff --git a/bin/merge_btk_datasets_V2.py b/bin/merge_btk_datasets_V2.py deleted file mode 100755 index fcc6c9cb..00000000 --- a/bin/merge_btk_datasets_V2.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python3 - -VERSION = "2.0.0" -DESCRIPTION = f""" ---- -Script for merging BlobToolKit datasets from the createBTKdatasets output directory. -Version: {VERSION} ---- - -Written by Eerik Aunin (ea10) - -Modified by Damon-Lee Pointon (@dp24/@DLBPointon) - -""" - -import json -from pathlib import Path -import shutil -import os -import sys -import argparse -import textwrap -import general_purpose_functions as gpf -import gzip - - -def parse_args(argv=None): - parser = argparse.ArgumentParser( - prog="mergeBTKdatasets", - formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent(DESCRIPTION), - ) - parser.add_argument( - "-m", "--main_btk_datasets", required=True, type=str, help="The btk_datasets generated by createBTKdatasets" - ) - parser.add_argument( - "-b", - "--btk_busco_datasets", - type=str, - help="Path to the BTK dataset (blobdir) created by the BUSCO-based BTK pipeline", - ) - parser.add_argument( - "-s", - "--btk_busco_summary_full", - type=str, - help="The btk_datasets generated by createBTKdatasets", - ) - parser.add_argument( - "-o", - "--new_output_directory", - default="merged_datasets", - type=str, - help="The new output directory for the merged datasets", - ) - parser.add_argument("-v", "--version", action="version", version=VERSION) - - return parser.parse_args(argv) - - -def load_json(filename): - """ - Loads a JSON file and returns it as a dictionary - """ - json_contents = None - if filename.endswith(".gz"): - with gzip.open(filename, "rt", encoding="UTF-8") as zipfile: - json_contents = json.load(zipfile) - else: - with open(filename) as f: - json_contents = json.load(f) - return json_contents - - -def create_meta_json_contents(main_btk_dataset_folder, btk_busco_dataset_folder): - """ - Creates the contents for the meta.json file for the new BTK dataset by combining the two meta.json files from the input directories - """ - for folder in (main_btk_dataset_folder, btk_busco_dataset_folder): - if os.path.isdir(folder) is False: - sys.stderr.write( - f"Skipping the merging of the main BTK dataset and the BUSCO-based BTK dataset, as directory {folder} was not found)\n" - ) - sys.exit(0) - - main_btk_json_path = f"{main_btk_dataset_folder}/meta.json" - btk_busco_json_path = f"{btk_busco_dataset_folder}/meta.json.gz" - for json_path in (main_btk_json_path, btk_busco_json_path): - if os.path.isfile(json_path) is False: - sys.stderr.write(f"File {json_path} not found)\n") - sys.exit(1) - - main_meta_dict = load_json(main_btk_json_path) - btk_busco_meta_dict = load_json(btk_busco_json_path) - - merged_dict = btk_busco_meta_dict.copy() - - keys_to_skip = [] - fields = main_meta_dict["fields"] - for field in fields: - field_id = field["id"] - - if field_id == "taxonomy": - btk_main_taxonomy_field = field.copy() - btk_main_taxonomy_field["id"] = "btk_main_taxonomy" - btk_main_taxonomy_field["name"] = "btk_main_taxonomy" - merged_dict["fields"].append(btk_main_taxonomy_field) - else: - if field_id not in keys_to_skip: - merged_dict["fields"].append(field) - return merged_dict - - -def detect_buscogenes_variables(merged_jsons_dict): - """ - Goes through the content of merged meta.json file (derived from both BTK datasets) and detects if buscogenes - variables are present - """ - buscogenes_present_flag = False - fields = merged_jsons_dict["fields"] - for field in fields: - field_id = field["id"] - if field_id == "taxonomy": - for item in field["children"]: - if item["id"] == "buscogenes": - buscogenes_present_flag = True - break - return buscogenes_present_flag - - -def main(args): - if os.path.isdir(args.main_btk_datasets) is False: - sys.stderr.write(f"The BlobToolKit dataset ({args.main_btk_datasets}) was not found!\n") - sys.exit(1) - - if os.path.isdir(args.btk_busco_datasets) is False: - sys.stderr.write( - f"The blobdir of BUSCO-based BlobToolKit Snakemake pipeline run does not exist at {args.btk_busco_datasets}, skipping the merging of BTK datasets\n" - ) - sys.exit(0) - - not_copying_list = [ - "identifiers.json.gz", - "gc_data.json.gz", - "length_data.json.gz", - "ncount_data.json.gz", - "meta.json.gz", - ] - - Path(args.new_output_directory).mkdir(parents=True, exist_ok=True) - - main_btk_dataset_files = [ - f for f in os.listdir(args.main_btk_datasets) if os.path.isfile(os.path.join(args.main_btk_datasets, f)) - ] - main_btk_dataset_files = [f for f in main_btk_dataset_files if f not in not_copying_list] - for main_btk_dataset_file in main_btk_dataset_files: - main_btk_dataset_file_full_path = f"{args.main_btk_datasets}/{main_btk_dataset_file}" - copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{main_btk_dataset_file}") - shutil.copy(main_btk_dataset_file_full_path, copied_file_full_path) - - btk_busco_files = [ - f for f in os.listdir(args.btk_busco_datasets) if os.path.isfile(os.path.join(args.btk_busco_datasets, f)) - ] - for btk_busco_file in btk_busco_files: - btk_busco_file_full_path = f"{args.btk_busco_datasets}/{btk_busco_file}" - copied_file_full_path = os.path.abspath(f"{args.new_output_directory}/{btk_busco_file}") - shutil.copy(btk_busco_file_full_path, copied_file_full_path) - - merged_jsons_dict = create_meta_json_contents(args.main_btk_datasets, args.btk_busco_datasets) - meta_json_outpath = f"{args.new_output_directory}/meta.json" - - with open(meta_json_outpath, "w") as json_outfile: - json.dump(merged_jsons_dict, json_outfile, indent=1, sort_keys=True) - - buscogenes_present_flag = detect_buscogenes_variables(merged_jsons_dict) - - btk_busco_table_outpath = f"{args.new_output_directory}/btk_busco_summary_table_full.tsv" - - btk_busco_table_exporting_command = f"blobtools filter --table {btk_busco_table_outpath} --table-fields identifiers,buscoregions_superkingdom,buscoregions_kingdom,buscoregions_phylum,buscoregions_class,buscoregions_order,buscoregions_family,buscoregions_genus,buscoregions_species" - if buscogenes_present_flag == True: - btk_busco_table_exporting_command += ",buscogenes_superkingdom,buscogenes_kingdom,buscogenes_phylum,buscogenes_class,buscogenes_order,buscogenes_family,buscogenes_genus,buscogenes_species" - btk_busco_table_exporting_command += f" {args.new_output_directory}" - - gpf.run_system_command(btk_busco_table_exporting_command) - - -if __name__ == "__main__": - main(parse_args()) diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf index 6850f1d3..da7d59b3 100644 --- a/modules/local/ascc_merge_tables.nf +++ b/modules/local/ascc_merge_tables.nf @@ -2,10 +2,10 @@ process ASCC_MERGE_TABLES { tag "$meta.id" label 'process_low' - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "ASCC_MERGE_TABLES module does not support Conda. Please use Docker / Singularity / Podman instead." - } - container "docker.io/genomehubs/blobtoolkit:4.3.9" + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), path(gc_content, stageAs: "GC.txt") @@ -50,7 +50,7 @@ process ASCC_MERGE_TABLES { def cobiontid_markerscan = "" """ - ascc_m_tables.py \\ + ascc_merge_tables.py \\ --gc_cov $gc_content \\ --sample_name $meta.id \\ $coverage \\ diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf index e2c30de1..018f7ec6 100644 --- a/modules/local/generate_samplesheet.nf +++ b/modules/local/generate_samplesheet.nf @@ -11,8 +11,8 @@ process GENERATE_SAMPLESHEET { tuple val(meta), path(pacbio_path) output: - tuple val(meta), path("*csv"), emit: csv - path "versions.yml", emit: versions + tuple val(meta), path("*csv"), emit: csv + path "versions.yml", emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index bc0df020..707c33ab 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -26,7 +26,7 @@ process MERGE_BTK_DATASETS { """ mkdir -p merged_datasets/ - merge_btk_datasets_V2.py \\ + merge_btk_datasets.py \\ -m $create_btk_datasets \\ -o ./merged_datasets \\ -b $busco_btk_datasets \\ @@ -35,7 +35,7 @@ process MERGE_BTK_DATASETS { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - merge_btk_datasets_V2: \$(merge_btk_datasets_V2.py --version | cut -d' ' -f2) + merge_btk_datasets: \$(merge_btk_datasets.py --version | cut -d' ' -f2) END_VERSIONS """ @@ -47,7 +47,7 @@ process MERGE_BTK_DATASETS { cat <<-END_VERSIONS > versions.yml "${task.process}": - merge_btk_datasets_V2: \$(merge_btk_datasets_V2.py -v) + merge_btk_datasets: \$(merge_btk_datasets.py --version | cut -d' ' -f2) END_VERSIONS """ } diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 09518fcf..009bb27f 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -12,6 +12,7 @@ process SANGER_TOL_BTK { path btk_config_file path tax_dump path btk_yaml, stageAs: "BTK.yaml" + val busco_lineages val taxon val gca_accession @@ -49,6 +50,7 @@ process SANGER_TOL_BTK { --outdir ${prefix}_btk_out \\ --fasta "\$(realpath REFERENCE.fa)" \\ --yaml "\$(realpath BTK.yaml)" \\ + --busco_lineages $busco_lineages \\ --accession draft \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ diff --git a/subworkflows/local/pe_mapping.nf b/subworkflows/local/pe_mapping.nf index e4e2963a..032657ee 100644 --- a/subworkflows/local/pe_mapping.nf +++ b/subworkflows/local/pe_mapping.nf @@ -1,5 +1,5 @@ -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ILLUMINA } from '../../modules/nf-core/minimap2/align/main' -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ILLUMINA } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' workflow PE_MAPPING { diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index db5f2538..935e5033 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -40,6 +40,7 @@ workflow YAML_INPUT { ncbi_rankedlineage_path: ( file(data.ncbi_rankedlineage_path) ) ncbi_accessionids: ( data.ncbi_accessionids_folder ) busco_lineages_folder: ( data.busco_lineages_folder ) + busco_lineages: ( data.busco_lineages ) seqkit_values: ( data.seqkit ) diamond_uniprot_database_path: ( data.diamond_uniprot_database_path ) diamond_nr_database_path: ( data.diamond_nr_database_path ) @@ -147,6 +148,7 @@ workflow YAML_INPUT { ncbi_taxonomy_path = group.ncbi_taxonomy_path ncbi_rankedlineage_path = group.ncbi_rankedlineage_path busco_lineages_folder = group.busco_lineages_folder + busco_lineages = group.busco_lineages fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 4ceff862..7c0e2f31 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -194,7 +194,7 @@ workflow ASCC { ch_nt_blast = [] } - if ( include_workflow_steps.contains('mito') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A MITO AND BRANCH // @@ -220,7 +220,7 @@ workflow ASCC { ch_mito = [] } - if ( include_workflow_steps.contains('chloro') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { // // LOGIC: CHECK WHETHER THERE IS A PLASTID AND BRANCH @@ -249,7 +249,7 @@ workflow ASCC { // // SUBWORKFLOW: // - if ( include_workflow_steps.contains('fcs_adapt') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcs-adaptor') || include_workflow_steps.contains('ALL') ) { RUN_FCSADAPTOR ( YAML_INPUT.out.reference_tuple ) @@ -361,7 +361,7 @@ workflow ASCC { // // SUBWORKFLOW: DIAMOND BLAST FOR INPUT ASSEMBLY // - if ( include_workflow_steps.contains('nt_diamond') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('nr_diamond') || include_workflow_steps.contains('ALL') ) { NUCLEOT_DIAMOND ( modified_input, YAML_INPUT.out.diamond_nr_database_path @@ -405,8 +405,8 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, - nt_hits, - un_hits, + nt_full, + un_full, YAML_INPUT.out.ncbi_taxonomy_path.first() ) ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) @@ -415,7 +415,7 @@ workflow ASCC { // // MODULE: AUTOFILTER ASSEMBLY BY TIARA AND FCSGX RESULTS // - if ( include_workflow_steps.contains('tiara') && include_workflow_steps.contains('fcsgx') && include_workflow_steps.contains("autofilter") || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('tiara') && include_workflow_steps.contains('fcs-gx') && include_workflow_steps.contains("autofilter_assembly") || include_workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( YAML_INPUT.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, @@ -469,6 +469,7 @@ workflow ASCC { [], YAML_INPUT.out.ncbi_taxonomy_path, YAML_INPUT.out.btk_yaml, + YAML_INPUT.out.busco_lineages, YAML_INPUT.out.taxid, 'GCA_0001' ) From 060e997ded7ec89086bb79968aae39cc1c8290e8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 6 Aug 2024 12:36:18 +0100 Subject: [PATCH 095/117] Adding include_exclude checker --- workflows/ascc.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 7c0e2f31..3b59f80a 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -79,6 +79,12 @@ workflow ASCC { include_workflow_steps = params.include ? params.include.split(",") : "" exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" + full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] + + if (!full_list.containsAll(include_workflow_steps) || !full_list.containsAll(exclude_workflow_steps)) { + exit 1, "There is an extra argument given on Command Line: \n Check contents of: $include_workflow_steps\nAnd $exclude_workflow_steps\nMaster list is: $full_list" + } + input_ch = Channel.fromPath(params.input, checkIfExists: true) // From b30dddb1918a2ebefac47d3c64507df9696118e7 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 6 Aug 2024 12:41:06 +0100 Subject: [PATCH 096/117] Removed extra script and renamed V2 --- bin/create_btk_dataset.py | 289 +++++++++++++--------------- bin/create_btk_dataset_V2.py | 204 -------------------- modules/local/create_btk_dataset.nf | 7 +- 3 files changed, 135 insertions(+), 365 deletions(-) delete mode 100755 bin/create_btk_dataset_V2.py diff --git a/bin/create_btk_dataset.py b/bin/create_btk_dataset.py index 8cad4eb0..69638fdf 100755 --- a/bin/create_btk_dataset.py +++ b/bin/create_btk_dataset.py @@ -1,15 +1,79 @@ #!/usr/bin/env python3 -""" -Script for creating a BlobToolKit dataset + +VERSION = "2.0.0" +DESCRIPTION = f""" +--- +Script for creating a BlobToolKit dataset from the ASCC output files +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + """ import general_purpose_functions as gpf import argparse from pathlib import Path +import textwrap import sys import os.path +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="createBTKdatasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-n", "--name", required=True, type=str, help="Assembly name (for the output files)") + parser.add_argument( + "-tn", + "--taxon_name", + required=True, + type=str, + help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)", + ) + parser.add_argument("-id", "--taxid", required=True, type=int, help="Taxon ID of the assembly") + parser.add_argument( + "-td", "--taxdump", required=True, type=str, help="Path to the directory containing the NCBI taxdump" + ) + parser.add_argument("-f", "--fasta", required=True, type=str, help="The path for the assembly fasta file") + parser.add_argument( + "-d", + "--dataset", + type=str, + required=True, + help="The folder containing the data generated throughout the pipeline", + ) + parser.add_argument("-bh", "--blastn_hits", default="N", type=str, help="Path to the BLASTN hits file") + parser.add_argument( + "-ud", "--uniprot_diamond_hits", default="N", type=str, help="Path to the UNIPROT diamond BlastX hits file" + ) + parser.add_argument("-nr", "--nr_diamond_hits", default="N", type=str, help="Path to the DIAMOND BlastX hits file") + parser.add_argument( + "-r", "--mapped_reads", default="N", type=str, help="Path to mapped reads BAM for coverage estimation" + ) + parser.add_argument("-t", "--tiara", default="N", type=str, help="Path to the tiara_out.txt file") + parser.add_argument( + "-p", "--pca", default="N", type=str, help="Path to the kmers_dim_reduction_embeddings.csv file" + ) + parser.add_argument("-fc", "--fcs_gx", default="N", type=str, help="Path to the fcs-gx_summary.csv.csv file") + parser.add_argument("-k", "--kraken", default="N", type=str, help="Path to the nt_kraken_lineage.txt file") + parser.add_argument("-ms", "--markerscan", default="N", type=str, help="Path to the cobiontid_markerscan.csv file") + parser.add_argument("-cv", "--contigviz", default="N", type=str, help="Path to the contigviz_results.csv file") + parser.add_argument("-o", "--output", default="btk_datasets", type=str, help="Output directory") + parser.add_argument("--threads", type=int, default=1, help="Number of threads to utilise") + parser.add_argument("--alias", type=str, default="", help="Assembly alias") + parser.add_argument( + "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): """ Creates the assembly YAML file for creating a BlobToolKit dataset @@ -65,167 +129,76 @@ def detect_dim_reduction_methods(kmers_dim_reduction_output_path): return dim_reduction_methods -def add_custom_variables_to_btk_dataset(pipeline_run_folder, btk_dataset_folder): - """ - Script for adding custom variables (e.g. Tiara results and PCA results) to the BlobToolKit dataset - """ - pipeline_output_folder = pipeline_run_folder + "/collected_tables" - if os.path.isdir(pipeline_output_folder) == False: - sys.stderr.write( - "The directory for the output tables of the pipeline ({}) was not found\n".format(pipeline_output_folder) - ) - sys.exit(1) - if os.path.isdir(btk_dataset_folder) == False: - sys.stderr.write("The BlobToolKit dataset directory ({}) was not found\n".format(btk_dataset_folder)) - sys.exit(1) - tiara_raw_output_path = pipeline_output_folder + "/tiara_out.txt" - if os.path.isfile(tiara_raw_output_path) and os.stat(tiara_raw_output_path).st_size > 0: - tiara_reformatted_output_path = pipeline_output_folder + "/tiara_out_btk_format.tsv" - tiara_results_to_btk_format(tiara_raw_output_path, tiara_reformatted_output_path) - add_tiara_command = 'blobtools add --text {} --text-delimiter "\t" --text-cols "identifier=identifiers,tiara=tiara" --text-header {}'.format( - tiara_reformatted_output_path, btk_dataset_folder - ) - gpf.run_system_command(add_tiara_command) - - kmers_dim_reduction_output_path = pipeline_output_folder + "/kmers_dim_reduction_embeddings.csv" - if os.path.isfile(kmers_dim_reduction_output_path) and os.stat(kmers_dim_reduction_output_path).st_size > 0: - used_dim_reduction_methods = detect_dim_reduction_methods(kmers_dim_reduction_output_path) - for dim_reduction_method in used_dim_reduction_methods: - add_embedding_command = 'blobtools add --text {path} --text-delimiter "," --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {btk_dataset_folder}'.format( - path=kmers_dim_reduction_output_path, - dim_reduction_method=dim_reduction_method, - btk_dataset_folder=btk_dataset_folder, - ) - gpf.run_system_command(add_embedding_command) - - kraken_lineage_path = pipeline_output_folder + "/nt_kraken_lineage.txt" - if os.path.isfile(kraken_lineage_path) and os.stat(kraken_lineage_path).st_size > 0: - for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): - add_kraken_command = 'blobtools add --text {} --text-delimiter "," --text-cols scaff=identifiers,nt_kraken_{}=nt_kraken_{} --text-header {}'.format( - kraken_lineage_path, taxonomy_level, taxonomy_level, btk_dataset_folder - ) - gpf.run_system_command(add_kraken_command) - - fcs_gx_output_path = pipeline_output_folder + "/fcs-gx_summary.csv" - if os.path.isfile(fcs_gx_output_path) and os.stat(fcs_gx_output_path).st_size > 0: - add_fcs_gx_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action" --text-header {}'.format( - fcs_gx_output_path, btk_dataset_folder - ) - gpf.run_system_command(add_fcs_gx_results_command) - - # cobiontid_markerscan_json_file_path = run_folder + "/" + sample_id + ".json" - # cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") - cobiontid_markerscan_output_path = pipeline_output_folder + "/cobiontid_markerscan.csv" - if os.path.isfile(cobiontid_markerscan_output_path) and os.stat(cobiontid_markerscan_output_path).st_size > 0: - add_cobiontid_markerscan_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,CobiontID_MarkerScan_embl_ebi_ena=CobiontID_MarkerScan_embl_ebi_ena,CobiontID_MarkerScan_slv=CobiontID_MarkerScan_slv,CobiontID_MarkerScan_Cluster=CobiontID_MarkerScan_Cluster" --text-header {}'.format( - cobiontid_markerscan_output_path, btk_dataset_folder - ) - gpf.run_system_command(add_cobiontid_markerscan_results_command) - - cobiontid_contigviz_output_path = pipeline_output_folder + "/contigviz_results.csv" - if os.path.isfile(cobiontid_contigviz_output_path) and os.stat(cobiontid_contigviz_output_path).st_size > 0: - add_cobiontid_contigviz_results_command = 'blobtools add --text {} --text-delimiter "," --text-cols "scaff=identifiers,ContigViz_UMAP1=ContigViz_UMAP1,ContigViz_UMAP2=ContigViz_UMAP2,ContigViz_Hexamer_continuous=ContigViz_Hexamer_continuous,ContigViz_Hexamer_digitized=ContigViz_Hexamer_digitized,ContigViz_FastK_continuous=ContigViz_FastK_continuous,ContigViz_FastK_digitized=ContigViz_FastK_digitized,ContigViz_Unique_15mers_continuous=ContigViz_Unique_15mers_continuous,ContigViz_Unique_15mers_digitized=ContigViz_Unique_15mers_digitized,ContigViz_Coverage_continuous=ContigViz_Coverage_continuous,ContigViz_Coverage_digitized=ContigViz_Coverage_digitized" --text-header {}'.format( - cobiontid_contigviz_output_path, btk_dataset_folder - ) - gpf.run_system_command(add_cobiontid_contigviz_results_command) - - -def main( - assembly_fasta_path, - dataset_folder, - pipeline_run_folder, - assembly_title, - taxon_name, - taxid, - blastn_hits_path, - uniprot_diamond_hits_path, - nr_diamond_hits_path, - mapped_reads_path, - taxdump_path, - threads, - assembly_alias, - dry_run_flag, -): - # out_folder = pipeline_run_folder + "/collected_tables" - - if assembly_alias == "": - assembly_alias = assembly_title - - if dry_run_flag == False: - Path(dataset_folder).mkdir(parents=True, exist_ok=True) - - edited_assembly_title = assembly_title.replace(".", "_") - edited_assembly_title = edited_assembly_title.replace(" ", "_") - - assembly_yaml_path = dataset_folder + "/" + edited_assembly_title + ".yaml" - if dry_run_flag == False: - create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name) - - blobtools_create_command = "blobtools create --fasta {} --meta {} --taxid {} --taxdump {} {}".format( - assembly_fasta_path, assembly_yaml_path, taxid, taxdump_path, dataset_folder - ) - gpf.run_system_command(blobtools_create_command, dry_run=dry_run_flag) +def main(args): + command_list = [] + + assembly_alias = args.name if args.alias == "" else args.alias + + edited_assembly_title = args.name.replace(".", "_").replace(" ", "_") + + assembly_yaml_path = args.output + "/" + edited_assembly_title + "BTK_DS.yaml" + + if args.dry_run == False: + Path(args.dataset).mkdir(parents=True, exist_ok=True) + create_assembly_yaml(assembly_yaml_path, assembly_alias, args.taxon_name) + + # Base command for new BTK Dataset + blobtools_create_command = f"blobtools create --fasta {args.fasta} --meta {assembly_yaml_path} --taxid {args.taxid} --taxdump {args.taxdump} {args.output}" + gpf.run_system_command(blobtools_create_command, dry_run=args.dry_run) + + # ADDING BLAST HIT DATA TO BTK + hits_file_paths = [args.blastn_hits, args.uniprot_diamond_hits, args.nr_diamond_hits] - hits_file_paths = [blastn_hits_path, uniprot_diamond_hits_path, nr_diamond_hits_path] - hits_file_paths = [n for n in hits_file_paths if os.path.isfile(n) is True and os.stat(n).st_size > 0] + hits_file = [n for n in hits_file_paths if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0] - if len(hits_file_paths) > 0: + if len(hits_file) > 0: add_hits_command = "blobtools add" - for hits_file_path in hits_file_paths: - add_hits_command += " --hits {}".format(hits_file_path) - add_hits_command += " --taxrule bestsum --taxdump {} {}".format(taxdump_path, dataset_folder) - gpf.run_system_command(add_hits_command, dry_run=dry_run_flag) - - if os.path.isfile(mapped_reads_path) is True and os.stat(mapped_reads_path).st_size > 0: - add_cov_command = "blobtools add --cov {} --threads {} {}".format(mapped_reads_path, threads, dataset_folder) - gpf.run_system_command(add_cov_command, dry_run=dry_run_flag) - - # export_table_command = "blobtools filter --table {}/btk_summary_table_basic.tsv {}".format(out_folder, dataset_folder) - add_custom_variables_to_btk_dataset(pipeline_run_folder, dataset_folder) - export_table_command = "blobtools filter --table {}/collected_tables/btk_summary_table_full.tsv {}".format( - pipeline_run_folder, dataset_folder - ) + for file in hits_file_paths: + add_hits_command += f" --hits {file}" + add_hits_command += f" --taxrule bestsum --taxdump {args.taxdump} {args.output}" + command_list.append(add_hits_command) + + # ADDING MAPPED READS DATA TO BTK + if ( + args.mapped_reads != "N" + and os.path.isfile(args.mapped_reads) is True + and os.stat(args.mapped_reads).st_size > 0 + ): + add_cov_command = f"blobtools add --cov {args.mapped_reads} --threads {args.threads} {args.output}" + command_list.append(add_cov_command) + + # ADDING TIARA + if args.tiara != "N" and os.path.isfile(args.tiara) and os.stat(args.tiara).st_size > 0: + tiara_reformatted_output_path = args.dataset + "/tiara_out_btk_format.tsv" + tiara_results_to_btk_format(args.tiara, tiara_reformatted_output_path) + add_tiara_command = f"blobtools add --text {tiara_reformatted_output_path} --text-delimiter '\t' --text-cols 'identifier=identifiers,tiara=tiara' --text-header {args.output}" + command_list.append(add_tiara_command) + + # ADDING KMER DIM REDUCTION + if args.pca != "N" and os.path.isfile(args.pca) and os.stat(args.pca).st_size > 0: + used_dim_reduction_methods = detect_dim_reduction_methods(args.pca) + for dim_reduction_method in used_dim_reduction_methods: + add_embedding_command = f"blobtools add --text {args.pca} --text-delimiter ',' --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {args.output}" + command_list.append(add_embedding_command) + + # ADDIND KRAKEN DATA + if args.kraken != "N" and os.path.isfile(args.kraken) and os.stat(args.kraken).st_size > 0: + for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): + add_kraken_command = f"blobtools add --text {args.kraken} --text-delimiter ',' --text-cols scaff=identifiers,nt_kraken_{taxonomy_level}=nt_kraken_{taxonomy_level} --text-header {args.output}" + command_list.append(add_kraken_command) + + # ADDING FCS_GX DATA + if args.fcs_gx != "N" and os.path.isfile(args.fcs_gx) and os.stat(args.fcs_gx).st_size > 0: + add_fcs_gx_results_command = f"blobtools add --text {args.fcs_gx} --text-delimiter ',' --text-cols 'scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action' --text-header {args.output}" + command_list.append(add_fcs_gx_results_command) - gpf.run_system_command(export_table_command, dry_run=dry_run_flag) + export_table_command = f"blobtools filter --table btk_summary_table_full.tsv {args.output}" + command_list.append(export_table_command) - # json_file_path = run_folder + "/" + sample_id + ".json" - # cobiontid_scaffs_json_to_csv(json_file_path, out_folder + "/cobiontid_markerscan.csv") + # EXECUTE ALL BTK COMMANDS + for i in command_list: + gpf.run_system_command(i, dry_run=args.dry_run) if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("assembly_fasta_path", type=str, help="assembly_fasta_path") - parser.add_argument("dataset_folder", type=str, help="Path for dataset folder") - parser.add_argument("pipeline_run_folder", type=str, help="Folder where this pipeline is run pipeline") - parser.add_argument("assembly_title", type=str, help="Assembly title") - parser.add_argument("-tn", "--taxon_name", type=str, help="Taxon name") - parser.add_argument("-ti", "--taxid", type=int, help="taxid") - parser.add_argument("blastn_hits_path", type=str, help="Path to blastn hits file") - parser.add_argument("uniprot_diamond_hits_path", type=str, help="Path to UNIPROT Diamond BLASTX hits file") - parser.add_argument("nr_diamond_hits_path", type=str, help="Path to nr Diamond BLASTX hits file") - parser.add_argument( - "mapped_reads_path", type=str, help="Path to the BAM file with mapped reads for coverage estimation" - ) - parser.add_argument("-td", "--taxdump_path", type=str, help="Path to the directory with NCBI taxdump files") - parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads (default: 1)") - parser.add_argument("--assembly_alias", type=str, default="", help="Assembly alias") - parser.add_argument( - "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" - ) - args = parser.parse_args() - main( - args.assembly_fasta_path, - args.dataset_folder, - args.pipeline_run_folder, - args.assembly_title, - args.taxon_name, - args.taxid, - args.blastn_hits_path, - args.uniprot_diamond_hits_path, - args.nr_diamond_hits_path, - args.mapped_reads_path, - args.taxdump_path, - args.threads, - args.assembly_alias, - args.dry_run, - ) + main(parse_args()) diff --git a/bin/create_btk_dataset_V2.py b/bin/create_btk_dataset_V2.py deleted file mode 100755 index 69638fdf..00000000 --- a/bin/create_btk_dataset_V2.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python3 - -VERSION = "2.0.0" -DESCRIPTION = f""" ---- -Script for creating a BlobToolKit dataset from the ASCC output files -Version: {VERSION} ---- - -Written by Eerik Aunin (ea10) - -Modified by Damon-Lee Pointon (@dp24/@DLBPointon) - -""" - -import general_purpose_functions as gpf -import argparse -from pathlib import Path -import textwrap -import sys -import os.path - - -def parse_args(argv=None): - parser = argparse.ArgumentParser( - prog="createBTKdatasets", - formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent(DESCRIPTION), - ) - parser.add_argument("-n", "--name", required=True, type=str, help="Assembly name (for the output files)") - parser.add_argument( - "-tn", - "--taxon_name", - required=True, - type=str, - help="The Taxon name of the assembly (Scientific name of the species + subspecies if applicable)", - ) - parser.add_argument("-id", "--taxid", required=True, type=int, help="Taxon ID of the assembly") - parser.add_argument( - "-td", "--taxdump", required=True, type=str, help="Path to the directory containing the NCBI taxdump" - ) - parser.add_argument("-f", "--fasta", required=True, type=str, help="The path for the assembly fasta file") - parser.add_argument( - "-d", - "--dataset", - type=str, - required=True, - help="The folder containing the data generated throughout the pipeline", - ) - parser.add_argument("-bh", "--blastn_hits", default="N", type=str, help="Path to the BLASTN hits file") - parser.add_argument( - "-ud", "--uniprot_diamond_hits", default="N", type=str, help="Path to the UNIPROT diamond BlastX hits file" - ) - parser.add_argument("-nr", "--nr_diamond_hits", default="N", type=str, help="Path to the DIAMOND BlastX hits file") - parser.add_argument( - "-r", "--mapped_reads", default="N", type=str, help="Path to mapped reads BAM for coverage estimation" - ) - parser.add_argument("-t", "--tiara", default="N", type=str, help="Path to the tiara_out.txt file") - parser.add_argument( - "-p", "--pca", default="N", type=str, help="Path to the kmers_dim_reduction_embeddings.csv file" - ) - parser.add_argument("-fc", "--fcs_gx", default="N", type=str, help="Path to the fcs-gx_summary.csv.csv file") - parser.add_argument("-k", "--kraken", default="N", type=str, help="Path to the nt_kraken_lineage.txt file") - parser.add_argument("-ms", "--markerscan", default="N", type=str, help="Path to the cobiontid_markerscan.csv file") - parser.add_argument("-cv", "--contigviz", default="N", type=str, help="Path to the contigviz_results.csv file") - parser.add_argument("-o", "--output", default="btk_datasets", type=str, help="Output directory") - parser.add_argument("--threads", type=int, default=1, help="Number of threads to utilise") - parser.add_argument("--alias", type=str, default="", help="Assembly alias") - parser.add_argument( - "--dry_run", dest="dry_run", action="store_true", help="Dry run (print commands without executing)" - ) - parser.add_argument("-v", "--version", action="version", version=VERSION) - - return parser.parse_args(argv) - - -def create_assembly_yaml(assembly_yaml_path, assembly_alias, taxon_name): - """ - Creates the assembly YAML file for creating a BlobToolKit dataset - """ - if ".gz" in assembly_alias: - assembly_alias = assembly_alias.replace(".gz", "_gz") - out_string = "assembly:\n accession: NA\n alias: {}\n record_type: scaffold\n bioproject: NA\n biosample: NA\ntaxon:\n name: {}".format( - assembly_alias, taxon_name - ) - with open(assembly_yaml_path, "w") as f: - f.write(out_string) - - -def tiara_results_to_btk_format(tiara_results_path, outfile_path): - """ - Reformatting Tiara output file so that the summarised results of the first and second pass of Tiara can be - added to a BlobToolKit dataset - """ - tiara_data = gpf.l(tiara_results_path) - tiara_data = tiara_data[1 : len(tiara_data)] - with open(outfile_path, "w") as f: - f.write("identifier\ttiara\n") - for line in tiara_data: - split_line = line.split() - if len(split_line) != 3: - sys.stderr.write("Failed to parse the Tiara results file {}\n".format(tiara_results_path)) - sys.exit(1) - first_pass_result = split_line[1] - second_pass_result = split_line[2] - if second_pass_result != "n/a": - first_pass_result = second_pass_result - f.write(split_line[0] + "\t" + first_pass_result + "\n") - - -def detect_dim_reduction_methods(kmers_dim_reduction_output_path): - """ - Parses the header of the kmers dimensionality reduction report file to detect which dimensionality reduction methods were used - """ - header_string = None - with open(kmers_dim_reduction_output_path) as f: - header_string = f.readline() - header_string = header_string.strip() - split_header = header_string.split(",") - dim_reduction_methods = list() - for header_item in split_header: - if header_item.startswith("embedding_"): - if header_item.startswith("embedding_x_"): - header_item = header_item.split("embedding_x_")[1] - elif header_item.startswith("embedding_y_"): - header_item = header_item.split("embedding_y_")[1] - if header_item not in dim_reduction_methods: - dim_reduction_methods.append(header_item) - return dim_reduction_methods - - -def main(args): - command_list = [] - - assembly_alias = args.name if args.alias == "" else args.alias - - edited_assembly_title = args.name.replace(".", "_").replace(" ", "_") - - assembly_yaml_path = args.output + "/" + edited_assembly_title + "BTK_DS.yaml" - - if args.dry_run == False: - Path(args.dataset).mkdir(parents=True, exist_ok=True) - create_assembly_yaml(assembly_yaml_path, assembly_alias, args.taxon_name) - - # Base command for new BTK Dataset - blobtools_create_command = f"blobtools create --fasta {args.fasta} --meta {assembly_yaml_path} --taxid {args.taxid} --taxdump {args.taxdump} {args.output}" - gpf.run_system_command(blobtools_create_command, dry_run=args.dry_run) - - # ADDING BLAST HIT DATA TO BTK - hits_file_paths = [args.blastn_hits, args.uniprot_diamond_hits, args.nr_diamond_hits] - - hits_file = [n for n in hits_file_paths if n != "N" and os.path.isfile(n) is True and os.stat(n).st_size > 0] - - if len(hits_file) > 0: - add_hits_command = "blobtools add" - for file in hits_file_paths: - add_hits_command += f" --hits {file}" - add_hits_command += f" --taxrule bestsum --taxdump {args.taxdump} {args.output}" - command_list.append(add_hits_command) - - # ADDING MAPPED READS DATA TO BTK - if ( - args.mapped_reads != "N" - and os.path.isfile(args.mapped_reads) is True - and os.stat(args.mapped_reads).st_size > 0 - ): - add_cov_command = f"blobtools add --cov {args.mapped_reads} --threads {args.threads} {args.output}" - command_list.append(add_cov_command) - - # ADDING TIARA - if args.tiara != "N" and os.path.isfile(args.tiara) and os.stat(args.tiara).st_size > 0: - tiara_reformatted_output_path = args.dataset + "/tiara_out_btk_format.tsv" - tiara_results_to_btk_format(args.tiara, tiara_reformatted_output_path) - add_tiara_command = f"blobtools add --text {tiara_reformatted_output_path} --text-delimiter '\t' --text-cols 'identifier=identifiers,tiara=tiara' --text-header {args.output}" - command_list.append(add_tiara_command) - - # ADDING KMER DIM REDUCTION - if args.pca != "N" and os.path.isfile(args.pca) and os.stat(args.pca).st_size > 0: - used_dim_reduction_methods = detect_dim_reduction_methods(args.pca) - for dim_reduction_method in used_dim_reduction_methods: - add_embedding_command = f"blobtools add --text {args.pca} --text-delimiter ',' --text-cols scaff=identifiers,embedding_x_{dim_reduction_method}=embedding_x_{dim_reduction_method},embedding_y_{dim_reduction_method}=embedding_y_{dim_reduction_method} --text-header {args.output}" - command_list.append(add_embedding_command) - - # ADDIND KRAKEN DATA - if args.kraken != "N" and os.path.isfile(args.kraken) and os.stat(args.kraken).st_size > 0: - for taxonomy_level in ("species", "genus", "family", "order", "class", "phylum", "kingdom", "domain"): - add_kraken_command = f"blobtools add --text {args.kraken} --text-delimiter ',' --text-cols scaff=identifiers,nt_kraken_{taxonomy_level}=nt_kraken_{taxonomy_level} --text-header {args.output}" - command_list.append(add_kraken_command) - - # ADDING FCS_GX DATA - if args.fcs_gx != "N" and os.path.isfile(args.fcs_gx) and os.stat(args.fcs_gx).st_size > 0: - add_fcs_gx_results_command = f"blobtools add --text {args.fcs_gx} --text-delimiter ',' --text-cols 'scaff=identifiers,fcs_gx_top_tax_name=fcs_gx_top_tax_name,fcs_gx_div=fcs_gx_div,fcs_gx_action=fcs_gx_action' --text-header {args.output}" - command_list.append(add_fcs_gx_results_command) - - export_table_command = f"blobtools filter --table btk_summary_table_full.tsv {args.output}" - command_list.append(export_table_command) - - # EXECUTE ALL BTK COMMANDS - for i in command_list: - gpf.run_system_command(i, dry_run=args.dry_run) - - -if __name__ == "__main__": - main(parse_args()) diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index 2a82cb39..a88ac0da 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -47,7 +47,7 @@ process CREATE_BTK_DATASET { """ mkdir -p btk_datasets/ - create_btk_dataset_V2.py \\ + create_btk_dataset.py \\ -f ${reference} \\ -d ./1/ \\ -n "${prefix}" \\ @@ -68,7 +68,7 @@ process CREATE_BTK_DATASET { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - create_btk_dataset: \$(general_purpose_functions.py --version | cut -d' ' -f2) + create_btk_dataset: \$(create_btk_dataset.py -v) END_VERSIONS """ @@ -82,7 +82,8 @@ process CREATE_BTK_DATASET { cat <<-END_VERSIONS > versions.yml "${task.process}": - create_btk_dataset: \$(create_btk_dataset_V2.py -v) + python: \$(python --version | sed 's/Python //g') + create_btk_dataset: \$(create_btk_dataset.py -v) END_VERSIONS """ } From 71f1f269064d8dcc8dbb881218907123c8d8ea03 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 6 Aug 2024 13:21:18 +0100 Subject: [PATCH 097/117] Updates to check --- workflows/ascc.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 3b59f80a..79d490d1 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -81,7 +81,7 @@ workflow ASCC { full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] - if (!full_list.containsAll(include_workflow_steps) || !full_list.containsAll(exclude_workflow_steps)) { + if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) { exit 1, "There is an extra argument given on Command Line: \n Check contents of: $include_workflow_steps\nAnd $exclude_workflow_steps\nMaster list is: $full_list" } @@ -253,7 +253,7 @@ workflow ASCC { } // - // SUBWORKFLOW: + // SUBWORKFLOW: RUN FCS-ADAPTOR TO IDENTIDY ADAPTOR AND VECTORR CONTAMINATION // if ( include_workflow_steps.contains('fcs-adaptor') || include_workflow_steps.contains('ALL') ) { RUN_FCSADAPTOR ( @@ -271,9 +271,9 @@ workflow ASCC { } // - // SUBWORKFLOW: + // SUBWORKFLOW: RUN FCS-GX TO IDENTIFY CONTAMINATION IN THE ASSEMBLY // - if ( include_workflow_steps.contains('fcsgx') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('fcs-gx') || include_workflow_steps.contains('ALL') ) { RUN_FCSGX ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, From 39393ef4d61db58821dff09ec74549d1b0b2a7c5 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 13:01:44 +0100 Subject: [PATCH 098/117] ea10 edits to dp24_btk_datasets_branch --- bin/ascc_merge_tables.py | 313 +++++++++++++++++++++++++++++ bin/autofilter.py | 23 ++- modules/local/ascc_merge_tables.nf | 6 +- workflows/ascc.nf | 21 +- 4 files changed, 342 insertions(+), 21 deletions(-) create mode 100755 bin/ascc_merge_tables.py diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py new file mode 100755 index 00000000..932f5059 --- /dev/null +++ b/bin/ascc_merge_tables.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = """ +Script for merging contaminant check results into one table +Version: {VERSION} +--- +Written by Eerik Anuin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + +import argparse +import pandas as pd +import textwrap +import os +import sys +import general_purpose_functions as gpf + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="AsccMergeTables", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") + parser.add_argument("-c", "--coverage", type=str, help="Coverage file") + parser.add_argument("-t", "--tiara", type=str, help="Tiara file") + parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") + parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") + parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") + parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") + parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") + parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") + parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") + parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") + parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") + parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") + parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + + for data_type, input_file in paths_dict.items(): + if input == None: + pass + else: + out_dict[data_type] = input_file + + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + if paths_dict["coverage"] is not None: + coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") + coverage_df = None + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:, [0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write( + "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) + ) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in kmers dimensionality reduction output table ({})\n".format( + paths_dict["dim_reduction_embeddings"] + ) + ) + dim_reduction_df = None + + btk_df = None + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) + ) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns=btk_renaming_dict, inplace=True) + + btk_selected_cols = [ + col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] + ] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) + ) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + + btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [ + col + for col in btk_busco_df.columns + if col + in [ + "scaff", + "buscogenes_superkingdom", + "buscogenes_kingdom", + "buscogenes_phylum", + "buscogenes_class", + "buscogenes_order", + "buscogenes_family", + "buscogenes_genus", + "buscogenes_species", + "buscoregions_superkingdom", + "buscoregions_kingdom", + "buscoregions_phylum", + "buscoregions_class", + "buscoregions_order", + "buscoregions_family", + "buscoregions_genus", + "buscoregions_species", + ] + ] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) + ) + uniprot_diamond_df = None + + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in CobiontID MarkerScan output table ({})\n".format( + paths_dict["cobiontid_markerscan"] + ) + ) + uniprot_diamond_df = None + + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(args): + paths_dict = dict() + paths_dict["gc_content"] = args.gc_cov + paths_dict["coverage"] = args.coverage + paths_dict["tiara"] = args.tiara + paths_dict["bacterial_kraken"] = args.bacterial_kraken + paths_dict["nt_kraken"] = args.nt_kraken + paths_dict["nt_blast"] = args.nt_blast + paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings + paths_dict["nr_diamond"] = args.nr_diamond + paths_dict["uniprot_diamond"] = args.uniprot_diamond + paths_dict["cobiontid_markerscan"] = args.markerscan + paths_dict["contigviz"] = args.contigviz + paths_dict["blobtoolkit"] = args.blobtoolkit + paths_dict["btk_busco"] = args.busco_btk + paths_dict["fcs_gx"] = args.fcs_gx + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) + + if ( + paths_dict["nt_blast"] + and paths_dict["nr_diamond"] + and paths_dict["uniprot_diamond"] + and paths_dict["coverage"] + and paths_dict["tiara"] + and paths_dict["nt_kraken"] + ): + process_results_tables_command = f"process_result_tables.py . {args.sample_name}" + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write( + f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" + ) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/autofilter.py b/bin/autofilter.py index 93849f67..d843308e 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -42,9 +42,9 @@ def parse_args(): help="Path to the assembly_autofiltered.fasta file", default="autofiltered.fasta", ) - parser.add_argument( - "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" - ) + #parser.add_argument( + # "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" + #) parser.add_argument( "-r", "--rejected_seq", @@ -56,6 +56,9 @@ def parse_args(): parser.add_argument( "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" ) + parser.add_argument( + "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn" + ) parser.add_argument("-v", "--version", action="version", version=VERSION) return parser.parse_args() @@ -179,7 +182,7 @@ def main(): tiara_results_path = args.tiara fcs_gx_summary_path = args.fcsgx_summary filtered_assembly_path = args.output_auto_filtered - combined_summary = args.fcs_gx_and_tiara_summary + #combined_summary = args.fcs_gx_and_tiara_summary excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path @@ -187,7 +190,7 @@ def main(): for i in [ncbi_rankedlist, tiara_results_path, fcs_gx_summary_path, assembly_path]: if not os.path.isfile(i): - sys.stderr.write(f"{i} WAS NOT AT THE EXPECTED LOCATION\n") + sys.stderr.write(f"{i} was not at the expected location\n") sys.exit(1) target_domain = get_domain_from_taxid(args.taxid, ncbi_rankedlist) @@ -207,8 +210,12 @@ def main(): tiara_action = tiara_action_dict[scaff] combined_action = fcs_gx_action if fcs_gx_action == "NA" and tiara_action == "EXCLUDE": - combined_action = "EXCLUDE" - combined_action_source = "Tiara" + if args.tiara_action_mode == "remove": + combined_action = "EXCLUDE" + combined_action_source = "Tiara" + elif args.tiara_action_mode == "warn": + combined_action = "WARN" + combined_action_source = "Tiara" if fcs_gx_action == "EXCLUDE" and tiara_action == "EXCLUDE": combined_action_source = "FCS-GX_and_Tiara" if combined_action == "EXCLUDE": @@ -231,4 +238,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/modules/local/ascc_merge_tables.nf b/modules/local/ascc_merge_tables.nf index da7d59b3..2dea7aa2 100644 --- a/modules/local/ascc_merge_tables.nf +++ b/modules/local/ascc_merge_tables.nf @@ -2,10 +2,10 @@ process ASCC_MERGE_TABLES { tag "$meta.id" label 'process_low' - conda "conda-forge::python=3.9" + conda "conda-forge::python=3.9 conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'biocontainers/python:3.9' }" + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(gc_content, stageAs: "GC.txt") diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 79d490d1..bca98c6f 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -79,6 +79,8 @@ workflow ASCC { include_workflow_steps = params.include ? params.include.split(",") : "" exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" + btk_busco_run_mode = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional" + full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) { @@ -290,7 +292,7 @@ workflow ASCC { // // SUBWORKFLOW: IDENTITY PACBIO BARCODES IN INPUT DATA // - if ( include_workflow_steps.contains('barcodes') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL') ) { PACBIO_BARCODE_CHECK ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, @@ -315,7 +317,7 @@ workflow ASCC { // // SUBWORKFLOW: CALCULATE AVERAGE READ COVERAGE // - if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('busco_btk') || include_workflow_steps.contains('ALL') ) { + if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('btk_busco') || include_workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( YAML_INPUT.out.reference_tuple, YAML_INPUT.out.assembly_path, @@ -372,12 +374,12 @@ workflow ASCC { modified_input, YAML_INPUT.out.diamond_nr_database_path ) - nt_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} - nt_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} + nr_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { - nt_hits = [] - nt_full = [] + nr_hits = [] + nr_full = [] } // @@ -411,7 +413,7 @@ workflow ASCC { ch_kraken1, ch_kraken2, ch_kraken3, - nt_full, + nr_full, un_full, YAML_INPUT.out.ncbi_taxonomy_path.first() ) @@ -449,8 +451,7 @@ workflow ASCC { // WE ARE USING THE PIPELINE HERE AS A MODULE THIS REQUIRES IT // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter - if ( !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('busco_btk') && include_workflow_steps.contains("autofilter") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("busco_btk") && include_workflow_steps.contains('ALL') ) { - + if ( !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('btk_busco') && btk_busco_run_mode == "conditional" && include_workflow_steps.contains("autofilter_assembly") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('ALL') || btk_busco_run_mode == "mandatory" && !exclude_workflow_steps.contains('btk_busco') && include_workflow_steps.contains('btk_busco') ) { YAML_INPUT.out.reference_tuple .combine(ch_bam) .map{ meta, ref, bam -> @@ -505,7 +506,7 @@ workflow ASCC { ch_kraken3, // FROM -- RUN_NT_KRAKEN.out.lineage.map{it[1]} ch_nt_blast, // FROM -- EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} ch_kmers, // FROM -- GET_KMERS_PROFILE.out.combined_csv - nt_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits, // FROM -- NUCLEOT_DIAMOND.out.reformed.map{it[1]} un_hits, // FROM -- UNIPROT_DIAMOND.out.reformed.map{it[1]} [], // <-- MARKER SCAN -- NOT IN PIPELINE YET [], // <-- CONTIGVIZ -- NOT IN PIPELINE YET From afc73ed90373687501c97a2eb919999ebdb6e662 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 13:59:00 +0100 Subject: [PATCH 099/117] 07.08.2024 edits --- .github/workflows/ci.yml | 2 +- bin/ascc_merge_tables.py | 4 ++-- modules/local/merge_btk_datasets.nf | 4 ++-- modules/local/sanger_tol_btk.nf | 8 +++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 24d41b55..76fa33c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -139,4 +139,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude busco_btk + nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py index 932f5059..6045600e 100755 --- a/bin/ascc_merge_tables.py +++ b/bin/ascc_merge_tables.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") - parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-bb", "--btk_busco", type=str, help="Busco Blobtoolkit file") parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") @@ -284,7 +284,7 @@ def main(args): paths_dict["cobiontid_markerscan"] = args.markerscan paths_dict["contigviz"] = args.contigviz paths_dict["blobtoolkit"] = args.blobtoolkit - paths_dict["btk_busco"] = args.busco_btk + paths_dict["btk_busco"] = args.btk_busco paths_dict["fcs_gx"] = args.fcs_gx required_files = ["gc_content"] diff --git a/modules/local/merge_btk_datasets.nf b/modules/local/merge_btk_datasets.nf index 707c33ab..7a818013 100644 --- a/modules/local/merge_btk_datasets.nf +++ b/modules/local/merge_btk_datasets.nf @@ -9,7 +9,7 @@ process MERGE_BTK_DATASETS { input: tuple val(meta), path(create_btk_datasets) - tuple val(meta2), path(busco_btk_datasets) + tuple val(meta2), path(btk_busco_datasets) output: tuple val(meta), path("merged_datasets"), emit: merged_datasets @@ -29,7 +29,7 @@ process MERGE_BTK_DATASETS { merge_btk_datasets.py \\ -m $create_btk_datasets \\ -o ./merged_datasets \\ - -b $busco_btk_datasets \\ + -b $btk_busco_datasets \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 009bb27f..b73e3261 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -32,7 +32,7 @@ process SANGER_TOL_BTK { def profiles = task.ext.profiles ?: "" def get_version = task.ext.version_data ?: "UNKNOWN - SETTING NOT SET" def btk_config = btk_config_file ? "-c $btk_config_file" : "" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" // YAML used to avoid the use of GCA accession number // https://github.com/sanger-tol/blobtoolkit/issues/77 @@ -49,9 +49,7 @@ process SANGER_TOL_BTK { --input "\$(realpath $samplesheet_csv)" \\ --outdir ${prefix}_btk_out \\ --fasta "\$(realpath REFERENCE.fa)" \\ - --yaml "\$(realpath BTK.yaml)" \\ - --busco_lineages $busco_lineages \\ - --accession draft \\ + --busco_lineages eukaryota_odb10 \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ --blastp "\$(realpath blastp.dmnd)" \\ @@ -72,7 +70,7 @@ process SANGER_TOL_BTK { stub: def prefix = task.ext.prefix ?: "${meta.id}" - def pipeline_version = task.ext.version ?: "main" + def pipeline_version = task.ext.version ?: "draft_assemblies" """ mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession From e3b8db9c75f553ab3aaab0848961f8f654e43b03 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 7 Aug 2024 14:09:12 +0100 Subject: [PATCH 100/117] ran linting with black --- bin/autofilter.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/autofilter.py b/bin/autofilter.py index d843308e..8c1dc4e4 100755 --- a/bin/autofilter.py +++ b/bin/autofilter.py @@ -42,9 +42,9 @@ def parse_args(): help="Path to the assembly_autofiltered.fasta file", default="autofiltered.fasta", ) - #parser.add_argument( + # parser.add_argument( # "-c", "--fcs_gx_and_tiara_summary", type=str, help="Path to the fcs-gx_and_tiara_combined_summary.csv file" - #) + # ) parser.add_argument( "-r", "--rejected_seq", @@ -57,7 +57,11 @@ def parse_args(): "-n", "--ncbi_rankedlineage_path", type=str, help="Path to the rankedlineage.dmp of NCBI taxonomy" ) parser.add_argument( - "--tiara_action_mode", type=str, choices=["warn", "remove"], default="warn", help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn" + "--tiara_action_mode", + type=str, + choices=["warn", "remove"], + default="warn", + help="Action when Tiara detects a putative contaminant that is not reported as a contaminant by FCS-GX. The choices are 'warn' (print a warning) or 'remove' (remove this sequence from the assembly). Default: warn", ) parser.add_argument("-v", "--version", action="version", version=VERSION) return parser.parse_args() @@ -182,7 +186,7 @@ def main(): tiara_results_path = args.tiara fcs_gx_summary_path = args.fcsgx_summary filtered_assembly_path = args.output_auto_filtered - #combined_summary = args.fcs_gx_and_tiara_summary + # combined_summary = args.fcs_gx_and_tiara_summary excluded_seq_list_path = args.rejected_seq ncbi_rankedlist = args.ncbi_rankedlineage_path @@ -238,4 +242,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 6d6b6faec3d11da34a3fe0bce772d98fcbc45ac6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 7 Aug 2024 14:21:58 +0100 Subject: [PATCH 101/117] Updating the version of BTK, spelling and bug fixes --- assets/test.yaml | 1 + bin/ascc_merge_tables.py | 313 +++++++++++++++++++++++++++++++ conf/modules.config | 1 + modules/local/sanger_tol_btk.nf | 6 +- subworkflows/local/yaml_input.nf | 2 +- workflows/ascc.nf | 2 +- 6 files changed, 322 insertions(+), 3 deletions(-) create mode 100755 bin/ascc_merge_tables.py diff --git a/assets/test.yaml b/assets/test.yaml index 850c2c2e..23766ae1 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -19,6 +19,7 @@ ncbi_accessionids_folder: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/20240709 ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages +busco_lineages: "diptera_odb10,insecta_odb10" fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/ vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/ diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd diff --git a/bin/ascc_merge_tables.py b/bin/ascc_merge_tables.py new file mode 100755 index 00000000..278ce409 --- /dev/null +++ b/bin/ascc_merge_tables.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +VERSION = "2.0.0" +DESCRIPTION = """ +Script for merging contaminant check results into one table +Version: {VERSION} +--- +Written by Eerik Aunin + +Re-Written by Damon-Lee Pointon (dp24/DLBPointon) +""" + +import argparse +import pandas as pd +import textwrap +import os +import sys +import general_purpose_functions as gpf + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="AsccMergeTables", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("-gc", "--gc_cov", required=True, type=str, help="GC Coverage file") + parser.add_argument("-c", "--coverage", type=str, help="Coverage file") + parser.add_argument("-t", "--tiara", type=str, help="Tiara file") + parser.add_argument("-bk", "--bacterial_kraken", type=str, help="Bacterial Kraken file") + parser.add_argument("-nk", "--nt_kraken", type=str, help="NT Kraken file") + parser.add_argument("-nb", "--nt_blast", type=str, help="NT Blast file") + parser.add_argument("-dr", "--dim_reduction_embeddings", type=str, help="Dimensional Reduction file") + parser.add_argument("-nd", "--nr_diamond", type=str, help="NR Diamond file") + parser.add_argument("-ud", "--uniprot_diamond", type=str, help="Uniprot Diamond file") + parser.add_argument("-cv", "--contigviz", type=str, help="Contigviz file") + parser.add_argument("-btk", "--blobtoolkit", type=str, help="Blobtoolkit file") + parser.add_argument("-bb", "--busco_btk", type=str, help="Busco Blobtoolkit file") + parser.add_argument("-fg", "--fcs_gx", type=str, help="FCS_GX file") + parser.add_argument("-n", "--sample_name", type=str, help="Name for the sample") + parser.add_argument("-m", "--markerscan", type=str, help="MarkerScan file") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args() + + +def check_paths(paths_dict, required_files): + """ + Checks if a required file exists and exits with an error message if it doesn't + """ + out_dict = dict() + + for data_type, input_file in paths_dict.items(): + if input == None: + pass + else: + out_dict[data_type] = input_file + + return out_dict + + +def load_and_merge_dataframes(paths_dict): + """ + Loads the tables with individual variables (GC content, coverage, kmer counts etc) and combines them into one table + """ + gc_path = paths_dict["gc_content"] + df = pd.read_csv(gc_path, sep="\t", header=None) + if df.shape[0] > 0: + df.columns = ["scaff", "gc"] + df["gc"] = df["gc"] * 100 + else: + sys.stderr.write("No rows were found in the GC content table ({})\n".format(gc_path)) + sys.exit(1) + + coverage_df = None + if paths_dict["coverage"] is not None: + coverage_df = pd.read_csv(paths_dict["coverage"], sep=",", header=None) + if coverage_df.shape[0] > 0: + coverage_df.columns = ["scaff", "coverage"] + else: + sys.stderr.write(f"No rows were found in the coverages table ({paths_dict['coverage']})\n") + coverage_df = None + + tiara_df = None + if paths_dict["tiara"] is not None: + tiara_df = pd.read_csv(paths_dict["tiara"], sep="\t") + if tiara_df.shape[0] > 0: + tiara_df["tiara_classif"] = tiara_df["class_fst_stage"] + tiara_snd_stage_hits = tiara_df.index[tiara_df["class_snd_stage"].notnull()] + tiara_df["tiara_classif"][tiara_snd_stage_hits] = tiara_df["class_snd_stage"][tiara_snd_stage_hits] + tiara_df = tiara_df.iloc[:, [0, 3]] + tiara_df.columns = ["scaff", "tiara_classif"] + else: + sys.stderr.write("No rows were found in Tiara output table ({})\n".format(paths_dict["tiara"])) + tiara_df = None + + bacterial_kraken_df = None + if paths_dict["bacterial_kraken"] is not None: + bacterial_kraken_df = pd.read_csv(paths_dict["bacterial_kraken"], sep=",") + if bacterial_kraken_df.shape[0] > 0: + bacterial_kraken_df.rename(columns={bacterial_kraken_df.columns[0]: "scaff"}, inplace=True) + bacterial_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write( + "No rows were found in bacterial Kraken output table ({})\n".format(paths_dict["bacterial_kraken"]) + ) + bacterial_kraken_df = None + + nt_kraken_df = None + if paths_dict["nt_kraken"] is not None: + nt_kraken_df = pd.read_csv(paths_dict["nt_kraken"], sep=",") + if nt_kraken_df.shape[0] > 0: + nt_kraken_df.rename(columns={nt_kraken_df.columns[0]: "scaff"}, inplace=True) + nt_kraken_df.rename(columns={"taxid": "kraken_taxid"}, inplace=True) + else: + sys.stderr.write("No rows were found in nt Kraken output table ({})\n".format(paths_dict["nt_kraken"])) + nt_kraken_df = None + + dim_reduction_df = None + if paths_dict["dim_reduction_embeddings"] is not None: + dim_reduction_df = pd.read_csv(paths_dict["dim_reduction_embeddings"], sep=",") + if dim_reduction_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in kmers dimensionality reduction output table ({})\n".format( + paths_dict["dim_reduction_embeddings"] + ) + ) + dim_reduction_df = None + + btk_df = None + if paths_dict["blobtoolkit"] is not None: + btk_df = pd.read_csv(paths_dict["blobtoolkit"], header=0, delimiter="\t") + if btk_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BlobToolKit results table ({})\n".format(paths_dict["blobtoolkit"]) + ) + sys.exit(1) + btk_renaming_dict = {"identifiers": "scaff", "bestsum_phylum": "btk_bestsum_phylum"} + if "mapped_hifi_reads_sorted_cov" in btk_df.columns: + btk_renaming_dict["mapped_hifi_reads_sorted_cov"] = "btk_cov" + if "bestsum_phylum" in btk_df.columns: + btk_renaming_dict["bestsum_phylum"] = "btk_bestsum_phylum" + # {"identifiers": "scaff", "mapped_hifi_reads_sorted_cov": "btk_cov", "bestsum_phylum": "btk_bestsum_phylum"} + + btk_df.rename(columns=btk_renaming_dict, inplace=True) + + btk_selected_cols = [ + col for col in btk_df.columns if col in ["scaff", "length", "btk_cov", "btk_bestsum_phylum"] + ] + if len(btk_selected_cols) > 0: + btk_df = btk_df[btk_selected_cols] + else: + btk_df = None + + btk_busco_df = None + if paths_dict["btk_busco"] is not None: + btk_busco_df = pd.read_csv(paths_dict["btk_busco"], header=0, delimiter="\t") + if btk_busco_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in the BUSCO-based BlobToolKit results table ({})\n".format(paths_dict["btk_busco"]) + ) + sys.exit(1) + btk_busco_renaming_dict = {"identifiers": "scaff"} + + btk_busco_df.rename(columns=btk_busco_renaming_dict, inplace=True) + + btk_busco_selected_cols = [ + col + for col in btk_busco_df.columns + if col + in [ + "scaff", + "buscogenes_superkingdom", + "buscogenes_kingdom", + "buscogenes_phylum", + "buscogenes_class", + "buscogenes_order", + "buscogenes_family", + "buscogenes_genus", + "buscogenes_species", + "buscoregions_superkingdom", + "buscoregions_kingdom", + "buscoregions_phylum", + "buscoregions_class", + "buscoregions_order", + "buscoregions_family", + "buscoregions_genus", + "buscoregions_species", + ] + ] + if len(btk_busco_selected_cols) > 0: + btk_busco_df = btk_busco_df[btk_busco_selected_cols] + else: + btk_busco_df = None + + fcs_gx_df = None + if paths_dict["fcs_gx"] is not None: + fcs_gx_df = pd.read_csv(paths_dict["fcs_gx"], sep=",") + if fcs_gx_df.shape[0] == 0: + sys.stderr.write("No rows were found in FCS-GX output table ({})\n".format(paths_dict["fcs_gx"])) + fcs_gx_df = None + + nt_blast_df = None + if paths_dict["nt_blast"] is not None: + nt_blast_df = pd.read_csv(paths_dict["nt_blast"], sep=",") + if nt_blast_df.shape[0] == 0: + sys.stderr.write("No rows were found in nt BLAST output table ({})\n".format(paths_dict["nt_blast"])) + nt_blast_df = None + + nr_diamond_df = None + if paths_dict["nr_diamond"] is not None: + nr_diamond_df = pd.read_csv(paths_dict["nr_diamond"], sep=",") + if nr_diamond_df.shape[0] == 0: + sys.stderr.write("No rows were found in nr Diamond output table ({})\n".format(paths_dict["nr_diamond"])) + nr_diamond_df = None + + uniprot_diamond_df = None + if paths_dict["uniprot_diamond"] is not None: + uniprot_diamond_df = pd.read_csv(paths_dict["uniprot_diamond"], sep=",") + if uniprot_diamond_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in Uniprot Diamond output table ({})\n".format(paths_dict["uniprot_diamond"]) + ) + uniprot_diamond_df = None + + cobiontid_markerscan_df = None + if paths_dict["cobiontid_markerscan"] is not None: + cobiontid_markerscan_df = pd.read_csv(paths_dict["cobiontid_markerscan"], sep=",") + if cobiontid_markerscan_df.shape[0] == 0: + sys.stderr.write( + "No rows were found in CobiontID MarkerScan output table ({})\n".format( + paths_dict["cobiontid_markerscan"] + ) + ) + uniprot_diamond_df = None + + contigviz_df = None + if paths_dict["contigviz"] is not None: + contigviz_df = pd.read_csv(paths_dict["contigviz"], sep=",") + if contigviz_df.shape[0] == 0: + sys.stderr.write("No rows were found in ContigViz output table ({})\n".format(paths_dict["contigviz"])) + contigviz_df = None + + if coverage_df is not None: + df = pd.merge(df, coverage_df, on="scaff", how="outer") + if tiara_df is not None: + df = pd.merge(df, tiara_df, on="scaff", how="outer") + if bacterial_kraken_df is not None: + df = pd.merge(df, bacterial_kraken_df, on="scaff", how="outer") + if nt_kraken_df is not None: + df = pd.merge(df, nt_kraken_df, on="scaff", how="outer") + if dim_reduction_df is not None: + df = pd.merge(df, dim_reduction_df, on="scaff", how="outer") + if nt_blast_df is not None: + df = pd.merge(df, nt_blast_df, on="scaff", how="outer") + if nr_diamond_df is not None: + df = pd.merge(df, nr_diamond_df, on="scaff", how="outer") + if uniprot_diamond_df is not None: + df = pd.merge(df, uniprot_diamond_df, on="scaff", how="outer") + if fcs_gx_df is not None: + df = pd.merge(df, fcs_gx_df, on="scaff", how="outer") + if cobiontid_markerscan_df is not None: + df = pd.merge(df, cobiontid_markerscan_df, on="scaff", how="outer") + if contigviz_df is not None: + df = pd.merge(df, contigviz_df, on="scaff", how="outer") + if btk_df is not None: + df = pd.merge(df, btk_df, on="scaff", how="outer") + if btk_busco_df is not None: + df = pd.merge(df, btk_busco_df, on="scaff", how="outer") + + return df + + +def main(args): + paths_dict = dict() + paths_dict["gc_content"] = args.gc_cov + paths_dict["coverage"] = args.coverage + paths_dict["tiara"] = args.tiara + paths_dict["bacterial_kraken"] = args.bacterial_kraken + paths_dict["nt_kraken"] = args.nt_kraken + paths_dict["nt_blast"] = args.nt_blast + paths_dict["dim_reduction_embeddings"] = args.dim_reduction_embeddings + paths_dict["nr_diamond"] = args.nr_diamond + paths_dict["uniprot_diamond"] = args.uniprot_diamond + paths_dict["cobiontid_markerscan"] = args.markerscan + paths_dict["contigviz"] = args.contigviz + paths_dict["blobtoolkit"] = args.blobtoolkit + paths_dict["btk_busco"] = args.busco_btk + paths_dict["fcs_gx"] = args.fcs_gx + + required_files = ["gc_content"] + + paths_dict = check_paths(paths_dict, required_files) + df = load_and_merge_dataframes(paths_dict) + df.to_csv(f"{args.sample_name}_contamination_check_merged_table.csv", index=False) + + if ( + paths_dict["nt_blast"] + and paths_dict["nr_diamond"] + and paths_dict["uniprot_diamond"] + and paths_dict["coverage"] + and paths_dict["tiara"] + and paths_dict["nt_kraken"] + ): + process_results_tables_command = f"process_result_tables.py . {args.sample_name}" + gpf.run_system_command(process_results_tables_command) + else: + sys.stderr.write( + f"Skipping generating the {args.sample_name}_phylum_counts_and_coverage.csv file, as the variables used in this run do not include all the required variables for this (nt_blast, nr_diamond, uniprot_diamond, coverage, tiara, nt_kraken)\n" + ) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/conf/modules.config b/conf/modules.config index 223be558..d4bce1a0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -23,6 +23,7 @@ process { ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" ext.profiles = "singularity,sanger" ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" + ext.version = "draft_assemblies" } withName: SEQKIT_SLIDING { diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 009bb27f..fec146c7 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -42,6 +42,11 @@ process SANGER_TOL_BTK { // outdir should be an arg + // --accession draft \\ + + // blastx and blastp use the same database hence the StageAs + + """ $executor 'nextflow run sanger-tol/blobtoolkit \\ -r $pipeline_version \\ @@ -51,7 +56,6 @@ process SANGER_TOL_BTK { --fasta "\$(realpath REFERENCE.fa)" \\ --yaml "\$(realpath BTK.yaml)" \\ --busco_lineages $busco_lineages \\ - --accession draft \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ --blastp "\$(realpath blastp.dmnd)" \\ diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 935e5033..6d21ea58 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -148,7 +148,7 @@ workflow YAML_INPUT { ncbi_taxonomy_path = group.ncbi_taxonomy_path ncbi_rankedlineage_path = group.ncbi_rankedlineage_path busco_lineages_folder = group.busco_lineages_folder - busco_lineages = group.busco_lineages + busco_lineages = group.busco_lineages fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 79d490d1..2318b58c 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -463,7 +463,7 @@ workflow ASCC { GENERATE_SAMPLESHEET ( new_bam ) - //ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) SANGER_TOL_BTK ( YAML_INPUT.out.reference_tuple, From b1e214be2fa68df29ddb9d4dc92efb1beb95ac95 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 11:43:21 +0100 Subject: [PATCH 102/117] Addition of new scripts to filter and double check data --- bin/ascc_shorten_fasta_headers.py | 95 +++++++++++++++++++++++++++++++ bin/filter_fasta_by_length.py | 92 ++++++++++++++++++++++++++++++ bin/find_taxid_in_taxdump.py | 76 +++++++++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100755 bin/ascc_shorten_fasta_headers.py create mode 100755 bin/filter_fasta_by_length.py create mode 100755 bin/find_taxid_in_taxdump.py diff --git a/bin/ascc_shorten_fasta_headers.py b/bin/ascc_shorten_fasta_headers.py new file mode 100755 index 00000000..f12f6e05 --- /dev/null +++ b/bin/ascc_shorten_fasta_headers.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for shortening FASTA headers, by splitting the header and keeping only the first element +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +# MIT License +# +# Copyright (c) 2020-2022 Genome Research Ltd. +# +# Author: Eerik Aunin (ea10@sanger.ac.uk) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import general_purpose_functions as gpf +import argparse +import textwrap +import sys +import tempfile + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="ascc_shorten_fasta_headers", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("fasta_path", type=str, help="Path to input FASTA file") + parser.add_argument( + "--delimiter", + type=str, + help="Delimiter string for splitting FASTA headers. Default: any whitespace character", + default="", + ) + parser.add_argument("--allow_duplicate_headers", dest="allow_duplicate_headers", action="store_true") + parser.add_argument("-v", "--version", action="version", version=VERSION) + return parser.parse_args(argv) + + +def main(fasta_path, delimiter, allow_duplicate_headers): + + with tempfile.TemporaryDirectory() as tmp_dir: + input_file = fasta_path + if fasta_path.endswith(".gz") or fasta_path.endswith('.gz"'): + input_file = "{}/input_file.fa".format(tmp_dir) + gpf.run_system_command("gunzip -c {} > {}".format(fasta_path, input_file)) + + headers_list = list() + in_data = gpf.ll(input_file) + + for line in in_data: + out_line = line + if line.startswith(">"): + if delimiter == "": + out_line = line.split()[0] + else: + out_line = line.split(delimiter)[0] + if out_line in headers_list and allow_duplicate_headers is False: + sys.stderr.write( + "Duplicate FASTA headers ({}) were found in the input file ({}) after truncating the headers with a delimiter\n".format( + out_line[1 : len(out_line)], fasta_path + ) + ) + sys.exit(1) + headers_list.append(out_line) + print(out_line) + + +if __name__ == "__main__": + args = parse_args() + main(args.fasta_path, args.delimiter, args.allow_duplicate_headers) diff --git a/bin/filter_fasta_by_length.py b/bin/filter_fasta_by_length.py new file mode 100755 index 00000000..a7b03e12 --- /dev/null +++ b/bin/filter_fasta_by_length.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for filtering a FASTA file by sequence length. By default, sequences shorter than a cutoff value will be removed. +Version = {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import general_purpose_functions as gpf +import textwrap +import argparse +import sys +import os + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="filter_fasta_by_length", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("in_path", type=str, help="Path to input FASTA file") + parser.add_argument("cutoff", type=int, help="Cutoff value for filtering") + parser.add_argument( + "-l", + "--low_pass", + dest="low_pass", + action="store_true", + help="Optional: low pass filtering mode (sequences longer than the cutoff value will be removed)", + ) + parser.add_argument( + "--remove_original_fasta", + action="store_true", + help="Optional: remove the input FASTA file after creating the filtered FASTA file", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def main(args): + fasta_path = os.path.abspath(args.in_path) + if ( + args.cutoff == -1 + ): # When this script is used as a part of a pipeline, -1 can be assigned as a value for the cutoff to indicate that no filtering should be done + sys.stderr.write(f"The input FASTA sequences ({fasta_path}) will not be filtered by length\n") + # sys.exit(0) + retained_seq_count = 0 + fasta_data = gpf.read_fasta_in_chunks(fasta_path) + for header, seq in fasta_data: + if args.cutoff == -1: + print(">" + header) + gpf.print_with_fixed_row_length(seq, 80) + else: + seq_len = len(seq) + seq_len_ok_flag = True + if args.low_pass == True: + if seq_len > args.cutoff: + seq_len_ok_flag = False + sys.stderr.write( + f"Low pass filtering of FASTA sequences by length: removing sequence {header} from the assembly because its length ({seq_len}) exceeds the length cutoff ({args.cutoff})\n" + ) + else: + if seq_len < args.cutoff: + seq_len_ok_flag = False + sys.stderr.write( + f"High pass filtering of FASTA sequences by length: removing sequence {header} from the assembly because its length ({seq_len}) is below the length cutoff ({args.cutoff})\n" + ) + if seq_len_ok_flag == True: + retained_seq_count += 1 + print(">" + header) + gpf.print_with_fixed_row_length(seq, 80) + # print(header, seq_len, seq_len_ok_flag) + if args.cutoff != -1 and retained_seq_count == 0: + sys.stderr.write( + f"No sequences remain in the FASTA file {fasta_path} after filtering the sequences by length (cutoff: {args.cutoff} bp)\n" + ) + sys.exit(1) + + if args.remove_original_fasta is True: + os.remove(fasta_path) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/bin/find_taxid_in_taxdump.py b/bin/find_taxid_in_taxdump.py new file mode 100755 index 00000000..930ee12c --- /dev/null +++ b/bin/find_taxid_in_taxdump.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +VERSION = "1.1.0" +DESCRIPTION = f""" +--- +Script for checking if the TaxID given by the user exists in the NCBI taxdump +Version: {VERSION} +--- + +Written by Eerik Aunin (ea10) + +Modified by Damon-Lee Pointon (@dp24/@DLBPointon) + +""" + +import argparse +import textwrap +import general_purpose_functions as gpf +import sys +import os + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + prog="find_taxid_in_taxdump", + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(DESCRIPTION), + ) + parser.add_argument("query_taxid", type=int, help="Query taxonomy ID") + parser.add_argument( + "taxdump_nodes_path", + type=str, + help="Path to the nodes.dmp file of NCBI taxdump (downloaded from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz)", + ) + parser.add_argument("-v", "--version", action="version", version=VERSION) + + return parser.parse_args(argv) + + +def main(query_taxid, taxdump_nodes_path): + if query_taxid == -1: + sys.exit(0) + query_taxid = str(query_taxid) + if os.path.isfile(taxdump_nodes_path) is False: + sys.stderr.write("The NCBI taxdump nodes file ({}) was not found\n".format(taxdump_nodes_path)) + sys.exit(1) + nodes_data = gpf.ll(taxdump_nodes_path) + taxid_found_flag = False + for counter, line in enumerate(nodes_data): + split_line = line.split("|") + if len(split_line) > 2: + taxid = split_line[0].strip() + if taxid == query_taxid: + taxid_found_flag = True + break + else: + sys.stderr.write( + "Failed to parse the NCBI taxdump nodes.dmp file ({}) at line {}:\n".format( + taxdump_nodes_path, counter + 1 + ) + ) + sys.stderr.write(line + "\n") + sys.exit(1) + + if taxid_found_flag is False: + sys.stderr.write( + "The TaxID given by the user ({}) was not found in the NCBI taxdump nodes.dmp file ({})\n".format( + query_taxid, taxdump_nodes_path + ) + ) + sys.exit(1) + + +if __name__ == "__main__": + args = parse_args() + main(args.query_taxid, args.taxdump_nodes_path) From a599125659946d1e873324b1af28d0aa4f8ec055 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 11:45:20 +0100 Subject: [PATCH 103/117] Adding new scripts for filtering and double checking data #56 --- modules/local/filter_fasta.nf | 52 +++++++++++++++++++++++++++++++++ modules/local/validate_taxid.nf | 40 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 modules/local/filter_fasta.nf create mode 100644 modules/local/validate_taxid.nf diff --git a/modules/local/filter_fasta.nf b/modules/local/filter_fasta.nf new file mode 100644 index 00000000..20f4f07a --- /dev/null +++ b/modules/local/filter_fasta.nf @@ -0,0 +1,52 @@ +process FILTER_FASTA { + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + tuple val(meta), path(input_fasta) + + output: + tuple val(meta), path("*_filtered.fasta"), emit: fasta + path "*_filtering_stderr.txt", emit: error_log + path "versions.yml", emit: versions + + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def max_length = task.ext.cutoff ?: 1900000000 // This is the default value and maximum supported length of sequence per scaffold + """ + ascc_shorten_fasta_headers.py \\ + ${input_fasta} > ${prefix}_shortened.fasta + + filter_fasta_by_length.py \\ + ${args} \\ + ${prefix}_shortened.fasta \\ + ${max_length} > ${prefix}_filtered.fasta 2> ${prefix}_filtering_stderr.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_shorten_fasta_headers: \$(ascc_shorten_fasta_headers.py -v) + filter_fasta_by_length: \$(filter_fasta_by_length.py -v) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_filtered.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + ascc_shorten_fasta_headers: \$(ascc_shorten_fasta_headers.py -v) + filter_fasta_by_length: \$(filter_fasta_by_length.py -v) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/validate_taxid.nf b/modules/local/validate_taxid.nf new file mode 100644 index 00000000..4cc1e2b2 --- /dev/null +++ b/modules/local/validate_taxid.nf @@ -0,0 +1,40 @@ +process VALIDATE_TAXID { + tag "$taxid" + label 'process_single' + + conda "conda-forge::python=3.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" + + input: + val(taxid) + path(ncbi_taxonomy_path) + + output: + path "versions.yml", emit: versions + + script: + """ + find_taxid_in_taxdump.py \\ + $taxid \\ + ${ncbi_taxonomy_path}/nodes.dmp + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + find_taxid_in_taxdump: \$(find_taxid_in_taxdump.py -v) + END_VERSIONS + """ + + stub: + """ + OUTPUT="TAXID FOUND + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + find_taxid_in_taxdump: \$(find_taxid_in_taxdump.py -v) + END_VERSIONS + """ +} From aeb7097ea4bec52b244e62393bbd08dfdd942067 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 11:48:49 +0100 Subject: [PATCH 104/117] Updates for #56 --- workflows/ascc.nf | 188 +++++++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 84 deletions(-) diff --git a/workflows/ascc.nf b/workflows/ascc.nf index e4a4ec4b..9785dbae 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -45,7 +45,8 @@ include { TRAILINGNS_CHECK } from '../subworkflows/ // MODULE: Local modules // include { GC_CONTENT } from '../modules/local/gc_content' - +include { VALIDATE_TAXID } from '../modules/local/validate_taxid' +include { FILTER_FASTA } from '../modules/local/filter_fasta' include { CREATE_BTK_DATASET } from '../modules/local/create_btk_dataset' include { MERGE_BTK_DATASETS } from '../modules/local/merge_btk_datasets' include { ASCC_MERGE_TABLES } from '../modules/local/ascc_merge_tables' @@ -73,21 +74,21 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-co workflow ASCC { main: - ch_versions = Channel.empty() - ch_out_merge = Channel.empty() + ch_versions = Channel.empty() + ch_out_merge = Channel.empty() include_workflow_steps = params.include ? params.include.split(",") : "" exclude_workflow_steps = params.exclude ? params.exclude.split(",") : "" - btk_busco_run_mode = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional" + btk_busco_run_mode = params.btk_busco_run_mode ? params.btk_busco_run_mode : "conditional" - full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] + full_list = ["kmers", "tiara", "coverage", "nt_blast", "nr_diamond", "uniprot_diamond", "kraken", "fcs-gx", "fcs-adaptor", "vecscreen", "btk_busco", "pacbio_barcodes", "organellar_blast", "autofilter_assembly", "ALL", ""] if (!full_list.containsAll(include_workflow_steps) && !full_list.containsAll(exclude_workflow_steps)) { exit 1, "There is an extra argument given on Command Line: \n Check contents of: $include_workflow_steps\nAnd $exclude_workflow_steps\nMaster list is: $full_list" } - input_ch = Channel.fromPath(params.input, checkIfExists: true) + input_ch = Channel.fromPath(params.input, checkIfExists: true) // // SUBWORKFLOW: DECODE YAML INTO PARAMETERS FOR PIPELINE @@ -95,12 +96,31 @@ workflow ASCC { YAML_INPUT ( input_ch ) - ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + ch_versions = ch_versions.mix(YAML_INPUT.out.versions) + + // + // MODULE: ENSURE THAT THE TAXID FOR THE INPUT GENOME IS INDEED IN THE TAXDUMP + // + VALIDATE_TAXID( + YAML_INPUT.out.taxid, + YAML_INPUT.out.ncbi_taxonomy_path + ) + ch_versions = ch_versions.mix(VALIDATE_TAXID.out.versions) + + + // + // MODULE: + // + FILTER_FASTA( + YAML_INPUT.out.reference_tuple, + ) + ch_versions = ch_versions.mix(FILTER_FASTA.out.versions) + // // LOGIC: INJECT SLIDING WINDOW VALUES INTO REFERENCE // - YAML_INPUT.out.reference_tuple + FILTER_FASTA.out.fasta .combine ( YAML_INPUT.out.seqkit_sliding.toInteger() ) .combine ( YAML_INPUT.out.seqkit_window.toInteger() ) .map { meta, ref, sliding, window -> @@ -116,26 +136,26 @@ workflow ASCC { // MODULE: CALCULATE GC CONTENT PER SCAFFOLD IN INPUT FASTA // GC_CONTENT ( - YAML_INPUT.out.reference_tuple + FILTER_FASTA.out.fasta ) - ch_versions = ch_versions.mix(GC_CONTENT.out.versions) + ch_versions = ch_versions.mix(GC_CONTENT.out.versions) // // SUBWORKFLOW: GENERATE GENOME FILE // GENERATE_GENOME ( - YAML_INPUT.out.reference_tuple, + FILTER_FASTA.out.fasta, YAML_INPUT.out.pacbio_barcodes ) - ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) + ch_versions = ch_versions.mix(GENERATE_GENOME.out.versions) // // SUBWORKFLOW: GENERATE A REPORT ON LENGTHS OF N's IN THE INPUT GENOMe // TRAILINGNS_CHECK ( - YAML_INPUT.out.reference_tuple + FILTER_FASTA.out.fasta ) - ch_versions = ch_versions.mix(TRAILINGNS_CHECK.out.versions) + ch_versions = ch_versions.mix(TRAILINGNS_CHECK.out.versions) // // SUBWORKFLOW: COUNT KMERS, THEN REDUCE DIMENSIONS USING SELECTED METHODS @@ -160,10 +180,10 @@ workflow ASCC { YAML_INPUT.out.n_neighbours, autoencoder_epochs_count.map{it -> it[2]} ) - ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) - ch_kmers = GET_KMERS_PROFILE.out.combined_csv + ch_versions = ch_versions.mix(GET_KMERS_PROFILE.out.versions) + ch_kmers = GET_KMERS_PROFILE.out.combined_csv } else { - ch_kmers = [] + ch_kmers = [] } // @@ -173,10 +193,10 @@ workflow ASCC { EXTRACT_TIARA_HITS ( GENERATE_GENOME.out.reference_tuple ) - ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) - ch_tiara = EXTRACT_TIARA_HITS.out.ch_tiara.map{it[1]} + ch_versions = ch_versions.mix(EXTRACT_TIARA_HITS.out.versions) + ch_tiara = EXTRACT_TIARA_HITS.out.ch_tiara.map{it[1]} } else { - ch_tiara = [] + ch_tiara = [] } // @@ -188,18 +208,18 @@ workflow ASCC { // fails during the run // - ch_nt_blast = [] + ch_nt_blast = [] EXTRACT_NT_BLAST ( modified_input, YAML_INPUT.out.nt_database, YAML_INPUT.out.ncbi_accessions, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) - ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} + ch_versions = ch_versions.mix(EXTRACT_NT_BLAST.out.versions) + ch_nt_blast = EXTRACT_NT_BLAST.out.ch_blast_hits.map{it[1]} } else { - ch_nt_blast = [] + ch_nt_blast = [] } if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { @@ -218,14 +238,14 @@ workflow ASCC { // SUBWORKFLOW: BLASTING FOR MITO ASSEMBLIES IN GENOME // MITO_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.mito_var, mito_check.valid ) - ch_mito = MITO_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} - ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) + ch_mito = MITO_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(MITO_ORGANELLAR_BLAST.out.versions) } else { - ch_mito = [] + ch_mito = [] } if ( include_workflow_steps.contains('organellar_blast') || include_workflow_steps.contains('ALL') ) { @@ -244,14 +264,14 @@ workflow ASCC { // SUBWORKFLOW: BLASTING FOR PLASTID ASSEMBLIES IN GENOME // PLASTID_ORGANELLAR_BLAST ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.plastid_var, plastid_check.valid ) - ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} - ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) + ch_chloro = PLASTID_ORGANELLAR_BLAST.out.organelle_report.map{it[1]} + ch_versions = ch_versions.mix(PLASTID_ORGANELLAR_BLAST.out.versions) } else { - ch_chloro = [] + ch_chloro = [] } // @@ -259,7 +279,7 @@ workflow ASCC { // if ( include_workflow_steps.contains('fcs-adaptor') || include_workflow_steps.contains('ALL') ) { RUN_FCSADAPTOR ( - YAML_INPUT.out.reference_tuple + GENERATE_GENOME.out.reference_tuple ) RUN_FCSADAPTOR.out.ch_euk .map{it[1]} @@ -267,9 +287,9 @@ workflow ASCC { RUN_FCSADAPTOR.out.ch_prok.map{it[1]} ) .set{ ch_fcsadapt } - ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) + ch_versions = ch_versions.mix(RUN_FCSADAPTOR.out.versions) } else { - ch_fcsadapt = [] + ch_fcsadapt = [] } // @@ -277,16 +297,16 @@ workflow ASCC { // if ( include_workflow_steps.contains('fcs-gx') || include_workflow_steps.contains('ALL') ) { RUN_FCSGX ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.fcs_gx_database_path, YAML_INPUT.out.taxid, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} - ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) + ch_fcsgx = RUN_FCSGX.out.fcsgxresult.map{it[1]} + ch_versions = ch_versions.mix(RUN_FCSGX.out.versions) } else { - ch_fcsgx = [] + ch_fcsgx = [] } // @@ -294,7 +314,7 @@ workflow ASCC { // if ( include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL') ) { PACBIO_BARCODE_CHECK ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.pacbio_barcodes, YAML_INPUT.out.pacbio_multiplex_codes @@ -309,9 +329,9 @@ workflow ASCC { ch_barcode } - ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) + ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) } else { - ch_barcode = [] + ch_barcode = [] } // @@ -319,17 +339,17 @@ workflow ASCC { // if ( include_workflow_steps.contains('coverage') || include_workflow_steps.contains('btk_busco') || include_workflow_steps.contains('ALL') ) { RUN_READ_COVERAGE ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.assembly_path, YAML_INPUT.out.pacbio_tuple, YAML_INPUT.out.reads_type ) - ch_coverage = RUN_READ_COVERAGE.out.tsv_ch.map{it[1]} - ch_bam = RUN_READ_COVERAGE.out.bam_ch.map{it[1]} - ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) + ch_coverage = RUN_READ_COVERAGE.out.tsv_ch.map{it[1]} + ch_bam = RUN_READ_COVERAGE.out.bam_ch.map{it[1]} + ch_versions = ch_versions.mix(RUN_READ_COVERAGE.out.versions) } else { - ch_coverage = [] - ch_bam = [] + ch_coverage = [] + ch_bam = [] } // @@ -340,10 +360,10 @@ workflow ASCC { GENERATE_GENOME.out.reference_tuple, YAML_INPUT.out.vecscreen_database_path ) - ch_vecscreen = RUN_VECSCREEN.out.vecscreen_contam.map{it[1]} - ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) + ch_vecscreen = RUN_VECSCREEN.out.vecscreen_contam.map{it[1]} + ch_versions = ch_versions.mix(RUN_VECSCREEN.out.versions) } else { - ch_vecscreen = [] + ch_vecscreen = [] } // @@ -355,15 +375,15 @@ workflow ASCC { YAML_INPUT.out.nt_kraken_db_path, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} - ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} - ch_kraken3 = RUN_NT_KRAKEN.out.lineage + ch_kraken1 = RUN_NT_KRAKEN.out.classified.map{it[1]} + ch_kraken2 = RUN_NT_KRAKEN.out.report.map{it[1]} + ch_kraken3 = RUN_NT_KRAKEN.out.lineage - ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions) + ch_versions = ch_versions.mix(RUN_NT_KRAKEN.out.versions) } else { - ch_kraken1 = [] - ch_kraken2 = [] - ch_kraken3 = [] + ch_kraken1 = [] + ch_kraken2 = [] + ch_kraken3 = [] } // @@ -374,12 +394,12 @@ workflow ASCC { modified_input, YAML_INPUT.out.diamond_nr_database_path ) - nr_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} - nr_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} - ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) + nr_full = NUCLEOT_DIAMOND.out.reformed.map{it[1]} + nr_hits = NUCLEOT_DIAMOND.out.hits_file.map{it[1]} + ch_versions = ch_versions.mix(NUCLEOT_DIAMOND.out.versions) } else { - nr_hits = [] - nr_full = [] + nr_hits = [] + nr_full = [] } // @@ -391,15 +411,15 @@ workflow ASCC { modified_input, YAML_INPUT.out.diamond_uniprot_database_path ) - un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} - un_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} - ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) + un_full = UNIPROT_DIAMOND.out.reformed.map{it[1]} + un_hits = UNIPROT_DIAMOND.out.hits_file.map{it[1]} + ch_versions = ch_versions.mix(UNIPROT_DIAMOND.out.versions) } else { - un_hits = [] - un_full = [] + un_hits = [] + un_full = [] } - ch_dot_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} + ch_dot_genome = GENERATE_GENOME.out.dot_genome.map{it[1]} CREATE_BTK_DATASET ( GENERATE_GENOME.out.reference_tuple, @@ -417,7 +437,7 @@ workflow ASCC { un_full, YAML_INPUT.out.ncbi_taxonomy_path.first() ) - ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) + ch_versions = ch_versions.mix(CREATE_BTK_DATASET.out.versions) // @@ -425,12 +445,12 @@ workflow ASCC { // if ( include_workflow_steps.contains('tiara') && include_workflow_steps.contains('fcs-gx') && include_workflow_steps.contains("autofilter_assembly") || include_workflow_steps.contains('ALL') ) { AUTOFILTER_AND_CHECK_ASSEMBLY ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, EXTRACT_TIARA_HITS.out.ch_tiara, RUN_FCSGX.out.fcsgxresult, YAML_INPUT.out.ncbi_rankedlineage_path ) - ch_autofiltered_assembly = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} + ch_autofilt_assem = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} AUTOFILTER_AND_CHECK_ASSEMBLY.out.alarm_file .map { file -> file.text.trim() } @@ -441,9 +461,9 @@ workflow ASCC { .set { btk_bool } - ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) + ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) } else { - ch_autofiltered_assembly = [] + ch_autofilt_assem = [] } // @@ -452,7 +472,7 @@ workflow ASCC { // TO BE USED AS A AN INTERACTIVE JOB ON WHAT EVER EXECUTOR YOU ARE USING. // This will also eventually check for the above run_btk boolean from autofilter if ( !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('btk_busco') && btk_busco_run_mode == "conditional" && include_workflow_steps.contains("autofilter_assembly") && btk_bool.run_btk == "ABNORMAL" || !exclude_workflow_steps.contains("btk_busco") && include_workflow_steps.contains('ALL') || btk_busco_run_mode == "mandatory" && !exclude_workflow_steps.contains('btk_busco') && include_workflow_steps.contains('btk_busco') ) { - YAML_INPUT.out.reference_tuple + GENERATE_GENOME.out.reference_tuple .combine(ch_bam) .map{ meta, ref, bam -> tuple( [ id: meta.id ], @@ -464,10 +484,10 @@ workflow ASCC { GENERATE_SAMPLESHEET ( new_bam ) - ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) + ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) SANGER_TOL_BTK ( - YAML_INPUT.out.reference_tuple, + GENERATE_GENOME.out.reference_tuple, new_bam, GENERATE_SAMPLESHEET.out.csv, YAML_INPUT.out.diamond_uniprot_database_path, @@ -480,18 +500,18 @@ workflow ASCC { YAML_INPUT.out.taxid, 'GCA_0001' ) - ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) + ch_versions = ch_versions.mix(SANGER_TOL_BTK.out.versions) MERGE_BTK_DATASETS ( CREATE_BTK_DATASET.out.btk_datasets, SANGER_TOL_BTK.out.dataset ) - ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) - busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv.map{it[1]} + ch_versions = ch_versions.mix(MERGE_BTK_DATASETS.out.versions) + busco_merge_btk = MERGE_BTK_DATASETS.out.busco_summary_tsv.map{it[1]} } else { - busco_merge_btk = [] + busco_merge_btk = [] } @@ -514,7 +534,7 @@ workflow ASCC { busco_merge_btk, // FROM -- MERGE_BTK_DATASETS.out.busco_summary_tsv ch_fcsgx // FROM -- PARSE_FCSGX_RESULT.out.fcsgxresult.map{it[1]} ) - ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) + ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) @@ -526,8 +546,8 @@ workflow ASCC { ) emit: - software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml - versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions + software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml + versions_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.versions } process GrabFiles { From 7405a9b77dfeb07ff5827e4b6f8b0fce4f57578b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 11:49:44 +0100 Subject: [PATCH 105/117] Updates for #56 --- conf/modules.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index d4bce1a0..0b12ec3e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,11 @@ process { ext.version = "draft_assemblies" } + withName: FILTER_FASTA { + ext.args = "--low_pass --remove_original_fasta" + ext.cutoff = 1900000000 + } + withName: SEQKIT_SLIDING { ext.args = {"-s ${meta.sliding} -W ${meta.window} "} } From 600800673a7ff14f4fd0db5fc60043c315ee7ca9 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 15:29:03 +0100 Subject: [PATCH 106/117] Updates closes #56 and additions to the output.md file --- docs/images/mqc_fastqc_adapter.png | Bin 23458 -> 0 bytes docs/images/mqc_fastqc_counts.png | Bin 33918 -> 0 bytes docs/images/mqc_fastqc_quality.png | Bin 55769 -> 0 bytes docs/output.md | 364 ++++++++++++++++++++++++++-- modules/local/create_btk_dataset.nf | 2 +- modules/local/filter_fasta.nf | 2 +- modules/local/sanger_tol_btk.nf | 17 +- subworkflows/local/yaml_input.nf | 2 +- workflows/ascc.nf | 2 +- 9 files changed, 359 insertions(+), 30 deletions(-) delete mode 100755 docs/images/mqc_fastqc_adapter.png delete mode 100755 docs/images/mqc_fastqc_counts.png delete mode 100755 docs/images/mqc_fastqc_quality.png diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e47acfb424dea1f326590d1eb2f6dfa26b5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 23458 zcmeFZ2UJtryD!S#x<#o93es(Ww4k)maRbte0-+a?-g^xY-3myTE`8G_KvA54)F1tn})nJ5u%TA4Y;^!^{48eL_}p#q-Umo0M|F1 z74+PQh^X8N|9_jcWbq~ zzn+tZC9B75nKdz=gQ8wo9GJ$P{D~3knlI_`-PRhCw34f1oYDLr^;oEbgxa#A^J%*2 z>FfDE*(~JzKFs$t_oeLz))qDU?s}%Q?7b~3Y;lUi^Oy-2@3g?joA4Wkgb6-2=ih*jub)~7yZ`T=L=Z`B`{1jhkB-iSjea94&Eo9A zxN59pv1p_}RO1>EC^q}Z2)ZI;b7JV_x4lMr=Bker2+EK;8~!;JO7re*@ZkDmoV878S*N^yX(F@U1yqt?Is3nnV>7}#(5pk`V3C) zWhB8;CwWIwsVIjH+`<9=YA(j&3DgQdFOOGU~*`36wNC&QDv8> zr?h2PQgnHkp&t^S)q^K!68h~`$PjZW&-Wns;Zlw$M2sc z1xR!u{m|Kih*|Hht#M@eOMM#8O*={^6b9k5B5^eBsrnhVHD7XZ5BWO&F?q(>Y=QFl z`f>yQ9NCoxZCH-1F{#mz_j{QeyY~4h*VeyYZ#S@Z(Pnb7G=ud!RW)5svqM*&GI_za zzn;8LkOTT?``1Ygt6w!2;5arK*o5k15cdIJnMg)IQhF_zVK%!ma$z&jL zZt>Q{!PqKl^`Qw?nJUOEm@@qX(y(TwSJ~dqW&M@7-N4Wk_wC4izx(xJMrmNjsl$XR zCyK&INt}7@FzNAbbg-nW)sJ>3->I1+2~YdlPsaS}^X-H0GR_CEsw`PGjpq`uX}8VP zJ)HC34>D(z{KR9;E&z=@?@q_|I{NPOj~g>w!$gR?Tlu~F+L$Mk%}xQEm+{&T(5zkH zacVy0k3w!T9r*p2sgX@V;^+PfUYUrEde07XSV=KSDbkIZU!j!Rk3MQV=h-!y@kWVB zdYkmu^fiU~pp#ixe4hBEMx7^LdHa z_L*14aVIHtrsR)SO?=&kQS&JR#^AVvln=P=bUXEIy$QB&!s34znCV@y(C%j9V=}SU zoYLHn+-Lalm0$-=QQ}a(+2dR*{DPF+)J4y!ukiA_T%dF zVKEk;c?LWheG#A5{A20}CKjMw5G%2}cT5@Oce=wqdobHC70=kY7}dxt3diH9(Zcwr zCabx8yObHQ@#e_wjl%wp8s_!Wvxe5f-Duin@obgt>qOcqN$$@{X^C_rEDh3fmM;|X z$zu4;D`{YRbaJ?o!KkazII&|th9v5MG2Mao$ytOHtW+wo;XJJdtLuGjg;d020qT++ zpD}e&o?SeKSqR`}4`OdkWNC7K)Wltn zbwBrWGM;bBGm8uP_RiqfwvDD1f+uRX>b=nTH9Y%vpg{ka0e*E>%<+3!G3#s*-1D>q zHg~1@BT52a*L>mVcP>6y*0iX8@!3tDFJLE+sRlnU(cl``hF`0Q>e4i6P8|wKmqIqI zoY+a0V*Bib0`F9nG#sR(8$^!IWLR)cE8@7XZTN%L-ucJ{9yijy)w5Pom%XG7V<^PX z$Z$U82w0qgcGmld-O6*e)?pm$g@!6`Pps5SPKccjDf(|vX9zcLs7t!7cyyckZI#R* z#lj(HqfVeqyZ+Va{)>65sAb3IQ%a{9W^_F!5!;w=XD}ZUHFH$8=Xjw+VE)s$q(nt> zE2^aDYki5`e73RQ=DxaBNZ6CK?XKCv@V}=y(g?YHnFaHfXnl}Lo;36@?471W;&#Se z>pE*@M{Y?CevLG8il9#HXG#W3>;o$1``EYBY5i<;JlBqj2M8Y2!+6bPj1(S_bOksY z<34UQE;=Z>KiL``pYd}5fpOOT)GJQnXfNiAc5wgJ>F|$Eqw&D*Vmz+#mM0oFD^`-^ zB~SXe{T+5hd$gnKd7Afo9cy&Lii@syPDFDK)^V{iWEAEO@?xzx1bd`ta z;$(vG+=i3~9|D=GX%f~<>eOVjy~-yRAhLf2dR8V<@M_`C^ev(yOTg{uf=L3uyDb-w z&)l7KXS_HTo87BxI}fXF{ge&5p&IHk9M1}eNAwqw)`eZSOPFhqjS70{hyE@C{oSN$ zam*`-UH3RF-RWEP`^Su1q#n_J{AncekkV4m7YITf%QHBo60h@pk4N4O}hhf%rxuIZGiQpprVMal%h7?8+cY#L>pYnx6v!EnuIgInW` z)w!NuTp;fz9md^}*x@K9+`^2LO*bZp1^?BG#iS@(4i%AB6YP023T8Eb?M5K7ElSpe z9-wA22Mm}VwDkmECLd*}a=7bCf(}@SHs6UBe)Xvk(+hQ^^unj5JBeo$=><{4PBI%P z4_9XQ=XnE``;1Daa6f`~rGwNj9{YXY)eIw3G90Ip+QEWg0%?g=i$UHuQ?Qc0OR0!w zv?BvlQa!QMyI*IP!0>goBt$xo2^hlD&wRp?$=}}#?q~Yw z{**_|5&yL*Epz|4V#SJjg-lNaIx_{sCL3R=_VH&_;oOn5J2P=h!0enu-i%FAZ- zw`Hm*u6N*}&A7pAqr>-?%0(lveb{r8>hpDmex?Yo*8!-%1?YV0R~VEPBFp>)ba=mv+2(#>WEy0yxHZX=Cr2 zKmew%=^>HsD3BtRR*#H!@!TTGcI&fHrVh)P&|X;>)OHML+uWDn(dlsDjXa;5uBM$r zdt!r~ig?5iGbx!GpH+kdG8k0%;~)Q#0L6wFROJ}^Z%DvO3x#yNk13^&ccd&l)BP9h zD5cU-qZg-rV3Sg&?)`x}cI3`zw#zq{-eN4pNf(+?QuOG4oZ7zMGSVqOUe>`u=GfKM z{xPCciJFw9%Pk+uDSoormR&c=fS#hGOk=RGUtizBOoY^8P(>!Si|I9i=1ZCQbcc)5 zgE6UED;+b$4u&#dhZjdXwO3tpG0QaQwXrLOx5YP#TOaS@FP!h|G!z!Pbv?hTp0eQL zoUsiv4d@*Ck#ID9-ua|zPbQepcC4a>>9-bJApd()Wg%}hj#%A4pO-q{jIJ$f-SL7- zo&=keG_jhq$Ty4e|J^l6j6TQ=W)|~&Ei6gRn<{*^cFG*tS19#kHpMD7Y;wb~!3_%X zS_-3NQoGiWCX!M-Id;Nsg7oSi4VJ=Hi{bYNfjnmTq?IyK@@&_uacfb&8h@DIe70-Q zZ^KaT(4UX*vf7@A7CY;P!IVGIuXPRIe^&71Z1EyHO5&^=jUUKHF+h&m!4!dOA+!Ed zfA#uQ&p6vD7|O8(?5`bf8^gK)6p`>+$c*yG?Sw29;OD+tp}kDD9augDAEXWbSVoie zpHF1Wj8lWfIZ}mx%(2XREqF9!{fNd&iurAaoQDMCSNo!vRHE8wH%QLLZf9u;ADqnxOaAD#VE%Yg z?Gb?EmGbY}a0|vSZPlF3z6;Kf669Bf%h zlSGiY-}E4LFurm_CJN)(*l?=uX);o&R&qLuzENz?9I%S&YQ2>rVhx#c!hbvWLL!CI zA8mXM$zjnnJ#Me@-99}hjxCE!w8|9w{SBlj%Miq#dvS5GHP!DxO$sDx^4PF^#`;A! zb=bZ1pyj{R#9h$r7svB$QlJqeF1cp*ubT12UZ!deKFG%1N<@S2x&2UtqsVz zn=gF&$D4i3x7&vdoa#^cS?bQuP69OpspVPxm*%@DSWf!NG`o`y^R~o1Hvta;#!r%i zvEB~Jsi~sJ7Y35P!bf?OQin->fAk+TpU$Ow1st|l9|i2rrOneBP3&aDyoUj3K{a7! zOYpnJyYD#nr4GNJ;@$ce2dSN=eS7f-VptzM(|Ek^ze)mPVrpAEgrFs3mL>f(ZwriH zCZ65HdO0|W@2<+v9t?J=-4U9>bvM@@Ew4uVZy@c^Ovw9`k|$!+CTAn(u#4kC7TVTB zXuy#d+GC@RIMaPyp|Y2jS%RJkktCracCaLqfs^i^XFqK#3z+d}n02*VDF&My)vp)lNzWx<< zGB7hEAH?7_joYR?>+&+JIas*%Oiux%kr*X*B=8N8Ulowx0MkRK?pR)K1F_m8>dSe54 z)48k>#|F!OV#yOs7xQNQ@1iun5pl;py{tx+o044?r{W2O{f}3r{#QS#4bf(|f9R3y#6*0YY) z5Ey{M`dj)yHl)B{sdmvti^b0IE5xFx%jJM&5w69;`PGy0vGk2ztSW|5H3~zhXO?mn z+4mo>;Y7=4&gC}HifyMO`#70u3H6;0|| z!l=0lP|zVF`bfxm{%i98943^7y4Iz};Z9F$oY3iUI*FIsYa=o=nS^d`;3?*wDxi&| z=?oqs6uDcd1e_e5z7M5q(+I^PilSRE(T6%z<=U8%sq63V!wELY9Rj%#Y@2Y+TEJ8(f_Kh0ih?l6E6~wDl3~?-5%7>d{ zKs0XHUeORoi5+U#M{kE!Ae%|)^dabh1DsJI9N~LVXp*8$XlOfc6J+Cc?}SM zsc3N~L7hzcpXn2>b(_YN=J*C0N}$f_NINTiV!~L}nA{wn^XfBogd5hu!G?*THg^mF zFJm@9m{X~X3t5{7 z#lWIO++R8;BTByGl7U;fz|JBB^*4R|bLvm18x;DF*U`=kyxbH2nD*RIH5AWfJ4^5o z&Nr;*|NreNKo$fUI5}~n#Xcbjr0T-7MV;wZXA(QPt^`x;=ZK)5^`AFgQM?7ry_(Tm z0|EhWs&cYJW?|uvc3af(tfuyDf$28~R=HOa#}3Edru##Wwm0a$Vnk=_8+eQ; zfyq+GVt0Twr^QS*HtI+&&>_<%-Gq-!{iQr-3LYn-6bqW0VW)>%iat!2IP)Jd+LgnS zgI+jJ-I9HMJ8Z*$2FjwK1T0RpF%U`&x)S{3HqRJ z5^;r?VoA(k7*aP@tzB`O5Y26jv#x54xNH;E`KzzLxC)FEnQ<}IR#w*>9sq|zFzZq< zdM1%ynXvcLfZ{Xm=l(Op?=XGV8`BwRiQ%@@A-GnjD+y3K zN2Pm011b!s`3368%P&MapW-PDulXKfpeyRXNjN`lKKgC%CplwE#GrRw#0FE#Q4>R+ z23B4CmO%uy8Y@;F$hCHU6+oJ}_cKgm|4Amr{$`38ue-?+GX1T!hd$w@x=z{w30Z*W za@$MLl^=f#*oR+8(&a&`E@Bj{{1O;DPjj$g9U7~{m*?^Tj}Rrc^wc=(SycXVT?bW{ zUus*6{74fo{nOh@zQyv0g{)t}Qekl*>KXQYCI9m2jqge|&Ntj{V?gLs*_GkeODYhf zW39Q1L1~vk+#E^S!nCyO&z9Wh}2=K}`9#{=`j&)^}8=U|lz}DqgAteVsos){s zDhK`>&pK%cVuhO7tPu7@Y4|yXAdHs!(uKDuLL@i$Okc6Gs;2456Br??ZNZiONAe!~ zvY5w1(C)E9fRmpWgWU2Su0u6~9{@wIm<-lha;uuEN>&C^FJ#^|oopkg``l#i0&{OX z%rI6Q>l^9J++K19D;HrFU#V9o0M`MBTT#-(q&A{|n-`T~CgAFET=$E_&pIQTPE;J#&nrwf2N^I*d zH)ev~7d=Sy8<@syK<`PFvNtyfa#8^JceG^ua^o%!fl6R&j--jGkz8wS`EgfEZouOD zr97H059Dj(#$*$-!UQLvb92wS40!wJc!4K~lq-K2h2rXunCs?SjQERnvv9Fs?tF;y zWUTcQ&PtDMbsUY6_&np`UGMS0ZZIhnDh~p{`Bryj7XS~*R}%z6 zUO^hJn$_-CW(;$)hHu0ej1BNqv^o%*D2gR6zUvCZyw)ddNB6JE$;okhf7PEEz|dRN z$sP&o`MU(L_I8mDW33;)3!U*;HRm$zVV%%zaDn^*Qj~RdWdFNb;^fRhnF&{oeY-tv zq$p~pZw)Ls$EWKsEZubtx_9bpdCfsjdy*<8_Io8VtCIC+8kk@Qxdti>xnu}nRYJ-y zp8$3YP7u;u+YlPQ2`o_>S?mpXvd0-x!Z3=}>ceWDg*e)+#wQLE)Uwhneo z;*y`VfoY<#lwT^k4BP(ytfI;M`FoYsedi}L{1V|Ho}ciBs=`@vtgnieHdpWz%Vyy$ zlnn?k0KJWOnlJD9>6y64*X=G{lyl&%pV8Uo&>tXw%1za!6*YYVB$jR$Y0XhB#1mVx zvjd8N4X~{Dd&28RVEkCw9TLN9*Ng!?9F88l2Bl)w%7!97mtx5(Qx%1u6h+$OGa4#qGGGI{Pj4d)5yg8F4O2sfu61u0uM}?$_nH8=0St?`ogZ@1LAr@*uC4Z9(|dIQ z?OH<_%?PD56K*Kty@PQT;W#)tazY~|I7-aq)tQ($$#Q?{gEbJwJK3mnk)|l>XgmJQ z_POHzee+4NEWu0i0zUFmLTF(zvD3B%sp1_F7 z<|O7{-oZ2>t9k~zX0MDQ(4&(YZ#~baV{$ah?o_K1p$Ad`PAvgtuhW(xO{@bMjNb>Y z-k>lsDx?xX;x5*9RSpJe~BwLtb79%{p~+JTs5HZ&#({u>j3kAOLx*Y zW{7^+`OD%vhcxVW39F$jZ;I@H`3X?>Wwt@269f1o{V4-t-|dX4x7L3j zUHltoa@jqToWvn&=0CF%6%D0h50m^)qaXkRMC&Owv8iG~$}1PBgld3nBE#Rg(5)8n zga7!2@yjoBBoF_e3M$ongy7N1L_hT@!LUaCXX6QLZFKcq1r;;Z$sca}zfwaCji7PcbfW7H9p`7Eh$-j*7-=%{5f&}TidFWiMr=NYvc}Q@gh_z)<;^d&F zd@za3ugvK(BbprUX|)`Rk0&+6)#sm5S8a7;dzrqn*f)iXpvW$BVu6u)bR+ywtGne@B61Om=Q)yvb`45S}|LKt&5@)wSOfk;LhZ^UofjlQz0h zm)>a9f&40n$;-ndr=xntY3nOFGmA5POfiIsfgTzT*Cl zU{P;It;qo}n}IeEA1&?GRONCJp3=_!ce2$kKRZonNV+tS_uFPWzeS zhqSPws(Jp?TsgNT7yGtphSz=h2-}y#HTWNE#@LHFs^pseT#RfN*P8yLUm`jG1N5s* zfU25qv2akmjD=Q`s4SJxi@i`xIOCdT5B%W6wj1Fz8)Kuv*iB`}b^(em~z zz4~VcUB9M5@W}s3-SOWXu+*?)Al7p)Bw?jh8_#s)>lYp{{b%_vCY00=iC@I3$FcpY zYuOjg948l-C~}cDxL!%j&X1(H6ZC7U5?oVLQ<)zh*qg)k6HdNPB;PQcbVRXucl7>@ zE`Ga=^8RPrIRE!3E#e-v8MTy%%a1yk_k{s|V-=5ML7(Mg#S@LA3;rEyjF&X1w*^R&VJ>2%B@{=W9BD)oa@0!_Gl{G8Oe+Vki1QQWd~<<~Et zEV_YlJ=t8VXv>#L|FKXIJ)GZ1(d6xUoSPZVFOzMhM$6tgyhWq=@}=HzWm&b4o8R}L zQd7<0PV(LqaHYNNcXtTN4rc2ov$)VeRm&}XS-vamGB^G4tspa#HrPa5#22^pb?s&W zS%!p!fba6R+WLMjkeUo!qpKob}#cMpU4(`C+U6R8i>qlJ&Hbh52enW<`FmyjlhwlfIlxyu$Pg z3uS-Qau7K~%A$hBFocIe2<$LBIbEI!uddh9(JX=++R9aM|DO2#5*qKh#Zq^~O40f6 z0#s@~v{DPy=4^A}ieKe(Idu22Ex4~>p=#u?w_Lx>bHE@Z4Dh%iKrDJj2IJ+qNDIxj&WPRXRSaNz$JyFkpFK#gLAB6G;4KKql{+5w z{2yWKln-fjDCc()q_W&mmIx?JvpXPb{)hR&ok40*!M7lC!&?b|=efwVb@r0;FeD2( z*x!h~5OA8DEVr>6PS6o_oYt+7HY+d${lh@ruB?hP=`vq;@uLNGIb%@~*X54+`NY0- z35nZLFQArwtL~;t?sb(T6k;wi@v0FFLV}%b1@;p|R%u%8ROV= zRWO3*fG33>>}We#nQ5Vk3gY2ODY5fL+-E@ zvWG%=(;1n3UEEjqSDn9V_C*FMSXjR{uYKa`>$>D#@FacqRX4qmy{)y4&Gf)@V_BVr zvNEa@r<%e5HW?jhEb!SY6v|~N%22Y0992I>~ud8In`Lf`QStH3E)x@G=`2&AraN&V){PF%a=v)Pu{I zuQ7a;TZAlAgDiVUO+`B+z-8%M0kCiylcazP7I(w|^h*D4Sn6R#-jd7ZMN@iJo=6v2GyL zo;~Df{e7CCta*U4B1pD0lfi=EwI3CTf2}#(`mwSD-u-%XLU(&V?BTG?P-Fx}R5*E5 zcvSdpxqh`s3e`yRJ6%Efp|NYd2}SjJ)h@$9391YRLSU!qq4E=W9yx#}_KqRcG)(~r z!+&i&OckDJQ2El}fI8mdeCHPcJ2=byp-dT&ZFDzLuqc{lvh)^vKB2 zL}g}~j~QUN0Fo{!0BTTKwrDjx#j6KVb>MsCz=!G& z0?uz!q)+3>Q|KAM0zy>+^zjMt4}XE)t2HIfc*Tmi?$;KdI7B#Aw9_O-Zg>98L}4}% zna0Es9syWr5+f5RGVqawtNUt}*r|Zy#6ay+mEGaSGMmMOW%88u6mXzDD_wlGT6!zy zpLOrO442P{0J&IYJjqwrVrEF87ZDTT<9iz5xv)C#pUTTj+d73+z7GI`Ehx*q&zxS(F>^b?4*udLeSbU~XBKKi_PI+| z`R!s3tpv7gX^R3~Cce0vX(P9@UCS)XwG6mNX_eM`6X(`UW>OMp*nTlrcUU?`gCzDr zKR0P?yj9z#ME0=e!>GupM|%&t{Qcx)sN)wVzW*5E>yxt5g6NEc!GR+F(!Nysd6n&^ zN?K|Q@t>y$%H^ z1}}eMB%-GY`CK5%Pj}AkUNRem1zBUE6y}0KA;6;dZu&VyB`KCwPfdQ5Xri>Osl*$@qxi zNUlL!r3OOxC4C`xXPqL4Ec)b`ajpfaw12E4xMZ6=Yyb-WN0LL2RUzLj zAKS$6X%>ekm|3yQ$#-`3N8ah|B+0f4bxDc4nfJcHZ{dlBeXYRL5bY2afSAF|vcc%G!HPxGS8==1)_U|T zNvWWGt}f~OGmCtqW8>q3f@5Go0Rce)p>g@dgop$3UUF3))$Wn6gRX7M3GQ}?tC)i6 z5#2fg?U#)GsvTF-;w zY-Nw9hPGMC9F9(W5F-PUEmiuS(F06nlcE{I)}b=%A7_~A6cEH$BClS~DB|X6Z*IT2 zIpOX|#S?qiLR2Osk#^=DtNG&ym+&FR*Kv8P<@ep!ZLZtJSjcEO2t@V!3dE-*!yhNO z<`xWq;JT2z{)iLD9MQ;&^p<*B%Gv z9;zH_>TGtlGO@9MT_xDkFS4=QaZA)){{?|_B)8Hw-q)H3IPzKPiHM2|2?0GNX^+EI zRf5>q`4yE?GgaPuK8|(quyuVfv-aF(wlXs_w}4}Na=7tnIA2P*pcwxEhcBp%Q-6rI3Rc0j@jnbz>h=|(@M6C7U>fx%lJG+#q2Q4af?@H7>c`6Fw&JpwfW1WFvJ!J#H z%4DH$Nww@r6h6K-1K$M;1QOi8g)GMGRywKGssy2=E7s%k;ESt|W)#O-pRtb)vf8-D zxR2gI3De!E>)xMZTl>m(C!Tx|_c}u7mC!FmY~hT4&*t)mO76L0VQ$Zm)=+l7>+9FH zfQZjFC%h{enbPhuNz~lx(beZsjm#JG@8B$iw_cTSX-?0fRc}lkFJafCcF=wqJsUd8 zMn~$&N!wK2xp3mXuom2=TlzBdg~W^u`*x0IxUuITUpwpCCpIqO47DsRfB}i?8mn+k zO?VOK*oa)bFN6F7oN04eyGiZR6q#;01`nk`g-ro<5USFo8#dEMz{N z)FLtwpl>inBl;{0syyqD<@D`l$#Jfl)EJHXIv_2TJFdCbB1tJq2^~2}iq9XvxA^o{ zn0YLREmF;vJ(gM2^u>gGlpZOM>hd=@e@%v3L4CC$gdajz11>;t>9B37u4gN+c2EaN z7N{PzCO`Ov_B8QVS#5&Tgk_TYRF@xdXvUjab#=&lP?prpL~g4|3*W;OC@JF8+0RZoP6YS5=9t%X5j<@=9s zJZx5j1kEdx-027b#7vEm4TRT9soiaOv=y$Y#MT=^nhP%|fDdU^7Ez#Ft2I{)2fQ7` zW7SkW?%wkBWnL)w_~|{}hkUWMk@uEt@uS1%?(3-dK@CnX)?b$25^pIgnsh^HS!eiB z?gK|C)llrf;ga;b^r9EOF`p3yYRe*y*MIBz1Bd-qR8TlBdJn2ur@`?phF`DfaY8;D zCwmvCvRQoWVlI$tetKk}o?MNTX9H3!Y@C`PXWV>S%$VZ{%|p4jHr#UH_Ryyow;{{;KtygLxrG7(#ca)wTYK z-Y0sN6h;=V$f!GPone8y(zPnL+1N>PyLSs(y=`1y*FQ1lR8e`3s=cW#m$+c=3)Tb3 zN7!8_R~a%Ek8tTvTN6~|O}BoxmiKrt8Mkh0)vSD{hV=%yVvnL*%!|m2!23pSnTfsT zwQ-^GnI8{pLlWXKtGU!5h-Pk2LFIGB{oj=);~!Nlji{=PmP~Mqtb8I%bKzXfV~y`v zhZpp~H7qb%5D%?Sa5$&Vmvl)54qk6v;W{B~UlL4_ z81zf;L5bb3SJPuc^~%Ua_>tB)$VLK>FZvy&b%*eB+g)qdbU(k_R*eJS(gX< zJxL0apH$ji6sKDr)n`3{aNlN^Qwkhtd8DRdnV96&?L&8b5Co{7; zvmmb;3CdwVs8W1GMY~|zn1^&RO1t0hBt(ULtGJTf^IAMxRpD7HU;6{ij?XXdjHv`a zw9!c(a5cYpR_vk~eKYL+k6gM+5023LHvMEY_p}y=4k&Q!!C<*zC^2Ia3C3Ji zL1sbM+*p_j602gKXP|mF$s?~%_vnUv zj52~Vd_MWnLq+!(*+*-Lw~%K)_w>^_onjFhcBsl-1z4eAVzf$ZoD9yB+;Sysedi;%NXg8B1{e-#F_eG|zvUc4YC2OlIpARjmdsP@u05 zr*U3jsq00uHQh{r5KWSeeT?KjD!)FjzCJInzFM??L^jL9NcW`?Lr-^4X;Bzlu&Q?y z02M)ULBT=3$s#1Y9wAzg8-+0n||g$cI`eH$?LAzF9rpS6h3c^3UB*o~o`&^2bx~YDhrzULrno%G+^r zq3*RFmK+#R^m@8?svWLq){v0z;Az zxet5`c$dkiO>9f|6fbU>MAIx-Kjc(r4SckyK$1&9Ug3)mVCA8Y1>GV0bcjayWKU?1 z;d6`Ui1G&YLMmdtb&4SB(ffffFqD_1Okq%F3-y=7Xr$+V_G^RS{QgC zXKOBBq9L5K2Qnz3y##l~^f-q^dVo0JTO6ysmtjFF?tQ4=Mh9FhB)1vUcK2(Quo8ja4+LSJ)Y<8ba zuA}O{%Nltg%FD9=r+$Zri;I)XEgq8j;?A9Ap0;b5j5DIM+@eRt2of>UaXBan>ZY7* zVXIJgT25e+vU`n3vm9;wD-XX>S5Izts;k7?q0ifUbXFZ ztu890yFSO?daUUr!gp4FD4cm`X`a_ImZ)oY+O^`2sgS=Z-sfHvxbI807yFk_pf??D z)@elHpxFmUW>0G7ey-bx)DpdGO}*NS(z-#}PYqNxLg1@YN}fvhUtBLqKc+GUT;OW% zO_B<`R#rcqET`udx*1pLFro0I)_p#G&G^C(J)_;ph87-;WP@^*-yrWnJiD`bUJP4q znYR1%sd_A6GDQ|qpc%2A)KEGs;Y;857S{2jmRaCehP?GUgH%@%HTz-B?uYLBrVgP} zH@h;%V${F6+&AJkBG1T_xqmSr-oU0c++uF-EFD zir8XIv!Ke#t=O)W|8PyRa?ZUc=)2$4uI5;dauysN?Iuy7nk&-rwtj_ zbqWwtQli>QcMkpbLD<<#ef^2AtKAu7XV^+t%ng>C+4%Wb9$F58#E^h`#n9f!Ps zj#E`k*Ev&FK`3R|?l*-YBQmL)w`1e~thLbiWK69X#vg3g_b_#aGcF(hyvqEk72SD; zu~^e}9oE2m94b1C2NhicobMMlg}U1!FA|mJle8de9Xe&=-H(MvA(68kA0+z|@_;-# z&(b*W+h^U$FizY_L_j1L?db`Rywq|kJ8nKA;QjfTaq4P?Nw-t8PTt*s02E}f>sbOX zogFNsq@})oI`S|>iHp=g?5*Ri>{ zfB@dk5v}dqihux<=+%{)tOw&-*p;K#;k0?3?5LDv#-^~Bshk-i29xz)oSMVH0{UfE_@k=$Td6mLADmA5HCS>H;8Elg7$zuRGQ_PzI@ zO7f{m&I)ngat~(Q!A^05yQ_P6@m+rB1*YFo4Y=~o+^59v4+%;&=jKhGbUydp4sH`1 zy;I`gK$wj(W`yp3Yj2)F9^2eqVW8uZJUv^BWHR7|G0X^Vuta6p*nh6WK_UPW?g|4H zCB73}#_XrDiYLG?L;{a;A`xflU$&e61X|e>FFS;FXT~~Nej^;8D;T+(JOGZ)-YCl! zDic2c`~DhIAgQ(OXEkNRICxKJ<<&$(86$}P>l1x?yCEt=imFk`Pe$TW&4$L37fnx4(%*=smL>0uH114m_}1+sdfuU!A0Zqzr@~p)h_Rae)3fnObHlP6C?me#TrO zCzi%;E6iC);zLiV*o22GEXIF{NL2tM-wS{K&aCtKGNF+iOQ+JaXYw|H4%FRB?7R&T z1KbAY2p!11zb8icU0Q6TPkZCL#ztpG;uZYw`xg!FyJfa%ZgI;OhQyI`fsLCle_S+t z4uqjjj%#Gy0#Ipt92R{W{euP*jXIOxh~qaUFM9L1FgE=XM~3_=Bba|6C*-;_c4HdFiehcxh0 z3i5W02=DV{(OsRR{NTp{O}%1D0O?=QOrHWG;?)^(Uyagt?*2oVuw0Pnoh8{=0EzL^H|PjFP(dF&|L7WETT0GcVgY_ zx1oq}^k1#{aimB=*)HzvnsDIHm*|-4-oMfmwO_ThrZR-9o)Q(i2K8OOn)fj<5|I>i zrMN-NYx$b70)BeTtJLb1l@(5>DzdL{44E$Db`c|6v{j8rk`njaT(d`!Q+zvdV+~uc zwOi(`abOznKOr4><!y3?&Pn`#_&3l#Gef?)=p3_f^Ui;vfzaAOR#H0C- zC_m1^677NRcZrEQlhb%^AG}2eIicl$V9+BoV;Y&B{w1=n5~3`>l3tCJ_iei91O5sJ zlfRNrKdWsWxAWWhrxQmbuci*ftO7n7Oc}WO%lj>uVaUiDKPF^(#js~|dl-WEB(b%;R&%wBZo4s*Feg>11~T!zk!KqRO#H>GQupBCvQnt=r+5tC~|_jcwZextGmQ=bxnE*pJAI!;`6FR9y=}o5@Ho683hnm=2#mq1!K9 z;~t#M?%xqQa&ju$A*O`A5Y;)3bM=^-yRtSfb`+m*&?NHD1^&k_^1V`zUUp zBQjO}+aSl}wx4UqTg2FEd)wQlHv^*CRVd!3FhGRo(ku4))jpO12ugP&rZjKiwWfRW zYw>!=HK|cBWxk2w*r^o8&xo`u5~q#7C$1%JvzI7GnjkBxN}y~)MsK5FzthqT)I+i9 zLQUJe#tLyOp$}IIr$A@HkBqga9H3%Ak12)kQ{#!2%+*+9#70XhbyV%2UkvY~D0|mM zOicCza3cpNf8-DDqMQ{MkW2mhk21pBOx#yO@k>+nz1ZeIc+LzQXaBES&Mc^@EREx+ zqiBmVE)B9tyJ8C(1%!qWVxu&JY>L`J5QAF>)IcL^2uZMMRMdci4TdEsixgYJCJ-=e z(Lp2&ix5o$VGm(RSON)Tn;Yzh>4%xBd6>6bx9&ano^!tXf8ROv|DAg`e-7-iRZ8cm z=ml-2W49d)ss}v#)i{V&<{UK+J~DWlkr^ixT(|EP4_lGEv+7l6mX7 z`rnoA>yKLGlLdp#ymRS3uTeX~bc`pDe>eR8u{uRKGM^xch?2hX5Bxxz6(kXw^chB# z#7h9KbJ}H`x6PI{mOk`b>sfNpaaH^>y|DfmqK}?)K;U6OD{UDN0WtzaUnVZ#(spqZ zVUr8UHtKKJjt*vN1d8xgpq!jad2C3(uDSb@6AQqAzw;SdN2f_9m=Y%6(PT^t2e zg=!ibR|V#v11NDo)>*m?5o>hTQnM~G5obZpgu!tGj(YQzF70x0uAV}pwc8nXX9bNO zbd)kXD!8@U4%A|o<87&s*`|`dnky@hr;;ZAo2~Bu2g7qn%3zfDbCVL7wu5 zo6Tn~<`BAK((ct9AG1D;F6BcA^^r>vEU%LrOxsOA%-~5M z#X&|sFPm7+R$g01eYw6pxAtP}a&bw{TPi%16;?Qf0?g2_F$#<3}XnXEmOcm0X z!{Mfdfq*I2fU-a1TZs929@5Rg{4M{z@?9Cko|M^ReIRLnw|jnGRaL}G1ibFOa|A7s z+co|6Dsuoxs)B@lW!!Fy@jnb5RF(!^gPXPin?1IG|04fYi3yRqp(DWls)4f1ZERc>4-}4==@QsXQg#VCX`Pjnxeb({{Mj4zJ&j-1gzqTJ&ZexJiN=qXShYkaMiouM$* zihdgSA>BBh>UG8sz{fP)%#B>6)ZZ=Zve3ylD#}%J_s_FUjp|p?zS5nme$D^s9D%?1 zd2a%1f&hF>jr5)w_Qg&=>>L|+n_ZGJ{}HuB-aWy6I|{a6W`Hnb;cfm6{HJ~AA5ZV+ zO^P4X_D8eT5KMzCi0L0n3XE^`Xqp2~J~>=whP^9u!!3KaNy^5JOLz)Qwu7R8tf2ks zjisRN+T82EvVNsTX1X}xJ+r&E1Ana8Qpn2QD&fVB#c4QXwtxn8H8-fA^k_PfU1K3X z>IqazcZf<=_}R)j8P@aQ7;I*x%o;+#m133p4|1XdRsx)DWgq8qRCq~o16CxrvV~U` z$2#Ub_snsmq87&UH8fBu1S$k8W-@S#nO1mvLoQ#oa#qzo1j5WsbiT7n#x9E6xctup zJJ%*Op$=MhR$JZqbv_dwGf|=jmqw4H=Qe2mw@dI%LXLx+E_G`7=_yvYv(qNF3xrZR3f^9WzweTrZ7WqEQ>&+*-xiy?FBw3-ZWJN4Th}bQmbtp<+ZqlYjQPJ zzNJfa4MuhJC8X&CS?MdFHTA9?=isQw$nkr*(2+Po!G*E?U$K}~)F4_CUzSe8@O3kZ^Er5IyP;Rw( z35J!UL`-m9!A;qPy7nr*dZ@-uSCrN8P)B_V9{n(?zi#F`+gKxs#*j zIH*Icy{ipTSyFy2@?sB~?5qc-cE2IAHt=n!gOV&jwpC}hxH_Kx% ztE2W0xmBmGr@cJg0cyO-?r1X(kr9xzu3+5V>1YzBtuK6Ra+RToix@7>2?<#qlBORE zbPI%~d_ybB0wTJa@)1vVt^ENOxF^N8TUJ5l82Ua|j9w5GM!ns$6;8y2MsryfV`-qN zEznw|%v2>{C)I{qY-dkz`?}Fkw&fQ zBN#PretyOeaJs1{;WawCpt=$SI;XBPp7InnGa1cDG>a+B>Gj%*6DIE9rWl)H8{q`X zVd*sdD=SM1z|Vy6zDVL-OqDUa_)7$Y%8SwTNc$fK$`(EpOnd?|qD%^KF$$pzZLs>; zv5g|58uwUn(Y{xXl&jn#G4$KyOX%KD$tr1&*MWVUnx;mKg3#9O_l|8-Q|n3o{>>eu z!`5^oYumbF>)9rC1!*L0!jnc)RWy#I)ou2c_^7-jK29i+|GW6{gJ3&?o*?PGQU4@` z$7-B=gU6FGBh1l6I?5Y{G*rvYh!1zuM?w70^DH5@`^PXicUM2_WGwV*Cy$rqr&KUs z;}joZDc2XLy+|3^isfRqI4kTS5mliCSf3Z_X+6tS(ggtRztKx~?*aru3zmUEkLmby!sE-ZloZO_Y`t>6Y$Ly1P@lk?ycSK)R&6OFD*7$sq=57)m6D?#^$`jN9!w z$Ftw}yzlq@^{wmjQf8PnYd!0E?%(f@$3O)+@w>P1Z=s-|+?A9NQ9?mM?L$Gi>i)-7 z;FZH#{oBA_R~(hZpP`gM2$z8$uA4oTeTsro7IypWIV$k;%@-1yjwmP?PVhfhrcFuQ zP*C1rN{T#HanoBrM|UIK_dfItqc6S?i^K#wb=ab?`wf!gEn-xkev5WY+aryTcai40c^)|>K>E+ec<8oTH!6Jvz?Pot=)BPAz*Z5>N7QUnkVti;^*btsSu9JUB@m~FS*n@cgXc6=9G3|4JYC@2aKBbRSEYonlO za7Xp=p9IuQxwVwM&PZnCJ#%x~OjH`hZAy4prD3VfDMm6~t%mQtl1`0vY z*HSSM%jBKyrWm|{+j6?LEI}Y3GvqKEDtH)kdJrmQRpWguolR0j=(SSeI_c4Jel05F zE(*$y81yR2r!Hccg3dmurS^Q(HErm&J9Lcb19agHm=hjsYU3Xc8JP81a5~KKILPL7JFyC z^*y&LQk#x%OoY^&&%X9NV8Xxp!e{Yo1&Fv(yp%lKzl_l9%%8x6n5Y`}aGHU!@%d=C z%jwtMQ?X)wPTTQXsI6($fxrBiWKUnp@$!V6r|EpIV72dz`))g5bBFxBNjs7q0h_?| z+eB8$4^{il7xeGQr?`&Hv+-V>O$Tf^Z*KOwdfAV%mO|c1H&BWl2sj+taB>rPpM2Ks zBTjfYnw03!%t6XgR&N&9DCQ*5^#-(%(Jz$S5s>P!v_TB(teM{aHrGek#kJFI=zD-| zcF#h8!oH(eZMS`5FU^Vlw!V6P zQzEMlGS7gS9xjcGDfav+vr-4~BAJaDGUC(`T{j2v{X^#xw?pNF?_27&6{QB-d@81T z-jvQ!gz*74P}1rns(}HmjXUJydQr5B-n6IgyBo%&<#RShWtQss{dV*2*RaN!muBb} zZBwb|QQl@PVS=EU>8^+Z)QZ_ATzx_hx8TNFo3PrwHnftOgs4nG#~VdD!^6)nyJlbO z60GZ^q1Vss__}XBJROZK>0Z}AUiyRIlw@c7XzjF`2{syyG6|e@>Q88&&ncr@ zyL*nFhnc(7S6a{Y@q4H*1@~P-uU$@Y??fFAT^^bIgMnpt^lYt6P)Fa+jKb4p zZ?a(y9I-9h^0XbT>Ehd`CI8bVkHh_97f{nGrvBL(!@$zC_yMt0=!XydN3CR@_mZc# zzSR&{_SqO)=z+GUr^3#2Z|8}7`RJTNUqcfKh?g2YU$bK6U3AHNE#Iz@u-ounY9?{0 z-hv)})tBIH+I?|E1_`mA!fP^WBqy3Y4a;XR(;wR(FXiVP^nw}5Q*d-Ej6L8FeIGK` z%;B=&-IU%>;#5Q2qwWxVl-YB)%VX;np!}q(Hrr5%~#e840K*K^J zXcHTx3)+WF6rWzaCOLOne!#;jc)rSiKz3TfJ8HH{jDli7`g34i??`x8>?ZHGakeMr ztT#S{d9E&*&kEl+Jr9sDc9uJ{rKTST%iDCs3SLZK9zkHq@v^LBWkl&IM4ozkJwiOb zFJ@BFr3c!#LQ)h73OTLoo<_E(o`IQKgW`QBL8B`n1TD=mdM|4BpF!RqRe0{f z!}sj9;oIzeC<8$;nc#j@&rR`xcC?El2&4SX+3Fm*)tPOw4vf0Cqe0)YKCS5&Gt~@r zw0Ch`M8b9}Ac`y5Jh^pQ;}Om0p;gUQhyK-E=%sI<`?H{G4fJCE8Bg0~Yw`eyyzlZ$ z0{*b26E)cV%nm-^VM5cm%T8daTZY4zIv?Z-=4^S0c1e}bT|tl0Q2xF!2)*JqxoqPu zzwg1BW^PPsEACOnTf)3YM2VZz=W7+7O@!6*ZcbkFflHf{n<}Jb=R0k%wKvp8K{95! z$pt;c_|DCr`-q29D}0Jo1$0`sIRo}!YjT$oixKNbi+kz)J?`?l;~g>YNifUW=0DG- zYBrDfcnL$m0;t6Onbp&hY^G8DV;IwC;Q3l8RRB%qZ4@Cjcp0VdUOW2yl8X4`m3NTNM5AZhNpzK~ z&uW>?=+MOHR+1U}-QJq1&EjV(W>ck82ABBmrymA;NF&-Rd0H%aM(Q(##X91M6JK1h zncX~}GIHf%?%Gl(hQdac_|HqCK*lo7_1hODTyeKpJCZ``dDdph+Zf*EjY@iNgKfUEl!h{(dmX0U zNbz!;kR{sBr3x_OwFRwzHcMjq+Qd^|;_NSb_QkcJeIirtLHIsFi9?W?mw5}-ntn@w zp8ke;z?rkP`_|2xrp?dKrxG{l6MPoj=vB_NSmHOjeCA(FV=LXNeov;i7%CAVc28G9 z@mmb6hyFD8B|rL1Rd%Mk%g!+s02W^9s-9O+^623Mj%Ds*tiBicI(O9ew4&MLXpmsU z^r71~MeXK;ldWsM2Wu6V=byFJqzATP#3zt}Dvptv`red+?eANkC&_Tz^}X6lIz4QT z=4|gqkA#pk4_}<`Z8htj)rv+ko*pr928n7rCSsBi*6(HW;cM+m29P2} z!v`B^9BA)Z01N_^hi#`)S9UH|+jgs0bD&Dk5vERZb3*!ZH>T|x0ZVYP*VcijfX(_@ zUGo`;5LO${U%N>I@>!{7n%wXrt*M;e83%!iq%TYl2Q6T%O|_HmG6MnCTs1}_o}a12 zmX_+frrnPAIVWAZxGn5czTuRDpLn{lWgd>$xrCl&94NcW4WeSC4<8m=z>K0w~a56+P1wDksK7nRmdn4Ee zq=bJC5eDh$Rl;@wG!s7z9W8A>EKEHl7uX-2KHbtCX+rmz6ZCCyq+AJ}JL=rJ9XaG> zc0_4LFR^}Nqu(@GPlJ{U<%~RiBSj!!U+O(`X~9)oy?SiFzO8#ni7%Pq)>~AwwRPmE ze_7!j-)1dPzAo*;;{0NBCUkzAQ$uN$Dg)j2qs!sZXqAq8_glj4a-dQO+U3WY9(o@K zpZe4dRjqQ`o(k4zxSoPv&Q{9ykqo5Z$7Yp)1U;p{WA(VZs*`H@nl$cjcABq(>)V z4s?5N_!w`pHsiSp$B%E%>iSm8TTbt6;YQAcua^$WT|6m2^lZuSvvmlU-t|Yju5Ca5Cb>mVJixq34`PMiwUGtt}AZ4}nLGr6Kod{&6Y zL23K+JOusXTZFb&$KkZ^W+s%0(kz*mg_oJfTo7q5DSX1X@*xE5(7!Q*j*vk2PPuCYwgK zvyhqQUV+>`k?(d+J}#z)d*3Qfo3=a9DO}4r_BxH4XV_0)Gl?0IWpq%Yub)OOVcJzs z@5FQn_}c7jruw>Kr>!mumWzMqYjm9{gbh+4*yAQFA z`s72sHv3!!_uuPgnCw$EZFA~3wt-&mR~@(I9$pBYf-i)lQkcnfn=dui!fKp`f=qMf zGFt>Mv~3KG=W#P_DMC)VM_j%4>g6vMd$p@|Mu$n8G62@#JE88MO+eyvu>Dd0q4p}r z*_wDCKkHd0uK2x1i}li`xrDIGkxl>2S{v!n?{=e@WS*C+Df7D1Zgah99)mCAHRME+#PX!(3lN1tyq=wT z4A#BN&r~(!hl?8D-(8q?pbPBoHJJs7`@|k~muzS?`<%BY3SNMFYl-# zSpNE*;$dCwjgys>^i6)kf_KLvz&kOo>VZ$g4^g2h;ERF7FZdOpHo%Xx4-x>mh95zJ z|G&Qk*S3oEGcz-Fb#*srb?`S+5oBUZl{ ztFc@4{$KCIbmON+V<1@XIkP&EV_d%Z0;RhHk5Kd@szVHg4sn+t6ke?YtZ=e*eNt@7uFX{LH`VP z^yuQ?DeNfC5hYr{6eFhO_!#y4>pYskSNdV*DC%HvK6rS&(8|h66ttI=%Cy&vI|72Om90UCr7>1mT5s8(#7L*CZeotBrN>eyyZ1y+y3kbcz4m? z-vfEW9v<~|b#Ecyu9c+N*w~Yk;0f+g-I}NLF)?J~p&BI4_yh!^1j|KeVf%`?#l^Cf zv(LTd?p?oHTwI)S7k&r8o%W^hPxSYbLb=HYu?J!Y7IGNu8gRMHF{b0PPqda(o9krR zfCnMf6Qi!TJs-u~PfeG_a3P`Xb)Ooz&ok_V>L=2FGr426Yed6D4eK>rI!RThXoL4Z zf2^+%$BEOJta5P6g<@7tw5Ju^!y9>3s}{sORA`w4DiS%(2m&pAJtZrv1$}_V7~jip zOlV{Z8)9#aa}htS_B@PZG!k5PB|W?gp&jRqcTImZWJBXR1eZCp-`6w51l2PLP|JP? zM$46ErF!W+LZau+=Gv}Q_oJR`^%63KCl{3lVv+O3mipCrU+{*qhztYzH!4Ls@KlV9 zp08Tsu#;Of1_r<4-;nw|U0ANUrWLkt`PuyYD>oUUo_8iJG~f_f*>(A;6&+44G*3=T zbFcz(rmCcU8N}ho36_>(W3DtVOQVP$Bs#|Z* zzeLHps63DlHS0g@i0LH|%|vN`Za4Nohl=1@0dJZp$=57}*hGUn2NtW5n!(AZ*Vktm zgb#drNEu4r#HCy(|6t@_DQD^g*UbT-8!9iDXT%o1zFtNZxGX%fxzTzQd37vPC2Qk_ zLtZd{996+m**lZV_Ps!9M#nrmp<4kB0ZJL(mKp;pt304=i3{bIYumgICnbo}q3k%= zLnN_OI8Z6hEj$$h`9sW&(#zf|)4A$uDQX)jgtU_L@|SfKiabuqpk*}sBu(z^6IGS& zVGu<$C;=?*AyPZ`c)55`TYzyxjnXG3D*#(2~YjfQBB=%Uc-N3od4ttKbpexVfi(dnjDP% zP)qx|aoO*D;_YcU(mOdDB9Dz$&}67?NX@m<*)uSEN{rrkFB&Lw@4G-`4dPsWuNcfI zBg&^zY{;aN#>#Us4ou&w3Nr6q^XFxvA=R`H4b%#FA1tlnsitVzCpKBH6?-hTqo#US zQmfRH!n0Ebx<;b*87&`E?4wSGru(E;y7_a1h~btRvq^RYgfcZD<`*=R~q$@dq?Wh%Bt%nbs1AI*a|w7 zm4RUOm;mts1-ZOP?fOaDIt19VbY`!y%b%Z7U9MYY0PibYEos;ZqDp-qD5jY%RU%k0 zf0A~;2pBOERR`qNsA0f|6F7vJ;leEZz{33b5<`tt32|_%Q`uU$a6!E)&g$#u&Sqis zjAgY}3tMtkROU4yPgRMY6rtJ|V;SYC56ie}1|EoFyY{CaiW}OyGFQ=o36(tAJ@tw6 ztvs04Ll0~YH<)zWeFiq4Z4e~I?>kj@U+>ZbVPZ^wLel_o!6A8pQE#O`*m*xGm2yt|-dK zogz9zqRwH56>=3Xpz*o*i)8CNc^iH>-a=8&G;LookL4Cin=-g;U{(gya0yHQBN*#V z-+9Djl$3?2p?)jnMYMI&ZTFvgu1Ol6gztlRnVYgu4ydv7d6NiN4Eq)WX+7u-$D5hG zzejcxt`LNOA>B-m&f|^isE63nL>{UhSZ^hY8QNd z%9wY=@rL0}Gm4O^7DVQ;35b6}ESjs#M4n=;_g0~g;S$;%PlI=3#T5TN(1vIx?RG|& ze?9D=$d!>9Kz$#HT;vNmrq7>$K4ItKfesHZloYtZd!?*Cneqz4G95ori}yN13AMYs zw@=c+oYS`n+4=%iskM8R1uwzArwQi34YnZPTKkws->Nji~nkb z-JKxW#*N=)Wo1kCrt}!YlB73}wlQU8L+;+ai|AZCw&yw$6A}pUS40VjfesufM~jO% zJXCarj#^q;E2~VlFdf&a8)YhLd6BDOKe4HUJCHUYvD(XAw|k|Uvh3E)k+~7JUI;{P zbwQ};*;OQkIPt1B?M0N7QYl{P~Z32{(ltt)fva$`&O@I;js25et z^u|d}?fNZ&B|_gU27y1YynqVGMFqIb!0}1ymy(7o9!I`}yT|?LvRaAB@yV_=Xo%l4 zc?lGXp&^M;o&Jqo$9=ST3k1{%9j8m#E;|&?kFc>5r;=f58-FfQ9GaYLD5&n?feBtL zqZQx9J?999Xtt42MeV`4%QxS zvSxn6oF~cKdM|UzA~2LWuf6@t$S}R7#DE7TE~@8b%&SIqlZvq_;??0-{jI3mA9y}I z=r&f0BuGqvrgGJCXGuOdyt*1G`gG9nz;-B{QxrMhhcmV+MZ?;@M`Fm{VbG+f?v6~q zn|1Z3w}^WEF8(a3T?nOX;hQhz#`u9l?S!oJvOxp}ol}Vpn3zN12FD^2R@LN#~aAA#Z%DCzEEK4h?B5E47AWNEtgHd_*&qz=gnKjQADb(QFEGm z=k_MMV*S*9_G1JV*GIwaek=EA`_b5Fq8BLfUVB69jYkY&0#7~Ny2Beu93_J3W-B$N zeR`OMwW!P{pnPjYKU$V>TTNAmijMm<|E2)R3pki=YaH0gq}I-}1f1N+deP}gO##jI zr;x2Gsn8DMs(8O+7&a3z=t_b2I)M>89E!MRKTF4dtw7I%e^Y_L8MHScesK~fXOvdL z`=2Ozb0TD9L-K^B?@HSb5*`W#=Sp!`IlRVIIznnIDh(#t4B%IkuaXtBaMNNuZPnMb z>gxG@b3a8e0FAuo#Ut0rE=Zo?x_hqjEly%-I#sJMF)*P+#$m_aMjrpI_IxdZd-zaW zGc`q9xfmU*O%H4Pguzr9TjZp60LB_Y5@O>;=?#C+5|j%@{;B>rwE^`fWpT_*B#5rR za!?D|4jL=|Re#)ZjA4XA0c+?@7 zrL9%1YoxjaPml%ZLv8RuCq9{T0U2^&Cu3QoB*ty~svl6uS&zTQ^{lWSmUmzUI0I`G zH4RXH$_lev+b9b73#qHj$ZT~Py1gje3k&?oi$@zH`Hd-UTq2oFK&+{qbykpzK|3{Q zB@Ob#(f>ppxZ7+8%_td4ch)l=2>hNm9J8jV&3Mf@_XB6hV@W+xIl8U?E~wpsh}$8n zv9YnNOtCV;7EmmztE&-O1T#B3_8-@^w6zfs-W)|GpTh51otY_I=_rvyH~gVG`u0F< z5TcwEJhbSh5Q2VxE%X^!-=$wG7rrN50kSc`k*4*V2KYBG*~?`NETlx4Ygux6eYqg` zZ1q&@Lt=9A?dxj8(VB*NzL$mj&g>cX{XG!KjjJyc5`ulwSSp|J@`?jgA~CVBShvbj zwHQeqI61YowaxZJ5kEa|d_Fwf&pobc2|I(9Is;!59O8&^{H>A~UK5h8)H~E#bO(%7 z71>&06own{+sY2Et*uq+-D{;K2P(=U3|8D{W;Ie&CeR$DD&e}f)DI{*i;Jd6fydDB z%gKw8zgWun$ukL#+w$k;=Hx&pCRSJS z7UIDkZ9wVOYpidSA>oeuv^__akbqBsk1v9##B&{Cob2qJY(v2ud_Vyj931TJWdLfV z8mzLia%fcD09lwTb%t!V#iwvcqA9n5(vvA=yYON#_RlsZ534sy@DzM`j+{*Rz-0R1 zh@or!v&7~_A{)eyk$}!zc1e*j9Dh(HxYmnS2 zQ?TOqoZ+2SHlA=}foXlWR3%eEZScKDL5yHfaK5hOVmP#L{B%b`chJ+qwbBmc>buNx z5aoj#$vGD3UQxcaCugdTD8y0-6G)(9oV+V>Vq(T`rTEv1l(+=1Nbhl&{ZmF_ z%pZ4@l_tyRMfXl^JQIk1AraetCnEB?X9k#F@@By6NbZfeRO*SSr;(G6pvUn6js2L2 z^_XXkn#*wVj$e^_4L8NQJTu76fiJj8u*7?Eza&)LEAw_IN0vR2%Af*hI`-BQ|-sIu32GbNaWR!8W# z(^e18lCO$alRw7TJbpcCPsf`XR0T_xqnUK0FIFk$$ER@Y44ftz1ZBF6J;!ZUZFwp@ z(J1m+D_5$d%9X#Gt9MzRlGFW3fC!h!5R#C@(EP6}mRH|`b?R-&TlvSRtcdGQ%fJ$- z77Y{wt#4CZm_4n=d~o`o6fe-5t_%@MG$sGvHWgjoZV{Y1uvitC!9`TPX-tCpIJbYN{& zxKz6lvqs8lQ4!_EZDx-XA6ap^ml(rgL;Jc(kdfQOFf#U54)Wom=4)zbeDnzk4RvvL zt}CQXQC{QlHdUIAu^XhvpC!YsqTDz;d*x%k6LNSJt=G{In^tspzRzdJ*H;%VP!+W2 z3SeJ+!Oh4h(-99Pw6L?Yv$n>v$x2K~DJd?tv9iLnag&jiMZNlRWJC>t-JA2^D6_tl z^`)iz>x7ZZQtUYl3$H4(U%_jW---y-;b!>%f=Yd@j~%v=HN?g!>L|8INKQ_EDfE-U zTy#c|0Tm^`un@B_d}FCUlYxPux3?EboLXB&00%-D(@sMZC_hD`^MHm2@FpZ)DN>B0 zy*2O#ILvPW)}*Z`DP{MP+uZ{KUF%tE0P!Qnmil%U1D)yfryl#om;!>Ojprp}Sco^G z(E-hDa0FxNVqY$m#H3NzJGU&Q8A*;7-Z)~!Fdim}3@WwEVjj%=p?7=W%jBB1?xT+d z{%o|EfKjuaB;@TKqC%!dI<+=wU2O8B{yuk>OCIKQlH)+QFad+y&V_2*wkfE|b9Nh( zIsi!=7R}H_Z5O+^I7$Sv22GIho?vb+DH zJP6)BFnqZ)?mN;%hrh7QnpziCncZrC1I~ef=N9u9yERF!25LrxL^Gonyj(03v50h! zf6BQRZ>TD_7`|e=Dz)BfdMD`i@YBr|oxKkrXYyE=ImB6nu=Cc+7##W_O-*@^wcHgl zyh8zrqkyU-qNd>OTIX~KexxXJWvF19VwhyV5iVyloo5Y2`YfM!Xti09UN5ic1$l+Z3$%;>iTx!rb0 zULiG>g|rJ?byj@y33+{3zf&#nGG-MrT*_i!F-RHBhZoo~KrJ$1Fx)-ir~nwgo`;!Q z5#l#@-E`3!h0yS9#HP$_e=X8n7AOD zg^kMw-{3pMo77am+Wy6SH4i&4Ec+>N*E3`X)7JSQh2N(!li3Q8L7+hgnp615{MiP1 zHL#zx)Qz*UvlrqQ^*o>>=-xLOOMNQW@6ri!2U(>p{lEdJYE2fz89qVi=EyTW+zU zR>$w{Baxi7K>9eBVOu2xOPZchP5(Y%8FtSqTu}~p_zH-&_uevjA=h7;PW12BY}Z1$ z3l1wF?C*aG=tNwKU-@U53^uu#$-KwQWqZm**gXO*5mDp!s}S!hm`G^jC}${&26Y&A z_W>GtDdpRtXAuAEh<9nPTS#+Au|aKc?KJhK;k?*@>r38`E5!g7H=s_gf1!Je#&~j3 zOCF!FqT*+-^NAWr$pMFg?LXM~1wm%;ewq~j9)%^Y70p-%n;4^|>?G0#pRMzcn~ujW zgn#Z)O`Pjx?%}kjJez`mz-~P6W*y8iqwE>rd|!PjWMx%oPB!(A-t-S85)L|kufnUN zX#lTU-5mP2`&=??rI#I6tCMcAHTtXptNIP9#dBMiYR3B-s=|gJ0wLS8E^=v2O=1NP z3d3z(Y^z7g3)Cv%Yvm(PE@Xv(hl&6h7+6lKS1oko?0W^--mdWW6H)WHtH zqena(0y+4QqT_Fuhe=z5r={)Lm_;gy(N1O6c-`*q#sT~Rprp}TXfE>^1em^ z@ZuQlS6JF)dAM=;7+>@Ycc9k`C=mi=fXog2_$^WE;;~`&_aKY#(XAu|Xwm?$@w?cH zm$F1GZ3Rg^q{CAqG0?zXJQ-a)X?EYk{`1B2-dbgwZ|ro1btIzv72A5W9xd!w8ZM zfhDYjv{3U57gDQR|Ea2K<~(``s9Q9%^9nyc?F9UmQ?L?UiFu7iBVR^?jZDx%KL67) z7BHU5@JoZrG$|wlNb7nMMg2>m#c34GARf!YKrU1i{VaxHn*O}UZAR0W=nr38(wB(1 z9z1#d2jUWs$ZWu3@Fx5_!(%&UKzzGH^&0WmP&BUoS%X{e>AXL>LZ&&;mVVFSN6!+j z+xz9qt9>gcr^>>@Ze7*wB*PjD`@r&suA0Xok`clMS`CBPy?sne0hH){>kQiOs&4f*+X>FIii<^3Tg z#n#p~9Z?~(v$LC0AmEHIJh1vzj(6FQXOlz(xYptM9uhOZlAr6?`IlCEr28dcIP-LL zoSmITkcp2JX)3FC4AO#tvaFS=pO~14^dtfUZ?3jzDl13*(1|Fu_5WB-Dk_5fNgm*C z`OhSc{f(t^W=9XmC2W3~+p1!B*M$&itpNT@caWw=xSsdwo4!6PyXIAEczzW)gt$p< zG?{G}UT)}b?j0+ROprydSpH=&Pbk$-)-&W@l`SRVWl~f9h%f1Ywq1+;vUp+sl}Ug3 zer@=L6*88L-G$C)SZ5PNA?(>uDW4Sy55SRPauXINCgw z3`mG1^w{^1$_CZqYQ!y-QC!7s^u07KtHO_Ei$S)$ewJTkGKzjtNVH8{`|HW!_|kkP zGM;kBZ61iOfcYBcKOr?s1!ka+X6?9Rk(~5Sqv2M!+~4;Gu{09!42cvM_mIiWdJcom z^cPng;}I7u6i;_qnXMhIWiJY9TUmIpU}L0IDZhR*C`J-)7GBRhR(n-;yWs<=YA9eS6R?za z39lg~N7|b|+lL44!Q4Zf23!wi^!6@35dUJ5KDGfvxPvQn-9+Qa$$UOZ#5&pMy%sR@ z8vz_o@Q_MbaT~7`ag78RA%Z6-KI*9J zdk=3+U5c^=8UKe`GftW@f}3YNvZ-rD7S&s_+VIdQ{P@+*{Efr;^Q9kE($d;@CPI1F z5IYiQE$A!2z6&iS@8G68detTm4m4N}qdG%oYo_(s1s>zaEd2276sQm@1fUc3>FG@+ zp%5_8aoDd6<@@{J04O?7hxl7(h_0&*ru08l*k70f*yrzxrEusY4Frs56ICC;4QHC^LBg3uSO9cY?v)Fk{Rve4!L zIh|cfrhD932NcF)3`VmyM#wcjS$_T%A)Qm*fi4piK zNG%{dRY^vB&qq}ox7X-PXfGaT_BTq3h=O@zLPlyHW;iPKEFtw9g}ec2Z85`x%CuH% zAf+M{GB!YYy{_!t_@<6wH;-;7o`+UkeG539QTjzk_nVy*Zsbx4S8xD?=TQpfRe~PE zzzl0wx`MrYQdS(rfCk4`-^4gk1*g47muU8QIs zbl)W83cI?bw!0NMAzS5@zP71;k+-;YFc(o4^rd`yu`to0Yl%Z%892f4{75|UZgeM- z5q9d+jMxBjilqc(mGD_)mbHpQTt!vk`pVRCte>R9+7=~oH*5(x10G5-+mv-`51ZFy zbqtu@sdJKLO%89%wpLSO4I5ag0Q}R0e34y(;YhJS9&su=B#NQ}&R$!FwfZ`c7~J>+ z*C=l^KhH35S!yU{J<6cwRfbaDeegE1vQB(?TXq_e%VT&k5}EpsyeT}Odqv(#e}WNSLsXX|#4qM^5(OCX zv0;GRx4ym}5)zUT;sp3DRaI3sHZ~b|!+=b)(4((VC@maT&XW1uch<%$h=_r=(pqJ+(64TIjLi_UZ7fNiR_W; z>c*i^oPpsDQ99}sQO8zVF_p3r;=PjUJVH&c3 ztXlM}{=d>lkVy9ckz)RtX2_IcL_DD1Bsczw{lOr8pb13v^D7sEmPg8^B zu+-4tv2m-LI*y{CzP@3S%2lo5;T=xI+Dl7%fwUo){=}==4{E7Lha~3I@Lc`PV7F6lk0Dch*+& zLTjd`-XfCK71T6fA~P5v@ zwe}q)3=_{C|8D*ox=44fnHIz_`t7I(Sp-j)TCQfe%Z!yhoXf$Q%pzBcNqXOcDoVBZ zfwVX(j`Lb)cauBf8`Bb^^`I;m6}hMsrq|pbUbAeC-^kXGO!RcfD>FW6O^Vr6Pt_TL8bS*QSUbok1spKPn97(M zu`f@B3AS`5iDa>)>{qi0zbb3KCl1a-u z`W2{TSOklXmq1zlJ*FNo0<}+Bu?=G|CXauD>a#7X=oMW%Zydm|;bIMpEH~lg<}$N~ zIJ(K+@b=Y-l<94J8hRU#0@*Nj$^H`^eGf!YB@#WOiD%|*6!CvCV*YN4{NI2+9Ygpk zN;3?vR$(2$Awhbdm7+>PzrT=s?3)zTiIzJB*IeiB ze1%82N*XPlz0-g!_pAL{cG-%Gia`(VpRwo~fz)EnikyxsA zfiE#JTHH&z>;n%vj+nw=>s)sb6B8cTz^?fCsPSavW@_r_w9n}Hd*nVRKZj>XX=$o? zdU-dqs79Rn7f@8F$#$x9)|Nv}&=YjgE21}yIuB(p{Exzf_k;k z@|I*~`Sei{ovr|#!+zqSYAj%HWj*tCCQW4eSsW5ep2sepN89 zc8}AB`%lfQ>t%j^X0sQ<67;*}&_UEJ4pquW@K$8wp&|Jbn*XwjvQ=u@fIxMX0T3=Q zwgAG>8k3rv$Y^%RdudRn_r#PgB7eXW92q%j?*f^<(;uE?pfNQb#plPIS8(n7muwf~ zendM75555+qcUQ{i%>S8aiV5Ao~g=A;qWiY>Jd6ftV?&k*J}Tg-z_rq7?7zdg^Pk+ zs4(vfN~u_vXv};##Y{{TPQbEf`p5`25(ffo3M)7n1#I31$r=c3RmmQZ(SDyk{o$d~ zE zP~2h+p&5sT(E2>ry&!a>$>>*!(IN$rQTDZIeyxP8SZysRVW(Iab} zWu98km0)kVV2Txmyb1|rpl!vdTJ6TaW?3RtxicccWo~{gB^Z<$cqWVpfnW2W4emEW z(B;&;w(r1>5|^BgND2qcJs(%`AK?5+{+~Nfr3Gu&@nM(!4KL|W@AScWH;PI)@5WK1#JpZVwXm|XGO!w}s#Fnb+wUDa8fC;f$y3QckY`UL7=2`i?%yvE*DGCSWCqz=|Hr_5R5yxxG)E9x0Ig zF$Bn#KVz|_g@8-;r+=3Y_;*1F--_39QAW0x7J&!rC7|lSY!(qx4WyW@^3$aId#e3^ z&!qdEevXj!H->BEj?Nkm4nP0|LzI8P*~sZpjIC3PoD$^vSO}o4%kD0Y1i9Eu#5=MZ zV)IevQmWUK0=Wh3^;4=N?9$uGQ8B~ZK-ge^-$@SGRnr_FA5~RV$f&1zxLPvtD7Nc9 zGF!k!r3epuwK(2oYGkETOXtzS;mY>re+*v>Lg3oD(3xN)1S9AOkl99p%J25PDANqv zF#oTZdhLsRBF$gh-vS)?|A2*}kdQZ_^cg^QY-L~zqk9xC5FtCoV9AUvd$GdupbAjr zDA(_=W=sLQ>Nx)->DIRQER58zWRQLa2o(rW9rPj>`f%3& z3~7zmB?z9(D{!SU^B^8Z8cVbeG^4{AJalq{RXl@w0yA6T83JsCqqnmQBdBeUAaoCUQCy4(yz%qwVj~CIj|`+;wBz z2&LRXuaWDz!XMKH>_r6j3MR-88QK@jYw->mfidcCdNhMF&oXcvC7f9aGJcqrGXH%5 z?mg6j9Ndh_;wwBu5{oV+fLMr57l?r<_+tf(I>rt0i2KQtV!wU+_DE@ee}72{qw8=Ge2VrekHh((m8dC;yac0QM;ZTR;%GrGWi}$&nE;n6Zho9I#i~$S4!x zsvvi=Sn<~Z0>Xd2Veda>?q*see=&DJx`Wr9pB@=X?VIVdRi=k?Mu;tYlmaLHVSEQ; zHKJs8$XykPsqkCU{!3@5NTCkjDuIOvrj~VmFNta49ZpFDwd1X*vJdLUDorE`Tb7#E z(h)gGsMd7BMSVAQ?Pzm-l?UC+EH05gMv)+g!?lv0-o}O4$$;)_zz#tJ6NJneO;#|k zcV|I|Vw5k9DheyOY33$9Mh_`_20)v=C3&+19$1cH^-^67btEHpCk9sJ-lXw_$W%O3XhRC$M_ZTzqZTW1rMQrh;#tCrYJsL`$&n$ zV4xJnZ7Q*9ES8HLx@R$8Wikv7DY?15J5Q3iSH+tqInTZtJxF(@Hj)Vf_SH$wzPQkY zM_dg*Fh*Yy2&9J(r@+O%%eHY z{fdsKWLh=Vfau|*|J=&_@HZh0A!rggMZJi1)D#fHxR<{&l99~e@sAxG$|s7wMSWi| z9tkE~EN9v75A&HX>u6%YcL(y_KQ@JhI03PIKF~5#=u9;Mdjb&2 zi+Mx%rZ4$^ZUMO@uKuwxgo8W0o;-TlSj@aXgMlE)8II+=K4)&q%8tUqjR+KA=I5W9 zoP34=2Vjq{H-B;zJPl~NXbfnLh%9|aPtW^(?vMCCT;2vigC~KJ7yJ+G-D9s~ zHhJvs>WP?|3OInj0&IYB>cw6c5LEa5nqr}8Wb>!asOlgcr%h2)cJ3`M$J}5NfeJ!4 z!v7|;#uMad=D5uRtAbso<_Ni)t^R&<7%=$2rJF&L^7A#@#+%ALHXB)iF0SDJly{zC zO{H7kcg9g%ac%cTYalgN&8m;+>7;sRAQzKcsL! z9pdSp-)^vD46y^}ZSo8jw7~|G+H&sxaLztL2KDbbZ0?mi)ClgWC9UwIH- z17CgkS`JW8#g)EVwxU^5+l4f*{DI-wYZ4s7KrOL2cH>;^Xnc(=#Kr}~2eBT{{rL|d z+T{I0lC7_u7L1*@nrq^;#*J{QMywSe;GdeohQ!z2&9Usb4zV2je%+=8FuN-Wo4osyaw zOG%I|3KuP~O(nBoAZKvJ6A99jOgB+t0cj4+Lo|*^>p>a>K0)hdeQ;2Wa;}St#?YC# zjqH^IvcbLR39D`;M=8&11eM|>vtMMy>F8U)yuzWf&YxuZ`#?v2-hm>X!;}?Q@tB8` z!fOmsT#}Re+TGXCMhEnH$C*(=;_j?TzK#I@Ha!F&iI-)cfvO?E8!?-H!PX~Qs5H>v`6bfxFdo14N~kp_>vNA47z9PSn7%X5y^mcq};(@5$Yu`t-EWoV}Nke?`&98vC<*d=66R>Ot`8# z&|CP-8zazRrzcgs{y+q9pK1zgX=wp%_ij|<3-f&wm;7*oWDp6(W09gQ^?%W3)zQ`@ zzb#zM(6}c2hLvGwM~6Y$Vc`5p7&xHw=!*Y~s(2_abuNrPxCD|&3ZLl?0n1h_W93W6 zFEtnb*4Fnm5r3wf;R3RsCNFa5`GaNrx3MNj=_*sq%2s7biEbNm29*0`N+J z?>wQ`W|IhmA&~T7V>k%FP@5# zIm6X<<~=8J)gLm7G<$|s_klLm>pVM&mt!%X>V{ z8OkVf2)fqC1ux?`7>>0(P8yDl9eONSW-J802x>U_D7SKUVN8OdWk4J=8-pFp!QLzd zQ%7n6R@!8d(e^m}AW)q8#|XNO65@Hx-2Y3)5!FR3g(cfI~Sf_55# z2s+Q)#^7fO;5k~N$-(_(>659=$+0#FiLsZUhdqwx`I<~ zHJ^Q!4_~#&g-4JXVg8$PBEVpu$lIAT^{I`@OmXtS5TUWE%kBwo!4fhe^S4{{(awhkNpg=`Jfxt7In5W3@)d7Pu!C9DL?p53ulWm`KA<$hwy zq|f8_?1?44Zy54Vm(HE2uSTB_I+peknNFArf~kp+JZ9*00w|{PTT3>oo<;tUdKP;E zy3bp;%Lhlg%MoWZ%*s8ohb!q*bw_O%fZ<+mo_x_QS2Ig97-(r{b~x1dX;w(Ahb3P@ zhB;Alm@+MXF1aLp@Qm?jd?)fPdg$v)W)C_WnY`pBO^y}|gCZsZQvLGB&i0}7jVtQ4 zJF#^&B;?E?-DxY9y?KP`1a+kHKbQ(h?p5%cI-ETT&0w^qwUaaj4qjZ2f1|$t&3}D0 z=~Qp!^=;k*bN=5r0H|vh{?%{)sc*Hc?H`6{zFYe$%gej})i-mCY?U-p=O-g_;x;c1 z`5Tfk0{;XE5c;eAZ%apj{E;*OJV&qN{r!zUqns`1R*`?yMtRU__9FUccfm@=5%t>o z?GxnE^u3F+rkLTd{Cg(8CbL<;l{g`}i)|vBn-57K zgG0xIe}6tAb`OVR+#5H$A-{lbmRKc1&N^fc4GkH!=M5*buiqLGE^I;Tj{?kcbTdyxjot~Y4)i{T@hjy<+1ZtZ6PrYMk#S__K>z!*sk7$GKuvkx z?Djz=T;wW-XPZA})EM)jR{O|pP}9628^AQ~KT|3*P(rZ--w8P$(%*a3&ZNbbSHVA= zSSGuu62hoS|SV#5o~d8Ie%3Kn`pAEv$wGmycK$6 ze2tBqH2Gep-~V1)3x<$uYp13^YwHA1TXQJD*?-6^4+O%+rmG?xOed7*-k1l0A%y=; zo+&mm`J)$+vXlK+AJ>@J-q3;xcxli~dtfOboSmlY92GpecZHh?CF9sl(lAfhRNWWM zS%{$~_s|hk3?4am*~o(9T@QU=P`KarDm_!i*_LDL%FD<{HfKPzgzMUSJ74=1`@zxV z$zvx=tug__=U0JRc+R9+5pkQ|S1`rD&hp@UF6ZZePd%IOY?4w>Go}>l*@NnwtOf?l zNfmKVC=2@BGUqJ4=s;c|>1}a3!>md^EtYnIogbdvoH@It#ZV)P(E0qw*=GJP)G$AF zNo#UDhNK1p>`?3tho8JH$#>;i7FThZyp{;Wn8=TSgW-^4?RQ#+;u0n4ORbwuGN?V& zW*`w|wo(VHzF8mtAtkMN&W-w^n(tU5k-g#!ov#Xj2@Cn>({ds{Y)Z@PWUO1W*0RWrMHS< znBh&n?wo%r=RcECC0y5m1D&HcJ|^j#>#_g;G++H4`2p&|1&=PJPlJSdw(L1z3E~^1 zeF2=%`h77B`~ZyTCXt=x*T*ByS<{=XHUM5n7UgQL)Z)5`>Yjm-b_L13+3FNOZ{DL` zN~Q*m$Ayp(+}AlOWUh8LBO~K{aslYufSv+iH+}-SC^;|1)(1xG0n+WW|Ji(Gz9$%e zKS#nT0^CdknSN%p)XG8T=afjZ8w<3PWlG=~KQOWyC_OpwKK>PIY5DNrYbq-WF88}D z=%5>{>1wlm&Gt2LAjGU0B^}<~|2DW|_Mct+|NU>}{s0=fkxOzeVt898QykPk8WzyC zN)(a`?^2$3WL45|84$tLP3Fx&)eG4o=bgqD%<~KP!{u4iFP#)~J`LgE7=y)&f*=9#d);a7Q8)-D$BoJ^VS zw)A8ajO299nwOo#LNTv>@nxfy+|-&&Y|Juq+c=H=RaWNdxL^ExT-==3J-$u%NR<0|q1J2|-=;+~ zZvV89e1rUh!wxsG3>03jkj!n}M;a9p+h!V#*OkUI-{2e1C3qKF))`H`pwXSmRZI8m zN!63M$~>)KK?NJ27VWY*W zQ)DezvXGXox+lf_XG3Y=;j-Q;AX9Fpc3lBjt^GyOe9CK!=1*F6+I%S)mnNLzBgdiW z5wRFv3J(0jCurDdnG4<#Se5veK#DPYDG#lEbGMmv-sbX81BaIQ6tv<-UF~T@P{n4x zdqIkQA zOodNJUK(13$SPhA9L3h7bd3rL{ z1}>QfUr6?f$HV>3vIIu>u_zfUYk3sixQ{=dyjyP)*-<>Rl-WpN;Dk@-#=pbd%1u;3 zI}77;buE^c4VC9g#%G%EG`Ky6xkT|SFxAOSJyz1}vVNK+j@;#k@1UGcsw;Np7(&b#e*M}=eAT-#<-voHLR(k94qFB!M`88NHLy&+9NzwOjvB}Dc^j3w*(SZ! z$>r%KIZ-I3PZ}Bm!Q#}d$##p4_|J~8xGT$(l(aiTeGJQ`=l@vfn_jb#F&cHx#281d zTV%aw&vzZvj?=#Pz9;X6=dy%dptg@S3bVx_!D5ioU43vZt5prXDPW-JTi^nY1 zduhn)cB})E7hrmc9eMY`%JodPjoov$CC*+P+7*}y&>@`DE7s{&`FQyYe25|qj*sh9 z`FJE?gKs#H-I-fS?fs&SLeXwLh5ls;$cD%L*3U**Whf>~YD1+`W=9V*;xM(IzwO*e z5MUNS69f8NQ{#1e#Q3Xh6%5qWu9#MPj#Ad)f=maFvUlyYhEMJz?Iq`e5U>r05PT={ zY;$ziZ&6YieT26!PTJ8DTg}E9DJf`ZDi)aZ|ImzJ-&8H8OCe&{N{F(&_|`l68AV9K z`~xF-A~F}$=&>=4Ma;DphRLhaC{9z&_a8s{jIhivFePR;dFWJ_8IM9Zz|%DwRQ82> zCe+sOMnYGIms+(lz9Zl|Sa;r}br;K=ZJ0JD-|iR3+2yX$xlGI`GTSN8mrKM~RL|3X zG_wFXTFzjlE>t6VXMfQK`6U;3x__y~qE~{gTXQ!hR#rM?njmwN_Z2jIP4C2BjheDf zalH&D&klP1KAXgJF~~+CJg&m&o}=_;*qPijdrEQ7hcGCywgBAV$TK6Sw>h7P=gNk% z#D$2sT8pYK`jcq*lw`tuvb?1HFJMKX*X<@bK2UUBR@ee3AC=bTM_FA2tCz0^D~h8n zsy7B*rI`Q5Y|MjxWxFU%rvEqlmp#5&#T3nOLuCGlU_i;MYLE!O`|@%;cLx>55t=*F z+@g(5+4YKAzx8%8V?-)@s_?{a?dL(3TLtE+C1+^cG50=E0P$`2?F%HXIh1-29v^_q zj9;xJ(r~x;A_M8}__gSs*rOSlQn#wL2)l6EuZJJqaCQs}m^$LnQyPn6@6YLprz!j< za9!FrVMslV2|VmfHJ*7mA}bAvQj!Ffw$~> z+aXTVb@q9_-aO<6ux|$DeWb~l;!U;xqWp%Qmg{M48sE^Bb!>@J1j0( znVzA#l=qu0x16mf!IOJL2%$BYL0u9h^BQ-RcTXNbY{Pokw}^jmrd{%i+D;ioXf6as zeF*`8h>S;x7i0qNZ0&Y*sA!Z2-$70HnrdRKelU?9)CqTQaP-o)kaPj?`n$1??|{_* zOkn+g^jmK&{duW1DX6-u<$$m5@lp(vzdVKw=p6S*o}D;aAgjr-;;Zedm*W?oavRyS zkxd4}w%V0#mO$C&k|hZk>BpO`iZ^Preg+8VGqsXjpc#<!dv!hWLF=PxZdsvP zxxdjp(oJ3Btv>~>HJNW8_X1;AW_8enh_2;GL)Qg_}dl$aoik?y6oCZzkgwBS*tGN zWq+e*&En@~`5T(W>VhE4hw~R=61r!`UueU#prxGCMG;es6dM89yOkjb&yJZH7VozX zVLHwAe~4XeGZPTi^}Wh17IOhOGCjMjKw)u&4C%B{QR?7qyNcjq6a!|;a;*%xrrnoE z1R+Y;N?E#XR^d2E!kOh_OiW#%WJ2jY=zV-3Pk?Y)SxRfFw#Qd8OgD#7X&simU$O}k ztavikwkFOkJb}D(UL+LR{l9Tfa<9Xskn%CEpK<|yb z%cMqs@~)iOIKvItCbOF!ze=7RLYtlAbcCqF6C_>QTRWvKC+4o)xaId{{bn_ZG!=^P zQXiZ4>vslir3*HSg}h)<98;`<#-iudnoVrEV}&l}KBd$H)By4W%;gCtY2xILTO{(G z9V!@4%}`SUgPL-~&e%&+$%f&=yG0(qIrl{3NbXKur)g?Kp-3=zf>Z9a=H_d(DS zW{09il11yfqvVbxD5jM)p55zRGO=cs@-E$WRZAkyq?Qj)jt)IJ23P}UGJhzH4yw0n zFTkb~RtJjie>}l_V9)#iXa|Ts%no$j^;Rcysx-s_n7VHaF)|0PPY_l2Cx4I&vp#G{p!F-iaeM|p}i^0f+VJ;eAR^MA{7~hUf+n)w> zh%sR>=|pTNdh`MV6sAw#d=>!&pErXCTY{uBricm=D+SU5939lkdQBS;liLVrnqB$~ zzKbZf-|0#iTIkJ|ml#9Ku;9lgs3Jh!{H34?MzMCMmKb@AaslO7un~1lx=N72_QfSF-e(t>6VS4+W?n1q(M(FE1yW)@S&9g@Z(#V-pv60ZT`MAxOH1}X9w(ma~ltK zkz#Rj)1Mh_edt51gJ#ui4Qe}LO7xfO^nbb8e|5bktt7}8veHbS7PmFrPDwMYzg#oD z{Lwx7k}B9bM2~mY!bil`bjC!SAJR1_Dk+ZHH)|V*jx}sXbcqXgjzbeuA6Y9<>z#z+ z7MqccdbWm3uQA?w{w!jxr?2)TC@k+@Q$y0t3O?O=FdV#OyJ8_AAnBj9XV8gf_yQd@ z%R_=3DvPA=X_y+F`_&ig=$vy}g}w=g!@oUhZ<;9NF6$rY)g8RbvX5A=)2Uuc{bJ)| z3R4)pNbC2EX-CC2v$4V$QHj`DHBOdY4wP0&XB&K^m@Lrevl@k5ZUhYnzRMnI_(uU_ z@tD_)%qc|;D#R?BLMOi&*m64}_$~f?P?)!mPk2_=r-6aW%F3{tgnpmdy~IoCj9N^lB3VLA*FFw0(l*lnVV+3&PuyJ2b3Y6J5D3U-^fXYjp#seSEaJ3C4sJw-vVrNw4Te&sQ3yZO^Uu;)9 zAkoki_0WebPq)Mm zw+dv!g$ix$!6Ns)bY*BcT7ZM_{lF+b{i`78Eb8@*2I$7x&9J_L``(FQCsZ~pt=&-8 zG3lSxqc|&->?wL5IhbRcDU0iflJtJaQj!lH%($2=@U{waSqxXb4(*mqoC)0Kv$IT_ zH42b{pfk^m2oIPrpCCrr%~aU;QZ;NEUyZo=Q;d*}OY7w|xnBguX2i_6SF^j4cVcUC zv0Jt5!Qceh(W-p@r{;o=&uqS_n}>nW4lJtR_ALgm8xVgJ41(Ks+NeR zFZ%UML6MR>1F+!~eh~zeOWoDxRGOcFEhzbap?;!mA_I)N(-f*5Wa#spDGU z3Fh>CdOyuNEHay*mGr@ibE_<_HH|RnnIE%xeQVGbp`_E%d85PA&_le>1J6Q4qFrlO z!Jy`liFaRU{Z2CxW_RXVTxvObOq4^VXYFw!B#RgsBjQ~TIFn&jR?QX;zqz@Wl1F1YlWBeEWsWBJj=nNkCOvK(k4cYPWYD_ot+aYV;7X+7 zI7P6x_gGy+_g3`nI=j7Lw=`%1U8VKSmuoph_9!QjQ8bFKc-wOX<~lSTM5Q+9W4wZ7mwpdC{~$5n#h%3)AK*U6)o} zdv&9DlP<~!DQE7Cq`u!{4>sRzV+;O50eO70dc@yf?>A4@&M&v|J)0Wz{s=8dMZ5Sli6wZCTqbg1 z?BgTW7>b_5IMlM(w#gCOTmjKko*bhE9Ko4htrr(dK@$AH!&{6=he+0th5;bg-KOZ98*t1i7d(5%nP=ag3FOAMZl+T8U$4nc->{a?L;C>flNRi zplitg`cJtJq_-!%{+56LU%uB5P9$3L+j40a9^aH9M%4`By43^kv@=3>r~GEIdz;(n zz;r8t0AeUIenpCf&ek_ zno^0AIi3)fg&{*e~y@EJqFwi!ipU__DEJ#qQ-16{S z|DA|a*G?q5O0iV7i(~(D6kl4E{cEYy_BBE@==cV8lj#gjFUXbf@>n=b zEJMbnZqy}v!6f+6%(8<2Y$UwDAFi~=Q&>wt8FfXri$1iOoABPdws zqp4Fuq@c@$;J8b5){re~y#^Ji-qxefjCD`a#-j2dMgkCus)7Z(^5Cq6TAati zYguGLr0DXY_ihR{LPF?m(?y&>3v5>+k&z4QeFnt0fC_ghUBafT%Md?QuNKo zai}G~GY-WHamRcpCBiEB4Trm4q!Nr~*^ zn{_>80{RM3`+JWeo5c%fb2krHP5;I@y)#h8>^)rSvV5H%^C7XhAmhoBj5M!dO?hl$ zBhL6Wfz5breR5*QV5vhDWmnw!$bGnYcIl3ZV_e{T-vLP3{=%$yj=& z!hNZ)8~fzwbtamRjIC`6b?s-EeiS)RguQhYmDf~jz_070-W;*v0~f)4uGx0kp^UC( zaV1p7ZL9Avn-3J>yfU*yk<412vaUdwZ9eQmInrKOwXeEw=uU<1nQMO#CX6;7sFxUt z)8iQE_Z#0y9AJzaDR?kku5*h$-zv*Ogs2TwOZ{9C6Ukjz7SmxEw^}zuoBQPlZl9PuT?ut@#>I4jtKjOCkMqHdziOPd>sSE(3jidh}P9 z&>ODr9aGYG!0lOlqs;yTgX-HLYii(20Dr>&;*%fYezh diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf56ab2ba88cab87841916eb680a816deae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 55769 zcmeFZRal$t)-Fn+z*nS{Vx>rm6qiDAOL2F1cMtAuDNvx0;#Q!zyE_zjcbDMqmSlzR zn{)pEI@tSUUwdu2)&Y>bJb7fuJ?=5a1EER^lGqq;F_4guu%)HMRFIHRN0E?_z5hZ+ zJaJ}X&O!Wm=At4gf>b&}x`%l4+)`Lx7zwEYjQMDcig^FRNlM!V3F)=#)7P^V3xFpQ z(!7JTn6R3s!6EcTteK|QPPjx@DDOv5T2*CXB}Z%z@|SP-DsObzPh`FaVcdV&m0)j; zcZ>LN@}*RhsyUw6to^1IV&KrBgSL*D84<+V=b92tLUGmkCzrla{Dr!*h^X~IGAQjM zyD9lfz=>mTe@ql{QdCq_QdAt=(BA&2YBUsY=dfzD{{p(Xxaz)h;YCF8?Ul%1e}5}@ zO@0yZuh)nND%kn8|Na%lH#NLM=KqYOnC|MbCw}whr}=*yP7H-Y`-r9qwQ2rq9Dz|0 zBdN65Kl4A$DgS>m=QkV7|7=EzGh^Yu&HaDh$NCi3wnS$c$@$FVUp#HFss7?l0LJ~{ z!`SL7tNPPP=8^Kq8)3(i@(qbit!IaRj$Duu3h(VXaI4Sdu3~_@H&ak|A1shtFJP;$ z&Ff|ziaT$FS{aiU@Te#m;Cp!+I*IbJ@XxAqIeeeH<$>FQ&-YdyTH@a_&X?%>7*prF zp2!e%;=M(CLssc(k6U1h(+Z6N7fk4b1$pU zx+k}@k}uu*?&UWT+g}Y#gV?3_XQkIe!hs%Suq9Q))|Tlh`Wr-J#)v6)bNt9IQZ-?zd%Hw*=ZrCzD^f-D3r^0KBi$+ip$`A6Mk<3rtrZFNxAf zKk90T99Gb#t7ndaGJ(*jcpaOR-2zFV|0MH`0H4>cX|8kH-A>yB@PzO5QPgAAeG<9~ z(7IdVikhJ^RFhx&6*~Cd*30U>;FKs>ES%nYuI$%8RM=1({ChUX}X7!Wu zAA=&In$O5ezi+pM8LtJ8`oW`oa28+E!&*f>9{W97;k4XXkIS^H4+UAGvZx7D{UOIK zH$}ZEkpj2NC%)GxA>My-R{)`xdTyO1fcg{J)!T^@lJhkw=vrQzj&$^Qa(I7Cu2xl- zg5af(2k=sEQGeBmBNF1c9B_MFCIG7eR|`T^)>Jws({-d$>S9rNoIs$o1qKW1U(s7gPai5(qrX(&Um zwy;AI@AZ}{%d9#&PBP>zwc8=%jgWWGH2jQp`DWYPw4k^T`^Nvelzg_m4tOygvshAx zSic)*_56B2$iwR{sdtKA-$NW8Cffewvz4#abf1JwCg*y2X*Lu~6edkmydt&um&!Yh;0Fgz!I z8S zXW#cIlDgIR7Kgd*mV>IL1+VdR*KujmVe6Bnrwi2`nyj5h(N`umHB#h26X zt}BBFa)TAfq5C^R?mPC5nk4!GljuO$+PG#|*B4a_2>^!?m-qb{I`I10^!40&Ah?Xo z5pt;rAZdrM_}>Q86li@(J8)D#f?(9Br`@U}FA1>Jx%%}~}bmH|q8K|Y!jaNAu?dYM~6 zRZJc^eBV;Y!Mnx?kn&2<<#2q|Pp)+P>ZBPmqA2KkX?Et2s&9LqBzZimIWVsmGYatA zRXt~RY=fjB;A5x~rSrZ2e#S!_7>vCGqC{9lj*|V8LTb}g!H@mpp{+Rn_v>x&(6H+J z7}nKf@B4Ld%Z-a7|M0=og<;D>XSx@Y&lV$4Ekin}o2SXK^<>^M{r+%K-I&?XE$nJSn(xJK4qrH|bnqfPU>4jm=e=x!oc#?Jke&g(g- zUucQtw<$SVY?d~P}!t-c2Lo8mx6d`@70 zvP5TBSUX%%C7-WOwciMN4WbKqP5B%ow3f{Z-jx6kgNKYV|^tpbL^<*qZ-A^30n?FBY*Hn_q~jp%0Mg-<>UCF!!;rL{!Y{b z*3Cv>f1?;licgf`G`bG-zLl-3R|wc#Q538g0z$S#C86oCbHSjNy?ANChiOIVH2rMI zG5nGlT3Axtm$CYA3AoOV^jpuMy|ROZ?T(T^1UI_*!$t2I@DM>^@!2%tQ*2Px;zGGh z02fo5-BK-N3cz|cST76mXYkO_egPK}#MwY7cUixalk{5k7n=LGIBj3hTJKhyeXzl~ zGo3fkBcT7$3Q6oSx65M@pbZ+YC;(b=HY>1%!!mZp6Fqznq0rpI#0pXZU|dVnIlk9-%u>~`h}VhYjz zmPod{6t5ndj-zKD=!WOo(!>9dq!*2ld8_8dca!LG1x9m|yPCUXkoxbbV)V`B^QlP* z2QLUMxOI2m3%(x6c>7K);Oa-%C(!K#N~N9Ef%3qRq9J)~x4KpV>itdW?%7A43LDIa z8X^^jrZk!ojDyDSMXww70zLApJntoe%=xcBD#D>RDy64nfaU_M6Z)d7V4v3O7+UfM zI23&xL2-PqOi$oj<6nQBorePGYWBHH+x}3PF;m>1({p~`Te}(*tYP8JcKw|ZaIa3W z5|KeaW+a1}*~V9jOh9(L$~YKYYcNd}*`l$FOU6yA(HR-(cSZ&9*~&v1R}oErionDF zkmE|SIb~(H=VJ$DZ4b&-CQ)fO@a_a4)*zSnmv493+6k&S(%z0p_QJ>psX^O_V9lhrb>BAr9 z#!w93wGILaXkvaRP39@H;n)|GB8ih{1e-l>kB{FBn1qGHL%+#NzbvY3$Xf&5Ir5z2 zPG9!I*3-qPiSN%$8O#PHBV)1VD}P1)O~7Dhj2?72@pBcduzphsN8H)`k=p3Wh%;_$ zOeXLMp7o@Qaw@rwstN}`?{)X08s5C`DQlRw*eDrX7{@P}7d8#NUz6uvKJSkcQF?Ne z6pViyWiT|=e=Doa?LjcWpUG)555Bnx)chgcgWJ97&2EQZf!xal z)p2nI02nbGF^RF>u>$hlk&33=WQ-^JoI>Si0u8 zV07Zbz#>r^qAXD{lBu!00RKml^p=Cv64=~UMF`M+kogAK za9tvbFb_5Czmu~*!Wcf7X4}nlOhFn>z@2UYs5e8zXiDYQ=Ox))S3>&zy2o(u2h5!JvYvSsLq$lAJ%%c;J%Lb@e5mEkCW z?eZ|Dux0i&Si?wGLD+e^#G`KKbCx{u6gsr?6jUM?pE*3wAGiPuHc1MIvY4|WVosn|)%172v_ zuJ9qyLTdW=-$|n#8!G@V$$7Z3oifYzxs!m`vv;S}RV*&e|L#YrvkJalcR(jP&|ivp zdX?VXKmoSP&tSH<4&P*Xc=vJz77}8-1B8!d0cW#BxWLd8o=iJfUfU`0+(QVsx$4{8 zM%dD+!cq1`U^-K(q~!|)T~eLAZia5FB+I+)`mCM=ATeKEa>FyeeU0P0N(2$?H5_a% z1c?1K;t}s!d86fx%Dsml&FIN>)%>u!tJSay-_BD*KV3b8rOY0MRDF}8&W3rMO8Cvd zq4No{`UQOiAyeW&=;8TZg&{D6<%2^Z z!|qE6iY8+BPguq9y#O>n~H+h-giBAsF%%~f&;2z zHSJ9+elB|j$&@GebI=dtreMMQ&ghri{%!G?7SS%=%2G0KqHH#RkD(za3ny=Hi$(=p zLGvS3B|d!WGOoC}J8#If=~Y0uQMxBB0Dao47Ri8W79ysyRyY66Fcmx+Tm-DB zhy25cx=95+#qc?ToUlOnSSf2{HM2o=*VzYQSjU+-RrVoQq-g{FF4Zg zE~D2d*8doXY~?Q)$%+d%R^R5T*Ja|j(efj$qMbfNU$|`D4f(?#^kdi{t)k*vJRUdL zlxcwb4m#}66CTp`2n9CPSQhv#x;!Mn5l~6yO6GGaT9+UCvj-#Cg^PfUgy(9?6bFXL zpNb`ZMW&HB#=RloUUl{4T*WAYN0#{>9S=giO>#Fy+5dV^K*r~FnE~_`y9;cG`R|Z< zoOm=C`0i!|j9q)!?A~%82Uz7BM!4{L-9s2&lDz;lp6G%f*Hh2|EjuF*ZTdWkb~fij z6_P^E5528|&KH1y9o-vpP$5xCn_I}+iK{MC;6&BY+8Fs=m!-n;b%SD?b{UHjMD=vl z=|HehRp36=l!l{Nb=j)%E)c-p>$yu+7f<0NCv?~F0Cqtaf)`7bVV&u>BhZse9N&i(A3$x{)K4e9C)`q;|M{`52%Ol-Fg#F@RhIVC{{nI!7gqddBASWD!btp-(BBw zy3b`l5s_nR2<)6q^Y+vd*eWbZ{zSIO{;S}l*pU8|lJn$|PvBuKUqx7+=-R09e`&ej zfx{|HP3Z%AGj5jsR!`dCO19@yQ~>yvW;*!(X7#4zWHpB}1(BEfJf?t!{10!5-z-JJ zQX-eGqE>l9_7%!}cZXT{YORv&H@6?!P^VBI%uu6V6=U2bfK z-nUhXzIRgAtSRD^1sRqBr@J>`*yP8cp7G0o-9a4q`1%ZFqkHR25(W(nc!>F8Rev?+ z2p#E#0X>$-*t{U__3WWm|LRC(^ku5R)_I#q+`)twhDXu$zH2tK)}SV;F#zE0@2 zg?0JR?v@D90Hrb{11&%10Dztc$r&o2>~^QX>Hg!vk;( z#!o$oW+d2aJ3E!HTRLmi#ku04&fiTkl>~TQ=DSMO6nU&V@0^f&T|`G#xX*^A`Jd~q zJ}%Ne)$q(Ccl0IwAN0|Wt_{zb<)PfG{R#-xbxpIXTB^TSg|zin6u zSh5q{v1O+fzBxjo@#?QW1SARF$04v2_)CFv*=aWK_yOuc#x(QJ=Ett;&FUqs;sfxq zCIB|&O^N=5HrZJJV02Sr(xjsQLk19jeTIiI@V|PQ~{$B-zwT*x3pGviT$60%8 zCF!>divF-$D){m87X$&aRcy6G_WdbycC+L(o9?%>1B5-W24q|AHU&J)RiTV0+o^D# zT@WW6EHpXfOd)pp&5q{s?`;3C`S)0Y*FJT?+vbC9;6s04-B?QK(}F_(bAgv9`a9z3 z6M28iWc~@r|2+7AU-9?vZT>GSHUD2*%^6Xwe{?i5`rX!MSZEWDhZAtQj+cwo7%6a? zSLc=zv`#AoZy(3i_dRGaga;nDKI!IPS|BN(j!XSr`)E`qYOKB0Wf*X2oba7V#{I5) zk=%1laIo%)G5j-l9>dPfyf>2it=GmbYZG{h1;(^o*K*Rh-V5gQHTu_th|#qnsfD#z z@N=S0eaEKKL8ivW8}}v!0nvu1qUJx#E)FXw=}JTjohk=?^dIb7E2n>IU)7z^yXKN5>F_agCUG}=!;#J&CZeBX*c`T6-#zh=YC zndemokzv74zo3(!G~OKC6xP?%!8h!~ZNg_vh8nM8JRn4`F)hCQXDep(R~_D}48xI{ zy4B6+;dRhGlsf5MLde2Kp_-kt&0xj4>3R zhquhEz2pj?@1^q#2>W9fj)Lo|e>Qu;f1NoyY^u>Q{MwRUOwH>_4=8z=h;cgr9=^=* z?xGoVzo&BQKig6XySlGE%#IRELH|3M`R8%$1||7_>z7ob{BH;Pi(>l!kOxD5aw~vz80WD^z{{}CSKKBaMsdz*X zg6)>mlPEl1p-B3iKpQu{PzB-uPdhWO{u5Cs7TY70bf2c^q^bito#+l%nrww;wH*q9 z9^AY$9%^s&xgT$p@9X{}TC>IZXEuYUIBot@Zd+L=dt8Ib>xM9s`UCq}w*sdfH-c>$0J>4`lZ*J!KJWf!Y{KJ18 zO*eu+eRMMb1qB7s`&Lme!UCS%p^vnj9Q2HvZ-t@@!T%j}87W(a>}+UdXigJcB$4Fw!o$e+tk>*3^i~SJOF4C(3^hQo`+k zUHc7b-*l>D~O}$@DWtwNsB+WB=I-1wY3B z)aL(26^f6bcMLQ!gU#$v8OoT`dO;}%ZkQ@+oL)F*{Gtk~zA0_h*@O(Wo!zyFkK)04I`B2uMsXC_I zU!z7c!RhYhJk8D~`gE!0=iP>pQ1&?a zB!)_?vR+2ekCH#{3X(;%F)T=$KuNw;e-z^P__rCKy7~zHo4Nd6PA>hsiCK;Rkg$~!x* z1oZ}mhF_&o*#{n_Gl6O4`E5MaZ`8*?L(y-2KH65;x&P}1M}c~Nt(r)Z&EUbuGWgb` zq7h*-WJ2sQ%Gao%mg#yU&%gCFZGLyHw3wSiqxS1=ra7 zhfVM<(E_q=xL(ERoMH|F6v6KtK8Lk~#`=qi2h8)gZN zpyUxJ+PA&F!GFW~&t>#~6y)_7(HpW8GA#0Jj)JnO8cp|o$d$>=w7`eLBf~3W4w@?I z3W{(h>8dd`6ru&FGa6{(H&J8WF#<6i9@Pa!~XE?j?N_|er(s~ zoQnPL+2qvYPfp!VWX_=|XJ`LT_K`)B)Hpg6`5Jj1h*XuWGaakV^^5GAL8 z1<+W`_)7+Y9;rgWz7UMAb3^H0$qF~P}9YX$|(l68N)eOTs+-Qe#c_pox#H>9Hd=PVCb?037 zc_zYv+uwJQsXssy&e|r6osX(3gtZO%F+;}1ED_{DN(OKVGEW(OEgOHy`z;Y7edqUg zys_WA|GWh3p==edvj;U(>@0s)K za$RXeodzH`gT9(d)4eY`^}kKtGx+twpn!(!VK&>E+`yXpuh(v|Wpi(xTH=d7h;v5M zR!OVLI0!YPL@|EdV)~92GWb13R$pt`GEOT?Qb3x8FL#*Qs?^3PjDp30bwiH;|K&TnmI{XS_VTuIA^Xnk) zsnw>~BEwGBj$xwjGp_8r=GxpTbLY>4v$JC!E~~?Hz8N?^Ndu^6cq%-o7f>+JKkXTPIu#nTp1%Bf8oJEn+~#k zN$lGfo=h(}gTm<=NmRx#HWubhurWa9!z_j0mirhQKozcX)o-MCKS+U+)JmbYr=O&@ zqxm_+j`#c2m5$2FzBZCB1j*|si#Xvy3^!Fg04#vUxMh?he_JB87X1Pu^@Js}Al%lvRC}tTS?07wM`*eC|2fyacbu0nu1^PZ>k4AuS6p2pa8h}3!lXb z7r_gjW1#8@siJi4P7|_X)OLVfrXKQ1D=O4MjItz#=B=8o?40SD-1vq-P6EOgSr>U~Z9S?C>u(HvJCbLw4qC ztop8mY8GXcZ~_~n((s%NJy11JVUEbad`sQH;>i#eZ%GutbswFi`1%Pt)KH$zcr%DNDbV>DfG#DbOi8HOuFJpN&gT2;Iw>eOv}O#o z4R?4w{O&%K5Vb8@eB}{yeS>?T6RABQWkJM`{;QZIfGnGhyGq@IV*-6knvpw|-p9>L z8_Al3s`00QS`2aOB3S!KJ6PoClJHk*^e<9Ad|2h$i@?&-W7MU;?%kal^yz-r<+G^1 z3ePEaFu4kt4B8S>_b4Tog*3~bz8YIp2aKD9eM`&~kMoKBWiRy9>3*ex{3JikcJ}Fb z%F|>X-1Il#2ykyN?PknmKS5VQ>R)oG6|@i!HKt@e_*{`e6InENts%!y^}F{k;`8W< zOrqN3znhy>Y9D=`Y^b~%VAL%YTfa)04G_FL@T75=u?EDHHkKYcahGyN8oqe$#fkN- zL8ZX;gEHG~1>0NUj1-Y$rY3Fo=O%*5W=W@_?&iwRXu`HWXo{>Xyp@Hhxe!iZ?z&aD z4#nffwZ_Qzzrns#X;7I)Zjo{zoMhLa+xqy$Lg_DE<4d}V4`)a2&!Cd8UrIb`$7hQ~ z=rk3pL_>uShe-#nDQLLow4nimpL(^LXX95){J{Vs+#}lAx7hhMZKMAmM z@F@}Uj3|<`r$;{V-DHE@vA-qpGrh)EZ5nLHWL(KsXXqLi6M2tSeldQ*-*^A#+2(TN zh$e0D&p8p<0o2}CZ?Hhg*9_EEM8poNPOG1Aa2MN4ah2O+F;TTtw>uGr!H)Gh>J2rH zXFLlZh85r9yE4=+UxGnHePi3;6^A7(&UUa7E_@yVU?4Y_-Fl<@d%Quv-C`T%DQ|3``&(L^MPUn-q&sCZ zIsW1CvgOQcUB>3?@6N76^$4n~f@AH|@$r9Ikk}0E6n$%+>4bIhw}NC?o0k^zHGQCq zxp%a2gBW2V&eD+hK-KcNgv_rD{9j9$3M3nTudV&qOyVhqdTQ*bNTlgAZR#YREPi=I zfkqQU1+uZ!r~ zapTZw$fVK7r9vJg-B@Ml62+w5DO-4xdbOHw%~CT+&0R2hKK6+*aN;}#xCcXC8`-rj z#;6lm-Bt>#;*zI)V_WakvCNkFRBe|M;i6nIt8_Sqf)GD$y4Ebet;_EQ-h36+-}Hwi z*G}Fgdp~G<3==(#xp-|EIBy&Mupf-xtXVY1eM0f9a^eqffibJ*| zFeh(6S1byR5ldEw}h82UX3!s5W0g3eUd%q+f2x+?Q9?AJ$OF(NzRM^O0ul)+F&srRw4rpP9NNM zC+6g5Exi}AgJU;t`_6WH(mrCoZ3b*c%ri})d9Ihd2^NoS7gwNk za5jd{cQ*6X&O$wBl|Mpu%G zfG|V3AiCEMp;(0hIdu;xI$DRF-Q+5CzoEklgGPL8%wa`qXo-C(ae{e2;oprIn(;Y@Rg$=FML#BVB8#k+Rsl+tItuyeq~L*%@f2v&d2@{8TD zM4U=vKs?;y0D1T4AlMAjt@pZ4y~b5b@2%c%N=e{S-}#nshr*)&pdIT`hWpYx&!zQe zjQd!}?*!y1TmKrsOhSFkV0&vQpSUeJ3^??Yn_vhJE!C@OqdrT8p(8U?oK zh4%j8J@{vmM&n5g*a{t_Z9=H#&%@^O?8k?dY_{BgDp+AGs7eel>=}gdqYj%0RVi$( zsT+LAc6Q%axVf$PzQhzC+57B3hfK@;tUU~41cfVo{!Kj}NUffe)J3ZeQ!*z(w z>Yf&dPaI1$fq6}(4-q#NuR(Tjuk+8QT?>!Z%}?WO-j#B?w@`gzPQ`$y$X_?XzFGTR zq4hP-)!S%(Z9A9kK-iSIk7=8q-+i=TuFWi-ym*_>eUoPt=U@$W&Du0xolIbxFcuds z4|Sb9PnETL$71WkID^fx}bZ->Qs>AzZ!# z)c%0bGRnt2(({R^w`7S zQ7`JPVihS~JElzLcg&Jdd}{iZFO;O*+4PfZg117qLHd0iCL@#g)Gf`g%DXKUr@=Yy zaQwqceMb;fi5;K|T|B z`ANT$P7xM#`E`EtzTje-z>i*~rOcq&w0y=+5+UNB=7_ZR+xavh$!gMiy9+D2V)I5) zXmTO4S339dDqho((|)vpY7L~`^o1fNL?K(C>SAW7+0tP}5O6WnD~RdrArPuwYBrFn z0t9YDTYbmUanM0m#&K`|H1tT-76<{b^1V|*ZWLDqsJ;U0k+kIi?txp3rqAApczcKB zo-dSweIHV#%4W#2=aTn${B1Sv+UK<<0kN}qKR$ZB4bCuBx0k6_9x~vVoKV+ z&(}WQ=Jfd5nXXxN3SCvQlpXd}JoI-|b2eC!WgJd}PGeu$0!A_7d^#zIInYxi2_?*Ae@&^G z$PDnH`PPs*7BM*M79tWQTA8;<+CjnjahNS z)TAw}dr@;mwFV9luiSC7%1XKG3xtoE5sB2~ygqfPHmK?D`3S&-UbuAZDCpu%&f(5$ zZ=tm6>C+h!4NRlD7~_9!xK|Rw7kh7$EdN8&O|Q*;*ZCaD z4jJd=S~Xv{DiBm!zi9n!b0}i$`%OoeZgb9z_M07f<{%w$=I`(F7_&6GM`$zITB8MB8N6Ln8`vU|&v^H% zzlI7CK3Iehb#r8caRv?DU*F)1A3F@2*T^{A{zQd`>S=|uUQsZ&KA$%6(}JuU$Osz{88r^rp+Wi2e{`0T9QV1?p4 za~L#5T~1-Vhe|5^Tiu~ICc2J`73V*Tefm#B~4=bveHUwyMjMBL|;cX%8)=8 zoFo#i&)!T+)w-21=sR3;km9s1*flcnP%RDC*F=Tm+O94aEg_pD%leF8vta2*Az+P5 zADCIRacf?WQ5yN&B7R1q%5=w5DPM1NI*8FkNSjOkOD-biO1n=>Yb5tgEnr6RP3U8p z5Y3K}dS=;@c)-P$KCeSaK>{xIyvtA`@hFg}FUHmS*FTS48)2aw_y`Ge$ znPdOp^4YsOOpB;eHiXpO*`L}sIyT{J3b~>{{`Hm*>q&-6fwqLN*}Hm*SJZr0npYDr z?=PMOu;BO2GP-?w@jR;0&XjsqFWugHNL(Ya_7gUH7>j4_c5%P9E#H1=OZjV-#{l0u_)~I>-0fUVyiYkdf9XWUa zM1Xd3e6i;hJ1jx+30m4J7u2Est`0T%J8*(f$K%%KjgCZsHvMO3bvqCnPh3H|?xQma z4rSbdWu=z(`9a-Vy*y?Xf&ekh=h1@{dte9L4d-_~uQ60YMb*`Oc8Afv+%Yp?VF6=U zBVxaZSM8}7nHB{T5Ec5;B(df4+%q?_-G3OE5S=3EkUl8VV4L_ckv;LF(c9jrKJ0u# zcUAY~BU|YBk+VVlfiscRFj_~_Mj8R6yWmfL^BTYEytrmUr|}&luY{yq2gBhj`^c5Z z^S(cSkrU0?2?&(}>)0c{^rSVWrQMSY%$yc?UR!hrcSNmq+0&B!svJ0?5C~GA8}c>6 zj3N{*t4OCfKpu_^evK+tV7fprL3p;sL9(|iBI7Pia)v6MwpCc}&x=Mz?g403Xl<e;viOll%5G z0F13z2bFa2Hzg%Djq*8s(f={4DAR z_VYbC*mT3k8^YwXI%jshm2GBx>{5ieUdx1_gq9OvdT$5b@dmgLq=((RU{ZK6<-f+T zm}DK>i(S6*_7hf2xOTX|1-7HO4%Lop@E&^79{! z@9zg?%&B$Nbb{u$4&`iUl7ECne{W^Zt*<`qAxIkdiPu5@9OKNSobC�)v~C(0C)c zgd3@mu<_@wnt>uVJydQ~oz|jKOy0;^`Z?+o2D0^+hp!@j_=nH5zG^AYBuV|wimv<8 zJ-BGiO^XI}T+0%OK+mPa+&L+!)PYa5H}wL${$XzJBCc;XV=Co{g^!)F^tz?jpNo4b zH_VuCMYaCaZVyd48bC?#x#Q0K4CK%<=X&Zv)V@IQ!g5ZVK?zTp+C(vj*rq zre0*ZTR%sn9`4BUqa`iQwuwP$!iTu9y z*^Aa8nvPt{NV`}cy5l$vTGknczicBgdPa#+$B~_lxB0^l39bW-wL`u?WXo>LbCrxs zHO}TPn@o1wSYvVPGZi62B3}9ADk9<9rEQFD-?ViCJHyk~ulRlQ*z07+ zmqT0+dAd*&o$#ah@3U!@BqPvJ}Ns=MjBuIqf9PCEedGznEA@4tG^@#xdHP z5}hhW*p9vTm8p^F2zoA2iJy%YoUT99TiNM^!6xPDkXY%@^R6F7n4GGx+4V!RemOu` z=Bso5M|O}5LA6BSOdLB#UmR7s1}UL!yoSsl_4aP{66T2X(LM*|9)bk2fjUQG@;XV5 za7g2iD)Klhxr?NUp}g%l7S(du@pSRzjsod24a*3J?<_x#8}8QdV|kf7grum zMHRS^M;MRa{Q64RKHpz0W`#~YUyQ#oG(l?D10Z|E)=~C)c9e1bRQzl_KE8L*d#S4H zGq*7)2eRPeh6YhjH3bvBj1tQl|SyY`C6lvas01T(9PNZJK6 zP3wxPDqmT-KbA4>ntJkBD=r{uh>P2dKe_5iem*i@&Qi7(JIJESfjBKGU&VlMgWXOZ z+grrgAg-ko&vt-qp3qk_{Jyj{S5C8tp_aWI-lcFeqdCorB>t+{;r}X*a{YZ_D7jsx@3ZLF5~Y0 zEmA^FHl-=O@oYTk=b{3)f#6wrVMR^aAFkWt`K!X;*hkOEJ}h?qih1@jUzl5Auc6L~ zxmKdYX`}A(wIiw@Nvhre3EN-J<9T?KI85Pa#lXhN0pxf~!g)YyRJC$%aOPVO z1|N}Vm(EBijEx+5zwlamO7S~iGl_`D(3_AYNv=Tp-B zLfLb!LWW&-P|dCrm$Sp?uU4-Z9Z(L)Y`Z^8vKv;BwSQutkP{9P7Ks==4@J%CYWj*9 zM}5&B_xX$_jmo8fH#TZaygRjP#vD;JIFLu_3CL=zp!gk|koyVmeEXBMat*taN>zb& zg&Kq-YKy~J*#7QCz^h^O!Y`}mn!;bvx)sw2>M`%V$C^-PmWPOs%LdR>R9a zjk<;fPnjUHaeQF}hq2MN56#UAxS3c@3Q9#gOvfR69IJ)f)#IIsnP!H1MzFJ+M~v3H zm2atRwZuz(u=p#QW$W$iOXDKnfSyYt`5~>Wm|Mz|({I|E$#NdL=fer>#3u1y5dSj4 zhbTlcNm<$ZXDm5+&{w;^Vnmq)aShdk!HJ)q1*3!J?c7eue z4Ayl-cd=DH3Kr87G6hlUw+4yt%YStriba0x#%6h8yWB{-wpg`bEXk>vAuT`8CMCZ= z-ET)=GS~U_weHAuj!N8$QxriRCC_$2*OZ)z1s7+y0Y=tKL9QtIwdQO;E))*V`;X)q z!yVh(pIlUb7qE?K#Tiudee6%#>#9!n7viM7$pyuCMEsl%le^k_Q@40@a~s%d)S`(E zEoa4Rt!`>1A*l{oFdqaZ%8$Gp!HH!0fyIoqj-0fBJZJCd=cuTUbI%~>YWI-?Xf_iU z;p(r4yd|!ntJP(HtQYRCvJmF3CM-fcN?4UOu~xNlO#K4l9UutOL;i*TcD40HZNfNZ z48=KpV`9#O&p~l1lqXnxeu_{R(_Fy18x?Do2vyIpfsMNi==h3*DeaW9KFeGKVIEUk zFA=1Sbsa>aOw&?cN(-LAsQGLQI*QKv_J(QxZW9@`w79A$t3iTm_8RU}= zPk1~jn1_ubHVP*Y=ty%DSKZCk_LL+S4BZt3ps?hcWV7U@v&+g|tce!uuT zoaf$auXWTi2^OKA6T^5VDK+&=LRZ zh}nwN4f|Wi2H;M29qxDsS1;ds?$L2%vs&=*`}(}x?fu@t5*h?7mkz7o7{o ziz|$({9mgQP|Q^QNr%LsNmqXDY%h(Z4D5=5G#s8mXc;bGXjqNhviHGjue>Uo%4SRF z*bqwj7Nod}m)P&L4UmIEG5T06`^F6ydHyGsz7w|bSdf}FmmV{OAIoAn zvSLZ+%SiQOM*3+%Bp+W1Lg$l}=r{Uk#**4isDECH=%jX5K&c!$Byp5BG?w8J;=YkIeXoqkj znKUFjOl-m^nECRn!;La!Lg$gJIgh_m;Fm}zxFr*;hzA!C9k~v(P>w8rpF(hXh1ovr zzA%Rm`6u4?vDUSNLT~;c9KJVF;WP;$)M+Y!vNGWDe8gda@!UuX;bF}B<-Nf*2T4sj z3>#r!`)cWpK08bL@-hHE@LQROyQGIdK{mv!k;3mAV~Y*& zSx9%5c6=H`R2c<5TZom~S)T3I8*R!KE9Z zGy!Hum?_Ifj#-ah^FhR$lt)QpLd z4Z=r(dZzP@l^;2su|VZMmnmOEH~2N&6&pO_5y1FY{2%~AEy}vnB0qX?;I+BeKcB&f z|5-n=5l=bT!BIq+;RyxX6beD)7x>UAtobc61SA?P_ozwGiB-Aj_c@!Lx0)r0&$Q*; z7-Q3p>Q8fJ@t8ETi=ab%YjAt}qA~>G@Vs;N-`I%rADs}msjm0>eWY*01Gn@It7Gr) zvfk|JHY~V9eI(H5^?}anqY4?%?)Xku8F<& z>_)a|3WD-J7>6{IyHJ7Ny`sr%kPEeFA5=8sz8I;*LW|uf$ijVCB$3K8y`x{FJORg-`CT zC}*oRScJZ^5!az4e_~k*L8Kie5o|%0U=n+}6MSoXJV^q{avZhx_N7Rh6~0qzf$Y&r zdu6)*)REIY#^T(0%7wuvlqQEMvE;#rG+58^o-`ukh`jLP##HQy1~6-E4c@rB3Pqh8 zDUnBX7mjDFaBO-{#bn&eWY$}&K#}-hW>rwhHS7<%)64c=7yoZj1-pKq1+iGlPBJuV zKWWI?fcdcbKl5WJrm2fffh~(~uvkVjp*vVr(~|$L=|8=URvWRpUf6Lsh5vzbQvm?> zx`zl(i*xr!4lxhdG3~Y`Q1gGiOqdro9<4s_DQ8>s)cb318F(RE9jSx=U_oa)!&<@6 zW>xI-V$Y4~$-l&cpIC)?eD<+JdcA$LeW$*9XCE(FnjzJSg_7=*jN^W1@WeUBcjDH4 zDPL7o!srDPfz9aXRG;qPXHjo@CM^=WfXt`E4qzoma*pJ40+uSL4biBj23qPqe)@#A-O+O882J9sS zx^ICqC-ENXg873a)hiL?Yz@}dc-2eO3P(wUqi2Mlig-`}Xn^2<>c-!c)nYA2ANpSM zuX$`hTok?gLtX^Ds38~f)saMV)hGjY49J#-6JXcd)fmPuT>MU&!;gXb^H(>&Zpei{ zD6$?;nhRf>Cl)J|l?%H+@7`H_THjT#q2NZFv}4$jI?{y^AFw)t(<3NOQOC{@uK$`a zoPZm>!1K=HBz(h-CC8)qCeFF)q=Y?4W0+Y>aYM_;Ck3GXj6bx#QiT@aGiN1BTVkl{ z$_soMv^o*z|IS*ibD=5ke1x4mH+90p^=6jL+vCqdmy>bpw>AThce8)=@3y`C^n)S` z2As*5mQq-ZofZMgl3aFv4EY~!kc=DVgPk4%_|XB9(t z&pkSvEgC-Fd2cJ<#I~D^+)wy<2|Dc}KteTsyumg~<4T`RTwO73uT1x6b7?Nz2m-zv zqyOe#?uynui^nat&s)saS#K051fD3HM8_dfRsv_4@!qD$rGwLBE5@Z2j9$ta(Iy%Q zyI?(ek&`*!o}zI)2_mMe+s^6{Ncvh8eAY-1@6{vYFcn>k8*Sfm zy$cr$g*55TbyE3$Y-}MsJmS0A>(>=$`3LA|Pq1!y36T*z%Y;3sBPxQ9<3LzLbMRC2 z^lI6cc)`I^f-xhbbhyc!6GZwVIRv`9)wSdf+(mLG-yGJyMG40l%UHu-3#%X;qlpQ4 zI#_zNF=lp0{;4(>6BbnpqPK82Py0fT!H1JSM(`6+d>88_BgyPd;`e|gGv!)&v8f|h zKFe}=GlJEsk%FxPR7!jXRBNR>!wcL`rav1Gca&M6@ZFqE% z`4Mh^%VfTB>88(OnS}XjA%!~1TgzdO3p7|7|926;mpc4??7wq26+B<|^nJ2fDzywu zFo?l1EdtXHOpk5ff@z1DS-<$rG(ZFiXuFs|}Y34Kpxiz9w9v)SYh`Qlsa!LK_OFPk$W_-wQcU; zqnMAG5Q$Prs$WQkS8`znPLX==kuQ7CiAW{Rl1k9zUL&)gL2Ky%RI6%ljx`3Lym78HOG_r#NWZ`h;UmT; z8Q;NB(OjT-ypxw`C{7rz=Ah6?Ilf*d)0!r@p+-^-rj8xi z_6SQ&${Rp@207;QK;#<376gviKcGm_O;|y6$pBqF&Tj(sX+L)PBhju%zN5&)Py{q84S1 z!u8GCK6^gp(|xu;h?PPKnUh7Lmhp+RzfjWm!UtOhw9(KveIW^uIn_ z_4XfElclN`*ZUd3r=6|g_*_mCYn{^noi)emliSaY^fz<49-|%;zdlvkVbJWlK+ewK zY*{HA(P$@!lXVkSTpg#-w&~WQVm=nA@QV~tjbwOd-7zb2C?(IOw{6?D(sBB$ncUFf zOE(5xIKJ9Pt&il#NG9BsH`1^QjnQt{9LJsje&!xuc&TL(@ zAuXdsJ#S?ulhXa4ohB~W21ju2HEmn9;Ale><}Dj~ZAt1pw2jd+HpPP}W)J-w1RDseHl7A;l`H-f zBR?QsBau>#e*U!E>9Dp@ArRa{F&#eiGa?C9X0D*u+HD^SnppyBly#h5H*jF%%7=!sw59c9vD zehhfcSO<-^K!2XtS}}-6ld)lbeq<@ttMA$#^BVn6O>T$3LxpcObE-NtEn)SH3DAgsjf%Hy@L@o z>)9|}Njhf6u=~m;LtCH0meC4`1j`X@*Usz5Oj(WAi)jVKP9?vMg6!#`W_aJeyzA9E z8Et=&jhAK;rplBlx~kENNni)V)@4o#6iK~r3DI>TTeDky--t|0k4HK@%pgO9xQ%UD zyh!gX7B7xtM3{)5K!6}U%CGpooZ#bwfJBA8TNJ|w2h=#+HMy)2qAkKu)x~cv^MTR5 zgRFZprT~ARVEa$0VJl_teYh6S_m})2e(B2S7D%gA2}!UY_BEL%&Tpl&tiC2nrB;xd z>BKo49MIQG#xbHH@XVM6HDxXHxI_x8HLWh^aO2<0Q|I4KOH9SCksvdzy{{R;Q_qkt zt6QqxbuiwIc%>4LsbH_z77CuZ(N3Eh{Hjl*tq**sjUxsbL00hB%O`K$_t@x|s{n4T zNd=a$$ae5z7;Rcbu!eQO`0qOBG$j8>tyuBKRunfzdwqI*M)DkXw4BTY9#k;h5lpSc zQ`n|Bngm4zP!!TzK$%?Z-G;AmCHO7HG zJ4a(MJnx8jrjb>P`5nQ+l}d5)GCk*Icu;gi*^oOINvafMb|ZIakvKmN9Bc9!zuX@| z8c!6fcJBtgI}cj%Z*hu}cIGcMT*eEDaRt3viG8Pz`YPlFCsx%E3 ze|0qp+oBM@_a-zIsY9^~(nq26QCP#uvzBLITT-Fz1pxTVGcnL9>X6Hfuvh0pCi`ERa%Md2+UxG~gfM-;9Wc)ekf>K{tXe9Mtf!(RFbeqz0o?=Tkh6Nvrj3gQ`mk*o^N zm!-*o=#C|``9cYa3e9*JN%R@qkelPrEPd#e)szjS?u45l-g~tSiv;RefFk~@$ll69Yelw0B?`5LzC;tmCJSyx_+HqT%Gc-2 zhqa7V;q8X$f6QtH%hylOT@X$Mzo#h71A{SUK$?cZ-d!_6boCTtWx6T|zRb+Ik5lZx zC5dG%G$-g=G*YM6F_`aAlH>GIDIqE;_y7oJh498JT}+&LXR4d;+c`H(r3h&!=?z9x z4Q9TKSxmY$n+qmpaZ(L5^RA7HmY@KNAqINP#5>dVozR%cDNn*ch4az#C??EvxggEz zsSOE4zWxw3&F#htFngbgdsT{RM~3V7uK!%; zSN!T%2CcRzG~5cBOfItKldRJy+p^9QA@i?}dZ znE+cDmfM=j?ciR(FH$XL?toJf-0P#?``x(7+V%+5_T&Q}4ryu>>On>|O2>w&hEpt* z5)Q%Yc&uncx(~56ht=CiOPu^_jEY%zk8Kpx8pu5Vbwy1^yuRo6Z{#hTke{V6p)&Tv=g`ZHv@IDp| z9-YRIOoK7?Vhu_H48|kcl8_9){<@Y7i_RF`qbV6-7s>n$_Pk7Q+O8Ny@3HclM47Ac z6zq|t>*>*jzQ1Q3l^j2@k0ZK+I`N0qp{^YV!oBYzZE5 zSvR>;F(^9oMiSA@_%a>wFdl#lN12STlFn`{Qmaf}rDn#9RS6j!Q3~}X zj=UMxLXAIWT*~kt-mDJCc)Cpz=ibFBQnyK#3pFG)Am4l|0PbQn#eT`Vij|AEU5G%h z$?8@IdZ=eNwR^{eh9<;Pjkqg_&CZ`Hvor z^fGvd$l6WXOdtBDp6J#m__((+#YK7r9MVZZf^jwc^VldYv>MnCwxEHmjCA-@!jTj?aPs5l^liizJ(^&FE1FpZ{Ym2#`r~ z3$WnCaEA?+aPxO%`B{1|`gSd*Ka{eb%NZ?ZKVE^@Xr40xBKY^cL=YK*9#^7FK>)h( zQSI76fgkV{B@bpHxC!faVCy9_0+fD8)Zyl>Oz5wZTeI&x21V>$btPM->8wm90k^yf zdoyGD<+a&Jz#pF3h!1alyPUX(tHDr~S87UyD+l>$24NU?oQO9D4|DnM<<{P-5v z0EfE~)@KAjemmaKTCM0`k3tG8krF!R2_~LbrBR2%teCVPh=veVmQB9mWCw` zRBgo9P5Zjdo9INN96~`85TLimeAWEwn27-7gW?#U5e%o(cE$*1-b}L?*H}@0i!8#D z>Uo|PP&r6F`v|C&?si$#j^150fj%x~5ONvfry{1>s%V^z?BIVI6%;awoqIAAE+1r% zr%okZN!tCI+p9joS~>M{6SzZ;3?!2Dhs9X!)6EG?W`;1=K2r-_=(Wi~M!Bb|OgmT_ z`2VC)SopD@PttM9_!%^JN0ir>nt%q^UFnwBe^6%XTT+3YDSb?Ycreb%B%%D&Nya3+ z2w8xJsD7FRj?pAvgW`tTb`Y4^yWJDg1&-?3wn>%6BsC2_CNkshL&e|3s0g6 zCp}stZhun&7%~}K)l7`s*HIU=ZT@Ig^~ciyxVAo{|#log(TGcqhFz2n>YD}PfA{!SqL*%27i3L zVt~5xwo(|dpyWNbTT%Xq90l-OjX0{cQ19gm4a+43;MeNTZ=^*pQErF466HVSl3n+B>}KhjI4M{vNuAyFoXS1WABDQ=ro#C9LHsinW@c$u zat7*s0VfDf|5M;;M0)rQl0tU8yk)AY$&F5i9w5cuIvS^~N4`8Er&8j=LloSD zIB@a!n7j^ZL*-A|ES~z_uESM3XAG>{e-s_b5@Y`0H<8?2V(vtNLcG>P#L70QDc=)3S59YTUZanCyxMgJ9IkJd@Js*GAR@QbFvEkyRt*ihX00jFbI`A{T@Hi7a>$ z9dv>9Zj5Nb)QrZRk2L02K06WlI?fU!y<7-R6wIRSDQm0??g)lKHj%zN!@_9%(a0V@-q0Y8JIgQw0k zW7KL3JY)7Dk5n5?r)jU5j0mN7vF}HdGu<)aLXMCHNd@t)OBd>dOcSQhVqu3=2eTsJ zgNs889adQocnYQEJQ%-no23VQ4pIz4bPKzPwc4-DLBR#uam?%N00hJ1njr|mOjTE{ zuR*ca{PW6n35vM9iK!*t8#DOOToBZaHj4?8k)~387a3NBLhj#R<;uK?z!bpJAS{wMPPYv6QFvJ; z1pm(5kCd0#WeWoFpwEhy?MR{TpwFJvXUtWgmeSGOP~>%i;$uC8L4s7CRaGSMz)fV7 zUH@X6>SJwD$y@wy2ft<@D9oe0{#fa=1O4+V;?Bu0XBj9@M&lTPmY1jKr%$u)t-%0H z3-xW%={G`|GW$M+@#1R2?cK`Es+e7a%3W&Y1={ajI{pp38a*BZf*cLMk@lcca%YXg zlb1((z53>tdl)5ewLO~{@W(aPGbV;*m_@yq z!qTY3JAN1dwSq6%J#P}Te0+5klVk5cW$!ppnl4pN5rBxnk}NjD;mr^O8WxI(tuyk`0_N-ZINriG=?|u0V*1~khV8VY1|dGfHsb!! z+(Ui-?Et=|dkl0Y1P6cph=LaS8TfA9T!yz?PpqW;y^36HLg)!o#r+qiEHMP~Vi977 z$7(}MP96Xy$AJ4j@)5S$ z2snd)MC1dM)y=FAI%aa~((I9!l;V~J2~%)Ps1pnWdtN_h)#4y1#Z|)Fy9R6MzFoTe zsG`5SF9Og>19#F$6A!2U5?$CmJUloKIWH2K!Pd!8Gl`-1B`tWbEj% zwiRkjD6ZDTM|sd?csJIOZSX&P3A_*kqq5%5i_x!yzuk!p2uJdXg!FMp@@_6aB7IoK zTfZ~n1_C0XsCgX-MJnqGCJnx&_GY%K+A@wwo}wu?zoJ5#%SCTshjddm*NlVOA60_o!t^8= zI0W__5IW`8Nk&UmI_i37>*#cFxlw+_lofMOq0LpPidbt%JRf+;51US0iZ2wkzhXBU z{sXo$ZRM!4y-fB)6GIa>mYK;(pHg%hKn`sr{vXS;Aw-_P)O1OwGV)Fmp4(3wz9Z;JL^LazLgBqs3c>31Ete zkvJ1G`mg2RFVoXBnbHFFXWG}DO5nA2ddz$^Q8rNcLw=sroH}ESu(vXg%7D4dr20c9 zVNbh2>kz^V5OkSK&mtMk#;7y~;;>bHPfBU~h1=K)Dez%9_oT_M9oq@hXPaCI-KAEa zu{h^qo^D~8_;yJU*(bQ2%Oy5pYPXS<8wW+^w*v_EnVFo=7Mxz0CO69%AvIkDua;ml zz0U!d&tone{&(zC2X!Ary4j(iv_c8}woL+hqX_34lAb%E5GR|RK3+PiU)tc&EO!lKt<)6Q?q{01?$TSpi z38`d+Wo9~JQFS7;L2m6=S4)!eGXEzn&)k-^*? zd1y`4oT}4%G%!z%}xCXHc>M$mhmTVAT336kckoBel%Bj z)&g8&jvAf@O!Xhv1y`%@vuHDzBU2eIKJHE-d^ihaG#+dinEZ??qTvKcSlIFl81&S% zoHEM=3Op{yn%GAlOe-^MQu7mA{UvC{^itXKzvVGn(In#i#7D#%-g`5-t%^txqr;ss zRa0U@3P+4G!CJk))@m4Yv!C;=t6-d2%gT=&k-LlU|HZLBjegiyu>*aHJ!<&T@twR$ z^k4HAr3$u8`D~&vUEwT~q%_-kU^k{QgYV^l6xU@aP~?)2R7Ni$;PRB>bq>wO4x z2Q47emNCk?Js?qGe-5jolGaEsMPNIPaN$dtXL$dp|N+K@#;;e$!}L;e9} z9|)HU8%z}N04-t!fy*cV-| z&}2yI^chFepYwSOh4h{7N6VIfD{fU8et0cv8q!pPWz}4dDhN9|6I4wEbU6S->l0aK z?`%!J%XqGI<%f9I^uH^v<41c29XWsR#SV7|oO?9xCy>;&NqxDJX*3)v0PF5mQe}Es z@{;McY=s=QsWN-j8l0i~VYxwu_RW_Ls(MO$M{F8D_^*6~WTdgNv!&mSpEEAgV7HKY zTz%Wg9D9(mFuZm&NL&x$k&5rqgW!Yx@a3u(zOIv;Ue;XgsP!R%QYvY);a(757zH9- zc4Ud;32BE97bj;-a`!?>KVi0llNL>XV{9ku{Qmt2^8w^JR*d2BdNFU}#jr1+?>tXidnE0BuK=S-> z=h>P=fbRnz5T;}T#2o|*n;igrz#sHq*Bq9%ys)H0F?pyPCv1_YM@pkxZGk0jT@WbQ z5KDokY=z2KTuDMU4aqZi^4=l86&mO^S~CWqFJ#i%2anIL^fydaUH znXJV@%IYSNofgsOQP}Cg&4d09K3VJd-5y#GZ}o0}XOvHnK&sdphlZ&~#{|6}+ePr)l?$_|NKwLRKN(BdZ3 zo#DJ@U=>sU752Y!1jPp&lbVL#t1ET51sA7t1e0$u;%X|Ct*=X&mew+NwOB)Prz=`#`&@WnIu3xwe)a~C4 zL3v7x3@n3V8V#$U@_G!`_`vmnCMluP{oO7rK%lLl3x8yU+u<%d=vI7RcD(rIYmub< zT~sKdn`Pe^#RKp{qrZlIH+Iz?rGH+&5V9Psbt{^s~I1Ml@4D2Us9a; zf4SJtwo@OBo~(qNojBF^%Gy!d?!UHHei#89mXzm%#QE2`WDj{{{~$+0LOqi*%6P%0 z%3*@i?u*OGyVk3B*A@ywsLuGBl2XYGDBy!kJtwQF*UaS`^K4pW=iof1FET}khs3Pk z`NJ&y!b>98;h~${_Too$)x{x$R6!8lWcpKg1iM0@TPL@5L~j{1C5nuVnU4R5xHDw3 zqy^a<2LKeQ&$;g-_YXS^u5A2l7-&=BGi7NvGn(RPbh&U4IM@v9x)hMm*~+kBFCBdP zu4W6LX$?j_MX-4Jo@9aOZxENUak7i;55J?NPMBy`KM7T5ki?o8-nY?+u$qaWER8=g zX0`0P5AGVR99*~Hw`{`*p!!-^knJK}Mz1=QZU%3}(R)yvgcrj?|fbhq#uk$67 zMp4}MhtDq#SrBar_6ynA{zL$l`8iMX#AmJRP2+R3}^5MRaqpmbj8GW4!Z$hLkza1`zr z@k1u&zx9zVlB`!`#B2Lg5tCAMDrTA+UfcW6Nk5kMr}E;uAB)ID3+Z}V$xKiXWLCGu zb&@@Pb=!WfDCLy2e{fUTg0SW%7c@zmHGmJkn5=1dILIl&6ZLKPV0MRz{m^T^tnU0UCMJ`aMmWMX6AQLqmL;?q?P zsbsx@f@LdX-&7D>Q*qjpw6tK(m1T$qYAVZXr#d;VCrG*3N1uYBJ$*>h8d-xGYpn=o zUXj?>QLCMN@Z(K7T^8!Pfq%bg=|gHJDV*VtQ|Rre}=?E(~;cSh>N0a!&!`UV$bA_ zrNERQ=kmQr#)YKfW1eZN?^ZaROvEf+Yg$8b;+I~$(Pc$u*9{X-G#3IEkEt*`$QSVIog6J# zA`y-Qp5M6VpbaKYFu}LMRK3jUvBOu0mF2z1`>m?1rp5!TB?KT<)b`${2^}{Z=Kap0 z{@V3UP2Cu&xngy8UO?MRAL3Ui;OO2=NV3gbgfYwkP86@NxCxSNd?D*Z;Zxl1p2TPq zrfV*YYx>zPG-*J6HTk{i<}%v5b&p^5)+`-ncA=7+ncNZE0?ZkE3V~-}!vX1E{LVMpgh3KmU##d}~-$~?0L z!|)PA9W6o#giPgsU|Bd3WY?@A&mz2kBdC8gH59E4D;y?C1g*@8X)44>)LvUB+KSRrZn=Pa@>glXfFN%iKv9F#NG)hABKjwmrQf`7$ zE^WH##}=w5_T5xu{lMbWSxb-&^K6pkh!Q&d0xdri^MFOgdH#*LE+|n)iWM|pweW{VTV9CFXr9w? zT@lQL5&`5YX#i=(c#8(v!80ed^u*m4}!_GKMeCmXy@wwvgds+K#6l{NU|Do5{(O1B!Z{bv(e>!|OAEauS zFeCzQ!T5<^)IA>Yesp68z2Lp{xE_t0@12s0l`&0uW2#aSd@}jt+iIPR$@|wAI{##s zO~&Eqz$0ku7AcgPbRy%=czUPh9_h?#Y7j1-_uwi+$vayFT~X+LPFx#MV3UgN7xq*W zdRE@0<>|@hX2qG>alJKa2Lf$fQ{-%T4DfS`J5Uf9P!LYt8I`KK-+Y^67+c?upqH?A zbu+jCX>IsTy&Mr$c#Z{Qw{IN)7_C$@ll$C^JjFaM4UaBV3d+sjB%0sMUs6dF*N}-xms`V{CaT%m*h#p@O z>BQbq6`f=qyyS0ry8-B=tf6jBpPis4XrLe+l{eb)ECZnKA49`I8v$CsCnT;z#CU*a z3rJ6pN9ZOU#7HD0wcJsit~-$nq-<+5xq1!z^C_`6szx(sQ!bfJfwoLDM^!hV!6YSJ z+0L#W|7eCMNd}#2)Rrn)R4P|t<_mHSDlSf8mDcyxcR%pilbomaJVaG_erwu*dH6n; zqfkc$7&t{y139)h%fUV|pyCnKR07)+)&mzNl~E!yFB_feQ(|~4lV8CVewB`IK~pJV z&M*5ev^{b(giYFsq`_n9ZtN>{C@9!j#P?p^RxU&>uHm3yb=kO%=F>&qmOf-m(WdU_ z|GyTDdlZ_dFE9Y<2rhwQ#LPA(L4NcFlH`}C(gvI9b*L6E0yhqi4ydqdDEI}QbYJ#w z6s3BOr4oJ1EEBU=s*~`r&>xDG?ao@fK z-5cUhSAgf=s%@m1wL)&1?g>1;v`GxC45skT;j)yN7-vDMotdI z3OSDKnsivlGMbhGKdZ2B)r5|NC4od58dXW%bW&>Fm^=Eey|!iZb?s;alW-ume{ME6 z^-@gBV6DY|joezuIF0uoWhvV7FGr*jd;7XXF#8r@)E{3E0EdqiKw}A+tfszOT1xAM zI@Yp=1WjEk8mu1Q_};EU1QG6i8p@7^)KpTH<|>_KzF@VKS?)}5?*^>Muh{Dbomv}C zZ)MM%Wl3xss_PQ69Hptk8=e64H@5$<)w6K{ka$v-q*jkReP%Hpze^vX@;;S^oiF#p zP^ZC<|BZbn$a_rk_ND!%!^nzsbP&HxMfr4&>`&zRfbmN4n7}mH0brX_P`(N#XNl#< zmlf3~Eab19m+!$p{M;v`C0hYbGa_hx+LXnSpxzr-XRM%bQN=*EL!~-s>=JoHgqoiD zmVUtXU2Q0#koE<;u(ea_d7+7=)KNo`nZe3H+js%Zapby%dzMdg8Q?dPc>0LC=XW%$ zA&94IY=F+HD-W#y=xdOp2alN6y9Fl0=p-sQ1-ZEslOzb)HC zFhk+y8%GUGuIY{$8=Ly=tk*N+t09D{jR&g)Q+MN9*#U%VFjBCoYKH{i_rn4lrfa>o z|Ip`>IH&N+O+v3&tywmNYXlqo#0uK=MYXTRWm&c7fih5AWF1K^{7`h}&tQ%WMSXlH zROqnOkl9@Ep_(hq0c+Lm%78cqD5!7Hhd0}Sm(MfNEQPfILeGVu3nP>A1{j(9C!*9% ze%Y-f92R*nz*5!ps^FtUL*f%R2QFQZ?qg>85EhKo2PkKZ?fG5MUQ(OS#3l1T7ru+F zj{*hHy1JjQSmy((?D|kgxB4pGy3VpoV$y(Rb%Ou@QQXk+LK+jk1>2b~=1%HZh4Dy`vziB=x^Yls~C#>020lv-;?LpQ~-2kH;EQQ~}+TdG)vi3@3};f$5i3CQ3^ zYuR*OoV=rykE7K;8F2*>kUmk|ppqG+Wg5r&D9;dTq!bzT=#>%e^-IZIqXezVLBrT& z@UWkNe@2~93z#=99oN6=eT_z!x91M{2FA`8&61U;EHu_+{`Z+zQ}A4Ix8FtM{{Ptf z%BU*4w@*+36#)eWk$R*XrKLqWr8}j&J5&UuyG!Xt>KwYeI}aeufkSuCMxXyXGi%M4 zS!>pOdOykWu6^(O>iAtNOJpgMtw<0u=ihwTrl^KTyoGbW!|`F5VD^;|{;*Ck`6BwK z;R!>C7GoQZuIm}L!o>aW6XTd5)NV}ssjS7%Bne6|c$O3=(!|DcO2obc5h<%vtQa7IKA^Y(eaz^nI_J}jXD6Qbc0+zw*m zGAIlpF_r2+duF^JU?lZXDB#CXv2-iSNV9zV=2n^iF}4MD^%w0|x+=}D5%*+(Z+p)n zGcHG)kIj}gk@-va5Iz_UmCi7B(sM-TG9gZ}QMBu+aG7*L>S^TK`ae}ldtf4`t3`*4 zS+Go=c!Y$kP>Ok=f!pk;I~OzWHnjn_M&IKy?9^)CuV?9YyHgdXu4(;7Bd5 zQBNYajdS@nDLd2>L`LZ_uqL%P^s?e#6x`!(UOu7E#8ZB2dT(B!9;#i)q>$wuuwA^h z1As!TH~iTQ%?dE+i+}q5Ts+rXiQ4Zbt;Os7rw1K@bJs%jRGxR}QP$xyB(hl|UGzI{ z_&}Bl{<|`5m=#psfJY=E?{IQ)LLo3%Td_LJuKal7>!>LA_aF(-0WAGk`b#2n8oQuR zBXSrK%_V)B-RXe|Lo6jl_-`$PR(VcOtlCKd8NuQV~m%VsU#5A;sxAif^%f2W!v zV6na%<#KXl>0(A?!t>d|Xs6GdrDS?=5%hQbgnWqO&}rE3oN3R2{281Vn#d2EoVz@B zFNsQTDcvkO^}5C)G@p3%M-UpQ=)qV!vgOej0_~u zxVm?()qPlQu+IR^jSYtx)EOOxcHyV4N>Mx8W1m86nCC2Aq}jL3u;Zzt0>tq%$*_Zg z&GV8S1T?JU?YpbxzgXO#7f|@|2zNjV06!N&KF*F8sq|(Fg7m&tlTDpz=v;hi6_F}?!{@{|?Ly{}xL_P%Q^5Mf!3Uv<6(a-(z0BoMwi+9SaqTkg#>?mqAtcx z7Vh2pH*2+T)_C~?zp_=^DTZ1|e#lm#W1_Vlgs`z7dTFc5)y!=)yBXI-q93sE$jN)W zci(K*?77VK`%s(xh#R+Q~3K z_SwGZ*lrDT=#Mw+#TV5Lh&{A|&l%X$hAv(%Jbc;)oh`WA`CHg`HO0zn^yJ?xXia%> zY$BfiLyFS#=9dCN5Pa)_=e%*kN9L;KaGTbp9fi%{(1NmOTlM$WOpd2na~su$2FzP8YrqpiD@lmitMf1)uah)UIlDowLgx;4CIVWA`=~L--eODx>>w0 zq42Eoza~BAJ$%bJ8Q@=ev~=X5hW6KsUuq+grCk-ylG{ChyStG|2W^?vp5IkS1!|R| zJSPJ+XDyG$!`L6Bm17Q=bH6bt)CN0vhdsU=$w}W%*ORs^itINANY8Cb2CVGrJspQ` zb)d7%O^4T_1pw(B^m`ENeE5N!-7XZc0m)L83yNq5Ii!L#^uAxITrXC#pbdEI`eu*v z#E0BJaTx@Uo~e9t8hIOS_`46)_Yv|b{mzas8ou{kUhRy)ro0!yLl7r4i6TRolRV}n zz-b$y`%$$Iokcs&O|=MfK(P&vM=x10xL%c2mnubaFlTN1%ctRr)FX*W-I!^U`wo+i zI-^egAkap=9LUdqa}}h(l>NB8Yf;Z7cl&ARwr@Ayo=ud*FQ^{V<~}t`@2c&7K7)kz zyBVdYim}v8y6~A}!9RB7>w@1h#(aCtmq=hdK;2j1FUGnr_YR@HWSDx=ZKq)<6Hr6Q_OlXKN8P8$@+TzJM)aIEAUWv3 zRqdt7&kapo0e$O~MVW5fCL9lD+K$`%mK__~j;r%g3SKioa1-)p~6CIl7WCx&<1X52k`&E#vUN_LjxZ=#tYs}e7C}f@Xbwd?wN6I)TQcH2O z@5phbWfo`MPTKAqrfOkfq9=v|)5=zU=+cfCgud1f%5fmbfuHk`W((P-W)v1iwI)-# zTTw^evY{)a)4mqLo2YoA7YM3Gxm#068=i-tQ=<$RvO;o68E$ctQBJ1Sa@yiRVIdk} zL=b9xV0Un+?$XP$2Q1o(0S4>|1Npxj?(l%Ge|wek#Dct)dyLE%#oYoGJE@PoZ|C<; z@)J&;GVmBE7WbN<@i=`{Eg{7Dbq{hzio)Y-6WX=!z)WCDZV)D?Ctnk;_MI}L>ZwtX zq3*g$rM9E=EZfxURP~agWyVx(C)$<#uvSu-H&`7L~=IWbY`erWU!GmxK~32z&7iUb+4*)M{62<(fbyUL}X z;gLm}Me|4C>eTss;;XQP>xoXUeV5lBizj>0%{g1R)I0IYWtBK63}X;0EhH7hLQ8V% z&Om<@Nl(RSGmZ4NM3d2HhT)ech{7#I(Uv79d#if5Ql5nb4U;ciMlm(CS+y)@o4N&_ z{#9|!`p$5O@O?)9JeGu3iqbtzYq7Wpi&>&;f(%-8*3}2kD_Px)daZ;a znk{{2M~%;IcIhlz@B$u?f|ir$Ee}Uwu6A6X!*;bG+>FQSp%Jg5dz~>OjdfER!Hgc2 zT^048Zs#3gx&VRG(F35LS%gfHvX}iqLC+*XDfZHS&(dK__!}bD{u5%5pkn z7n#LZcQwzs7b~;B)y6MFzNeECGlF>$ce|L_o+43@7eQsrt6(qxD|?McH8|!+ zi~&PUPFv{vaG(@l1+Ui{n-B=zCyWgUsRQv~->GuKGC1xZjYvO^bI=im)K{aT(C@qA z#}k2~RC=rwBn4zh)Cy?h$VQQ>9B05SnMGgDWEh*k-}&|hnc&GufLcy76!=D+pO()y zOV6e(>{dC4K*$4dzk9CM>Y`JxWx|WBFFz^D&<{W;$)#;>9HC)^Y0^bktoQ4W>w!j6(8#7d2(>HFoYbWxPa;=9VaWbohWgh0wIqJUyA;R;LdJ;Q%B>TbjyysI8lR36tBt z*F(=XO&(Q%$)4OFQXseJpCeeXN$>+qW61gL^>!B8eBL!fr#{c7gZUD!vgLgBYtI!S zXjja|Ll6cT2_qA}pijQTowea`BG`{%3k?X@5@b$NY`xD?3ST+0FjMxUZ$JJg8^G?S zw~Ia13HUvWu(o;x88d}GgT)xtGEhbJ3XN_Og2@`3`$~T3kNiRX{E+Q^ne~<{-`lqr z{HS=iS}K7}2@P4>3@Yq8rqv9HtLpvr)HJtwVkF;*rWtefVj9t?7M#iwaZ`?h@=sv4 zwfFU}Ei5Trm~;xVn}N$)fwy;pv`aaXfTUMiW{s*NVx5xmAPT3tJHUh9NSUd%+&HY# zxTMlL&3Kp3e3wt5wzgX|WBPF24sXDiDOohs$f4-v{q{2Yiuo^+g*TFgl8lZVV-vqJ z7Tfl^6QX?fo4Z#GSaGz9l`X#EdP{n1-QLt(U$$Iw`J@aC(U!xf4@(c%m)9e7zU!zC z4}7VdAlTeSKR)(VGCPJQzMyDAKe6#Rvp^scd|8b3jk6U-jeLDjbz0~5vRKWi&9lSw=8yHd5Ypk-r=N=*>&*L`*@5vnFxto1Bx7H98)pfdGR2n=eWjXGX?eq@pEG%q4pLag@G(l6N7amC4vea^al|i&J zo8DR}R@#f7i!z1mpj9l$6W7y3u_#7*Ctk;1O@MHwe38G#PD zXK4WD6J!+7$M8do`F=p4;H%MORtoN>AL4I6m)cIUrudR*Z*#v^Lk%)SC<6O8lf z=qF5psNO-g+DoF4qNl#1s1Lt+F2)K-O6F$0n}TiVFnd0FZQuw7DND&}`x&?2VW+be zzom_~X4GoV_&^Em=ntJ`SqcO3YRfQCKr@#(V3pLi*Rls#8-&yhpP@}JOnGZ{I=Vbv zd}nWmSOJEUkv$!{Z0u}J-TA?XZU4QlmL)iRbc%RTHQM_$e?g0-YfP9o(q!~+csQI$ zK)aoBALEJpAlRWN8Ja5%5zs;@9Z@%L=!8y9IRmRQ-hL{9+*0rKv)e7a!eJVPt$%h8 zvxlwXPV%n=toc+k6kgGB)4uzZ16)oi(Els1D|9?|dNg+I;Kvyr2u66}yDMNz{W9!-8T&0< z9`tLV5LKyQC`jb%NvOiU<7S9Zx%z-+2|nS_vTw@MU-zVdrvN5Yxqn*2m`yO0H5hc< zo?Mjk8+8TMg;C2?Dz5B1Aqd_vuUx41yZq#^ROedQSyiDr%6|oXUUOqQldf`eBe+=* z1TPO#@lWWV%VIh;asl>;g0>-AZY#M92GUD^P`#CM{+3l=v?B??h9y~ zMbgEK3L|ktg{6D<(H}cSKkutKzK<>;y{_P=omYFkncFbMmzW3essXsRB-@|bErFiYvPPVZ!)vc1PQ;Jo_0&@kl0D?z9*FXtQcPj ztMzyy*Xeb2Z>yFNa}rRlp@L4rW1|zNHFNrboj@s2ULkLv-tte{ciH$CTWz48mk9vt z>3;gh*>45~RB=G?or>l4@9C)bya_rZli4?X!4%^{8G0Xra}r?vb}LqHx4`-lEfi1u z*B0crsH33Mi*5^f(#Zkxv0M=zRWJ)NKuSM`p!~TuZ)JF-ZpEN_Mx$H@R^oUJwq&PF zXqpF@7wo>n&Vy0BRkahDEeT^h_1*B*3BF1nqd!9mt0btk=9%&sqL0g78^dK&I$Un0 z)}&%VO>sHP=(L831;_M%{%hVcQo`WDr-<*=OcL+ER{NuA&u}OEo}J0LFz=b4z>`&#jB*MLq2J&h!&9@o{VO zwYu({G*vbgPE=Qxu5zJ}!VmFiJOnOx$?15~i*MoiUoSoRKq;xb{iFVkFColaGzrqN z@>(D)dGes>A7c6{*LM4&*F#VDg(nJR*}x2?IR?4DvV@+1ON zfuGxXg4k8DO-p573F@$PwK^6%qc6$Ol*>RS%d^KeDH`{ncFrpoa#ww_LfVm-dbo)! zN}KX_*Qg-eJhvCZzLrP|Y|~@X&Xq*6>Jb)Mo#-kBQwo)OzFd&Ne^R?l_YJ8F!jZ!` z7u8U~7G8(S~@urM;F z7b4B;``hMIlP^ua4Uc16d>O9n8Jv5w0y1}`4c~8jHO&SJHBd24L8k6Hn4Rr{AV|=S3HYCloaak< z`wC}VdCjdWA7_6SXq0pqgE?Y@A$+F?N4>(LU#-ufDpwli9}@v=&6tBABSl$mx6eSm zYym_5K>|URD$7U9KPr9aJq8;WH-ac_UusZI!9EqfaS+c$7YR^V5$QyFWeg$jR{B*H z4a?hwrRGJqS|j>0NanjXQn4K*Pu6f{_|1i_xjrH?!!ws9Lj9w`_=A z@pXIADP9D)JMFL(*+HgIoweJ3Hw*{pgB4)VKkK zdwNC9X6lE|b^zGsSGab(>>#KT*`tn^kqRQ~OSE#1W7Bc^u#Qo{gLZI!WnNyALdg9t z=FQ>IVr*mnYCcH#iPx>m$foh}*%2;;9_(sg*SPIRPiq)yx{(?5Y%xorkii72G zv$3bKYY4;r{q~+Yw0drlXJiJaPo;(TrJ7Pe-(pJ?vLR0#;$v0IykGro{+7<-2}dv8m)YC4 zsesa{czQQjDu9Ldmh99J%9}1_5ulTe#mTnV;5*2{f=w9Wn*A+_xGPUfk`r4GB;`aEQkpd)ZSj8EYN`#wd6z05IlD;7Z|)jhM^WA ztus>Vv$o>r%7U#>)(htR(8rRRcRmV^{mk*()>Zd;3{J*--*OC~DdMH*YW91nUu$@P zY3I@%DnXG!TGKa7Q{{)wyDpS`Z@6vP-JITVZ3N>4f7*HIjIf4zi!W0YT*=5h%tP6G zevw9YYww^pMsHrTRb!24C}pXeA&L8W{u3Av1j!`P!q8dIANx%jT=QRzea8yLL-H7O zg)YnEQE+IX6Mv1Rr)9RV=|VQvMQ)BwUXCSh{`?g`#N!jE`E{jFp(jq8Z$-5dcG%X>nL1+YPd`8n>(p}-c@!<}9T(=L#1zT=fIv`13~G>80;F0BH6%20Ep=KO z0GZ3ZQBrTNe&fA}fKA)muLqLW{dQM!iR-v7NV5DEzKtTAdi(B*e^7KV$q>Wpkf7E| zb50UPwrE`>jhn@}gT7YNGlI_}pRK~_pY0h14X1m5V~>LQq1Za8oiPYIDa-f;sd#Y zcDUVzqhptwmjsumY>2I*T{fjxgzSjoa(m+-%2-VIR*7s=SYwXYpqp_z#WxF#s#Rd< zcmwlq{S(??Ak?uDAm$*K*I~PSOeW-Zb-SpbcjKMsE~&Ebf96|>O94G0T`GR?Co%9X zoT16tY0BM7k%kE`yzlA7YUZW8;uPL99k*HO?e?$6l$-oT9@^m_*(*^F_^g*M=v=>eI2o^n9%Pr5?lmlmp>E{s5Nj~x!};_dDqpH0koFDG0kXL zOWPnD#(!R|Bc>!zdfifZ0}bhnRv_su>9P?TJUn@xx&A&>MiT@u~uqLW{da5j3+G9YU>3JeCn1OS>p0UCopmL8 z3)Va5{Yq;o;M3uCTO0t}RY&%wMoh~Sh?-)n+8XMApiyATWal=`dP8w(gb=MsFVnoT zyPj>(f0(eoiiNac<1>?3RvTWUwe8gK{6LVn$3CVkXcye|KCU}O{9@BW9FhXOr@k92 z$DPX>kV3QT=cdV|v-k;`e6-VCJzeysOfh3f5$LtUOm+$KsZ4Lu_Fgr*(a(bkX&MW& z3X`J>3-`@I8^j(6nA*G)9+5S!viDxTQ!GibBAY}ZA^OYq_C2zqW>#B`MNA`9hJs>6 zU#L0`aR$>~az_kgNyiXVAFZ8m=*&88qt1<*S&_>P2MZ-82E|DJjZ|l5+vKpI>~DZ=Kxi@a-b-h5%ME5J4XTS`&6 zZoq&RFO}Z-dwWjt-9z>F7N3>6E$oEZazGU>9TTV+`7({1d45!fbtSnpsc-`1EC1JqGzR>|7byEk!PP2vt36DJ<{bj?GRJu-Ds4qfdx1-m^^NoE`-XN2CT6~CW{)68e>}wpg-DpXx=y;3)#Prr zT?F!FlC3wq&qTT@3`8Rb*LA=^E4-!hi~CT z-&zk1$K0(dGS9I03{T=eGr=1MEJS;SNgMh)qtDWPFfIo|U5w&fjHgyMTYI*0Nyn<)KQ&tm=LitCT53i%K7fgfu<3Wf@sP2)f1t* zMJYz^w2-9yd&E#<*)YPk4EL-j=I2 zp{YK3I)Bny-&{u7csL1VgBG)wR{T;j>y`KvU}i=5tm*Iwk>8Vs|k+7eXO0ndvY&uPPR?yvQV4#3s%v-inRcYoC_suE5G3pt*+;hn$H zUP&!JAzC@W8O-vFiXzLSiHW3@U7<~Gdgub%`9&4qzrIwxBv2PSJ4#?u0{uE{apj@^ zwyKYp7pg^U6s;-fMC;QXaLcvNuN{V!VA$VW)3C7H&`%$o-Qa4SnWgNZG4^B#^g0ut zjn39cPK=@ctIinZ5ArI+us~YqRc}Z!Az|An>^FQ%xd;7#SBo)ivT$l~WqmCManNy& zX!1q)K2z9gBHGiqbT7K^UU)55pY62%CMtnMS~}=~&pi<2&`+t-D*n-#X1^L0nkQw! zb=}{k;epXO=~*xa0J<2L;R#e!Vf_5JeritDJ6o3mvOmV@qkm+B$RL*Y(Z+oG&ktt0 z!_{P!Yjgjmtqh!X+v1vsVJO?@%x~+zt_O8)!%dXRBz58{{hr&O1_%#~T7aO2s(yX8a?l*)v6m#lqT zDX6HNHn|CZ(<7;KDvZ5H5jTh#YJi3sGuS)bd?jf66en(W8*X(PcwqNqP^(eFCnh*6 zTPHBZ-E|Qrpidq*m@tD~HB2F8`%H3BJbFCsI-{NhaRA*g6YSdgN)|x-^{*HH5P+?C zXp^t?t{mAd&k{X0TNMs_H#56kT>DZ#d#!^qWye=gyiIiR@haS)Jc=Ys#TFSR^5OQGeh)Gwp3p0MdYBY7OnJZB0jKGQeSC zNcN<0+8LknO^1iTe#OM*nFr4bb`@uxjKvZm|JCkK%VZ7$6i>!k;5rTAu5d?%tWw6g zt=b*h-Jd>Ijf09>^zqdp15Zd-73lirKx>XCbE{klcSS4ZxEBN8*+EP7Xz5`_o~eRT z)AET}A0FWCGV}k10K~FZJ_Q_g$1yj0=ygBu&-E{Ra{O+|K_d|j^yd7TjDFJYZ+ZGBG0$k9r!7sDI7{D8-G?mk-p+JcU(&G z!QapOtm(dwXu}N}8*Y{FzXUM-rn)=fsJwB2=TzUyXh3n%mz(fN+kMD+E(Qn=vw@_b zXUSDXb-Ch|af_yA;SXyiT;Uchm29$HX|4?HE?iDGljz24%o1`JV+~l9myD4}yx+nd z3^ zuvtE%$N_pOfkL z=U^?Ts`-NT6!z?2f>=qXit4W0OMHwt*u>A-_zk#3%QUpP9B zBT#hpp_x_2jrPJ%Ivy?Vj&@(IL-Bd{tf1qKqMf7lFrp{%Jwb`WtE+t|Ig?=_Ia$M_v!=(6YVI{W z?lmyvMz!}3U(ZU12zQTf2GZc!o@_f~#$m^Qs6{*?l}_b&u{r5$SpyXz%DuVOtz1u%iCx0XpHy*s>u=Yz`Y6ztlGP zP#8gf893Kf%1AwWn}P%>vHCu zf@Snh=Wv6Gv{AYLHTxA6XNW|G2x z!x&&kMEPoT@6`rN#ph?aBoag)jEutJ!t;w(!SOHfcwJSjB!YlIEXNbE`;bA0>S0?w zmkKe;k~(&RCoiGD&g>b>y(^pHzu03^`gwVRM(iSMDcq&>pS!aOSh?_U^TZM)bYX_9 z`gI(lzb)6N*|GVE!V2F$a&T6yCrUlRE!W2jPl_MF2r(QCGZ@6m2$wA;Z}@KiG||L5 z%-EXa@g2MvZ5HJiZdOs%&h-UJylPb|zsK({o#+u7W(qbx|D=>b9xu$p;Wal;s)DK1 zi;ir~>SVR`rtMQ8_t*}^^4_Er)l$#wv?)5-up0B+2|^fO+AEt1Xy?qV<@T1X=w{zz z!G|K`@y($20XwMgiMTG{06`lW;-NzRlTDCNpm0 zYznetu>CM{(X4iP63P%pvt??2qFrEsXCB6xzDvohwz_BMMV@mMw+LGa&U5})TF}quF=FDk_9~}1H!*++63B)oqR6uKBMi^jtx;&0q5a!%L z)9^DTb;1vsL&x<&$PVTpN%3d5SJEldB#gCP80E0I$Lq3$t1l%fxT~ZboJi5zGZUeG|2~}-vVCAX*hvN3qS~h zMehJS4r3iR-s>y6={U6H#IM{Nr`onn?#G4`FVHx@ib%H?`4M6CT8L&(tUjK*zC9s^ zwL9Uwu6>!$@Z$YnKjs^P`2g;4vWiSmTX*Efw`#Mx=T;xLd#G(+eVQ)`dwpR`U1scG zw(e)=^Qjr@s>FmuLGt0WG$?y~_#a_58QE>5?L~HYMVAn#ql2w9xm=2gi0BT6MQ|yI zgEfP3OaJw>a0~Xs9(?euGxeL>h57pS4#)LVWd6DhtC?7aX_j;;joJpwIz}gf5`+;> z#v?nL4Iu}1VYv+PFA(Z(l)#gp+mdqM$bJZa{2}YQfjOR&ju{}8v_6cVtk+#RUx zmRN|<8#@_jD9!>gkYu-1!;2iXH^TJ)AW=cFD%=0_=v)A4&~UBK=7x*KzTxWD`<96@ zli-t<++b7ad?)edwFZ{6HJd224P7Ke6VDVK38^B%b87=}>u!J2pT-!Vm7eR~$y?8V z_`9Z)I2dn48VUM2G>0K(#3V10vBUt*Bdqq1B{I_I-u_AB1y?5c_CW{t@nBqE1gzfD ze0LeE^VaQRSDFJER#(hs3AZY~kAy@&IX8Z}cb~xfP{r!fd1034;B=DrxTtuRo#V7G zjn95x7Axhl{`TbD`-%yV^44PK+RUCCsZ@zrT#+WE;bNsttbk0i&TFH)(9t3QK6?)d zNyT_)V}E)wO!J~!<5-qYl7r1*!PR|ccJ+n`PWd^hz4F8oPJJdnfu!98X-05cRc5OB&^lXja+EC#W7c^H>wi%$U2Lz zfGaZBsW6t2p|r&a2}u_N4sUdBExCckdLM^Duadl9F;zUS>PtI6TDm>oufDzF=f9jA z@xAtDc0O{6KFUF>@+~x*i6rP!>Rm{)AZS)g@z^hr*Z}WrE^!Je+VbAd>%U!sT3{Z%lE!-mbJ#Mc^u55O4I@4XN(QPDEuWK0M`aec5DA4mo z$*M35&fy{omtLyG4rY@Rd1iWTd^X4$DG^)I$k@xZ<;yjFBoCC78yy1+T7-n_86kmYk+H5-72Z}ir-B<=&(2iZeqiNL;rD)B-+blaxpsISMKVzDcrX(p0r{mq0s9yb;o}a5Mf_L1wG4rdzcyi#FUt{Vlsj=)l?Y4FH=DHDf zP;%Ryy+Eve8zg(|wY;U}3^|T$WaW0Qb28ne!t1%c)P$e%U#2WvUOAt7?(5wCZn?c^ zEVr&>xgDN9GD6~jZHAIx>~%KYQmv<+abt;!YI~hWiF#iL6n8IqyPcOe8{baru2Ftr zk9>%PRF-Gno4w<{v*T%_I|pqjy;)EDetXP!AmDskKL=fy7@yO+UGiY%U#K&@zVba+ zFkTBKPP^`Hjl*nkg8x23M4YbipHT-|ms@E~W{31AA!`;$g^-(tQm9YFQSjG6Iin?2 z%38!ok&sj~HjmF0NCs78+0aP(mG}$257cVR^NOVjYMtk2N7Jsh<`cFWwhEY%krK-| z?mJkPacaxZtujhUMZfz)LTco^nxWoroJr3)yz3w%;pxR8TeZ8rr-(iZHaB0UrnsK} z(D`plC4O()8zIZ$h(-^!voco&S#RvxOkN$xeCiHTm+H(&VidL3Amg3Xg}sX0TXnfR zlYFtaGcA)lR-z>?MH~_NjcK2M5gj(e90RG4y-K$Hvjz%^*3fxtUnY{iG_}_r(-o!b zUv5Gcu2+j^ttB~-p^?EMHJD*0AQAx&!@c%%qqMl{<;rs$aM?NQ-0&|r z^yG-|#-`>TOoEvs(quYV2xGbcO!o$ok1^^S(=JtMFYI!>*s-4A7L=b%9A{sC*66Ox zW|-@DL_$J}h0j!!o-U$I+_pp|-3*r#q+PPfq1(jt0Sp>z@JdL(?s)=kM?&I)qbhbY zsEo$oI^O;M%tof*sgWPG(8yy3o`h7DP;`+jB)4`^su^%c&`3>>na817dn>v%55O;* zAk{hAYTt;`T*c(VtOD>qNF4RQ$pRvWKg2k=Qsl1y34~D5uTSj#CsNe0LX)^6~hn zT=`cFp75@pEvn27)RKMTcgrvQhs+-PZZ)uUZe}|)=6`VEXYMy5$dAzdJCNd7sGqZC3$#y8`^$&>> zX274XAfxfY6wHQgOk7}rA^PRHOC4YzKlQ+8#C-z5)t@nYy<%Y5naWm{vZZHI>g3Qe z>k5bTdXt?40?j11`ipsUI5Rj;AW0fJXTJ`)9Epjk9Eqt6hm27MEw93+gbKb&7P|dV zO`fTbhiJmtCw09VE}GH)y=XpY9lCHkUfTUiLPL3@BC?H6q4pHlKQT)qQbTx>2tw|u zftiT>3Ou0d>ntkj1*%m({tw9**xttKvX9+|R-f^M8zU{)=1NeEviRM%`i$A*vJjiu z+cOg2_t=t1H9u;(-OfHWy}2|XqVfGy`d@BaI z{-KzM;&=KC>1kvI3i#(A@;_$@h~4oV(&z9yMnXb*E&hk71tTGMzrK>RQ)@v5_Dg`ufZviPSX%1&>B?v&`<+Pgu47RqDZjZR`I_<_;2tLBUS2mlH#ZK3hD8pBMcE7? zE{0~O^GhGg!Gvj6^}u3o3-OWINo~ovJ7G6tQL~=Py<5wqr8Yeys}YI+g8;c#tgeXb zUFwko4WGSlKzfNpy*97Qo4+@=pKTIYXcDL?D^sp1^Vtl{k`}7^?@>F3bN>xf-KNc6W!Fa|*OeI{8D1d27rki`TN*e*RIUS}^Wt z>*C43`W0|&crRQ2;N$}5fnJSZtY*Hmv*>YZ@rpOi^jnSH&?Ez`Nsk&Cqqc2qsEq7n z9W}3cU6SF1Ca)LM)`4HFv`n%^;A|FMpj!&tG!93%W<9r6V%3+f#Et-k-DAJlx8=uG z;>9QCP1%malZ{T+e>qcmG*+aJxzgR*Hdn1C3s^hClLQcP$w;BT}X=w$Mm+Z%xTLvOmRww&?h!p7Y38yLZ8p60diT$X}+62y(V7n-P9fWSb zuNGAtMPY1Y1hqh@?Y4Et4>rUHmAvAxK4SaF-e`R*&4b!1nD?5w#xnY)1J3l`h3sIPwc+dzEWS7j zpCpA>hxfXjg9Mfc7U}J{vYc{iRlRkB0q2_D+u4_$JU)TN%|?PV*9Qh0T#pb?;_6x| zxR(%w@ZAY~Erj>_l+(5>%k2Wzw;o5_a2x8t`|VE7WmL9^*`5iRvdYn)h6SkKkrTb@ zC{e<}2X`uYajZXf%>awV6L8@F&K42Oc64^kl584>&(<+&kxEXSUNrR=A8%F2h*)Ya zL@^?(bWS35g%-Qj6W?;W9c>hA)g~r^ryx}+7dZ&e2>K~vJrBAp*cbG=GyWQ?OYyo`5ss3_VGD*ZV_mbtXwQTA6Jy zd#YnjpXy=ivEqzLKi5xNKz!y^ARGx%H3^Q-h8J#r*$?pTP@Q1iFOJy1Ki*-d!D8z} zu`XPAJvPKjY+b+6y*{us z4ptt$GOq2iidT{HUNXtFdy@^SK&SQgV*;W;ra`rP7vG99sA=_2eL5c|o@(-t1)X9{%$!Bf5wnAB<&)?;)41Iew<|Ie(j}@j>7L}M2>34Yp7#VrO%BV9;4+se zC*-d>V?i1`S5fWcR+T1?QslWOHougZmSvWeD5_m)mJlXd-A=>|o{Em=1!5f%&^0(| z)={ecFlCkmi#Rr5=-FmuEfI(v0*~W;Be!E+Ut*dVDye-ak;j?f!D0SDZ;<^^LV8pW zNIV_Hl>lG9Qk2mMEB?sC_8C6sNTYm0GtC}y6;_`h@2RC4v)A(F4 zPW?Se;W38>;0=uSn}ZFL!x9Y#?Zd&wNyU#L1Qh%gP}dQu;N!TUB1yM0-5Q6D+5Qe1 z%yrtV6VBi#-%DO*@MgdtJ}mnQoGZ@C+ISC+g4j;cppHxfp$uJHNAFU6VvEU%g|G~`=rPM9as(*y&Vi++ENO&a$J#4ne8d41GsHj$DnvW2UN78N5gd-+ue zbL^3Y^v#JpEUIKDP3&eT-Ly=1aaXUjl&EtFRZJc1tN2K1u2#mnoRw%@>9Ag-)=0^! z+W~N>65{9(14=pB8giZ^)5VrmWE_IW0=A3Gbs^c^#Vt`j+iVVz|Ijzq+H9vi(@cX{ ztCpS}yyeiexEf={&oHFP*s$ULJ^k^Kl!tq)<`fd@4%-P50%>_(L#KNl-HA0 z+K)U(%AGBC1tD&nBE}b)okXFDO{ao;`FI4k%v$`*My6GlKFvp~?*_?E$7T9yZvnei zcFPwG+Q@TzzTKup;19^gjeZf9?8zV1OQhs}<(rEu>1m#b8PvGM82ipddp2j($s}<= za&t*%5sNl4yZqID&r&dZ$kIRPlY!uZM4V!V=RAOXBMDv+Yi_)pKZBX}SJpVxY z2tL|0A5|)uTqY3>Bc7`?SFy)&P|RXYjE>b*-u)r>HuHR;{w-!%X?srG^VwQI(?l6{kK>ZP3$Q+O^AzCBPCPjUZzLBo znE2u`)HHD*UmCZw7kyzQ*6Z02Ys%P(mD4$gf%NFJ?q2O$1WJiaC|+;>p852;j61iM zlkLT-Iy~^NZ~IxfM*pu*@c-Gp70?~OpVh5i_Hmkni;GXq(xT2RW~4!)<{?s{G;p;4 z(a1*&%#e&O=6BDP?&wtCztL$ptpP$Y?~5R#R;`oo;>|&B6AIGAoeLlS-nTR$yHrq- zM$7&*90iEg<);`iBO50B0<#gZ2#hRw+Ht=|j%Znx649H4#TEw|k0%e1VAOZd>3!Vl zejvB4`bl%()kofs#Vby?7+ermibluP_O1SSq|Y)@z{58e{e&3&N|C}p(@DbMq^m|q zr%1!*rF=@oA!+@~gIsRp-0*#=noE}H&nt;7RJvpCJmu{C^EuyDA`RTMlO;U@Sx&xz zB_9Y0YaN3V^==&$s(GSm0g;w_s6MDwlHhxk?rGzv~s}vT<7f6k#!$Pyr zN@9W*!bAxCi3kc~J7>dQ@tYjR?~|?3WkJ4E0WUGX)4>Y)bLE|{YM=t*$mzMfrltuFev!U8<`6GHijVw!)&De8So2^o7;`?4a>x1fhe|5@$d?j?;mO z+|(~{x8RSL$wDewZ$|2DD|z_bSftW43ntQgQ7Mp-%)bGeR>fi5vKWcaGcgsPA1L{*R_Z=pk5kU7ucPZ%>U!a{-r#U1D<447=)Na`FF~eFg%5S|*TatjGp@5B*BEU9R7%jwSX9z3V@IDVlbo(R76 zyC787atv<4HhaNH#YoC#_sodKJtXshyG4=NeQ2+5mHYH~UDdSa4Z9qn+1fMHggBux z&!4p0^5;KyG1kpj&u)SggqX~p7pBOBDZofDcI!9gq%0%HjHdhgeLiIj3mxXJnw08W zeb7V9`oF48Y?RqTrdz!pH?q`4(q-7ppWNCH%McCQnW-$OeuVUSO9kY~IDfG!Re#<5 zqMw1f_kuLVU@~AaAi^BW9qDtZSr**|AixJoFX?vpAervHm3h&^3`oB^?tJNcz5Fb( zn6@>Cn9<%fd{|L>w+|9iyYPe@eGpX#*UuC99Objq6NG-bPg zb=>|e%QL1(JTo?C4}-(3v|N*s*83bU`NuDj+Q%o^?< zncUo8ASQ_u0kymrgVYxoJ!9Xz6Bb^9t(SE8pJudq-Hr zd)39HpZH#qG+Nt}d7HqNeHeVO*svOZ!MDRQf`*9}zVD7tC4b-5 z_TrzMiiB-$uVoOX!cH@)n``I2ZW?b5=6-(|9`WZqJ#nxc%e9NBQvOavW;pF$ILz&U=hg#^G!(p`jrmEV7o+YyB(~ zLIp*<)@QL+jLhLYI0}u5p*yCiKFkxmIFcbL?0e#|y;&1%AxpAe8?sQp`nY6#PUF&O zpiPwjYNxy5l0+@>M3d!Dv=?^d^nBza8NQGGL5%1B*hcZV`7b0aukwwq0Er}f<#pt=s&-;&I!&RFpNhjn=13e}f^lf1lE%(44X zb1U%a%egOgr+NQsTe5Cd!kcfqC)X)0x9fUW|Ky_Er=lN^XUfL!o>g79(p~@AV&=?R~j!`T6hP`EI3K;1p0={86)cK~BzX=kN3X zf8?K(wPoXyS8o@W$5vFox|;I$(pzi0s`OQXOUiElVXy!Acx4*r?Z$TYbN>GWtNM@K zJIlPYRkyg-+HUWTOwXxzj%?fcDqiMhz>ljx949-=-i-Kh_1KBUKX&esw4a``^RJ>* zXwhtT%ei{n#FzEH|C;yZ>+$!u_x#*+`=L8{b9SH^9&27u3G_Gxqxe`L2UJtdxghk z&-wzDFvLvW{chK5u3{n6GSKKy!P&C6w^IFpbD0bcp^A{{2lcLh_DXj@ybtYvc^;(2 M)78&qol`;+0Fu7JivR!s diff --git a/docs/output.md b/docs/output.md index 5a19f952..f3d8a974 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,46 +12,374 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [YamlInput](#yamlinput) - +- [Validate TaxID](#validate-taxid) - +- [Filter Fasta](#filter-fasta) - +- [GC Content](#gc-content) - +- [Generate Genome](#generate-genome) - +- [Trailing Ns Check](#trailing-ns-check) - +- [Get KMERS profile](#get-kmers-profile) - +- [Extract Tiara Hits](#extract-tiara-hits) - +- [Mito organellar blast](#mito-organellar-blast) - +- [Plastid organellar blast](#plastid-organellar-blast) - +- [Run FCS Adaptor](#run-fcs-adaptor) - +- [Run FCS-GX](#run-fcs-gx) - +- [Pacbio Barcode Check](#pacbio-barcode-check) - +- [Run Read Coverage](#run-read-coverage) - +- [Run Vecscreen](#run-vecscreen) - +- [Run NT Kraken](#run-nt-kraken) - +- [Nucleotide Diamond Blast](#nucleotide-diamond-blast) - +- [Uniprot Diamond Blast](#uniprot-diamond-blast) - +- [Create BTK dataset](#create-btk-dataset) - +- [Autofilter and check assembly](#autofilter-and-check-assembly) - +- [Generate samplesheet](#generate-samplesheet) - +- [Sanger-TOL BTK](#sanger-tol-btk) - +- [Merge BTK datasets](#merge-btk-datasets) - +- [ASCC Merge Tables](#ascc-merge-tables) - - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### YamlInput
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `NA`
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +YamlInput parses the input yaml into channels for later use in the pipeline. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +### Validate TaxID -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +
+Output files + +- `NA` + +
+ +Validate TaxID scans through the taxdump to ensure that the input taxid is present in the nxbi taxdump. + + +### Filter Fasta + +
+Output files + +- `filter/` + `*filtered.fasta` - A fasta file that has been filtered for sequences below a given threshold. + +
+ +By default scaffolds above 1.9Gb are removed from the assembly, as scaffolds of this size are unlikely to truely have contamination. There is also the issue that scaffolds larger than this use a significant amount of resources which hinders production environments. + + +### GC Content + +
+Output files + +- `gc/` + `*-GC_CONTENT.txt` - A text file describing the GC content of the input genome. + +
+ +Calculating the GC content of the input genome. + + +### Generate Genome + +
+Output files + +- `generate/` + `*.genome` - An index-like file describing the input genome. + +
+ +An index-like file containing the scaffold and scaffold length of the input genome. + + +### Trailing Ns Check + +
+Output files + +- `trailingns/` + `*_trim_Ns` - A text file containing a report of the Ns found in the genome. + +
+ +A text file containing a report of the Ns found in the genome. + + +### Get KMERS profile + +
+Output files + +- `get/` + `*_KMER_COUNTS.csv` - A csv file containing kmers and their counts. + +
+ +A csv file containing kmers and their counts. + + +### Extract Tiara Hits + +
+Output files + +- `tiara/` + `*.{txt,txt.gz}` - A text file containing classifications of potential contaminants. + `log_*.{txt,txt.gz}` - A log of the tiara run. + `*.{fasta,fasta.gz}` - An output fasta file. + +
+ +Tiara ... + + +### Mito Organellar Blast + +
+Output files + +- `blast/` + `*.tsv` - A tsv file containing potential contaminants. + +
+ +A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. + + +### Chloro Organellar Blast + +
+Output files + +- `blast/` + `*.tsv` - A tsv file containing potential contaminants. + +
+ +A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. + + +### Run FCS Adaptor + +
+Output files + +- `fcs/` + `*.fcs_adaptor_report.txt` - A text file containing potential adaptor sequences and locations. + `*.cleaned_sequences.fa.gz` - Cleaned fasta file. + `*.fcs_adaptor.log` - Log of the fcs run. + `*.pipeline_args.yaml` - Arguments to FCS Adaptor + `*.skipped_trims.jsonl` - Skipped sequences + +
+ +FCS Adaptor Identified potential locations of retained adaptor sequences from the sequencing run. + + +### Run FCS-GX + +
+Output files + +- `fcs/` + `*out/*.fcs_gx_report.txt` - A text file containing potential contaminant locations. + `out/*.taxonomy.rpt` - Taxonomy report of the potential contaminants. + +
+ +FCS-GX Identified potential locations of contaminant sequences. + + +### Pacbio Barcode Check + +
+Output files + +- `filter/` + `*_filtered.txt` - Text file of barcodes found in the genome. + +
+ +Uses BlastN to identify where given barcode sequences may be in the genome. + + +### Run Read Coverage + +
+Output files + +- `samtools/` + `*.bam` - Aligned BAM file. + `*_average_coverage.txt` - Text file containing the coverage information for the genome + +
+ +Mapping the read data to the input genome and calculating the average coverage across it. + + +### Run Vecscreen + +
+Output files + +- `summarise/` + `*.vecscreen_contamination` - A text file containing potential vector contaminant locations. + +
+ +Vecscreen identifies vector contamination in the input sequence. + + +### Run NT Kraken + +
+Output files + +- `kraken2/` + `*.classified{.,_}*'` - Fastq file containing classified sequence. + `*.unclassified{.,_}*'` - Fastq file containing unclassified sequence. + `*classifiedreads.txt` - A text file containing a report on reads which have been classified. + `*report.txt` - Report of Kraken2 run. +- `get/` + `*txt` - Text file containing lineage information of the reported meta genomic data. + +
+ +Kraken assigns taxonomic labels to metagenomic DNA sequences and optionally outputs the fastq of these data. + + +### Nucleotide Diamond Blast + +
+Output files + +- `diamond/` + `*.txt` - A text file containing the genomic locations of hits and scores. +- `reformat/` + `*text` - A Reformated text file continaing the full genomic location of hits and scores. +- `convert/` + `*.hits` - A file containing all hits above the cutoff. + +
+ +Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the NCBI db + + +### Uniprot Diamond Blast + +
+Output files + +- `diamond/` + `*.txt` - A text file containing the genomic locations of hits and scores. +- `reformat/` + `*text` - A Reformated text file continaing the full genomic location of hits and scores. +- `convert/` + `*.hits` - A file containing all hits above the cutoff. + +
+ +Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the Uniprot db + + +### Create BTK dataset + +
+Output files + +- `create/` + `btk_datasets/` - A btk dataset folder containing data compatible with BTK viewer. + `btk_summary_table_full.tsv` - A TSV file summarising the dataset. + +
+ +Create BTK, creates a BTK_dataset folder compatible with BTK viewer. + + +### Autofilter and check assembly + +
+Output files + +- `autofilter/` + `autofiltered.fasta` - The decontaminated input genome. + `ABNORMAL_CHECK.csv` - Combined FCS and Tiara summary of contamination. + `assembly_filtering_removed_sequences.txt` - Sequences deemed contamination and removed from the above assembly. + `fcs-gx_alarm_indicator_file.txt` - Contains text to control the running of Blobtoolkit. + +
+ +Autofilter and check assembly returns a decontaminated genome file as well as summaries of the contamination found. + + +### Generate samplesheet + +
+Output files + +- `generate/` + `*.csv` - A CSV file containing data locations, for use in Blobtoolkit. + +
+ +This produces a CSV containing information on the read data for use in BlobToolKit. + + +### Sanger-TOL BTK + +
+Output files + +- `sanger/` + `*_btk_out/blobtoolkit/${meta.id}*/` - The BTK dataset folder generated by BTK. + `*_btk_out/blobtoolkit/plots/` - The plots for display in BTK Viewer. + `*_btk_out/blobtoolkit/${meta.id}*/summary.json.gz` - The Summary.json file... + `*_btk_out/busco/*` - The BUSCO results returned by BTK. + `*_btk_out/multiqc/*` - The MultiQC results returned by BTK. + `blobtoolkit_pipeline_info` - The pipeline_info folder. + +
+ +Sanger-Tol/BlobToolKit is a Nextflow re-implementation of the snakemake based BlobToolKit pipeline and produces interactive plots used to identify true contamination and seperate sequence from the main assembly. + + +### Merge BTK datasets + +
+Output files + +- `merge/` + `merged_datasets` - A BTK dataset. + `merged_datasets/btk_busco_summary_table_full.tsv` - A TSV file containing a summary of the btk busco results. + +
+ +This module merged the Create_btk_dataset folder with the Sanger-tol BTK dataset to create one unified dataset for use with btk viewer. -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -### MultiQC +### ASCC Merge Tables
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `ascc/` + `*_contamination_check_merged_table.csv` - .... + `*_contamination_check_merged_table_extended.csv` - .... + `*_phylum_counts_and_coverage.csv` - A CSV report containing information on the hits per phylum and the coverage of the hits..
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +Merge Tables merged the summary reports from a number of modules inorder to create a single set of reports. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . ### Pipeline information diff --git a/modules/local/create_btk_dataset.nf b/modules/local/create_btk_dataset.nf index a88ac0da..bbad73a8 100644 --- a/modules/local/create_btk_dataset.nf +++ b/modules/local/create_btk_dataset.nf @@ -26,7 +26,7 @@ process CREATE_BTK_DATASET { output: tuple val(meta), path("btk_datasets"), emit: btk_datasets tuple val(meta), path("btk_summary_table_full.tsv"), emit: create_summary - path "versions.yml", emit: versions + path "versions.yml", emit: versions when: diff --git a/modules/local/filter_fasta.nf b/modules/local/filter_fasta.nf index 20f4f07a..2aec22ce 100644 --- a/modules/local/filter_fasta.nf +++ b/modules/local/filter_fasta.nf @@ -49,4 +49,4 @@ process FILTER_FASTA { filter_fasta_by_length: \$(filter_fasta_by_length.py -v) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index f1dd1523..3f012f07 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -17,13 +17,13 @@ process SANGER_TOL_BTK { val gca_accession output: - tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/draft"), emit: dataset - path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots - path("${meta.id}_btk_out/blobtoolkit/draft/summary.json.gz"), emit: summary_json - path("${meta.id}_btk_out/busco"), emit: busco_data - path("${meta.id}_btk_out/multiqc"), emit: multiqc_report - path("blobtoolkit_pipeline_info"), emit: pipeline_info - path "versions.yml", emit: versions + tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/${meta.id}*"), emit: dataset + path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots + path("${meta.id}_btk_out/blobtoolkit/${meta.id}*/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/busco"), emit: busco_data + path("${meta.id}_btk_out/multiqc"), emit: multiqc_report + path("blobtoolkit_pipeline_info"), emit: pipeline_info + path "versions.yml", emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" @@ -38,7 +38,8 @@ process SANGER_TOL_BTK { // Seems to be an issue where a nested pipeline can't see the files in the same directory // Running realpath gets around this but the files copied into the folder are - // now just wasted space. + // now just wasted space. Should be fixed with using Mahesh's method of nesting but + // this is proving a bit complicated with BTK // outdir should be an arg diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index 6d21ea58..935e5033 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -148,7 +148,7 @@ workflow YAML_INPUT { ncbi_taxonomy_path = group.ncbi_taxonomy_path ncbi_rankedlineage_path = group.ncbi_rankedlineage_path busco_lineages_folder = group.busco_lineages_folder - busco_lineages = group.busco_lineages + busco_lineages = group.busco_lineages fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 9785dbae..48b4dc83 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -326,7 +326,7 @@ workflow ASCC { } .collect() .set { - ch_barcode + ch_barcode // Not in use } ch_versions = ch_versions.mix(PACBIO_BARCODE_CHECK.out.versions) From 00b375a53ff00fd0428804236bbd1387f176c6b3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 8 Aug 2024 15:33:14 +0100 Subject: [PATCH 107/117] Prettier linting --- docs/output.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/docs/output.md b/docs/output.md index f3d8a974..99d0d6b2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -49,7 +49,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d YamlInput parses the input yaml into channels for later use in the pipeline. - ### Validate TaxID
@@ -61,7 +60,6 @@ YamlInput parses the input yaml into channels for later use in the pipeline. Validate TaxID scans through the taxdump to ensure that the input taxid is present in the nxbi taxdump. - ### Filter Fasta
@@ -74,7 +72,6 @@ Validate TaxID scans through the taxdump to ensure that the input taxid is prese By default scaffolds above 1.9Gb are removed from the assembly, as scaffolds of this size are unlikely to truely have contamination. There is also the issue that scaffolds larger than this use a significant amount of resources which hinders production environments. - ### GC Content
@@ -87,7 +84,6 @@ By default scaffolds above 1.9Gb are removed from the assembly, as scaffolds of Calculating the GC content of the input genome. - ### Generate Genome
@@ -100,7 +96,6 @@ Calculating the GC content of the input genome. An index-like file containing the scaffold and scaffold length of the input genome. - ### Trailing Ns Check
@@ -113,7 +108,6 @@ An index-like file containing the scaffold and scaffold length of the input geno A text file containing a report of the Ns found in the genome. - ### Get KMERS profile
@@ -126,7 +120,6 @@ A text file containing a report of the Ns found in the genome. A csv file containing kmers and their counts. - ### Extract Tiara Hits
@@ -141,7 +134,6 @@ A csv file containing kmers and their counts. Tiara ... - ### Mito Organellar Blast
@@ -154,7 +146,6 @@ Tiara ... A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. - ### Chloro Organellar Blast
@@ -167,7 +158,6 @@ A BlastN based subworkflow used on the input genome to filter potential contamin A BlastN based subworkflow used on the input genome to filter potential contaminants from the genome. - ### Run FCS Adaptor
@@ -184,7 +174,6 @@ A BlastN based subworkflow used on the input genome to filter potential contamin FCS Adaptor Identified potential locations of retained adaptor sequences from the sequencing run. - ### Run FCS-GX
@@ -198,7 +187,6 @@ FCS Adaptor Identified potential locations of retained adaptor sequences from th FCS-GX Identified potential locations of contaminant sequences. - ### Pacbio Barcode Check
@@ -211,7 +199,6 @@ FCS-GX Identified potential locations of contaminant sequences. Uses BlastN to identify where given barcode sequences may be in the genome. - ### Run Read Coverage
@@ -225,7 +212,6 @@ Uses BlastN to identify where given barcode sequences may be in the genome. Mapping the read data to the input genome and calculating the average coverage across it. - ### Run Vecscreen
@@ -238,7 +224,6 @@ Mapping the read data to the input genome and calculating the average coverage a Vecscreen identifies vector contamination in the input sequence. - ### Run NT Kraken
@@ -256,7 +241,6 @@ Vecscreen identifies vector contamination in the input sequence. Kraken assigns taxonomic labels to metagenomic DNA sequences and optionally outputs the fastq of these data. - ### Nucleotide Diamond Blast
@@ -273,7 +257,6 @@ Kraken assigns taxonomic labels to metagenomic DNA sequences and optionally outp Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the NCBI db - ### Uniprot Diamond Blast
@@ -290,7 +273,6 @@ Diamond Blast is a sequence aligner for translated and protein sequences, here i Diamond Blast is a sequence aligner for translated and protein sequences, here it is used do identify contamination usin the Uniprot db - ### Create BTK dataset
@@ -304,7 +286,6 @@ Diamond Blast is a sequence aligner for translated and protein sequences, here i Create BTK, creates a BTK_dataset folder compatible with BTK viewer. - ### Autofilter and check assembly
@@ -320,7 +301,6 @@ Create BTK, creates a BTK_dataset folder compatible with BTK viewer. Autofilter and check assembly returns a decontaminated genome file as well as summaries of the contamination found. - ### Generate samplesheet
@@ -333,7 +313,6 @@ Autofilter and check assembly returns a decontaminated genome file as well as su This produces a CSV containing information on the read data for use in BlobToolKit. - ### Sanger-TOL BTK
@@ -351,7 +330,6 @@ This produces a CSV containing information on the read data for use in BlobToolK Sanger-Tol/BlobToolKit is a Nextflow re-implementation of the snakemake based BlobToolKit pipeline and produces interactive plots used to identify true contamination and seperate sequence from the main assembly. - ### Merge BTK datasets
@@ -365,7 +343,6 @@ Sanger-Tol/BlobToolKit is a Nextflow re-implementation of the snakemake based Bl This module merged the Create_btk_dataset folder with the Sanger-tol BTK dataset to create one unified dataset for use with btk viewer. - ### ASCC Merge Tables
@@ -380,7 +357,6 @@ This module merged the Create_btk_dataset folder with the Sanger-tol BTK dataset Merge Tables merged the summary reports from a number of modules inorder to create a single set of reports. - ### Pipeline information
From 646ae910efa5aa3bc9c657d90f22be53e85515d5 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 10:07:30 +0100 Subject: [PATCH 108/117] Update to the sanger-tol module to remove the yaml flag which is now depreciated --- assets/test.yaml | 2 +- modules/local/sanger_tol_btk.nf | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/assets/test.yaml b/assets/test.yaml index 23766ae1..3922933c 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -23,7 +23,7 @@ busco_lineages: "diptera_odb10,insecta_odb10" fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/ vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/ diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd -diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd +diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd seqkit: sliding: 100000 window: 6000 diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 3f012f07..98c7e780 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -19,7 +19,7 @@ process SANGER_TOL_BTK { output: tuple val(meta), path("${meta.id}_btk_out/blobtoolkit/${meta.id}*"), emit: dataset path("${meta.id}_btk_out/blobtoolkit/plots"), emit: plots - path("${meta.id}_btk_out/blobtoolkit/${meta.id}*/summary.json.gz"), emit: summary_json + path("${meta.id}_btk_out/blobtoolkit/${meta.id}*/summary.json.gz"), emit: summary_json path("${meta.id}_btk_out/busco"), emit: busco_data path("${meta.id}_btk_out/multiqc"), emit: multiqc_report path("blobtoolkit_pipeline_info"), emit: pipeline_info @@ -53,7 +53,6 @@ process SANGER_TOL_BTK { --input "\$(realpath $samplesheet_csv)" \\ --outdir ${prefix}_btk_out \\ --fasta "\$(realpath REFERENCE.fa)" \\ - --yaml "\$(realpath BTK.yaml)" \\ --busco_lineages $busco_lineages \\ --taxon $taxon \\ --taxdump "\$(realpath $tax_dump)" \\ @@ -78,23 +77,23 @@ process SANGER_TOL_BTK { def pipeline_version = task.ext.version ?: "draft_assemblies" """ - mkdir -p ${prefix}_btk_out/blobtoolkit/$gca_accession - touch ${prefix}_btk_out/blobtoolkit/$gca_accession/test.json.gz + mkdir -p ${meta.id}_btk_out/blobtoolkit/${meta.id}_out + touch ${meta.id}_btk_out/blobtoolkit/${meta.id}_out/test.json.gz - mkdir ${prefix}_btk_out/blobtoolkit/plots - touch ${prefix}_btk_out/blobtoolkit/plots/test.png + mkdir ${meta.id}_btk_out/blobtoolkit/plots + touch ${meta.id}_btk_out/blobtoolkit/plots/test.png - mkdir ${prefix}_btk_out/busco - touch ${prefix}_btk_out/busco/test.batch_summary.txt - touch ${prefix}_btk_out/busco/test.fasta.txt - touch ${prefix}_btk_out/busco/test.json + mkdir ${meta.id}_btk_out/busco + touch ${meta.id}_btk_out/busco/test.batch_summary.txt + touch ${meta.id}_btk_out/busco/test.fasta.txt + touch ${meta.id}_btk_out/busco/test.json - mkdir ${prefix}_btk_out/multiqc - mkdir ${prefix}_btk_out/multiqc/multiqc_data - mkdir ${prefix}_btk_out/multiqc/multiqc_plots - touch ${prefix}_btk_out/multiqc/multiqc_report.html + mkdir ${meta.id}_btk_out/multiqc + mkdir ${meta.id}_btk_out/multiqc/multiqc_data + mkdir ${meta.id}_btk_out/multiqc/multiqc_plots + touch ${meta.id}_btk_out/multiqc/multiqc_report.html - mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info + mv ${meta.id}_btk_out/pipeline_info blobtoolkit_pipeline_info cat <<-END_VERSIONS > versions.yml "${task.process}": From 124a50a5266b8bbed6897388fe7ed37ff3c10691 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 10:58:00 +0100 Subject: [PATCH 109/117] closes #57 filters the output to that directly needed for analysis --- conf/modules.config | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0b12ec3e..f5dc119d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,18 +12,30 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - withName: SANGER_TOL_BTK { ext.args = "--blastx_outext 'txt'" ext.executor = "bsub -Is -tty -e test.e -o test.log -n 2 -q oversubscribed -M1400 -R'select[mem>1400] rusage[mem=1400] span[hosts=1]'" ext.profiles = "singularity,sanger" ext.get_versions = "lsid | head -n1 | cut -d ',' -f 1" ext.version = "draft_assemblies" + publishDir = [ + path: { "${params.outdir}/sanger-tol-btk" }, + mode: params.publish_dir_mode, + ] + } + + withName: "AUTOFILTER_AND_CHECK_ASSEMBLY|CREATE_BTK_DATASET|ORGANELLE_CONTAMINATION_RECOMMENDATIONS|FILTER_BARCODE|SUMMARISE_VECSCREEN_OUTPUT|MERGE_BTK_DATASETS" { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + + withName: ASCC_MERGE_TABLES { + publishDir = [ + path: { "${params.outdir}/ASCC-main-output" }, + mode: params.publish_dir_mode + ] } withName: FILTER_FASTA { @@ -108,6 +120,13 @@ process { ext.prefix = { "${meta.id}_euk" } } + withName: "FCS_FCSADAPTOR_EUK|FCS_FCSADAPTOR_PROK" { + publishDir = [ + path: { "${params.outdir}/FCS-adaptor" }, + mode: params.publish_dir_mode, + ] + } + withName: SED_SED { ext.prefix = { "${meta.id}_fixed" } ext.args = " -e '/>/s/ //g' " From 1db642973b4d76094b30cded33b47cfc1d7a5207 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 14:32:55 +0100 Subject: [PATCH 110/117] Addition of 2/3 indicator files needed for integration into the current sanger automation workflows --- modules/local/autofiltering.nf | 7 +++++++ nextflow.config | 9 +-------- workflows/ascc.nf | 12 ++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 00dbc98d..e86f0e95 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -18,6 +18,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: fcs_tiara_summary tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file + path("autofiltering_done_indicator_file.txt"), emit: indicator_file path "versions.yml", emit: versions script: @@ -35,6 +36,11 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { $reference \\ ABNORMAL_CHECK.csv + # The below indicator file is used in Sanger-Tol to allow for other processes + # to begin once generated. This allows us to speed up the overall flow of the + # Tol-engine + touch autofiltering_done_indicator_file.txt + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') @@ -48,6 +54,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { touch ABNORMAL_CHECK.csv touch assembly_filtering_removed_sequences.txt touch fcs-gx_alarm_indicator_file.txt + touch autofiltering_done_indicator_file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 01c68b33..4935682a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,16 +12,9 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - outdir = "results" + outdir = "${params.outdir}" tracedir = "${params.outdir}/pipeline_info/" - // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null - // Boilerplate options outdir = null publish_dir_mode = 'copy' diff --git a/workflows/ascc.nf b/workflows/ascc.nf index 48b4dc83..a6168ab1 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -581,6 +581,18 @@ workflow.onComplete { } // TreeValProject.summary(workflow, reference_tuple, summary_params, projectDir) + if (workflow.success) { + // Generate a pipeline completion indicator file, for use in + // Sanger-ToL automation + def newFile = new File("${params.outdir}/pipeline_run_done_indicator_file.txt") + newFile.createNewFile() + } + +} + +workflow.onError = { + def newFile = new File("${params.outdir}/pipeline_run_ERROR_indicator_file.txt") + newFile.createNewFile() } /* From 0bbd384677e051b753430397976290a0b0d4ef2c Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 14:34:21 +0100 Subject: [PATCH 111/117] Addition of 2/3 indicator files needed for integration into the current sanger automation workflows --- modules/local/autofiltering.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index e86f0e95..32cd931c 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -18,7 +18,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: fcs_tiara_summary tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file - path("autofiltering_done_indicator_file.txt"), emit: indicator_file + //path("autofiltering_done_indicator_file.txt"), emit: indicator_file path "versions.yml", emit: versions script: @@ -39,7 +39,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { # The below indicator file is used in Sanger-Tol to allow for other processes # to begin once generated. This allows us to speed up the overall flow of the # Tol-engine - touch autofiltering_done_indicator_file.txt + touch ${params.outdir}/autofiltering_done_indicator_file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": From dfdbb8d729b7c4e568932a9d74934e95b9797a40 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 15:10:48 +0100 Subject: [PATCH 112/117] Addition of indicator files, these are saved to the main outdir --- conf/modules.config | 15 +++++++++++++++ modules/local/autofiltering.nf | 4 ++-- nextflow.config | 7 +++++++ workflows/ascc.nf | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f5dc119d..530430c3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,6 +31,21 @@ process { ] } + withName: AUTOFILTER_AND_CHECK_ASSEMBLY { + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode, + pattern: "autofiltering_done_indicator_file.txt" + ] + } + + withName: GenIndicator { + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode + ] + } + withName: ASCC_MERGE_TABLES { publishDir = [ path: { "${params.outdir}/ASCC-main-output" }, diff --git a/modules/local/autofiltering.nf b/modules/local/autofiltering.nf index 32cd931c..e86f0e95 100644 --- a/modules/local/autofiltering.nf +++ b/modules/local/autofiltering.nf @@ -18,7 +18,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { tuple val(meta), path("ABNORMAL_CHECK.csv"), emit: fcs_tiara_summary tuple val(meta), path("assembly_filtering_removed_sequences.txt"), emit: removed_seqs path("fcs-gx_alarm_indicator_file.txt"), emit: alarm_file - //path("autofiltering_done_indicator_file.txt"), emit: indicator_file + path("autofiltering_done_indicator_file.txt"), emit: indicator_file path "versions.yml", emit: versions script: @@ -39,7 +39,7 @@ process AUTOFILTER_AND_CHECK_ASSEMBLY { # The below indicator file is used in Sanger-Tol to allow for other processes # to begin once generated. This allows us to speed up the overall flow of the # Tol-engine - touch ${params.outdir}/autofiltering_done_indicator_file.txt + touch autofiltering_done_indicator_file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 4935682a..f7e11c79 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,13 @@ params { outdir = "${params.outdir}" tracedir = "${params.outdir}/pipeline_info/" + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null + // Boilerplate options outdir = null publish_dir_mode = 'copy' diff --git a/workflows/ascc.nf b/workflows/ascc.nf index a6168ab1..2fb750a0 100644 --- a/workflows/ascc.nf +++ b/workflows/ascc.nf @@ -451,6 +451,7 @@ workflow ASCC { YAML_INPUT.out.ncbi_rankedlineage_path ) ch_autofilt_assem = AUTOFILTER_AND_CHECK_ASSEMBLY.out.decontaminated_assembly.map{it[1]} + ch_autofilt_indicator = AUTOFILTER_AND_CHECK_ASSEMBLY.out.indicator_file AUTOFILTER_AND_CHECK_ASSEMBLY.out.alarm_file .map { file -> file.text.trim() } @@ -464,6 +465,7 @@ workflow ASCC { ch_versions = ch_versions.mix(AUTOFILTER_AND_CHECK_ASSEMBLY.out.versions) } else { ch_autofilt_assem = [] + ch_autofilt_indicator = [] } // @@ -537,6 +539,15 @@ workflow ASCC { ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) + GenIndicator ( + ch_autofilt_indicator, + ch_fcsgx, + ch_fcsadapt, + ch_tiara, + ch_vecscreen, + ch_barcode, + ) + // // SUBWORKFLOW: Collates version data from prior subworflows @@ -565,6 +576,27 @@ process GrabFiles { "true" } +process GenIndicator { + label 'process_tiny' + + tag "Generating Phase 1 Indicator" + executor 'local' + + input: + val(a) + val(b) + val(c) + val(d) + val(e) + val(f) + + output: + path("decon_first_stage_done_indicator_file.txt") + + script: + "touch decon_first_stage_done_indicator_file.txt" +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COMPLETION EMAIL AND SUMMARY From fe1a8478bb9e329565cdf02029bcba6f4634eb82 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 15:13:17 +0100 Subject: [PATCH 113/117] Addition of indicator files, these are saved to the main outdir --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index f7e11c79..68647829 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - outdir = "${params.outdir}" + outdir = null tracedir = "${params.outdir}/pipeline_info/" // MultiQC options From a150b47d8bc1fc6d27dcce253cb52c650e3d9475 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 15:15:27 +0100 Subject: [PATCH 114/117] Addition of indicator files, these are saved to the main outdir --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 68647829..1b6e09bc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - outdir = null + outdir = results tracedir = "${params.outdir}/pipeline_info/" // MultiQC options From ddf15e2957f319e3d4bd59514a8ad6074bd04d46 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 9 Aug 2024 15:18:22 +0100 Subject: [PATCH 115/117] Addition of indicator files, these are saved to the main outdir --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 1b6e09bc..01c68b33 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - outdir = results + outdir = "results" tracedir = "${params.outdir}/pipeline_info/" // MultiQC options From 7be5861dd11e2c9796cc67ddbb3c7315394d0b5d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 14 Aug 2024 13:27:09 +0100 Subject: [PATCH 116/117] Adding 13 subworkflow images --- docs/images/ascc-1.0.0-diamond-blast.drawio.png | Bin 0 -> 74783 bytes docs/images/ascc-1.0.0-extract-blast.drawio.png | Bin 0 -> 64729 bytes docs/images/ascc-1.0.0-extract-tiara.drawio.png | Bin 0 -> 32510 bytes docs/images/ascc-1.0.0-fcsadaptor.drawio.png | Bin 0 -> 39487 bytes .../ascc-1.0.0-generate-genome.drawio.png | Bin 0 -> 43539 bytes .../ascc-1.0.0-get-kmer-profile.drawio.png | Bin 0 -> 42529 bytes docs/images/ascc-1.0.0-kraken.drawio.png | Bin 0 -> 38524 bytes docs/images/ascc-1.0.0-mapping.drawio.png | Bin 0 -> 42349 bytes docs/images/ascc-1.0.0-organellar.drawio.png | Bin 0 -> 83209 bytes docs/images/ascc-1.0.0-pacbio-check.drawio.png | Bin 0 -> 70716 bytes docs/images/ascc-1.0.0-read-coverage.drawio.png | Bin 0 -> 69388 bytes docs/images/ascc-1.0.0-trailing-ns.drawio.png | Bin 0 -> 33665 bytes docs/images/ascc-1.0.0-vecscreen.drawio.png | Bin 0 -> 67792 bytes 13 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/images/ascc-1.0.0-diamond-blast.drawio.png create mode 100644 docs/images/ascc-1.0.0-extract-blast.drawio.png create mode 100644 docs/images/ascc-1.0.0-extract-tiara.drawio.png create mode 100644 docs/images/ascc-1.0.0-fcsadaptor.drawio.png create mode 100644 docs/images/ascc-1.0.0-generate-genome.drawio.png create mode 100644 docs/images/ascc-1.0.0-get-kmer-profile.drawio.png create mode 100644 docs/images/ascc-1.0.0-kraken.drawio.png create mode 100644 docs/images/ascc-1.0.0-mapping.drawio.png create mode 100644 docs/images/ascc-1.0.0-organellar.drawio.png create mode 100644 docs/images/ascc-1.0.0-pacbio-check.drawio.png create mode 100644 docs/images/ascc-1.0.0-read-coverage.drawio.png create mode 100644 docs/images/ascc-1.0.0-trailing-ns.drawio.png create mode 100644 docs/images/ascc-1.0.0-vecscreen.drawio.png diff --git a/docs/images/ascc-1.0.0-diamond-blast.drawio.png b/docs/images/ascc-1.0.0-diamond-blast.drawio.png new file mode 100644 index 0000000000000000000000000000000000000000..e4ba9f9d9b245587c50a6373745ee56e8ba4b4cd GIT binary patch literal 74783 zcmeEv1wd5my7qvegmgEI14_ftNC^WBDItw?N_Pt)jfiw3pdw1Qv?4XqN=Pe8C?KUE zQvVuaZ)0!v`Op8~yU#uMAi}I~ed~)io;SXCX}F5AEDjbW76=5wk%Pn3K%mnI5C}yV z<22Bszfn#H{6cY2la&M&^-zBRfy8uOrL|q{JT0tk%|VPjQiq=yp(=58(W&&9}TGF;B9SdYs?6h;p5-}hPt3+Y-(-q>fmC<$Ri0{%h|h{+XBCVX5i;# zb>N3K@Rys*kekm?fEoCB$JjIvYD$DLI%OP}j`!a9Ca_AFxE^D$Tr(TKKt+rm=KB zp6FnWP%dPHy0!Q5UXEMctj)|_4j(>x($&Gi*45ha$3{~JdwX-!BPlrQXzc9l;PGQO z3kTby?H%=T1a^Kn`q6~Q2K8TPifq-gHgg5U2r0LMP=UjJA=}~R)|OVsGjsC`9p14s zKJI)pu#1(knS;l7bA8X2LqZ;t)7ilR81Fl-eSiD7lbxrOxh+zbf1)EGa{u?w{zeVw z-&E;n+5gmVM_hL|wskwA?jccJT)mEk)!g3f5|TdvLF^sufnJhUu6DM-6`#33hLmFD)!}Z;&5mW~STc>BEddZa z5D@^T4kX{v_;->XG@Z=X-?=Mm4uA@Vd zHQQO6nIQvRNoR9_kH#hkK`%0JJaVRh2$4^ckq_yY8@mD7sl&9#@3HPrVJeV8v;DaV z8Rs5F)5zyej&qM>5Ll)8Q55(8F*^M{E)@FFCFT(V5_SjD|6Kqp06psTeFV$Rb>tiz zJDuN0t4L?^uTvU8pQC8&WOoo5(f*V;ISO8XP|=?x0GT~-{0GT=l!`jyIzWIUyMPpu z|5EZFr=E@-0xp50oqb1sAW3q-k?*N5aD-8R3-ON_e-iP5%-KQa)84@qsZ$e|QM282EcLdtLQ%#fKl0BE*Ty3q9silLk*38)1<}&b*(WpOAzzqcKAY=#Oo^1z_Zifl&_({?YD#o&;$AW#V7D{se44D!^Z;){lu;q)vQqnHZsfwMKsZ zy}ELQemqgmChpkYeh+!aZGR6NkDRR^Fz+Wl z1dek4_pp(V_kScCj}YWLHoBNQ*&xq}4`JEN+WlupKEnM!G$K$w6K-S@`PjT2KkH&^ zZDwt6c|7Vb2>zQ5_+h@EJcLgm+u@2JvLQ+TOCrf=`fDdn}ADkiq zM*Pt5L-~-VpATvLfgi|ce>MNW4Wt7AbUk(ffRD#c0N7Y0;&+&IoWT1Yl8)Q{9!UD5gN&cN_kYJ4=kF_Cj!U`@3yluW zYJNNI{P(MZ;Xd&8kt{wAW`7G@k0tPsIR8WmGE$U*2=5Q7tp$0G@ay+UkF?`>(*Fr% zUdO8XLqPqrw)`(jf%9_-{v0oG|4%@@oB8 zA&%oi#D5*G9_jrLTs@3DkMfBBscreA($nAKcP=wz{mTEI5|;z$`hU?2?&AuPW9JZh ze2VxT`U(93`tfodzj=J@MgCV*Tpqkf@Fy>`9=~Jo`+LLR6XM`5AQzUvo8d?A9UVPz zkcd3!YDcvj0s_&42A%!LJC`qA+OsHV0UlLd)lM2*mir7 zEuxK-kMbsERH?Kf@raIuX=H?|nPrTXZE4Hx(#;}?lB_`al#KPemLD7DCNFOGEOwR6 zZ2F2dE_aTrH_eUvY>GB*i$tSlT{?H;EQ&NfL<03MzWZn}!Y<@g<|+@-kk-AXPLN+P zB}UY4%e>i<3qXUP@ST;0p9LRMA{})|B}VLEtO{e^0iF0Qp#_$RDPS$MpqiD8DZp9L zPB?LVVCt+_1i%y@5(|V+<-#>-ifsS-2~$Gg3GU$s6SC4V1rsslQiSvKD0G_ zhvzRAg+hp-901mF)Vx{?kuKwCs86Rpk+cBe11<%CDXL+qAZe&Q+KD>`7JK6p3>e-+ zcM0tB0Nxh$*Futkp#oV@&R2@Lm9O+y5+#9ZjY^DoLI9<@gwTX9pP~G#rGqeCy?`fb zcL>j<(My*ZrE5q0rDWJ3g^zEjw7VSmZ4qR9f}kGW$|6F_QaEXTs?-Cxrc$kK9Tkx_ z&C}Y%t=>~B>Jqd=wX7gt$`abhm=mQKjb0*#4RrsLN)SLlZSx){M*Oct0pi-$PYmrstW)HDOclsGnfXApYbdr3DZ?HhF{?Q*gVS$( z74Du#Nv1VAyJl+u%JQ#TTzXVWN0dbyd9s?JL4XC)=M@>;0_$iQzO@$;C+ILJ?egUa zU|3Sj(T`DV+sZL72n9W<&S>xXbfSVQ5CIf>kRHs~VGrmo-)Bcz(b;Gc zP}}VWmGnLA#z9#$W?m-_=q=w*tZE05pEWuM5=ZgGeToe-Q2xScycw7}-Yb5pqz>OE*;8a3! zPJU;yPCs=3Ehb&Fxd0y z!2TH{OOW`P66&W4BW^L3DH$}Dr3?m%k3AY6Je7OomidY2;hehNay>`FDAC0AYHYD03u$s<{RTmQf7<&qU<%2j%ox32?=sqEa&2G$y%b9o6HQMue8T^ z@H5cAtWz@?(2<7SG({OVu~yjIm5_%#A$`g%=GRfrfys6I`iNMw^ot0%AU`9V8qFx; z!FVR~CHbI>oMUJKT~1neox)0njR@Y2N2KNS7nhCi;Tn0|3&glPg_c}b2`>gWCym5o z7%o$Zzu-{~;?BBo5JWi)&**avhZkRq% z7t7h*3nOl5k2USiuMbNu8+K5f=&gm{OR(>C*r`|kbENE(2DE-52V3uBg>WgHsU~$N zh>56O74+$|5oA&e%oKh`iJ1vC-7~T#xRpm5O7P|*N?524KGEg-osq=E{YDlbW2sbp zyefZ?;KWMeI`m zcydE^Im2mz&Jc87r*?IND`1usiwHCOPRP4N$Lj{oBR3wc_~X@FiRZeLD{!SK`IcT+ z@ADo7NL*ily68r{XHWXb=;<;Ek47iR@vDaL3ha2W@GPV!% zTUtimhkV3l_O{>vc?`vr{0~cstmkNb*->T}$m~^zz7CJLn~6y2j~2Ail5jq=)c;1f z`UM*VrjMAr%5nC7W!|u^LWhh1_qXS^uiG?0J5a_hxgNe(5dFNMj0)WcW2{x1o+qGR z78TM!;gAY-5MO)*-w=Nnup43fy4AcbUDm8Yryz87n#SBaOHTV)bX0Aui(3uU+QPSi6Gf67ZV9oH&6l<_#n5R)?(|xppg>vJwE?zI% z9;{`Ax5uT`gqwjQFtGEVGNHx=k;Us9)Cgg{5>67)Q*#ecxc$fkJd)2$@X>ID+F8v| za?8u>0{X}h*i&cyzE`;N%fv zOP;nNJv=mV(K=tlCR#p|S7$68AsnV{OB1b6WL%2C$2R{oJz;T`74s%bO5@_~|gI~9Y^N^@;>;D2-n(VIjM*2AEyiH#klD}0_E_J=La)v2Z?nw*JtvyD6?Vj}))a4lFU>2RF4qFbM z1qTE0$O90E{k;nox0&@>bwN(B-dOC^ zf>9^Bz}rqGNFB?Jxtns#>TMLqVyCO0wDLtwg8Yb!tDpN-xpuXl>n28AkHKNLw!0=6 zAS7lNDdAfA;MRM0s%ryIM2I4`>6xL#i@bSocYcr#UVgYyMax4MmnOK=tGE*4%g|Nj zPRx6R)2c(yDy}W|yPYwp%CQEfa{{s zf^AEu;75=Vfr+m6nfsoyN{nI&on9SuUYl4LnCJ_6!G z09$hLO+y;juldt}3@OPN$1?}t^%6%x(5Bizjdq=TU-GD~K+0!P;&LHZW~@=#&IRTN z?Np>nBO}lez%eK9S0!>ZQ`%FS2 zC)V@HGZAnxfz?_z3vdo*ar(@sIr(ZbBWsZiJCvzXF`%}FXfyGjiY?fl0*r9=^C&hY)4*u+BVP0VS%FS^cYGKYXw za{V2<3*oWY+UAGhLgrHK%NN!hdmw$|sQuIt$pR=(V~D?>27wyU04DZd+juk}SafHjx}){jc;O(Ru^idj|z zV@v$^G4aKEZRJJMuVYGF(I#kZQGoBUR%u^?mF32V>heCl7CD^EcR5}ILKH1P!0*gY z74RN`WkM*awq`ifMT}5e}H5DaYOT^E%L{^{kb$-M&^9 z!{B7OJ&_TY=!dWYbm=yJZnq_H=a;NZF9Rjb+6IvJUx^zRcx23>X=w$+=UtRQ$(JF$ zxnbkUdau_;H*sti)?O-b8m}1HP^P_1T>4bE+(sZvZJZAYysEb}lVlhY=F4Nz^>n(` z`MTja403U~e%*20sXg1WS05(Pq?=GTDJKnGO^g|bhoVEcS<_4v_dIgoGz6`X(3c$n zOS+Nh1c)M~V5PG;i=;#dsMQ=K{ib|2k!P5{!&f>!jw?aPX;m0x`Sn)zOP?#XL^m#@71A>q z*jJk^Kh($Q(-&0_+Pz62!wugaf+pA(5A5k=o}h@4g!5cr9aGbDZP9dh+sB*y_bcW! zH0$N|ueZ~9yqUj;jmzb?z0sN@9jT(OuD-Db0~55obJE+b3Ns6bp=ufW@m=5Fo=W0l zBIu;`UAc6Yk~gp5+l=PN``{EststnX5W43jRd(EIjquqIu|$fa-8)))kJ~HUumX7m{lT8?S3-gh(kQ4!le@Cql3f_a~J$l~D}o9iNR6OFIMC2bC(`9{}%vhKbcjr+FcKo*{Gt%uCE8G(eWUD67E0Q;=5K z81K)Xc1AIFx*@iE1VYvKG2%5~pP51SC1+J&VSz97eLv-H#tp_P5?j9NR1UgVtuUqW zfzp{s;cHuZ0^sGYSbrHixMSyJ5ZOqMb2*90D?=@1mqP-<;;qNC8eyFn#$D!zGt2r9z_! z&a~CvBeuQh9P%!&eEU<~Lzk3b$?S-VxnVdNKB9|x1L2Jpp43B*$60<3Wjvwq2|>W+ zD4F*w`z`Sx`l66d5%M08W8RFePqXiYcS&AS~V9mwm03@<&y zeH|`aswQ1l^_dg~i4$=!B+m#_A&1ZslWQgsrK*F!ms zR!kx$dTLzLH&1U^6MUNRm}+G$t`EfKtjE9W&!M`VK151Ta^*sX3M8(?rOe-DcE&5N zuF*qtvCO+g{}lbhYY|2p9yil4PJziu%6o6!iCkigwLA63t@CN1x5O9aVDBA7@*U>b zlu1=WN;OH?Y3?SsRidcC&jb~UfF0eq!yIOJ`pr8~MgjU8n|tGlpvlsmSph?@mGHjA za;^Z<=*Ij?S!uNL$E(wF+HBnSR$@fU;>^qejGYwXk_}1C`?MJ@1JlO zlbMg(6f#<_4pt%^Z2Z#TVEJA!!$;><2-|cni8KtVPmQysGMIc?eS){d)%pUwK<=)EDw=$ChcvUZ~J z%ms@1oz9TnP(tJO$&r;1*_hp~U^eD_u$(R@c%?>`b;95cPD|hkX`8EQ5~28rjR_A! zk-e|)yk5>_`f$Jm%STbpPzU($E*NgzRTkfM)lz+-J6BZqsktSb=#_!q1MNrPk+qr5 z(jMW3*OHgy{RyF}vl@D=J$#c)yEs(*XYs_oX5G8vS(o9N1KY_jas$%WLX7rd!%96Z z`7_dV^3T;6$mvSrYXoMZ+~6BiIck~LYM6|P@iwiO@jxwOt=9mVle?QsmIir#v7jTl zHBwRJ9c;Wjl{q$CP_XmDxlGTYaTEKAz3`0Qz3cp63KcT8o_0=L(d&{A6Lp=7H|$hM=&r+&To>^H!H%_R$s*R#q9HtuK9wJ z$aM5;{O<}BDRc}x(6B0w^|@ttLUPm91eNtFgy=HMYI0d>nDO>9=cD`3Sxp(|@WDZn z=r`uxdFWRk0@yK9ZdrGGrm%XlIZ;pkGoSzV`?$*`A9B-#u}%}5+SS|S?8lUbs%H_Z z%A#?unmO@|EC`$u>~ZdAIn8@ND26%FhVYcSB&BX}z_>Z4()sC7K3@G#x;_zRvZuHv zz`ZDrl?qSlHOjVz3z|*ryK>=71)02P^aSz5;cnjU_qJi9aXg#%bnR|%#z!V1LEOuqIY$1Py32HZ zT#F0dBrUiY^PkV4kbf!6^eVFFb3O58*ebjtb*zD5cm|ywwR(`}%BMNi#1{$noL@Oi z zRi{=G!DW#;1Mt=>18)T9T87`ER48uiy=ZyY(!iyfxlGS#Oz|_BRm;7AlFE6C5UrFy z(pPJz63Rxqofi#$9K{EhqvEvmAHxX$qUZ8`?BACCRtmd-5GZaDdS3xLGtiYT(ziB{hy+g@m7 z90|GKy$Whk@3{7I6TkmS)YOsh|~aB;CGdT#rb72GW%?2)nRjH?(ap z1aiWh5Uz`cfhtC?fWB}zMs{xoUP(*R{Rr0|l$Pd~uH48A@{CV+pGHY6AJ-K(yCq9o zJ3Y#>wNqe$m6Lx8KYL^k9nBFdKo(SgMna?&mSy8SlD7TeQ)N3F&6U~)aI#nU30PT& zZzT}3_#98X_MagwAeW=#MAheN)+i{E-T5O`?n&7|$lra45+p7U8loOkLGBFyNqe-NT~ z3kJTk60KPW_%2Q@1hVnG(ulk62*xDAj}=u_VB%N||G{!)z`=r_GX`~2>s`-kd1q^w zq~Z4P`Dv2^bR%QsYDe-wF3H?m5@b zHpe&B&5oKeK#{bTn@Sqs0?3!E-}!)-w7LOZh9Xdx$;4S=T=}KTTf)r>ihP|@3D*=_ zN;L6E0aXS$lb)1+dW<%z1aZM;)RxQJ-QH{uM8}q2d*TgvDy7SfISH=|uf*7P++j$* zs8x{~C^zP_%x*dN%x)l0HJMLAZI#&eVTOD2dapQLeGF8=T`fk@qE1&cFZPcAb*3}l zEKfao6c*ZV0b)8;(!N6a@X0k^G!NHl9v`1&@pls1i#HFzJ2#?Lj^}b zu?S0e7Nb!9{o;wMFLqcja*h~zoD%F(lX1Qz_Z7%)bL+KVzC|nmH=bimhHnE_8*}PH z^OZ`)=Ff6PHTU#|xd7^csTtiL7sTA%#d*4{K2Ao=JkMw7ayd*wpttz>;uj!!Wvv_U zIU-dSa!CjB+Mg+<9*ki_wNA|21tRG-W*RNJGU>D7 z@--NnF)(EUwpgZn2dcK0ckgy{<*+Kgv5e$` zj`XCQq%;qW2qo9rsH7bp6}g$ju$_@vyBsbE5xNb7&8Oh(Z|Lw(#j@RbFZqq1UJjG) zJj1DKls?Y1v5#?Au5%*#Ki^iMn(k(#<(q!+g)zWVjX*;By)Ffl@fHkzhg!?fvfX$> zJNCV@B-^Ya|3?9_Q%fjDooqxY4>4?@z0_0N3PyF%Dw=3s!BDS8?tw zWAMZG{Bh|%;5_+@4iC@w%mZeJMyu=QD^td}wH&4qAR8;B7;vGN3DzEVr+<*6s@*lrz#H4 zX>V(p>9Ux;(e5mP$fgFReTJ&)G2Tt(%YF3T*`=1ty}rg7b}}s>DypcgN)Vn_27-1N zquRt|kI?!3S0PmPVZCCki86V-{H*Gb?deEP$B&O5n#^{k6y2P3=@nglZ5o}O zD+7M3F$J80-gP4J79uTyvkE2F+?1+uFwdxkMZs zV+-@%NMMS^p%bHane9S6Cx~Vy=qccLomUb0X2k$a50j5$QhH3@m%tFqNQkMJ#*Y(w zv#ayl#!}?|-ewbjaczXs=sgBx@q&KB#x{y4$@p6>M1z8Y z>V#A+vUK4FuCmdCFf;Mp&qSI!<|I^5D9o3h{=PbLp&b=FX=scTZIlb1&DchSV%F5rKgDQttFUm0fJSB2)eC?`(szHKi+*wno-Q9@(Q2mT?fx6A(fKK)Q< zRc8+%q%R}Q8wHOCLd#V@h`|Pftqmx zn4}DuvF?|pE2IqF+ub&O7Kq+~CBnF=&K|Q^HD>=DfGa&$5o8?pzUlhzw*-+-*REv2 zHE|x>#{Gq?!?W%Y=B2`OE0yQ)^y)pG24}AWxkjO8{=}C$d_bzKxE{)W-D?C$P{#CJ ztoo2kNKyQbT)OObIVm;fUvTV>#23sffP0o5nKTRpes<*moV5cmmb`EGz9S}?*E+Va zD+rSaKe~OrX{mAh)9oEIXH}TsR+(QQ3K(F@m@7+(G%3U zbQk5;ehoE9+C>izKs=^}We83eH*Dw@l)YdfxhwU6@;+RXIo3AxFG1HpxKusW zrm1IV{%&p3XW6(f>(bVg*xpxDTL8JMpqve#2~XqE>N{m75F^|ty)5fjnz3(l8+d zZ%rXlNjj1=8<4qaYF5VBhSfHz*Y0C>H1DXyuZ!5%ueL=**HPlALGn*)RLTAM@$;c3 z^r>Q2Jw3gIQmtCm2(tvH7aJ#3E*#zxhTd=|LL_n;-Zp6P9?{-;N4Nj< z*%`X@efAi*QDNm!^K@(Edea23AHi?)_wt)d2*D{slvED&++zDXE@R}&4VwnQ!g5*O z70KKqMm)ZhU>AF0Eeufg<9Y>ki78g}%Ygc8z`WQ|lXB>XxeZ?x*iU=^IdBypkUTpG~w>I_>l3#u?3)NZWb3+e1D%8|J*uS&u#WA(EJoox1-XjaZoqXv z(l;eOiD!yT*~zhsZ>p4)SjS~w#*f<9*3>H)aKap~zwE%KXScZnvKa^^C)>|cI+aOE zfi^LE&XEdn=k$s-V%tW3;&k3%{@k{Z^~RXWohqwK7%k07C|{*(t)hhS&hs_hE#^0O z^&j9c92@GwVta}xCquU$)wCGBVRLD?C~3d2s@*r;ale|e`jsk+>!zQ_0!qG#PIT0I zXUti@aI2mC2)5q%j!!Cx$&@O|sqE~n)`*HunHT5!1f6ZN+;&~fN#so?IrpO?Ih7NK z$wBRtA4;kFx5Y2yH@r^%LZDk^p|N3n2c`8RzYM%Mm4juu#A;`Wez4XqrLE((%D4K@ zJC)bTaoD3K64B59QnSGtBT)yW<8P}p6VBSMym{scM2)uUP16yqj+0&~`-ZBv+e_8$ z02rz9i6r17EP6AeYHMpBM>$2ZS3Ly|NYpDgTyQ+(V{E5e@K5*Hu@fnyK7FO~4fk_i z!}*oBu59C|MnMmHPs84oLffBSqe+l3HRHUnyS%f)Ymvop-E-u525$kG_aFy*`JN=G z808uVe#91SW?M5I>g$3&Eh`_GWrL>0)I`3n>3jMMqUSGuKucsCYO{7}?3D@&raR3w z5U|Go@ts~08?3F_10={Ym8ce#=RiAUeZynZZ z+wg~q?fa}}>$!Kp#3An=#$wprQnwf(T13_20XHeVv(ExqGJ<&r8 zn{xW?ODdLf88#rrS&+qQH*j{%&oW^|BPoxGjx6X31F`gl8osvf>-h>@QE@+)Ki*J= zA>-0jpunXxJ;Q%*>&YT?%c#i+$b@skp1erGut+kz?ltJBDG5mv@q#ibCaY9(t3g-; ztL(rjm2c51#7O!Rlb0`fu?A_pYY-f;sOo4>iHBTN0Kp~m3 zVOGJJOvc-OAc<(ECMdj4JI|^Z22&=!#qdN9*U!o>FQ&tm;#LM7vW|p%EXy% zFI!a7`7Ki{i-{2|H4PW-^=Pypou4$^m_Md*x}v`xY>CkJ4DK-9su^N4q&hG9RjF0r za=|A;iJ4?3m2zKES#bj}#T|(pw{z)hPXnL6%Yl=U56ZKXyY8z(^d5B%6lJJG*4va~ z3ZCY2uc5_rw4`VWip;o)JV)vM%yg4B)T*<>ed5*Pt)Io@jMkIq2R*ZGf z7M}P{fQBYQ@J!gTxv=Y81x}xGF5eZ?(94;->rY#^SQc5h+vvAFy}gXxU%4Yp2bdE| zs{ABLBOp2tyj4Cr6x_^y9W6?Lx}Nx28N>YP5!~_aKwVKxKBEgcO3qx%ynur|89{Sz z7nu9AsS1N7n?~VG%{bXpU64EI4#Lk=S8X|Rl^uO}Y}_rv?7?!j^>cwsHr?%85m>|b z#IC2_V{nL3j7Xdetvlra7#oKM866+%XHE&&Z=Dy97 zvx==hmJBt~%j=unvZ>vmxOJCV=$_Tpa0#CyVN=Cb;2d; z5LUnQZOyTvi&?&{wBdV#PCZG=RkUwOUC=9OZ-kIyB0f3*b=?$hL4-gR1mgLbtlYM> zL8W<7-Y)NQu)5Kc)(E2l?Q_J4dR6}5O&LNP3#@s_EN9BiNV+8Il8Ih<0)(OiwD6OJ z)O!B}9;Kz?&la3D6e#XYc;4{*%cva;%DN@@TvN7}mc7bRFl7PrbAb-o`SE?bP`wMd zd43qK5FnM}yPn5aTCagSzGPtjW%bUvXj7e$cZ@7Wr6wVx6Rr)-*X7@9|kzks5IR+Gx3zvV=z6S&jUt5~lK{CWD!Bw)ZLpBox zG;_Z!a!CoOllCE6&f0ZTh*2Dp6zR+jc_?XGU}sHSB;mJHk6Wuxtyzib${V9NJ4Re{ zTKBGXn)iiLg0}9h@eL|qeeR(jq!X{)R>i6&i!TS-mh&h;R|O#S1=CAg?XN-s6+-Sdl!JAzz4$IIVU%hVRanfjRMYALTJ zaX_6D82eHeqvI?)m(gvgYxpD)cW3;q3jHPp#!b{csjSk`j1)jOK@A*Pji*-7v&2yb z-p!XUx>IlW;Ykm3X;z=fszM~m4fQB)iaMEKP}RFaVzX~%f7S#6bv@i!Q<7?ik*no~ z)s+`Tq4=17A{D|I@(vhTrzv|S<;1F5)bsq`G~L*I*}eU_t!ywE`oG^_@Ro<3CZBn>z4f6lzgc1e~PLAii?Lv8c* z?8xJ)7DV?(y~RwJSpsmb^h_zLAoRW%3FSqJN9>>m?to>KT^4+AioSj6!U$8_b(8i{ zzTnVHow^UZ;otK7@$we^bk@0YHvREEm%0N9#rTNJd><80N>^-=WiAOxLor8Kw%zUY zGR&MWw)EO}o1(&-PQ^jWRDLhh)g~|9?|koMrhi8cUuyU*<$i@o(tDvKP{sRvUk1P8;*mLy%Stt%kgwM^4H&N zkH_IhiC()cLwJ!P2q&wQSe{){ttot{k2f8EAVd3D8(IUG7);7SL74w9C+Ka=|z3e&eZkdQeQ0i*<0){ON7Y$qa@D z?~sg_$(y@3s~jWCI0qvnW2ZHDp>5T^F>fhjn&is-SrJoA*Tk~+;|w}SNt}l9pKR=E zo{7SNia&Y~2ITzq#2?Z;A=UNCI5RH_fhWM8ynqJGn_rZ@}j%bDafl{Qm9PGb4>` zQa=L|L`4j!O`a)VP~!utgPB-Ih%?cm)R!V3_W*(*Kju(c@}7!?y2whD7Kgo!;d+(K zY~R*tQ1DIl<*Vn;JgLqxyt|qi7+t=M!MhiFCsxX(di>hR5Sle>63i*8Jke=#lL6Qaj)~T$SfXVjHIP*2{J#IAL{KjYAVDMosV(Syp zTgm*vtj`UN%XEdfGE(b)jn5qlYy#AVw?)bgE?rO!9U9%ven}T(#zrvu1j5}zt;pu< z{qbF*jh(pcyR}WP52X1%Br0P%EIl&329`E?smprnBR5koRpC*jHd{))d%WXW(TFc9 zDZr3D<99KyBPdBSzu(Gg@1Z5dItAn`)k26A^T_$&IR-Ae{;v|U7o*+eN&%k;vyc_jKoHT1WnlaH2Fxkeu#bEeNVynwhTF|U#Qt`Mvx4?z6VWCHcj8t>mg3-Bw*h0aY}Ubt#qKC zPvskjm4zg(_mekLVTF!Fu%-uo`x-U5sWOqlj8}M$ZP+xdZAnt2Qo2k#!_(qLBjK}) zdF&`96gjKKwj_a{>L@~ojQZ6Ge5A=Su3=udCF7^>yzyi#zJHfMw}0_mVBNr=7I@Sd zC~q(is{kC7TXSU?+pTCQ3yzDuV1&;cBj9Z>NW8snwMa{BJGvmCQ?8&x>tmmBd>=L1a0g4!}bNiZLkxNB;D!Z19%jJTj(3uVQn5&beX4DAZZ7&-t~oP28Kycv3c{= zOOn}$;mL)IRq<*`IgKM9Fy9g*#@+8xzbVvg<|pZ8_IP{Y(S{>V@sPF$Tl|Y4?y|HB z#_R=1P*MZD)}-L(N(>5p4H zdLv*RHec(Zi=9vhvditC#c=&i){8I>!Xpi;lE+J`0uBNljY)ke;Ky(bo>lzv(DU)n zblNHMeMnV=F5^`1GW#eE>|Ox;Qx@YUvIu3GDu&p~6i0Uoy$GxN%l%dLurex{;wV~7 z1dipzIwr(MVC8IkZJ8*d<>uLS##>1)>;cu;gGnhXp+kL!Uc$XuVvTi6S;8HS3d_B< z{wNt4@#*kOYD!>w6f>zq`QOvfL%0)3b-o z5=+u+bAB^o%;lA^o=1$W_kB)6M6XDu%i67ND7$^_m}EiT1SOFLR{mPywO*2U4GY4O zyH|yMi-xz^?w>Z&U*D(Jpn5(>N-z%XrD%#~6>fsh%2d>E+9ALx_ZA)mT*p&k01PQGC^J~DSp%EzStrFm5;#CuaI^hWNl)frc!04Moz_L4r1Iz(ZPB3$YjWnzaH zIYmnx8&Q>Ygx{HY8!iWgm$K7^de+!)cdKqHA-wwWr}|58)=QT$_I(u0Ls@m5Z&0MY z*EH5FIZCC6m3R8H&bMe+T3*a3jeEDXAo<(5oNoUzoJUXK8~FIbRQ)L&R=?Db5=$HKWfwMtvf})deoP% zxRmW#mA$4Po}U*ovR9JH8l2L{YktfSYt47o=8N|L_awRkvG?ExkP6_?wFxnEsaUk* zd??mlzbvr6BKVmM@wi%c$#}0tA!{yHotpN#Qp~xmFWv5!B*br7MZV7{*JER25XKIT zYLz^bxc1r(>+V?`8>?1X{HVHcfnd!B7?e3?wJ3hqlyCWt4^YbznSN@X_13CbAZ#8@ zZ~n>aylxas^$0`C&AuC9RU+~G>|yXkOh5jyc^8J*Yl;SUk{oZmlCcdRO2yX4Et+Ux zNk>g^T}AbV$h=-7f6(wGk!G<>S)sq&!Y|#i$#-03at|9E%NM*E()RAQ7{qoyqGdvG z<_0?cD)e(}5qxN8LMqFZ3}Ir7Ic2rUrs+w^Dk)3qoF+H69!if{vw3#;S%9&7&< zPtoh=)MRb+%Vkr>BN2}T?Rv^w;cwdr#^{N!xnl&{XJ%L%tNOq5oMly%mAo3f+P$1x zcMU62`a|Pb z%|Y?Tvsh3Xx?u0e$k-!}&Mw6LWvAXU`y8ivm0wgte;;^qj)I6`Z^66{4Dm0fpaXkd zS0VO7m0wPAkb3Jl&~8aw7yNSA#2A6inxo$q7;^TDT{8Nw_Ng9t8|d969C&wdHYTaS za9KBPg$NGeX_VJ|Li@#Oxg;zqh9iX%;=2o)vc4M!uLeip#-t!B3ZGiPg z!il8F1+_%<6ni5c%*VWb?e3VKKmRpVw;|H1oEn#ujg+)h4)!q6T=GJXQ$qqYyVcCT z?M8u<<1BmZ)ljRQv%^*0<1}h{8g}7vX;X-#-p`!2Y=*82!+rPM@>p-)qF+gWUYAP= z*4gT%MNBw{og2Se)F5FZO|oId~#v#w5??;j_^Ekc6h z#|3fNr_tZowZfo}5>_P$Iprg*>yqOQA1097s|L*)R8DJ{uIj#wHlDn*OVdk3;lLD{ zC~}$2!15aNmDA0rsIf9n<@m^3mmn&`g>XAN~6KST6k^ z<=6~Y7b+a5NA17DFLZ>WY1ZNNKjC?A(6>tlgSdRc+j?qj$jjR_Pi|phKxpt1zvF5% zyD~*3vt%LN$f(N4;yeej{0^5b9nctBqv1`F;w}%k+mxR|mTqX)-i=##%qt(oe7Q5Q zJl1kHij=qId6jc%ZFoR!$Mw38M1UB8b6)9-de^~}%f|J`HX0K&Uwnb6XHTe#FNo8Q z%RYeJOWCmbb~&tSr@t{Xgv{+zPEeYC*yT;nw%5UIt0INgG=XunCH0+bd_4a7U6tY& zMa6Wj*|Z?}b(AVYKEt01N#7fo{aT}PBNmVwR!ps7cEuPrzbEHRx(w~|shfMbI1B`R z&q88@4TB^(uB7u;ljN*cM}DGRrypXkznuop<;KEU)%g@E{uZGkIO48ECIqs*b?e5< z*f(O9?$N=wLQ|<_$j}~|oGGo*%smJqt{YQ#jID!efVU`CK6fhV<1Ia~s}^y_+RhT= z0IRj7f}28z{HwV0tS1K6L5f(yLLeGBhi)#bksKK{THTh)rW1Fw2bS&Z*c1wGyqfKpT;msbW;!`ddwZK@es7%!kxnNC z(xnRxNz@ZPBjoiJG%y5*E9P(c2%e(LHe9|tCbq(L(dkRh8oukzNxebOBC||u<;zeW zSgw%FEA>~O`)&toU1B1BsG;9*3a49kdy%T9X~~$iF6g`t`eb!VB};o;*N4ha+Px*d zVJ<3fEt57Q)6S=O)(i_w&D7=|Wigsa-_SbF;+&Esk8pCTv-GS1O6=cTUGPgY*QaBk zOmZyc{W-Zoe}e^h14@&zB%@F#!VFA3K!f|>)^p1D?D8|lJl`zYn7h@44!Xu!vOYnp)yKbS17dra<6=$z;x%2pOpO}xHhflkg1Kj;Rm+kCQKwGFNd2~sCRg?}I9o}zZ|jm&-|pOb zz|SR&SQqSkLTWPk=8LE}Y6r=!U@8=C7xq!WD;{>t_iIGK{l?>?v9^_A^vH)ZXI08~ zghVgj-D~r$AMBktr*KbOEl<>)`nzDtDTv|2{?;@uopOq{DNe7|`tGH35gZJ$*-Oh~ zU70LbA=r%k%}<1;AL%ZREbMGQ-^pucj;#r`?4HuQ9gW^7#;y>9U(y6@wS2MB`mY1Xx&qJ?TnD1~r$Fufi*Y%3s7(QIBFTWdWky^(Nqr zk-$%n^zgJv#5cxi<*$Cictiq&tP|OrxIF}YJl*NIi1^s;#2;LJBZNAWjEyqM64sU; z`m~pxa*DmrHLcmLFHG;nXwxm=v~Ij!xT^=S$39m&67COZ4+wNVEB5qf<_%tAV67ZQ z@KvOY*qct!WnE!#NTst8$4GH>bM~~Vv;+xC1wUvnx|UtxX^db<@9HJwfKIijRWO|q zmbCj8{(so}>bEG{t!*hmkWxa2?ha{$A*Cfm=@RMg8oHHkB&0#Q8wM1V5O9d0yVD_u zk^XL78dxSpcnDcBeWyZbq5+NjE>I z9@{eTBkltb8crDQHdu!iV)Aup&}sSmURTq$F`Qv^^c}g@;(a2#8Vo6MUA!r)S=$Su z*sc7|sgC@7IRsZ=wA>Qf8QDdMV_RSdLNYM%z=uI{9Sf+ljH?|P(y|ljnCRZRS&|R( z(Y<48%$qzovon+I&QZFVvp)#|B8)ZeF@h%K$MTCTlyCz6~p%T_rLp@I=h#JG%3U2PaY zRpQ;(!$N|6+N_@B81ZT~r`8iWK7LH$w>JjEff6u zD>#PD6@gO>5_s*$bzd(!Gn zQM%*4BjMseGa4stXZOdJi!z=No&cGYjXPh4sUG+y_5on&0=<8f|2 ztYhIrA9bBE;jKa(Iq2Y)HcGW77gzckf02rP8qR)U$hN&s^4Y5KlY!Z9M^;L0@4kvM zYwK!40j&wywkrAzx5E^kC=qcE5B>r!28Yw~zAK?=>0O{H$)Vh-aoFmOr9^1s5)D3A zymz$?CnpX6z_QuX3AAuuHoJ&;EpHI=7;NvK%8{f>(e*&z?tnTr)%y@@Zh`Z~{|~n< zCIcl7q74zJ2b4H8_R9oUrj@67+nNOXf5E!R2dlT}j;+950(lbrV`dncYU0D%ND%>! z%P8{*TC8lKkjgY0@O5=uqTPAMd-!2ZybpYD zwD{x#%eE1${VG+I%Hr46(3Y_?sgyclM*ZaI%!cT@29|q6po~}4|gRjuC#CeeZLrqQh*K1 zD{A)^jWXZGc1gTK4sh`AORF>d*)L-D)~+lyhY;Vuz)HbdzU^!%rpnf1|atJ<(=Zc&DMx$AQ`6 zC!((~I;x3d@pL1vR^L~^|Akg&yvfC-xM7!a;BmG zB=Y8A&0oO)8NtBNr#s)Y{pn=C|JXDvI*-Wxj`Wn#?eyWZ@4w!eHTiBop?;Of%nJVL zR32Q@!c=L{l*6&*JAJswl8huB96rkB7A@|~``&XBAjuloHbSqrsKLr)JcgyxSbcaY zerBLJF{HXbG&btmRizK?OdEkRl5wIH6aV-_z{HO(yn~!|-X6C5KB&#-uBXh7s@0n~ zCI^DGeFFf*-rQr6-mUio!1|Wf9~ewOZ;Z7g7?)bnegAJ5)0cRl3y`GD;s&IJd~ku_ z*+@-_1<&sf=1=3%E@T!}X_I9MABg@IN`Vk6b;f8T(bY1L8T|c9hg~9@)V_+1nuNF< z+)0zO-D?qlQL|Dl@A)J8yA>Zax-B1;9B@OX>{NA&c(biJjHIY^N zh<-Mf;WD4uoRAo#DFz@s6+g; zVN}4O#5|}S%wiz?U5Rj@U+Q*>vQ)r}Y0H8ySfb0ACBrYlqZ1@+65S*J3C^O)qJuJZsl*uFPRdQ%|ZRWrk;Xa%IB% z-Yrxrh!Yl!gjU7vC~Xa_uAPLpPsgsE1=aidy;&c1{PEs_7#C_up_yw@u{skOv!KWw zDg+$`HjY^~He7B%k4w@vmST*EXCvxVjJ0x$n(I6-#ZVW>x+s+8Xxq$#Zn?ix5XH_P z=Ke`*2yKL6eMA8fF30gvLa)me)AV=-$pw3+9C2h5Ne*c|!ZuM)3S4}>`}ziqH7=LK z`OvE63lLWfnArsq%aM^H~;*Kg^o0@dns?M1Ov2{mfk#3WaE!GjOu$uRM>4A||u zPbU4U-zYFXc0mU9G@jwVZ_eHm9yvpPTn(SnPP7MfN;pbg^3gUP4?&rDKF>lQ4y(p^ z*nI{a&M*81JP{*DTA<4yVOMm?AJ-J#k9YC=ddc6CrLGo;{%lK&0P|B2tDk8vBj9+` z5_S~u2OHrjbQalLpCa^H_)JC7BszN8_3IrQtVxPj??3n@klZ{)$!1EmkPSM5nz^;% zKTI8}EW|p4t^Ex8&yP+Sqr;hEGQ#+WX8qut%VaEIYZQI0kT@3~PL=+n0rE<-D06aI zX_+01mxWmP$If{MtlGj<8Z2Z#_9B53!Q*y$Ql#;33|ZN`BP9HYBPLpMUu;YFcTGaJ zU3lA1DcNi1Wr#7Bsg9RlJr*DX`Mf zdE#wtEt0z5T|}EZq0Q8fX8Z+Lo={o+QA2U9-T++<%*AAO$?bm)8!Ytl_vbd{9d&GL zRCqa5I2lXSpy}+l(cfqR_HtN0!8P%KdFuVl-D{qXbT0$B`e85|jni3kO#g1iMvIa-^_dn|cIW=QZ4!3OD@6`C-J}5Zu3V0owMef1ZM%4!kfF5BAc4FLg3XUwejy@)G z$T$#7JIjgjyT60ag*u&mFJxj?VNk8Ob=FBp&Hj9Jy^1S`^G;urW0i4_R&@tU#xeJ z=#)+6f4cZCd1lnL`i&1gD(rbZ#LnYkVkMiOkU(|)p%zpUq)+|8Ph)2os*?e>4M<(C z@#6XYz@o0!&xglkV|$oQ4V#or**`67UF%8QWvf<*eO`ItHhHnvsB4I5y#$jhWWG5 zpg&}X6F#vVA=Vrt!ljCwL<5JsGbR2i_mJn~m0Js6j(Ke2)x%frbbIRA2C|U7{zA@Jky`!rg=XS0>*9$8g3C`>J&0W=Hi4H9j3_ zC8rPoGsZzS>uzvLnK =f_T%CeOdCJCeLdX>Zpvk_FwF>3LK?cG(i+U$e2Nz3-o; z$I;QxXnUUya)*6k+5zni*+7q;RK7hU0%aw_JIQo-Pc;FZU+Flq)G0&XuhS!bE{D85ETQmOKU(wpz!b6L7?(i7pQ`dZ zclM^fE1o3l^e}jtBaFgGmXrP;aezi>FMYf;Moxc}HkQwJMmf8Z(d(6X) z+e>u>wCill*aump0aL&+Tx_p{nM)S1S)OiTLb5r=WPC{4$UY}3ES_k#RnH8 zBAmr^TL@PA26!|IEwZ)UaA`_**nD^jkE@~(o4bj@iq^g&rd9D$veAop&vup-Cn43X zP%jSpq}T-zJDaFXmF1mxZ?-RF`T9&SZ>rYFwt__u5*J6Ac~XJnDN7WqqWfiS>WKt7 zZs0Rq(Qq#|jVhFbWtB4dpa8eDtY5)4CO-{77oS9<56;-{;O6H6FxrODz9ZeiY@0j1 z!w=R&ujNepdmcS%UDDjLJKKlm^x`!DJ5QP3r(*s7aHj?9KV0a}^8?tvA6C=Fww@G; zFA3VZjO&PP^$CL`!rJR0!WL6+Dn5;lUKi}6Uqze8W6X$|=v1(bjdrHn83@N1Q-B|> zp_!8p6gdML7jr{3o>a^{dAd3Ad5^f)^xZEMqrKmedLk4A9P#;ept2n$s|Hfjoo19F z(1@HS-_@|IZjTrr<;9;kY=dYP%j6tD3$27i=%lvP$cItqp>#L2f@8_~tyM3LCwMwm zfVs&3=DVTfbyk0o_3d%L%~P}&FSbKZ$Wz-%kR#-v(>mXi($N|K1YQ-A45IScM)w{| z*;ir?jSXjl=Y%bs@@)q<+j*IaA$K&gJ6Ta9pWaV^I4SD7NQ& z@097FM38*>Q5YQYWcs??EPI#t3pfY;jI2)(=-c^(`a$@W!=}JbS=npc=@I57E5RY^ z+D(;5E-rfS1gD#DTpP6Z+mhP6*bkMMe+kJUTn;d54BtLA)e2iGq!aih(=RV~dx2Bjx161F9kZRwmB{QR7$4bHzh>J>iDHS1AZV@o6#6NXzC~)^>~u3^>hyo5 zT6Xy8fg>s?>H1y7E>YgJhaZ|F+3P(&xUdPLjmuB2ejqhSh(P+xDCoK1onWhXZPloP zjP$qICZj9Y(;hZtUhN;qr1C=0k^xk6nr-k$Dn@eRtN7L0Fozf`OWg0Ct=)#kpvb`M zZTiNw!zaQD1i@%*(}kWi*8UCAL~W!6E*)Mk1POnE7*sVN+0PEu^R-BzGO&{G^^I@a z-U;L_fIk2vAlh2u$}c$#Xo*XnBln^V;6aHDMfuCprj^RR?#kv~VT~79w+d3$MGP!) z@6!h}?eg9($Cuor=lAq3UGcZ}XSRi>>n>e+WV^9;8 zHSdvopRFR0V^0Ezab*uY&df&l?(&e)rApON<|Dynwf<`#R&T{!PIK%luNq{7(3cc9 zL1N@pRd-*1&n~}%0Cc47(}s-rL;PWiPUy(|l+&`sR-i=CP|QVlJW3$gUa;CUcS+y zXulnEF)#pVh_IyVg_F|WzoQ@_Ao3ueG`=V|*T&Sn33?Cd6_0c$%(tQ*P+>V)P%f8T z3+(pefoj{h9DIh-w%rXw3y^x&)^(A&4yt&xP@XeO z-}mKja*s*xcXncs+`@=R@dW=^>J}UN0u}dq;!$}##^o~3?&riel2T>Cqx#}yaV~cq zeMha0Li!^p2hkk!j5;RGyLI|TW~Xm}Oket29TDR~r8qXHss4lo)U_`Pr`si}*$%$9S5{ED9j6;;|{s$+5tpXhy z=(#@B=3-)MQWd>pqnvc-%95;=5sGONZ*WZ>V%3P-xqf6p&%6-ybMzEu07;HLo3y2# zQx$XOYCI>YdHQ+!?dYN=RU)VLq2neQeHpfyUo_ZI-!hj3cnX3VIwYqtNckN7k(Hv_-?7wZFjwNrszI|@pAo}s0pNb+A@*zmn!4F8PagTW1B8% z5}+!=>@K#LlPp~I1~YZRSs`b2B>CT9ryeELUelvm4*$E6lb32aq9s}c*Kb)HahQDqwXRTP%O%mE00JENG!Y>%rN5( z_6hZEi?}0Dj@kTorq>UE>rdt2U|e@V@7&X~YQHQ_Qcrpf(C`y)P!!f4zAK*?uy4M+ zvj#8fDa6Y*G9~jD!fSRbn39V7ABp0P>R{&-lEAIMUVmjUBJ?fyu}-HxEFB4+QgATI zAgAc;d!@lj>F2?jcwkvsdh^MO`!g+eIM6P{M!LQ~vWq%P?VJW}W=c>zr^PG0^vdD! z;4p?wIG|_BU2PMbV^1nZy<4OXavhr7;!aYcYVp%H4kdBY-60;G0h@f|AT*ITOyczq zC#w7IWJ_!yPdj&#P^g>W&yqTBifM=N9K_PA+-4eu5DGMQuIaI^KNZ0a3Tt|3`@kBS zZWY%u>^42r(cg9V7Zm6`Yt|cruc>y-LJADX?|fit$e;^Z(G2RO3%0R;Jax_HeOc#k4`26iyc{#>Po> z)|&^7mpn36$%^tcw?!?__Q+wKKqKn=w?t(gti z&80-ap1*#nR98r1>@RQl99|H+cNoWRE0-NgJWc1ve(U1+)a9M_t9Y|wP@QGX@>*6d zcYQd!YGZZ$YieB$0&dvzKLDO1wDEmpvYuAYsBvgSn*6fMn^S)~$aW_ByVt+%tp~uF zfsgLx4@zmyIBkh~R#_HUez%c3oYu^hwzP^(EX*ILE4abzK;_Hy&${wK@H^`ex~xp- z*|R0-ib9J4hxgw*uXL<%{`-a16@?)B44QmLSLo{~b1Hua#?KLbimW$x_nM4^;00mt z{O~?Zbx=my)?HHLj4VG^LaOW(MQG9#XoExQMxwLq8v@c zmJgFae?xf#0SEUYY<}^&eFQB%RBF;;?;h$Joe|n3t|ij%D?jYLri=HesFj~wD4d9J z3Yi+M$)Bu2vnMF_m8sY)!}y|3Jbp-Dwk%zs(C}FyyUq05zKz~HjxczpVgZXxRqjeO zjfvL?)?@buix;VK_q2z!m1xI0OlEz4hMdo*xvo9anrP>5TzaZOXWbbL>I_a^gB717 z?bc_JJ+uB2ZNO0E8IJ3K6>B_tNyU0YkOaP+Oyl?drxqakn4ihe-a-{~Qld#rrot!z z-XU!YwnqA#Geo4lT;vjv4phd z9S2v5_E7nSVcFEjVxy;Cnujq7m63Cgzp&d_;y~ZOSc{9oGrh}&UIir=R^Ex+X3cUu z2qt+>S3r`;jI%>S{NjyNBK>zvA~~CU+z_M0prD}lth7udZ1jrh?`iYsL!uN(%kpvZ zTNsOs5T0Az%WYQNZoVB}zFr+IVq&>IgF_pRKTbQXJo)*TKflpTN7(t)`>f89c_$A? zN0e)=6kJqV%5C#MblTyESutHz_dhR|sSmZJq$J&3plT6<&3_%9=q4Y^%mDd`zf0?= zfOV!dvHJ=*ne;P{c|~sDn85i3&$cqdYd$*NHTgM1sa)L-+YGSTqd|ki#*IGiV5}u; zykCNNb2Nz4!{Lni76ad*119OJh}=)DGFq|+!9RW+5=-4f!zB90OK=P>QIf^kH*%e< zgcv+DG)z7==i9i6;m&o)9`zit=TS@ppN(`e@SR|N+%xi|$&*%_87~*7H@95My?TW) zIJ80c0C?=Lmr+LaFmqK``bYzOnfg`SdN4Xpk!GH6T&eJ*-b(0O>=AD#_I;WBKhKgU zqh;g>^n}35gka1u`i_cV8;XnT*FyhyTz^KGCyj+O2s--~<6eg|D6gNrP8THAi<8~5 z{&mY|`#go^p;v8a;9X<{8x58;Fr`fCpoGdZJ@LS8s$rRtIuH-`yflB38>LY1>}P_1 z9VhSb*!Gt#gtcGEvReh8-GZC!Ad%EtOw6B) zrWmLGSQd+T&_VfopJZu+q)ISx1}}3Er(U;3dcU`GOfZ+2JqJ_?E%(~<>i_3ZKVcOl zRQ|MBIf`Y-+YiP9)@7jNi;vhR%N+j+t(+zE*-qS2L3A9rGBEj?g9|b6TT2SBJNnKU zg3FuRrZubX|M9qc=*3t#a;k;jWLXPCa;Uu%#`XY4k69{%6_`Wr7myiF7NY{(NzC1AQCOUa;A&{ob~% z)UJfeMMWWb`r{rQ@JR*+=hf{cyWsB4q);EiS@_dCs*F(Kl~>g{3ziM!P)T;ZpT7O) z-MgFjIho{S{>4gx%{2z*x2yh)^u_;p`=4g~PlIL!>}oypUk&`9|M`FK4P@V=)7w%L zC6)hQ9|b%J^#8u}pZ@yqgZ{fl|Fxt4)rtP=BL8)f|GLP3UF6RSSfdA%@n02XI9OT= zXU)o5l95f?C1q6ESvqCR1`by?JlInuB`R(Hl4&Wl2%PiB*TM;n&E^5tf}@L$Dc_zw?vY5cyTvLt&5QIBntKEBz}WDBf$bA0-YPS4CZAqDrZgdzAA zb8h&61F?B)rM!@GVR`O?8sAN~^lZVq8{K+>Q+PB|@M;O=vn~@L#63v&_nzy|p}d|By_XC9Y@P=^T@R&+>a3IyPnLS)B37 z)^hvfYyqdCE}ZxunMyfxZJe(${ECb#*A3qX#Kt($k$W?Iut1l><0E+~%Wu%V10BCE zSBe{sV-JvEW~9h?-1F-1JgBrpiZY4qntj@}td%80-~Mru)+uLgjU~~5J}lN!#0riU zY)LvWha#raYcL*!O~~?cFWg-X0<+6M=?L!v~K1SN{2^>0Df412>}w z>HPf2M-zvWpYE9XuVhzC4xFktPbDnR8+tUNI>;l8ukB2)2s%&|E;Qmu3N&^lDu^(161%TcN3K{t8FPXi;s7%j?&Yr!NFf_^;&iF3taVG zVHXxJkHDYsw9A@xMDSNHJ-l47EsbMY7-Vf)rZD7rPrmpZW70|l%3bfOs-cd#$HY6j z-7?!9zn1V4>S>jc0YWY4{Z@m1TjUuL+TZ;}fnt2Uzj@o0Y|$K?;s0aN%tpi(f3x&( zGrskT)?a6zTCrDnN%Qwo*EDpceA?rS%RkDnx8_)In;kGB=bhX1kuD0WpB$1E?+Np0 zH}MucrruIT1#C&7uFtJdj`D{t0?_cK{+(GhS*5#qFyY=!TtxCu(wtQZjn?kj8Kmv> z=@!appt-%nY3zy$$}sK@YRR0XY(d0@8>9G#4{%XSiWAm5)S8FyNoIEalb4RI1IaUH z(fQ{mhY4H{3mbnetA+|zm{rQBHAl5<-Dt*>2-xI zYWYf(u*?nw;E00$7>AS)=G>z5`)zV9)JhNjum!?$QwI^Q;Y#Z9;Vw2~_j_V{GmU8Tv>1KFSgz+KzHK3%F^W(ik(H;>pr?+3m?bJwwnWtuxPD#Jr!1^t**EK!n99`VCeO92* zD_8L~aVs`PmT%DniTcE;gCCd|D;)|`Llpz6Bl3$=7 z<$2kw?b<5XtCMqrC0F-~Wj?Pr~gxp#Z-Q&4^Q7`X`KRR z-l=e9qm^ymj{#CFZ#JwP50rUk;ZejMo8SrZKL@eI@M%p7E}dUh}FjoiJ{0=vM`Lv$b`y#Zdu$= zDaJ3TSEz8J?!0kSnZ5-F&TE+OB4L?U6CxLWsvYYO_$Epz6Qc<3Rgb7>REIi`XPaGC zYuVWeDtq?|2E0|{lL-6Ap^(stJ;tBi147C#_d;b~70L-3Pc^KXR)5YwsF3ink-R=+ zip4Fm5<-e!Ij(*hnO?Yw0SVZ302sfgYJC2-y5O2mAJ}cL>xGIq{QNXLPc`-va!J87 zF^9s#PcQ=RRsedvf1wE;b+>o;e572MTDpCMz33l*UT8|&lpDOoSg{e%_(rjWsB!`N zF(-cAx8*BbsjzH^N-r>?yyC~}hHWz$zOE<#i0fTJj+E|h1&GUbrBVD?APCoeogoscR?T~ox1jn zUPre0b_UBCwDv~!xowXb_+SAMQeD$Z#|tk|mu~A3fl_vjf)92!t9$A<0+5UOWRY3z z*xaf9d6z^0gOORhF~ad@bGtTU-4s(GZQU`+WzK8=5PsO1{_mP;NDi$?dWOA_Uuw? zqoQ!p1mGEn(8(2*E41f8xtl#E}B{_z+sh{bn1;{$66}3Hi^qRUPa#JJBA*L1i+Q7P3Jzifb z0yP-(Fu-5GN7QR|e%$&By_6nrv-iV`M zb(R#>wU}~p57wm1hFoq8f6JDV;PerfinY&9YtZpBFi`8RIj8SGSU82VuFK14zdSP;u8 zud)97-V#%&NO^J4%TH(2Dy!y!5yy%gg1$g(wL6RF+N?+q(zg{}D|kSIwCDv6fmp7* z>dQ-Yo-V=cEPaaIEk&-(NekmDWV0Kg@G+k2q~iPzR$>SrenrVev^zr@4{_42SNp3E z}F3ITpcJ~k8h<8?l77i5K*eJa`%`f$w304hHO#w_jMEtl(!N-Zbk z5&ZpzcGMf)kH#Z6S(9$|iLgmI!s-lKL9S{mo=4sUig=c@)3xALFT;ngQsiGTU4v*j z8`rrjOAKLd(+S01Ox~4?-)edL2Uumt?`Aiuc{CO*!}qHYNGu^i_`<+QtOchu;~xtC z9>?>%u@8Nr&gW0gvK_h+P@lyAKW;`acm|~{yv>P^e3Z1f6`^?m(?CwS*q!^mR7@+0=e3|xLc?zm zaAYJDd2eR3!yHmC9aP7T?m;x_Y=_;UJp?g5HJ;{^uL?K|F|LY&|4>*&3S!Qqx5V}E zE-HRUOOI$C#q${PEy}+q@T$x)Y;Bcn@|_Y_rn~HCa)b*C_>H^W`?6e^>pAA_uO@mE z968UNi_;OVp@ZY1$OvPXL~}2K7?bF`wQu~Y-06b0P!>Yl%L zWA*LGNm#wVy~^@>XEjY~rjs8sB;EwRwlEgiWIsq=@kSL++{k5Ky*mOMhXn4Met2r zSAit7t|pO2U8Mvo~oOsg}eh> zK@YZ4r%@3SZn(KovV0Do+lJv2Mu(nfp9&9)3m2^PVGfYx*OU>1M#~s~?ACi%6j*>O zs>S6}zxKd!E8~7G(e(ti%_&rE7Z1EcWjXYiZ}g`0d4ly>(r;)Ih})+7222B^l2PXq zd^mlh9~jZ@HE~r7-069oWN};(^jwFYDBOIaWY=HsCLQQAI$QSn97L9$S9$w6{1NSF zZctzGl*;c7jp#kx8tQlL(qqw{P$He$RKv4+Bk;f(Q3>xs<0EV~f7N)Mg2 zyS5QffubhF7)cT-59IchcQvdU#LM>LDxG57o&z;pc-6vT^rd>WFDbeiJU$Xzc*TKi zd`}L|o>Z|!_9$|HUnaCoDb3L-S2>2@UY=d2E;x0@<%|WGtobq%3@T6Rz&=294391w$iU9TY}^KZ##FsLlJ_>+lo9tCv~M6a(5)D3`~vq z7++hL9Im)lgY<&2f(0IYgU%)gNB^kteT=VAjuIH$mc#l1+AIW;L0=sIyE3ozkcy4^ z8oWZ})dta>%i(Z-m^p4e5}JSbI^1wj?l6}GZbP6GYh83G7`_cznzS--OJ|a;4@M{; z5z63G;9y^FV0R0I_1n5XpIK4w0^K%A%0gu;cK;w8nG!uI9*h8`WPg4zE97W+BbHJ| z3G^DTJwNx{-`ROjh4VxHBx*$$=b@U^T~|4>NdIgnSiK5+(pCWqWEGFV_B--YT<2%} zs#BF&)!y&S@jFV>W%t5MBKa1xcS@uQKj3zsp(}D_pZN>+<4=^{*oZZ~8Gq(+`k8`J zb*nrb!Jhu3u=-+7o{QbL>&<-~xhZ+tRCEZ@dr7ZLza6F^{p~(NcH8fjXdlhSDrZ)n{_j%M5)c7RmKX3GcC3 zjvAPG+YK3wUEXEU3F9kkOg?&`bQi{3R=BKwjb{Pf7z0N^l1ovYU4b6U1&3-_W)QSu zc`ycGg1%b&`%B96U79;CckZuv2yP5f(wB}p>h&8?J$Y_$`3I>sU3ka27_D4}o zY2?8hwEbFt8^ta4?wmf7yxJAjfW2=I)F3*m?r{3Bd}wX7XtLuU-Q`O{U4csOx;vYl zw9*Wm0;^s7up!+RKkx8YdYmo1DUg``rxu{}ZaUfqY7!M+R=;=k2Bo2Y1}^Yr^^a!^ zbxGA)X1GL*qdqO6k@8?H#_wH8$4Vt1re69=P}_dmn8`IA$&G(7X#4wv*>oA1^+1CC z!Aza!Mxw&p1C||XR(+fwn*JHDyfN9y6ywONu%*F8j7|aUOT-+1Rl@(5rqm)CRaulZ zQfEgtUmA?@2JUT^nU;{=vc4FP)`u!;iIfn@%Tlr9=)iEW#6Yh4_#a7rTI=!%9!$X8 zDe;3#$Orb=iqkuArv!@f&nw<4%krn`H$s2p<>=rxk+8C{PV;3ayyWSf#8BziIw~Sk zmz+?s2oD0?Y9Kig4qCQqe9)9>zYpad@&co^u`cTMu5=m3Z}%@WV%d=BM*D1)@T03^ zKV_wOAiqM6zb+gBQmhCa&C;3N3P@{NA~POs^Z~$U&pgN%`6uG}2c#f+z!;kD&H8O4!I+{WNVzx5R@Ww+7sMtDO@qHa z;CJMv3O5I->4OL1zJ-7(d78NQ;@Cdn2lBK0l8Yd?%1hJKtqiUv))y1Eg4bq(vV6!B z4JSeT`O+m@ml}5DrN+I|C1>`EkgknKk>-OM8ifh08=5o4OBODQWQ4gUID_sVp6}b(G;^3#j2Ff4WCV($51ACBk_MzibC6ysCEd;yJXTa#O zOnHGdg6;|J6BR>U`ry7~MPY3vHvGSV$iF9Pt|S$l-t!E3saeB|#eVGk4n(iwcE$Q| zB0F?%u$b-;SoJj7CWkxRG&5Sk zG?`3L1RnT&%f&*bUcI#~6sWD;Y+4W2~5Pb`W@?`e@@8_)GmFVxNr z-rx%|zK(x}JkV4Z!Ovwjx%sT2nO_hhGr9UsIIJMTPKEnVMsVi98^j^YWw|?{W<+eQ z32Y-XxbBVJ3_3ndQdtD1MXvWpoU_J&rg!)pAMobjP<)+z$F3-@sJps8Cfb7 zSpwS|q@nZYz6g*+Enr?hXkZ*A9@WmVj}Eyq?J5=^<%N*Ovi|HJ$fOL1?TEZL@w_9s z+TxU}8Zvyq0*Spe&P-RO%h;m53dw@KIz6x*=mR3@#ivuQoKWjpy4h!r*2A}kcgY{U ze0Ow+4gPqa2MH}>1%pJSqjm6`Fmp@`e1H5`BL73)0nECdHgCy1*TXas68AlNYe*Bu zjl4IZ5dv!GF(x^&=5FY~Lz};1e>j*VoeC+k$vO9CmA!HFpH8;DDp&`8SM#ooj!05- zXSOL2xW-h~UP-a0L6?#?Xj!sK7wxN*s(*-b98e7R<6w@f$Wx}9onYb)0t_984z~cV z;+O{{^meKN+wCjr&@Fhw!BIgz*`(QFo=i4L|I9Nn zc7nz;4COiNWU*S3Ja!Sg^#*g+0q`*wW&vzvDC_>kTXooVcZ=uGoLASPi)YtmRc_k2 zlA8vtHY-;xKRW|cBYzuCt(n)VWAgFjr8oDtw^<8x>Mpny%<)1L@-D1Ag&MH&pk$8f zowQZwbw7U9Ob$1@yOH}3<1=|i`brx*Qb5m^7E*7xr_IbMro}eWWNM;x6Rn)7HU!L$hzjktiN#w&R1dUg4AU=WCjnShdt>qN3P^8qJ z3}4;j6v6)Q3S5~wD>u_Lwbh$FcASkY&&gY8mZqA$lSlDt8b&+5ph9TGFD|0j2a7w1 zqh#vezC@6>VHv$1>C6qYyo&b~^t!mgR*g3}7v!N7l+80nzkKL;J#Ho2u#cNukUJw< zy??eX>7!KhVPX2RcX|%`Gh9K}TJpTd`Q$aBYP*dH-K@SaRcja4Dl*4M-K*J?`skV< zy%bC6&+)+v04T;Kex1B2=@G}z>w3u9<}CMR2opIL4Mj2oKTmsY(yHegZ&}aqrOFO% zTzfeK`Rzx#qX3I9WEEFW#ckA#=o;dz`GIxJc* ztEBiC_fI-|=fn-7a)6`Y6Nxw;a3<)948e(m8*$A4Kpr2}4ZPMdM=Z|lgssr%Zj?7XzlV)65r_U#siBdcd6gmHM$QDUZ5n9})`jH?`}1 z9~PLp7U4gfnW}TJa{u}3cUVS?>dvN9cANFmqFp>S)Rm|?;0K;Vd`~QlKwnbe&}(b9 zjh}p^4OU{9ynGaILfw#8=z)jlUgmy_QFZ$$v!Fi~Z z=cSl7Bj;_lka!ZT{IjM3d$B z^V*{(x7}5O+@8XFHOF|TtpA&n%+MY4KugTvrVM`gu^j5{u^1jyY^%jvr@M3F3`@2g zg94D=&0(eqHK?~OX@De-=49f_CEHd>yEh|-#)h!*tqMi0;p_XF{Ju-WjlZ}NOyL0% zLnC4hg;}}VAYt_p2;xb*`fTm!{GMe?ZWiuW_`(bI`o{~=sY;IBAbEv-iOikJ;@mRw z**E-I2gW_6)9Dc+ozo+3h4~c7S}Q>vaPMz=p1vKjs-TJzil;xeAF*w6Ol^7yDMSG|A&bTILMco zDgL62dT-F?Ti}9NV)LOX@qM-^_UjNaM$O-_+;LSN*uGju>bQ%_oIbkjadGD-xsnk08jXj0W6hI@ zRnh!etE&*7{)j5Pz*VofITiu2A(?sK0B*1Kk9qsN_}#jl0oF@_ot?jk<3Kl+(}IEJ z;Y%6*sHevB7PqS~|63W=gJA%t{EiNlED5pFYlMWHGsIocRriahc%tkv=33^#!2CEFWS z9?tgSE(c3;>Fce|^z>wdbeJN7;L*8+9qGc`ZR_fEjCOtTSRQ{11@mmM1Sr=CFWYq5 zM+EcW8W9~G5wF{quRJC}nvpktfAV9*;i? zjhNU?qXMo$iZ3e_|~D_(=eO2{SLtgfg$El0_q_m=4& zhAdi1T<7>sFCL|)J~uew0L~=TPPSH0Q$})SA`dQL#3Ptz_Faklq~}Rlk}^Ec8Y{Iw z%F3Lku^(u*7N@n_Ds`se_4jq8eF06Cj3>MV!@8U-=eEocSdkEW8|r$(cA!iWCNbo) zur%R#`1usY$0MeU|Fw{u5qF-O!c5y7lfYX`uxwNy2^l$uz-wSO3UsW9t_f$s#l1Z781K==i<|L)^^qI)osdW&&=BwLh4Wc z4}0$&Pxbr8kC%5MyUdJGlV0^`;MMGz<~=RQ z<4_tlN9(B;sZfzGG2iPI*i~EABf3>P+QWU2A99p37viHV#@9W52g++3kx$!eJcs%O zooZ>!T}}&>7MV@{WH;&dWiQy=D2dQqTW&hq$9EAOxV11qOn`s$drWHVT|WMycNX?< zs#dorj41YLo>b5hoR2A-g+X|WXJH>is*Z*aA3x#R3LSYAVQW^x>OJJE)_P^GD>Vr? z9zH3HpCTaq#RMtPq6Mq2Iu><>9!|_4Xm}A4%^jN{8+Ve2%i@YxKYwundgls>y+L;L z-P5V7$8LeuYHwI$L#;QhfqjO`Kr{D3LhhRM!{k{N~^oZws&PlTN~G3uGW3bx~R1Vj{Mo5 z<>`(i>u6leR;oI1(^8uEZ0*Bt_>CVH25RhuS2L+ATgIcWluy_-qlDAF1Mf4s`ETqR z^ChV%#}60B`jA61?A(KWyIVVeo;cRxcUJ);yVLIQSRTK(&J>7lJ&Z~7?Z z`Lec51v}yEsbx1@ThJBOA2`iq0<#RSaq3x5nIT1%E-ynWX;@L22r34aJ`%j#4{yFv z2sdu2O5@$o`sR4fJWUjle z#fw%;TX`(%JWF7d)f1i)L5>!!>yZovJ! zG1FO^D$;#ax{D#7P2~lsXTlw|J)0=5Lv^vW>~BL?cXOBV^PkTRXMWlIz(`#bD_^}$ z=-|r_3jcY%RDPz_KbC95 zkBNBMImyK|yZScZUM+0UuT0R0oJTlvK+3#+e$el2{j~vqZ9nF`-OrVqq7z?D6)H+q z7#~WD@|A?*>%-tyla1!>4d*ZDs*^S_iR{(k;+lfDc9+=u+*p%X7Pa(XUC(DMZ_ByH zIQ;k;nQoOK(lvR>XSb&D-C(qxWU_ZdNZM2v)NXwosI=4jbKZOtABmfadm2BbhgjWT zXfda8r1tSBu)_KyKEM?=nlCKPYFqmv5VMYqlx|Fxb8(j(Z;<4VK8bG$d04M5u#uPS zJs$!+;v{amV>$KK5=zj@V>!2!lcjLJ&&p>ep=OI)cVP?7zsbAa)JL4n`k}GsLm!#- zvi-h;)&5rn5GK4i6*y*hZaBsdZgsK+YLfASvTq>w?w9Br`>+rG0| zK>7Y8Bh8v;>y@b`2;()ySXbL_m-lgU@6j&ewS6jmx%5 zeg0shg{@l-D>)xu_r{o|rLAexONIf6>!@wcY0}}eM)JWudijQlRC5sJ7CP`Kd}w{^ zRMW;lYHb3|ZOyvYZW-M~Rehc31*c;}@ii2)HRLNw{Js^;o8r?8u-Q2|)nrHMMGe2< z+n#WZnsiS{f0eB`=^=c?4K|ay3UW>AGE#Q){QPPHFg^5SxK)eIVz)oMi?}U;+3~Fc zkAwYmR&{GPW+?0H#Im86SNr(;Y=^EnGP=9U*rBhU;FM;iPE?)c8;JKdcQR<&Swep; z-Cqd)E?IPSKRq>J--d4NwYjwP2JhPD2d7zx5&|nhd+B_bxp7)uOJ(I*r4Xo4zq3oy zOi~SO9^~o;6VFW3&7cwI{P#O^EcBYZ=NMcKm1+?h}yTd~t#K~&D{y69@-FJIF zXu!)hNX%N5smHbWEo)5${}Bkku8D(umZO)Nk|*<-d`E6A!h`M$^p3*4;_ar@u9)!E z6{_UI_gaPfUup+~&UBZD{G>?gB1IaH^XZJ-tH1AXClLxB3`MNzvK=E*69 znh%#`jwiIe7*629P0$_Frs8Wg?v+u-J;aB?(tM0FhU*_10=gdZtqJ+R4-Gu{kr=Wu zu21Y@X64&HK}*9oDZ+_ita=b0LY$8A+7?c6Q41)Q^(&SB+|rOt`wh7%?_`4}hdLMt zExjqAy<OIh56&lCizvE8Q^2o(t8!cPDs~B|6|VsaEnvgAAkBG@W3} z6|3nD%lEEYkElatV5Jv=cfU8!H(`e&2hn40dqeTwTm;K!E$#4Wc-On38;8h|49E_$ zSY$WZG>riquDlKzA@O&Lq4gqu%5s3{R?_oFMas(17yA!x18@KK@OskS|4|i$BWF_ojeD*6~%iYH2?08BB1gZf`Hw zU$WkV_V>L&H0-iax!ExI=Ub!c1K#dfEji@TvThLbSqNDrO))Ht0{t!R^VmE7Y5|bD zBp*K;!!0)79*2gC{ki~{OVO}%VAI2@CXyksFKHjUsGF3v2@8%PlZ~`QXRG+|Ult`) z58kL;xm7d8b=aM+H+|hq_;2tu^D2md;vA)IO}mCIH($_wY=$+Nx7N%M!o_UZoZzMsx3Sc85!vn7;Jt|$3lrvr~qtb#{{wCmq`wr zq=}C&h9Yg!x5eR)ocJoccoHP{3e)nI+n(VH)&5g66?7K>s1{3ZI;fk3Lb}Q#&s_8? zmr})D?GJVe{<_F&p!lF9SndVR&rW{3rUzXxnTOyh#yt;(Y&ITd)CYb&bY6?28i~oe z@+aVm+Y#_v18n_f?Ax2SL2aFYOIG1kJOhfpJ_jd_PK|E->088y)UEu972-<})u z`HnXh*h%BCvH!j+iIRn_YvOwT2Wn6zc#rUt?f>bflWRFzkE6LGtRe)Wf={uclsQ~U z{tsmTV#N}hoU-0EFS7A<0C158zTjZ_KYsn^ixoi0AwxZrp-2s=RddZSu?_nJVg|pG z=$TRrj!)pyq;q@h)03p_ERXr&`ez#ikxBqtx8+{f5*+XA)^ln1?&-dC10IOnsVy9n zW;G~Mrww7c3AlCsj6e}oNzT#U@9&Nj zfx{9}ofbv=MM=iCma>BIk(7*_V|w7S9TH_8&7R4M#yr24UTzU@khl1H%75F~&??Pc z7bb2g(!9v2973Voa;lVaFp#Y9rU(Acf#vpujw=WS@b`ANC8GScxnG4y=qQ0Y$Lhzc zD*-l^L>(o$xu3w{t_+4wO^=bZ%{IBauptU7=L=%r z1@Z1^Yv>z#PP#k5dqm)OVy`NGN)onU>|mB2?~Wl3$WiSks-BX(4sx{0^Msg+8s&3&(Y10gF*w8>>awaV<)ck_GElCtvy_Hp5HiD2^{kK2Wa zM`r>65Wv(+rro^4ZdvWrxh(hnymB^9G`x@;f_Rg7egSnCzz7WR0~{@tid<6cE1rY1 ze?fl9N4v5*OAY@F@42J}_VrJ+Cidh~>v9JhCW)w#6zUKkfL7+68ESf<502<|2_MrT zcSHs3ugqlLRp46{sVy~N04hoM`mu!H#2cQn#vdxfxQqeB`xOdN`}NbUD+k5;j5+9t94tR+Rqy zLbdRc1%rM0O{vW+5OK5M23iS%B1d|J9o15Ei4U__O~NXy#ZZ(QFC|`08WVQN$h`K? z@n(>(5XzA5gsPoZEAa6U_Dme`XIj0Qqh|B{x(Hq&Wv2UsCH0H=w=-z z6YarN#>dk%fFr&g8dj={Ya?qWJyKT6Tgi~a!?_O47M8N5UJvzfRkrl1U#dT7hL77e zr)Nh>*nVc0oOvmjB_2eiz&qt=85j75Z1PFhyGtt?WK5_IO8|`O^qNMkN{V6RD8V0z_mj>7xkT%4?P!UGWwMd zD%HM5Ke*)SK6j8??-CF&Brp59$~Sw=UlzT%zuUM*P|6fwIJJD_8a`ihHZ34}6u%WiRpS~8hyKr1p z#thlCBCa3#o}H1x92(@*Jn#EvJ|(}!x7jI(xn4E}*%Oi;;0V!aAHLDpM-ECCmc5%p zg4U8~9jqYjgKPC8&$pKiz8F@PVP~r0{1E-X5kwW*TDCuf{8zUZ?f7m{I zP8vN3J`yt7F0j$9E9x*4K+B|IfjNYyqwV$=WF442j>U`gw-NJ)Gj$Wi`9~cLOGf6O z5=!T;?vLz@IH06O3+JhO-hz3#gAbKjI=k$(%F_Uj$#nq74b$oItkun58h7d9sfr5g zwx2Ou;kIC)jLF3WYfU5r2f}^uLd-Dc!ozmL%?p|Cmub)7EmD8kNSSg~>Hsec4D#%@ zKlqDW%9O!LYIFNCxy(+cL&cp<0WS|T2>e{(Epi1!S<+Xh`nEHHy~`^{S>8NH6*E(t zk~CF&X{uE_#!TXYg`b+y!nQ#fYu&rq^?tJGyvI4MKHh;a$LIsq^!EoJD)6HTr7poU{)<+v@h5w~B5S|CNi;9HV^*KU$Sw=$CzG!msjF zXq|!!A*2D74EQwt0mgC{Lbz^vCY~o?GyAd+8NsKMxQ%{%Yg)n6yS_%})cN{{A5Y&I zWlFauWfJ6UjWNw`kRnouKSTFDgF5|IyT(~2=^!;vib1B}*ZZ&rKmUNW4=K9P8mf9u zJh;3`-DjNw_b8iQOB3^O&z3R!_y@)wJg??~d!^Hc zO1JR%K4v$hg=u@BEjr`4m0ZSU!g$Mc9)wtr%q~+gT|V+(Ox`{oJi@NjxolxrIbv4k zjY#fW@>Q~5%SK$>C|RiaD}4sa^}6Zo-hFIg%#yCh#*clqbb`aWkM47Ib~QWE5dEly z!+^of)NrD*aWAFsMULo=BW(AAzQ%Fak74^XFK|ToSf^6p($)>b(&o(CgT2Nr!{Vl^ZV_paNqSbcM4tv|L*M|MA=S!ZUgOeUj^y@l)e zIrp(spWMjGDX6sj3oe>XUBukc;^OR-*nXLN@!_uV_w}e_PmBn&S2ku_gAbNYJ{&U~ zJEC-_cVPbV0?J9&WaO>Y^F!=zYP+R0+L!&vpYuqxZsJY01T|CTh*H&NbN0Q?fHn8E z%x!U~%ORGk6m3_Ku`D9v@m%K{*#v@`X_H6)Ls0{CRVrSp{Z7*}iDOdGE)DUY?PK4DHSDRmDAN4>Fcv zBUV}4p0?Yt3j~X6MkG6vv}g@1&t_voy20#l#$?^hZ3YK(a_OA5)>cEA%{SzzR&rT; zF8}30&zF-f5uy#gp#@fVJjtV0(-|gD-(hsKPGdJy@w|T_OjP>jZH+a`jqFQTwfLfM z^_^#G#b=W?Q+km04Pk_!9|^zZ({YKtk?Z0V`#U6Ij^3A*;!FI8o)gaPs7jl`P~)hX zDA}oAzpqz+%QC-f0&zC;HE0DHap#7QV~~J5-|NCovSlK(8B1C^U1j9dlq2)*XC?b_ zhngzjz>EQ}%SLqK(NmuLee;uUMH4Ea=7mM4Lljw<@T#oS3hv}494}dl1SF8>{S~Br zIa(XIeb=R(set3KN1f=pJcDiX-X1+oz-UTq^yqzg6&Y|p8rDA$JA^!~DR<5=_8i@R zNB|b`cgp96zCxqY&9As)#mMkbl$qBI+yBRuVQIjuWh`aIJ}|jUHt)A6k!olEIEXTnVOB zFSpnp8;YNNxS$|P*1+;J1Bou@U1Ok>6D-{)?4Ik;sW~==Z+A@H&rlTHe=Od?nYZMr z2&#z0DiSt{bs1wEnzoQ0lkOv^iY{J;VMtT^5w@lUmC$l9TNGwqXVlxgk+wE^=USS% zv9U4E85-SO90ZY_kz1+JC09!@R6#O0J}YYo@Cu!|efu$D?X#FGD2Smm2=XlR{ybIx z6^(t$eDF?P>ib7C=7YqCkY(}Ma+JLg*M50JeFfRgD1$xPy83CaOJ1p)}8r! zmL_x$9vVu{y_tm$dxM{jR6XE^$dWwm+rWeQP7JLcXE|i~pMhH6q&qZd7l?Q=(UhS% z9^4<0J>$CvXFj|e7*#ze!Wka97La%7)nD%@c_=Lt+G(BlzER=uuEp?Myp@r-Ya|VU zNPj=#C)O8`QS06i z$b8mYmbP?x8L^jyzToA3*b+c{7OEsOgsA_K%%~wtj@MEwyfBdAdh?6t*_&3pBnL)@ zoAXAh6mD({$4dBhNEU13MDn?sxu@~i-b{z7T2t&oS>{U5r}x?q=LopU%At?u=RxHK zkC9k$ZZ3QBj1p;XSbcYftZXt`Y0b6UJ}YUsYly_?ShW&{G`kjkHu#lcPJ^9O2mVZP6c+eZfscEMGHOFFyJx zt9piwNF;_;`7+6Kyzp;p@xk0$bmuwwwbr#bS)R9EL@bkO7_nqMzCy@@Arq``4O@Me z(_sJRL(~ES7#%8=ThnTuedir$!!G*cU63U*r+W19eQ0&Ke02Kbk88WGo~AQ+G3hJy ztrQ!^UKbojn#fl~VH3(hw`E3{d~R7d2R-M~r+Rkly)rM6GF2$YnePpdta7$J*Uf#F zb=T6H`tvc5*%Iu?X&TDeh?F5uSVpGtVvv6Pfp73)ES^+*@2);JUj%dHRQuT6u8zsX zw)*|6783CX()f;oaED=v?*-YB^e*DPVlyO?dkfiBhbxxVlhK|ZLvjAUcMvGbW}<&? zk1R20bx9VoiD}v#P#tR*vBc*8dwnLMi)osdUVA&~iJoj;58)f7F|f|^72k8&9n3f~Ki+AV-6M&sop9+=G~TIN zt8<;dz3=@chyc?gihdbtkSgWv;?~VmGa*p4<4`kc>52CDtejF4oiw+N%WP?OI!A!cW+i|8{ufVLd9)B0YcnGF{G#R;y_VhHpd-VQ^_p0Y7#pJ%j zr{F}{ihVMg1<#qR<2XotpvmU)TUBH?-vam8wt#dC35~l?^*5Nm2QxSKn8zY}ad3G3 zM1X3|7_rOuwf%Sr49A`^f;WDtId!tR;aIQoCAk%L2SUi3V=GYblf4^CU z-tqkO$z$1Lt>I&+T8O{yp^$4hw_SqejV z>uS^3)F3^OnC18)or8TE{ndpve`0j;-P|R;Rij(W)gCFY4ZotdI59DCt$vB4)xNnj zHLGArXgTP*hW1dXl5Htw+(1pX+e<92R9GOrS6z88P;-9anuY zIpD6s*b^RQob~0?e%ZRwtt|}7X)$?m%Oi09vQI7(kAKhJ2IRq{=in+ju1 znf+{aM3-hqvua{b(@e+kl-P3FL*)I3q`q@WR(t!LMGKw)TF8^$$H1#Uuo@niVys5I zMTqXkoPObx@U2vm#kol->k_-xqk+jTwhV;gG3=Ns^wyhaBa_)|M0c|u=diE4P6L}j zd!|mEHkyNOQCCiY^K&8XbD!v1U~ue-4aXavpj;)MBDKPwXsN;&U;4hiQXBX|GDxki zZRPCKVq3d>jIiO1=hC*u7TYED!{_2T*{CsgPfine7(asS&j zSsPkow^uw>y6BO$`kP|_7Rx@RdOAg3sC55rlz+*{s*qo{Pq8%c6+Bu=MMqV*sB?KN zJ?7Iq$1eF|%v5%SH)r;zMr!7t;Yn8AANV+)mX;RxgDT8C&avGGz-3+l9wD20$wOgH z1^lK6m#Z8EV5mA@3Wfe`>F|Mhru1%kHodNc1EKoA> z5l1jHFlX6)MMlbbr|c7qJ)Vb}=q~@z4)=L|{jQF6?qmBA<6*H5`>956EU<^;w#B71 z4#nS0-jB<%qQ>lCOr`LHA6*GjQ^b6Y6ifF| zpuWyX18`Q(NA}Z9@4pj$rg}~4t|gb3SV&S8oYjNA1pMIO*i8mlhxg^EGOo zl1Xqtx5Gl5Z->f?J18nSmc)S+vt>WcC!H1swlgbz~rSqRze<5Fmq zI7hCt)I6+bh}s%4af*`q@r=i)wohc-UQiMQ^|Gg+e9b#qD~%hy{nesqKe&xpECI7$ zpU1d^sPED4yhD;j{BwDsExhn~yagw9ftCIBR`MD{$GT5PyNJNstwLp0NQlRnO}?Un zjIFH6;gbH!QZcN|Z4~&3d&W;0#Zv(6*gfZ+kL+OuoD%Q7QUxZF9gm)D#zrv3YpF1Q ztfu8~<>vSS5XR?o4jz2L!i0NIZmtEB;%2+;Jb+Hm7#b_^*@_7?d0i6BTdO>kyeH3> zq9HaN?Db3$-`w{8)}tBSuo$pV#uBGKujMj*@#w1R$;(2;yA%%nLv%;yB)tfl#xW>5HU8 zy$~5-Z*Uk~aqR){p!u@T`Z=OnyUWV{M&k74w*6Y%g7k&<5MTx$f?aNU1e2slvI%t1 z0&I(z0Cq<`BEJ!}NW2->R!YMTkov3CVtw5KmX@l;0n23#jzd8TI#}y}f#Aa#R+dUB zD15agEv+gb;SV0uNhvb@XA@LAY!{D$8Y6RTS*oxNIM1TaH zwHsSxIvN5z02Yk&iL^<7n;SpynH@P8i!nLg(-*1)BPl0Et!)n7pUe2Oxxp7x_Pt}dd76J)LR^w^V* z=qNPtrwhXa`&>&5G{m>$S%&eLh2f;c+g4n4kVZYai0|hG_pOTT?#+GxjeIV?p3K!V zz0aCtTlyAC4YB$BX;4E}dg*#zy4g}d;P+3op{;`6DLYHp>^4XI;{-TD==A$)$UL>(Se-t!Cd zU}8pAa7MjhwVun?yHGDlbka=+l;;RUb@ z>pxEy`HnNT#k02yZsPx*h7V%ukR4y~oeO3;lIpi`3qngC7Uuxq#7Mm56cn!|_J*J# zSHJ6nyrjd)Zbu>bG4ib+BmMvAJr%@COc6tx6iJ$L7tE0XU1;@mD8pEU=+vm-#8<%3 zi(43N>+1g742m1Cds4*(iCU=4dlGM4A*!W<+8C3*s{PS9+4>O*B34=m0ku;N+f%;d zA+*p@yL7=VM#;@Sh3t3s0ImE3OA=%}mFOtJxfRBblCAx@kR@rhXt@RdtW zy3>GOxb0J$cQXRr0xLI?Cp=wIe?~d`O!1^jL~#{k?v!*M%E+-XdVKx-bf2QCnwssx zXCKOA)g?Lt4rZ9gKR%~ic>y+jUuj$DNb3p|H{t+{;w`OkW66rB z1Gg-5BqGE8$Av`zIF6tYo>gP z4`=xT-+l_G9S79W_1IL;#v6{&+m)Q(y6X!H3^9nnBUho<**>wP4_Mh4&d?N}loV=$ zKC3U6@qW+z2~}={pwY>Ulvd{$u`X52a5J5|EFP=Dq{eCdpFwct1`x&_L$J)^0<*t# z1ozYD7vQ>Huh15wGH7pV_za{PbA_N!!rT0hPqAK10CC2D&)p&g3!7@Ych>7FTCuTi zb=aX9{ShIWAD$uXcl|oc$=#&fAfWN?T3FVfl}XXy;TadXOv5h^Ej`HTszZ_vnQ(P@ zxUtaHg9vut%IJlEjS!f(SD=!M;f_P(@B&^9^^*z*hNfs?ZuCmXWBUgB)ePg4(MF#G z@#MRymYmOlSNe|;_niUrt@h|bq5SX}q=+ECV(~qPs6|viF)xcBr?g^T&dJR}d4LfQ z(`i*ecKJUae=PSFaAD*OUK2i5G-#IQYe_iAiR#+aII<{S3=3S7 zQ;x6GQ4cv8K|vggX9CqJSJ^w%w};@{UO1~)(ujI$@XyTHW_Hg}gf&3Nfg;UNMo zoPqCL)s4&iSbeQyb~1vZD}Z7qCG3g4`aS1zrQm*Bp6;VTPB0*(pif|neu?8L~ri%xPlQRNzUrholyM$#VPpN=|RnI>;-e+!S|wZMnML2Wn9jKlSv-J zb+;y4VEm7IvI1cvJsqJYzzUwfSb+1q)CKPSY?Dl90GRV?YtH&lIU%_RIyM)$*pFcF z+J$}bDYv?`jRoBxg`7CCmumux_Yyne?+$hUGg)*%3Q=Dg@1F?O37-SNeAUYR!=#UnKL?jo)TuuGzV!3f&yC>v9N+0vzu!@G9SDb-8umZ7UC78;c|th#Pi#JU;6peB6x-Bym$7{@9U&Mq>-7|LLdBTl{G

CE}tzy#3)dWISGDcR+4-S=cLzg>xN|l98`y@nw9m_PA&TaAysbl$89) zKwp!uw%_PZNEiY`DQSC$`Oj*_2^8VpxtPgKE+FO~DZ|VkwntW7iIN&n7n?rwL_p@& z$@4-uTNazDT=Wl}#VJJJCK#Yv;0uPVC&A{2!RBzwo{B+r$%xDnR^EF=CpW)a2Z|Z$ z^!d`Cx=n}kpk8Ut(FL1>N})aBn-`gXc8CML?}2k%asr|JmVWS)bpUu`PPRQlQvs}j z@&ZpPZrAEw2YEg?O>V_y5H_>={xq`x_{WqD7>wKUNe=;~bTQ{&e+ONpaBpQf1Y=*6 z4IBEOxj%V`g#lbgi6OSi*jZ65xR3allrb{@97xbK;1?+(s`R&&J)w&3I*yi@UaT`= z$JvYED;E2Ifz&+Q8wYrRWnSG-TKwag3g8jMr@0y;rze2 zrYo3B;nDWbaHT08vO+_QKgnsrkt6N-`sZ)IX<{+Nkt2cLD&k>9@&6>pxSEmngq$}x za(W_4|1&i9pXA^iswP)x6g}9x|0Jh9x9i3) za=?;@`qDG{9~JTeOZl*Zv~N2bTd_8%9gS}yCl~(dVJT+-iQDVF$o|Kf1)p(#vj}2e zc_bOwNWcM%* z$zAr-f}H5=-;4;t08?wRJrQNz12Tjuz=TG8WAA=@^5+60Af7H<$Uhc#xy*FH>a8I} z@7r@7ox$cSD=QwG{YvD^DzyNf!<03wzSzDo+|5geBtxAzZKuzKvdbxpaLb7NL8YAO zN1V{Ulo%wq%*I6-GyJ6pc=R)NT*j=#muHyseh2Lf^Iv=)$AQnC~n6^M21O$ws8^+}GBUycRz z6nUH+H)m2tSn%=~a{Y--{9?~t2MfFb=3`ALPh;7=GA^nt*67EOB%_|3Zo->4wQhYo zTYc-}%;>S*X~8f`YziR)ry$iLAp^qX7t34kZiQB~NOIPf{qpI!ne_0A@_F&PY8F z;1OG$TriBIK9u`9XZImP%#o+KI6v(PgJN*L;KzG!uKvj{4{EUMHPj*rqKNSy3L2|% zf&r*o^WMFCd)qC?rDm<6HXSkS^*|`yhVOxrWMMJOZaTnHHD~)I&0Uj1T@_rS$ZF$; z`Y;l?1YmCD>+3qn0>^NamfjJX+!CZf60$rp1!kSBL^k`khjHRh_2~lJk*Ct{XmSQ? zr!3;P^QdWWsW{l2nTHK7*m-uP>w(L!Ds@7?THtx0DgGS33eNlsAq$6!4{~$3WS7|J z*Vy3E_WfCk^>fK~LJvL?-qY8&`}*;@4~SuTY}db!lCaNO8xKn}7lBc-Idd3g=_h{u zsj4bCWq7S0HTHW>eqM*f0)t=&S@}zJKuN)ea2M zQm^R{THC(Nn_DnLqjJ-m)}8<;2^FlH^4m}=HEEF{>94ZWPXjr*5&%$aMHtpN4uibf z{P~PPXjcLsdNwXNO~QQuWQNoA?ktVh!1Q3yQV;}i+H03RdUHwIXCd6=AZu@PF&i*{ z5#Z?qaD)s)o86ABQnBy5$ER5BO|WLncQ*Xb%BMIFZ{l?DgT?wKKFSH7hy$7%SQ0aNc@lVPWAO@5!vV?>u)} ztUG=B&i>REu+s7Y7u`>A^_Twrx=JDUMGjbICc-VG=-hFfTjx|>T8H_E)p8tC8GN^) zVX2qRu+i&nz)makQX7NQP_@IjTL*iBb#JP;#A@w~5Xehq^55)lm~ga8K#4KyI}O0h3!Fud#vgh_PpyUYSuL8I#1q zT#6H!J4l1FtZAJBQTROoa=~~Et3Qk9^9Am9?e#bXWb+!=W2z682sJo9o7^dP>FJ4(BlO)n} z|J`5Y!Ey*0IEB;tm-oJmF0G-YX7dvnN1OQ+ShWlpX^hDD;MRG00picx95cVRT|qQB z+j&~I{yz%~K3EMPr{Z`$&yU(ETQ(E{-Rv!&>Ex9(!Q`bl62TyqOuFRKo0^3ESeh6V z*F_kJ6RsyK_*#4gT!lfKd=>##P9|AlFNy+zAbCyqPt^Ls(cMsgFOVJsR5r*Sinb|%l@G&c_e>FVs>4;8qGSy4d&VNhz%3P2j+ zn5{_Ag^v&#Y<>`-@Y7piF#xMh2ZX%nAF%q*KbUHOKlq8hU@aes#(=2Y2=GrYJmDU{ zBkb}`xhF+bW6HP62`v+>Mx$bYvp4JIlqJ*4G%N05AQ(ahld06~fi{db`EIZ8wN}}U zEV*Xvr#63&a0-^|cv64^;X#{&$%hS%7~wGN-Yw$xJhc=Wd0t<|uOJx0(0dBbWChSu zJHm2qdwBH9VtxkM`%z%MHvvaOS-N`#;9by*HmAq7o41?#jzEgf$iePLYcD_X7}R?< zj-Qse#r|5_mYL+@Se0FV#__@Y-h1)@iZ=Kd<^aCtH!nH8QLisG#>K#MztVIK*jh4o zA*pLC3N-O)=AU+uf@g6{wOlbU=yp}Ah>w5RF_pW(zI~)Axz#mkr|dDLrDQYLokTD8 zC^MeVq|CKfT*;~N`|U(W9R&f&{jMiG#NiU2tBr=7#2w8?Td=US2kw#`^kL>tgoRAhyBG5|?>vzH{*jVZtn!GkLuJ$H zFN>c`#_Otgqu0=j^KcZqq}srK;G|5W{zyf-SnP>7{Ptio&j2k}dDO2`{;et0T#3>7 zme|O0POSO>@B|9qD1?bfvcZiZ)e7AocpA7rR~SQR5}8~+vrfa~NjflbFKG%&@4vnL z(&A=3zVkLZFmMOu@x*{)bO>m849c<2DPSXom91eRD}3`?tj2|yxGyHBRDQZ+*3x)2 z(#=^ay~iH&7aLlpWJ=oalqv-!{W3290|Yebfqdq^RnLXaLKP~WlFNvtN{WC*%3-YX z`Wg8^U?Y>a>OTP)QmszGEM!NKS%nxduQhE$M^9Ux9@Yr>5xZ{t;jk{Y8(L%CFwiTt z(fw#DG0nu%Dik%m;~rjpr8}UeVT1>=-zM2%KQacQ+ssb{d?T_Bchh}W&$*gjGblWc z%M8HTxF8qmS_iD5sA~Pg`Q6>c8uDd#v%20S?>{{}%_7{8>s@m1Z9{=i3N zW#~`tK--5h(8BIZLx(@=KN-L+Lj*e=>t^FMw{c;&b%^3PsnvY3!TQe?RD%0drE8aE zcVd6e2+x$;pqd0IwD?fWypk#~PHnZ}hg;@Smm2Lo8LeXCkS2shia2TDFbNvBYys&P zZzC+d5G$=dQ<}|iOUsiT>TfLPHExGsQ)XCruW2N@`|4&A!td%C3USc@B1v`7U zLg|wCt@ca4CEVP&bdrm0RmLU^BKFZ~4tky_)B}&UQL`{6ji+B6-yJ?#vh(KgTG@Et+G4K`M31MjNEE-6W-*w1#x z?R~r^(_a!@QeO`N))JX>C)RCmF@J5Krpf2KmF#{}z-BhH&Bk1h+F{Z87~y*_KxQQl zgA&p*#iZ7(ZbC&@@^)2$%!wB~pX?*~FA+E>?G{E)N;g)%LW0@1DQo3rg|g|_!%G4T`b#3f{@p;w$4Z^Gb@*{;?kQxAXO2agz7rR9RaqJgx}CZS{lX6q#~ zEbW}7Bn(r*($mZ5+8Z(=F~)U%r9!ZqJF!B7JI4n^#pEfhR=B@aIG1XS=T>{Hj6I49 z@%W2?YzSA$((Jzc9PYB0EPNl_!Z?1e-@IB%?^M?@!P=q&b=`)OVDJd{uH4w*RvsVu`|71d81^d7# zs_tlWyjZ4k58|=|U!4>kYXHgMP&++t=VkIDw{~U7h{%dasaivs(w!7!?nL~Hsxr8Q zU02)oB(}3s@@|-oq2<%!sS9-?Nowj#1x!hw?>UZRhNex2?Z>}z3ciXRvs1DyE-vv< zU&%>c`fM@4dm!0TJyv(Oe(Vmjj>y>Rfbc7w@#;>bZiCxvr&u-dgfeH|gt6%y5fw#A zqcRbfq7vPr>b6;;740gQI^EJ(PV;R`JG(Sa&exsDdNJ)CDeB>RBc!-R4#zT4hN{*# zqtu@I)YvYLY_ZOIv3a90sNvGLuSUq+!HXy>Y(wS13u9!iOYikGJ>CTOSGgld85iET zSRF{`Y-+d>@{Y}}*nC!B!sL~@)YO{uaq+G$Sl{KYS(jeP;t?5yICIx^_PWur`St=Q zSmoseS7&uzkKSB1RFgBKBysl&l;MiD%G9SfeESmtPAet;+zgVul^w^aDvDbU^i#`5 z7;}Gvb21(;E*tn3U-NUkN3~S-)`(G(dg(KofqJa=ph>3sT!^@Vv`531*$Rf-Z1okj zb?fk2_in71EJVG5n=K4eZRczcn6PUc#E@CA# zbPM8ZwPC$*zUUFwpg_7Lh`a_xwY*H~E>yxL6z!kr8XoN^$0>7F0e;W@z|A7u0JG@LS!CP`~rj8a_+S{{AbW9m} zuops2x?O+D;b7XtQ=|(tqZ%7N~1WKtkm31et!hJDyE zdwC!+zA4|}N3cu3JhrCo^i<+U=?{2ev1p~!Xo5d6eNR)_)Sho^jf`x|ZTH}ohan!* zzWR^48RnNs9b^h>t{*#8uoZ2ZT=LX|Z6^;Tiu`GwSu~Bn{_v|$4YHrG<3i<+r=d`6 zFG@ecux>o8utADqG6DSOdkV5GCnR;pann@fCb`b?rN^OR-xd86lQW%&a(@~fO!m7A z3F;DW5Ez{6ceElnH0CkQc zI&LyXfc4+@Hpm44v|R1?)TUB#L%a$ehmffJbW1rzgA$B#!i?KwySZq(9Bh)v|NUJ# zaey1e%@kdA0vdb{LMdNyhUe_bh+kK{mg zQio6E-G1jf$X@O{DzURk%7ssVYf4~hv^-vHYQ$U-pZB*vegwrzN*4A6a8?o@M<4b& zzBMFX`N8vJWkY{~tFlAXen>yt!}-n2dWc`Mf50<&XVRsf5$gGNSaEsT)Tl}~g3H9o z>>zEaG(~B~X9e-*sbABUzp%AcOPA+jE(f|<)2Zn>T;oUC^gBt~-O+Xz7Y~Z`dzUZ{ zj?Rgpn0MmLXTZg_n$K&mIE*fdpd9~fF2^&7|MTD;UAWuC< zOLnCj(WNBg#`J6higevUb+=Az%y3nU-^Ld2`C7OSO)@mAoxWNLGYR{$^FH}L^9xs( zSN_L8+!Nu484<5C?4Ir_zS1x2YwmNJn(|#%5bu!sVHej0dp@aS0^7347&lrTGgK=) zn3tSPA-d%8;z5}yv*IfRbz9Ai2eane5R}b37dLWK;2-Er~efVq;oW+kK4)@wKB?v|*-2vRG{8 z4BVz4`{X=kID$;5dj3ILJN1Xmi&EIw`f%ic4J;Y*X|Xcu!@&G_{=ii6?atg%|BwXR z*WpbXRsu4HJ84xN)T?(sP0b5XLnS>p;-35SmThWRY1gS_l&(XEwIyc!MS4eG_j+fP zTF>M^U1Wjg zQAYvqpWsVmZGUe*hU_jTmj$R69n?BcuPIKWy9gFqH0VI8=pSwjHU(_E1G00n$-%u@CdnCWV1 zqtZAGs~tvg7wg=dk;a@ew)TTuBik)RUoFCA@ufx>L6vZ4Thc)*l1ZEQiW9kZMCVw( z0QJoa;mqevC0yw+ucVdinsJx31~#>(EM2n09be7+b#el%NoDy>u;g3(GRx z=Cv(5Jas8&31wAPc??0rmRiWQ70OHzS_%c9{tUYPZ}$+*56odOk8)DHf^vLG2-7jz zrH#Fq?LWKVzgXZAslLV`52J^IybmS|vjf8Fggqr{N;!zlQ?|xitFP25qh1DSHk~A*aV{3UtgRgP@ zNl{S|mEtSlv%h)&l)!HKLkKIsVU_K@>%0=I2G`5zh5v*Bf98MS{=w)20uMXySj|$! z^rT7ZB=iF5dXcUk4LWor!b%_P1)yW+7AHe~1kL&|IF zB$7Rm5=$J1AZ==((R*WaNJpcvzP|nr2SNFHn4#3;9g^Si-2c&ZvfGFz0hF1SDAeh{ z8jZ$$c}k81wL>})pztKhuDn6813R<>l57V{Yo_R*RG%>c3j&l;<6U(V+yfT!B+1+ES&fQe9N8@S&9w*>w^ zp*Ztv;`+QT56+e?zUj9KD4~{ns`Tp}otNvDh`EDJ_Oa2sGH{=Zw!uJdT1Fm^wJg7c^lFgT}`yp~&Tr&4_Gd7C zd+pj@S}!}(7Pn9hKJCU*&DDpTJj}}8M1bZ}faCs!kJQrBC&bqMd};+86x&+*It-L0{O$j4 z`3D@?ZlCpfU5M6F&>lo9;DGc;K4QFYdn%ihE2 z&d*75Q@{MGlbZxO%>_%b0us{z%?wY9+c!zGZux{YDXa7rI+^F)Spgg|i8?B++A6;T zw^oOQ^T1-M#<~kQn|JLHaDr5q$LBlF9G425PYx^do^^s#aMIQfn*=6pH86nAJ;Dna zL>`?o7g%`S)Xh{O;N%g&cKM$1uhr3 xHfh(FSoH+pnzN5AAN1y#D&wp*rmQ&dpS|ST@31$mf$^ttQS1N! literal 0 HcmV?d00001 diff --git a/docs/images/ascc-1.0.0-extract-blast.drawio.png b/docs/images/ascc-1.0.0-extract-blast.drawio.png new file mode 100644 index 0000000000000000000000000000000000000000..85a5818b1f8e017e79ea2e0e9ebadff0ae3fa819 GIT binary patch literal 64729 zcmeEu2S5}{wl+aTk)VO!G+(vCW(_)e`X8^M??Z4t%ZO(nL(^gaTvL!c8(ai zI4odr1S9uVMlLQxTU$00h@rWiAr#JLV~PNZfa_403B(LyYO+&@i-U`Ym4ly^lShq{ zi;-J`iwF4O=3@i#@q>5j8=9NiU>j6~xj?L~4H>y)Kx`a9QH;8$4{_|XFX$^)YuA!xkbvt4h98EWH`Ba**Gx&0}53Q%?#}zdz!$E%)!(S4uRQV z2gbw3%f`Jk1XF~e`A(^cth)SFUiBLgi<>IinzyBI@L2%OVQxk^+nVmwF^9p-t$_z( zO5w(az~QzuId@8oV0I>^b~{Dew{lA}a!3GGG5_b5!pa*su`vXCy?v(&AVOB`?GUWo z`R$!0`IWB2oo+f^Hixmfcc&h>0nD=Vt7k2pG&70kQpKp)t(H#?%)^o zLmGBJObOfV|5kI@S$8nBw#Q0$M<{TF^RBX*+L%aS<_8dn4a^3pC24_xS_5aCz@Z(? z-o|9x0btJH2s@b75AlP3lt{s>VLN)x%_l9%b@eK6#ncIc(89KYIo1Y_IoL4wV9u^$ z6caFlofC{sVot0LjZCd24UMhLx1YP)EVf%aJvTMk)f23!;4phT<6ULt-_?fS#kXDU z*K@c#-GKF&0uBUW=M2oI( zdyg?Hr2+%ICD0430kd-Q@nSoHHDC}gcC5S4+K~hHK8&gVwZi=6LqXhFx47#y__61^ z4@DT-nVTa12M+~Q&k!T+-3DzjM3$(Q|Qd{A_<%r?+b^{{W*03GnO){fEu} z^vwUZ&Hpc&1$GGk@@8>>dhd;CTv!$Ok9z-IV%oojdV_ZK<`3%4YYjlvZLbW->34j` zd)pp^VYe@^!na{PCI#W)umzw6kcsVH2ae1!^?<+I4Z&>d8S=6}!)4&xut^!I^sb8rX< z@R$HSz{+N~FVap3J3}CM;o?wI1Ac&z2IR-s9s~W`>B}Gc_oreE^7>u+TnJMqj8ykP zYC~&?IVSP6HZ{Xs0Wwhp1W0ovu*Fb_i3tXpOWK(Nvt?+s4bw3}6N`+1kOz`v1Ysby zp*@gK?<8u!qw}9YGm!IJ|6GI#+*tgNx$oD38>?wRFHNyn6dMly9{y+--EwgVU^5kL z(Ba|(a(#>__n>fY4s6+87|zM@6Aa!9x&Ian{69ip*be={4u4VrOe$mhH%b|sJ7Xsu z5C9hPV|3&nD*fGDVmIXTVCQ&`^a0GbJt2FAzdK30XykuY_SnS#SF#5({cWbk28O^` zl%oX%VXA6txScIJ0z3&&w#P7yek7jC_6TbTCKcY!flUnUtdxPfAc$>H%Ep07Vt-1E zcdQVYub-1-AdmTb7}&O={W>|=$%Qbv*jFVZQ$m^Gpm8hO8JL%?h_O-cAj6{ksJMfnlkDU?qd~*KoES>dw#>0%tRZL4jgp zI8a5#%n%B(#>}&#p@S(903}uIV0!}vhuMz${9cBw$nkT*~|LMVhs``^(`=bH;g=zhfIKtS(-nhib1w>@b zzkjZ+V4WW~2OG~$cd)9khz+pL1mCRz1k^p1cEI%Clqebl+$m-m@t>6{Vlzw( z>cyZDnV%5SHt^ZGq`JElvje!eKp4mc!hkN|2XpVAVHa=#1H6E$yU+_b+y!5t9r%T9 z{9h|&<>cr7Nslq<5hlp)(PK_7Y$V>R$UOhdjFl7XLiZ@L(O#NkkBS;w*xOhcAYcY& z_L$|(A5d{mmH%k*PV(@7SBtrKHwNr_<6XLWuNHG+W7xlr7Gv|*JzBi0pnq$^1Ue{T z!ek>r^FP#mr@`Gc?I#rddlrNNzkiVh?O4k{n#|$i{Ub`^;={7VduNmz8*}&0C?}AB zZ9A}k4%+=Qr+tuPvNJWqP|k>L$7BGrN0>no09)~6#{T1={={zl-*r%&AZ)n&m4n(% z1oo=)KR3)`*|R+k>L6_f_D=vo*46+ZC;x*L|2?rHKOYtv{?Orlpdu#dV4};vjt>8s zVaMnyh8YJo+Zg~p7^AoUfq?(VAt3KAKG;=G?mu&5oFHs;-m9Fu7~-D;7yi5)uQs9 z54o^L`S;ilv1k5w_ZRQA{JVO#Ya2g5^Z&y3!(E;D zrLFvbv~O8Td@*8I zA^+gGusqSfpX1`+1wp&&#*g*UyL$8w@LT^D`$~7o-(RZy|3`j{mjigc@h7N;g|q+C zp6k!__(AN`E^UgX&G+53#04_Kyz=p5U+ov%6}C9hmZX?;$uO)SZZ_E zx9<9kJq7{#C#`yJ@8vz&dgom<547Dh4$fH| zxvLUtPWqF*gfDeXOR8o@tWV;}UUNymb5@qT*y|<*+cQ53{0BVeOhVYCy}iyViYMrf z+ekjR8Gm>3;??7aD5VQuBOvG0Bn0I?PDKedOojDwni{HFNgNMe$J$~e@7Y^P5Kksa*ZEu~tf9`bGTZSVNIu0^kygh_?DE|X*z`=uh zAH}7?m%zoPiNnD?bX@$P9}Duc;>KsQ$qwQ1$lx9He9Uviy{b-)QUH~Ce$i1JU?^%a zoF}^o@hB~x9Qws`#l3L+I2)OPRxJ+f8|Fe6!E15j3qb!aH=a6rfOvMqfJqYAXz zbg&$OirL3hysKm_7>;1Nd9YZ3N8|BSi6`Lz^27b5{HSB_{GK<89>;X^my$VT@x)sR zXjKOPS7L5t6DOcKAM1V=)6HK>=1Aiy{75{Y)x*CMbBvxB5s!@InmDGLzm&}J7|z3j z1VF1KN7STmosd(gTwK2&-^uJx(eDryAI6*)M|)7m#mRtKnmG4GrSl~Y^mUJ1nz7f> zm&{{99RhS$#@+;tIF-g5$CWZDfYolku+u3#cThj(0j)q1I!QrP@v>tPd<7x0x8mbT zjA$6-T{8Ub>qB@r9X`(lB*ya-ZdrRMG&kSqx5^qRHF$&`y>pN$G#&#)RN}=?;UM9~ zOEL8u>e+Eyr$v*=jA}+`bG3XgNuyhoe^vcLQGgA45I$mifQr3)OL;2Kj zQ%9s!ZgCJ=h}9{foWPS&P9i+YgR)Wwnb;^kE?%~%I6bj1&;+sLLwM@;-3@YJp~60) z3uy4yoEEx+bS#Dt=)cH2TKR~RF(e5^H91%D?t;YE0^-e$9cOUlG~t>j4$_iUB|w_f zsf6>-k2D^hM}PfR$i$(ApUoh*>O2_`8ZsX}b>c1Mfm7;u=tyH>9PgpL(|EXD0Rg(n zb3+#6LYgzhEMvFjwgTp}$bEFh=^u+xD8EKSFw+^F7*uYhH&tiYwIcN7v5a8JgU2H|^ z>9Us3=UL?#5<1L_ACoLT`!=^g@gS)>YTjK?aB;d{K3vsg%SwKgLCzp}9wcWax2oH0 z+~ht}5=I|so~$Y?RMh)H%Xg-8O*r8*NaSVjlG%;uHAbk;D^HkT)AG6DT8(#TI=}O@ zY1DdaVsWmMYa8ZrFg05kRlQA?JCDQUm^<6$n)Gx2Wo2@!!SM|@r?T>vLTBG1_E!n- zufK_V@#;$I?Y^fJTrPKhjCR{wjzYf0%&{uq^461rfcH4qNU)^qH#2mktoDoFbIZF5(!nh`UvpK;%yG*|R zaq0eUf#V5aB=?)@*zA+<@`%GhZNU#G>?mtR0{R}~T~?;lyhN;%^l|ctWjz(35iz%<|dfobL zhAz)NBCv?Ev9)`Cd(e`bP}k=UJ8a;~2h~|d9WIs2pese>G0tGQ{xu_1B=^i*loA(T zaAMN4c|`Qg7$baS*tWIBaupn?ZdV|hwjL2O;8Lsjb<}koZ6PKatw=PPMPuCB=0irs zA9A-7#VDYQsMNQytZ_maEp@esQxUx}c$9cYSEX5|v$5W)m@y*h?1EazOgk{WphYLr z{e9N`0vy~f4hEcpT84PsuI`6q{SA*3?OsOYBa5!8bIg#9>!}B)>dvY8q+iHgpfOJ) zauJ(Zp1qNylnJiERm!AEQuA62v=A#uCJL>w@s#%-J5Og)V9DBfMulJNK^{EoYdd38 z>w^OOCeoU;^kMmANu-L`@VgIGGvDjJ1}0sZ?=mc~${u@xH{nt+ok`H1eVH<5<2qN^yFY;@cU1)bjr0`ivL#aYGiH4uqZ2r-6N#1h zf{w}Ycy->fD0D4I)4}LkCUi)k&@{pS|7VAJe%%duqp`&(jTAVe6t%?*Z%9>M1!)Jl6& z1Jcale|IU@p;@V^(70f8v$5Y$y;y)4&L3puS+6v*>~~V?gW;#zhsR-3a-??#kXl~D z^5->c86#3+i_L;iai2h%^3Lw5PH>P)rhHQCd{E6qEjk4d74s>7xikF}!9$mG14br~ z*LkZJyR(CZSU0sYaL`xel9o6)6rxnSaFC6F?pM%j5>Q$Cq@m2BN;6v*Lf(ka&7?hq zPL9_kk*}pxomy~$%bp@r5NUtZJ7Ll|qIx}zX@praaN;BMWhDLF!{4IlS@a3n`=}lu z%~62X*wF|-IUrJJ9|Hgg030R1#5VHMA69_df6~hcwD%ewBYvFo{KglmbV=ICBeGBH z?jO-_VMd*CtHk${Tw8Q8a{Ul>iQ*9@QKQFmWc}*xhjYDp&B%u%flDKOAJhWULb_8v zlUQF;sRLnO6~KP_^pBT7V|+rT}i%efos z_#MWyHipRtYLWimnE9YaHsqTAQ1b3RZ?)v(<$BW!)`7g@aEs?J>DfKTeI&0$Z(X~= zM0jWJBx-yiQ&`b3d3d4w3~!YVm$<@xo|>`wN9e7`0h@tN&8o|lEChkx0Z9uw+RcPR zRTg|v%Cs?+D7raA;fcPJdH+2^sgtj>pAv*^w_;#BEG zV9*I!{s&-TDx;vT4g80o)BxNr%BDhCfjj@l+cANOj42OF8Eb;=3POm3h>+6MVoy;w zSj>ku>5oM@aP~DHwy4ROmj~Q6&$8;`>SU;uK0}?iZpH*Fug_pwfQ>(N2_)wqkIUYi z)XW)9e5XW*&|BZN!4PZ*eRne{L|3`KMa4z%lUdm%wkxwR4eIpB|o)krA5uE-SJf9WHhkTOncHL8$b>wG)rTf^-N1-Zf`zh5ttba z4Wn@EXfvC!PaePxXBj%<@~sOYryt6qIF}uy4`V?E;DWg*GL194(mQUSIbPYY;+2YO z%c!rxL0^>{3FaTuyP&9W`v&Q8YPop@@)$}Yqd-;H5W)3U>wLtm)QS@UY%I4+D)F)n zO!05%6s-7uvFjwtZl^%?>KOABY=UPSNn48bqpN#1I=vQiv`hDQOY_^n%1Eh zO-n^c>&n`dWk4{wN7Ty7&c@L7C;CabL0XY878;Y}p1H=3Je> z)aRPh>(H%^&aZS3mId%erxRk~q~^`pCJ%y%8w08pvl@)t$1@F|q4a8VvQ=q@S2cyi=Srrof9tMD zF}<9B%hz)zXEI=kw{Ai3?ObbU!IzoeLrye!))L)6-IS^A0dGC{@mq@9bM=F!E;3@trYnt62a&z#O9&&-N=+?7NY)FPeD zP}(y`8kGXs69%#9Z*pusU<;0^w143^cfY@-( z`~&Iet&c5CiBJcBUp>-RU3JZx1($FIP5I$)@_8qF!*$&^AQ^b|_pUplTah>M@@HYb zlRObKFg)Q)kjEX1o8|h^cAh?k3yYgBwwJ9R?H`>gq;I`DWKlF-T4YmJ0zVpamrPVk zQ-AFAs$jP_A^mHEtuARYbY1KVshpd2Eh07Rv5KS$z~D*pf?2PWh}iIUn8@no`!Ye%De?Ztl@b z@ti7)S=%U@=JdFp31(4N^UU-HXL33AtI8Kps_>iymbt=i7gx_-5whF^(i+1H9A|;a zOXs(2eJ#8=kzS@xnr?^8$x(mInyaOMoY#I_O+byMUEg_#5?Z=4>^$+tex$GP=46nn z@G^T z@Y49fI8xFUk3jxHDSThKgf6RxkTYwS#dVaM%Z!*pa_uZPC+DAMky8f&4o6)*%BHzR zDG@R>rBEQwT*ukGCi zE&Xl~nt(El3-LIw+}>E6In{Zky}IR<&8dxURj`X?t!@N80c=RNB+bBWRiWXwE0jmt z4ED>=*GMIPkJMo`E7EnUKyd5=yJ1@&Z^tcsk(GG!j}zAV{*%jtaBbi0)$x>xN@{~E zE?pR*=`3hxBJ&3x6^=x?&L^FdEb0g5kD1h zQGBl6I`_79x|m`ryXK@9sau-7@QTle?3kgnrJfHiPBrMkqV)Dte#+;@2A#pRWg$1% z)JR6SjC^5fH|Hjp`gF&Z%);DVE-Nw!p@b{n7{mb~t`+~m(*wYi8^PhLUb*Pp?(xmF z>FNAqOfPNkoOx5(O)4}Q)4#R3L`Syq-B+yDhnmVj!~F9L&T!FQAF>rR;J{gFqJ~_e zZdaWaMDwtNg8I$sUv_F_v_}pT@hSIerJaZgCLeby<)n4X2Q#5`N#-oVkdB{}CY;Pu z{S|P_XgAgdtU*oBGe(3#gZie6BhDgvjN~vtZYwWp#`7Y`CvGz;_qUm+y@r3Mh5>>r z|4R2zH0=vZ=vk(USJiPj83RR^sV^na<1h;*k~9Y31|&`CCWgl>CYlow3;ATl?x^!3AB z)O~v;%EeHl(MEy;P=D zS(j7uXi713a{E5tS0eE%Xh<8r!s(c$uY$F}*+MYv@rR>8|j4tUD=&aWSS3nnge_ zDwAkKgcduDi_~fh?$mpCF>HQ0{Yj7(7i_7gToC#)?K`Y9Cx!HSs>BdoS~yAjnS8pJ z9YsqsSLZvnWM|cUW`-jIpU+Q^N@&svT(bL~tMHtgRjnkeEZ(4c)m4Nuokb%}p!ji8 zZ4Kl*5rJM`5=+>6|6C`LKy+4mT{T=q?KLAl&yw7IA&<}HpFEo8J)*A)&5Du*e zn#KADb!C|F|TwO0JZh8b6x^~i3l_v)d(NbN*kJ5=zTahL)LpYbFg`W5XjxG z=v@tv8eNB7ll4_7)NDQ$pldk%-R7ouMuFa9j)TD@4&V|_eiruGAF&*HJ>4$uI+hTM zXi;Pv{|F;}HlWJA+6k|(i5_E7WPmx2#`;<|NmI8)xlScVz#KdpJFEO?HNP)c9xl&8 z>8ETg!Zn`t>pOiVKo$&#Cs)@$InA9tS1&ZOu%!0XUv$+%hky>4j8BqufvR0YtIHKz zo^q=kI`+BpM(!!P3=LajZ&^@hczOeoueuW7d7ok9KVtVfD7RqsBK1c#7V)_P3Zla1 zDC0DPSe&{BMpVzEY1G$`uL$r0bfZbg+YAb-*SN&zL|8SRkIrecP(2kKy{$q+_Iv;} z!KpqepqaZOT6`0yv~|HHwz}s<`18p399x9as7Q7Alc{rKA-~~|Jhy-rnUbqNESU1> zv-&1r@+41^G5}IFtbDTE$R+{^;oEb!Wz(XIi;t7$Jq-c*qs5`f2 zB4d(4YI?fn#X+N22y~mKLED>@LR;oC#YjPQLi*9)l4j9$eErr!(ccP$SH1;y-MUMX zy|LI$9usVSrL6+@X%f}-v@vy?^fa~Z+;{VCq}`j^zfs6fp<0#;!mZ7`@%7UdEot*C z4_ow=rM%K*E08WY7W$3oist5rq|Eig3lSG0SkgQ$jF~ep(oSa<#dklpZVAN=S1xbS zrq}$Y3*==*sb|N!emiH$iRWvcCjVKT>*5+^Q6I?YYh<3l1sxxIQ=$!zE%0Du%?WJ+) z%DCdUgiw7AX0t5uaDbYg>+lSZ%)Da+1>wetjLR*lZdw zJ3?I7pH*=&9W>0m*%P2Tvy$QDHLUXf>(#VK<{B%(U@I8Yn<(L}(IRPHyN)4HrdulO zvH2Bugr2_6xQv`DK=z=Sm6J77p)^ocw%5g%1Z6TfD%X_LOozCXeKp!FqgETNy3wj% zaDX_aH#r*Lzw}ynEjT_fa|#wbsx|2nX#Dud&T&0gsF?pb|1F-nuQLUv ziIMXj!Zv9vHgD5|4Z_#6oxm}hl&Z`wpbKxd6!)H^QMW@Od zMAx#B+{4MVB0k2Mb&*1vb9HQC5z)+=(q}Y-&;dn($spbCxV7nJ=VJWZFCxt ziB-|6fR=Hmes@lH9QyLn>axJC?{$kocIna+Le4ZY=_T_LpEg|N1(l`pqv9*$aGMeFQxz862&weN4Igg_VLC zYJL?KMoz~FiTwBhHP%>%3Y*)2669_e4lrDaa&5y6^PF-PUha7eir{~^S8&03(U0ql~=`$m5 z+t!>f(x&BVWM+*PyTCmu>7%9&Kt4ePt5?KX?(uI2kAbj}uI@h9Y2-w@KlQjQkgS%a z0p9DWXc0O$!Up(xwGhM#$q{31ok6ue&G8ab2F zXIGC}*1T4HGGUdBwi>4B)aZ>Xo-0WJ$WvPCn~+R=k8V*GgC1*$0?43Z(!3tK$3VFB zFRnh)c)66(`WgF|oCIA)fU$z3qqt=C*U)4cqMKWJga`1D+np99SY;JoLzQi|nIbc( z4cu;|r1uqkz8Nderi>5BG3p`Gc5TMrGu2>yfZAC(Bn8Zpn?BbdW>C#A&6p+F-DQ=I z9EoD#xohCc;OiR5q+H)U{N*lLd$6W{=Mcvsi((4bOJ5}HZLJyf1M5_^sH~#8nkVuq zN8}c|ZZC)klkr~Lm%+h6xLdME=XJasY8XRDU&lQie}}Lg#$oL{b5G!SZ4RZT?v3;k z4a?5om<3ql^Ax$!MzKXMZ2%GQRkPUna+Sa6Y_o`7M9OC%z=bZ|OP`qb+$t*8%RCW7 zl8n0Psr!U5BBgNv%@j}{$|JNGlVmYZ1JzGT<&L$P{vsgVKk6Si@?wAO!(@QOYTh2l z8Xfwa&Ffxp##IbGIREsSRF0Fv;p^Avo#WX`mj?nuc8G(G#bVLQ%kMt*2GPQbGoX68 zCUf(FHN5cQ>&UXr)z#%e|G6la@8yI!l~g(u!W6jR0-<+vriZ9P=gH()F0PiGA7c;D z56upVr+YkE{g{9^<{&B^1JYLTHcrOWMTzJfW6QtQ7uMs@4V_hcTrhmr^0_os%|uH_ z$Lp@Hu7D-m29c#!AZEQpjl9DrYi&(dWhq_SVSF6>i_t{t1=Dj%rFtZ=KG;8gRNCfXu%MW{BHpI zrel?wAyXrDGDh9y+NgodEp@s6;B4@GG>UIcr8y!%|M@ooRfcD+PcH1w&uznQ3hLLE zUJmsFBN!T2eZljq4|Vo%xRt}1IF+`J!?e~hOwn$c19RW!ADc%xoAo)&EVP1kbaFmj zPO)fD)pm9l){9T_yMj8>`041(LG<%Br_s2Cr%uvHL~yml`Qa`TNZ`#LSE3lfi>wud zOf3Oqy{6mf#!|llN*Au!=fD)@^y&<8p))$`Mx2a#albrY{^sgL4#16S7HUgxoUc&& z_$W7#`xk>d%YiubhAPI#up?K$vzozE7lgzo<&UXVdqyTQ|EzdZE3*VZpJ2=6gv)}V z$lNY@&y`w(*<69G^*MWBGe@pSyCyE!-Mxe>?7H-bKExa^TfTDifCcDnW0q2nvBV88 z*QHxr8Hpb01gHTC$6kBQYUBQudPl@eCH2U{Orf!&q9UEYvtPzH7YmPO+(H;E6*P@hHNO<4@dHN1x^e*NEFp(OvhGC{5&r{0n zptIjEU7)Awx<9N>UW#V5mE9A84qbZx_`dA6OvA;;knbqh^$R!fq|jUdF`TmI{%W{ zPb^?)PTTCNFc3t#oCf_=D_kB`9%C=&NH5D<@@VEwhx6VZ_)t6r-0Tm)w!-4-w8HVr zfP)c2+sv)%*_IPHOT5ttFjPIQYyhgEtBkOc7!+50P>2y+lCY4XQG@=V83oeExb7n#_p9}@;>uEK;^S2Z3Xe)!-iMiE@ zbgPx2(DY>^1NQ}WWa0Q5f;{-y5ijUau%?Y`X+nZDWOXH%Q+Wp6LGrF5;R-8R*EF7oN!9r}Y zw%@qus5$LgBUZ~{Vx!ATL&jzCH&{XH1s_i!r^jPXZzVhRYd+OeHLQKz+MUyWFTm__ z%e?kDv}a~kj|z2(wL7D3)R{YHt2bIBn@sThj;;MEImW*p(}0%^ASXSsjWJ~*HqSvn zzIJ1Ok&x=I5OO`-5Gz{ix86~1vo$t>02v_-to^VTmMl%b;Y+VroT+A@MlaPGY~HM< zbJWdii0c5j#1io?m9;nVix;xjcRUR=G$*~DBBazSWZ9Bb{{y~U@6%WO8L?7 z#%jBYYh=8;An%boE{Yby+30@Kam(<2rLwF?uJtO3kda5Zg-%ILG$M9N(%lFLk890h z314m}CrX6%mulIGk>o4{aM8KH|IXj0$f7&kT}N)jeLzf$NyorqxM>*6XFDhmEuiYR zW`59vP9F`s1V&!AoDY;dr_LyJhR^0DV9|npG{L3M&Z9SvHa5!#k81kiD1KG{bX_Q) zx;|~0O{v=~K19!C?kVYcoy`=fL^0JiW75`0IO{4#>JYF&aT z7o8ccaxFP+Ea1^%KgX~ObB9o_;$1F9d+1h_u*2z``!MzFvQL7V^f8LPW*+XP-zPtj zs=js1qL_SheQ^HOn&oTgGoIMFl`l@FG!4@}#!C%sZ$R{GBbNM^_qSSvYK$^oHV|)= zG1zoYX$GJ*nijhqseJSCte;EInhz8K7cmQU_H)f$d~;n^Y4mgRPl9s`3kCJ;Y8Yt* z>`W3O633OjLLMa}lFDgf^f-sD-W;VLFV4Re!sS;Ybj^bO`(h2-c=e{7M2A3`RZK~y zx)I;Paz`Z|PqKC$E6dn-p%!2L3O;eGG7<^fkc&K<@n1wvTjGLqs*L>!d4%@grsrKs zF&c=222=}mF~T~)Q7_tKHJM7+w)|;z@!?Lu{-RMMSq-8sc*m|Zve2XSU6D!vH(nPG zN}1ej*LQ_R0;<{tE8Czrbn%VS+T9oh5d{pSfx z+r^KeLheYDS5IE4W%b~4ywnn8kft;B;h08@&l2-%=d{{H!%@`6`+$&V*|G6ML{p`p z9t~fmi7O}S?SB(zapZHN-`@)zjunqSWPx9q0C&dNgCmIQl*gvL)9OkB6Dq z@kVd9kZSL@o4DXNbh2+4;Hh`cdRkUoSGAX%ki-RN;9POu40P1iM1n{^h17dqc9RW* z=(V`xWoMuF^4K5db38b7U)_uM!IVOD^krNyLm+v~f~VKTlB5+;{TpKWo=TlXVlVM{ zjIa0FbGrA*k1B$QD2AN@`kYP_alp)%GXdKGm#4JV>tEm?FQ-;I%6su1$K#2v zkp9F&r@A;rmWNY!Hn1uxokN(nAu-XJk&f%UC%rCSc4mby=>a`m1~#M}?iWe}m^^TGPwpF+g8Xg3&v5Na@sKbe2X zLfS@F3QZEQC@;Kr%``lh+2t5*)#O!}#f6A>jM&oEd{kK3o~;h|zxmb4cXN8PUQ>xWgX=k(brYgh^M zJ`oYO+z?~d%Koec(Kxj(ZtsU0UYK}1(}9%uEVI8>H5veWkT)2`-aS?D()f1&`st## z;W`Erv*m85=wk{YnvR|0nYWKYzD;WDt)dRmT7J9MFfr$2o>aI1Eav+NQ5_jA?q0`1 zKho1wSFDX=#!LIgT*BnH`iLv9M0;MB)=~uhl^{8d5yjjH32*#n$*tM8pzNi=2Yoee z1)Ds1>9C+Kc{m%pM~QJ-=$#Vd2EI!zYMM<8D1QMBiDwK~1c;F`5ruW@dR1Vnd?ab4 zgXee!s8#S#-?{cJI@9owu}t?aqHIgQ>ib;}1L~FB^`=f8(wtcECv6{F;#=x;N*z5k zc0j(m!3rorzvC^hGtoH64}LFniz31HuvgV*{4Djg3G0@X`vysJ$^sDLeZ~?TH^^xk zniL0zTh`#Beo?gAUml>5YpkxL_$H3o=*vOm+9Bj^Xzw zt-QhCECbqEY}x!Twan+-xlEFCT5drv`j}V{o@O2Er7?wv5F?FJsz_Vx{f2nh6@g(ZC06h_p$s&v7%-o)jATMK@!vgfkONc|n{ zPdsUA->(!NwwSR|mM_OAs_>Nk8u2Egy>;bcR~8go9faDd%@Js}dyEJR$OI2&RO5%!?Y}rCZa@S`EMXBxw7v0YW7jhwmV(P5 zvb$(?)N&GAM2_;qrJKAi+HBJW934{jetBc#bfqz!aU~ksZX$2ZK&aHTifJcg?WW6l z@nax;dqq_ueWQ&IcglMxgebLe`t`V!_e6QQ$xI{p;b~cyQ$RhszDlnAu)Fs;&1K7# zXGz{~0RfTlSakIZL4tD1ha;si*IUf#{MA|BUIJEWzmOPjwu5+x9o5XbdiBywFQs~G zj9JBId4Bx@T5=1=LDCG)B;Gzhzxh(Yx$OlAToPx;?OYOA%@)`G9a-#)k4!(Rs5~U1 zft=}Z_sqSzFMsFlar~_K1mV25)5mjEF$-S&r{VyRcSmUSq)X1i>eV?vEjhT3XWCie z3!X(^#DP!~X=6aC5kpYE#1lJn{}pxk%RB6?H{Mtk#$Wgx-$h7aS^HEp)K`_am_-@S`tCuQvWhBN1f$u4W)-FiC0kF*wUUp4RO0qhs^dOq94yvzx zt?os{(x@$I;~m>O`dcQJ<>TDv6OJ~%;iRW^Fi;9#lIe4|Ow%s7bt`fr2-w@H?riT@ zR@;3IfN^9Vmou362cYvMIJn^c2(pjoPsR+{C_Ds~5H7YwrMxvpr}fCd9FngG&L%lt z?^}pcj}b|}-lC~M8efy6^IBTSejbuskZJPv%Ttn^23c^DafFFiJBJw|b3V&-K5Mz4 zVCqakzz7#)a)qeESZMW-pdxvA{phk|4&GxsT<{vmHIhBuv#nEdFZCLC!|BL|b7R+3*;x*yJMUo*}ep>_|yVOYZ69 zA!>(43IhdZ=X~YX=E6+Uh)s{CxzQV5RAw`{<6V@~?jAPQslB-%c6QyQVl1YX1?@KeEWn z1MD@*T5&61qda_`^4?=PFt}O9S^a(_?dKZ`6SI0%_JHWW)QLcK2zE4ud;!@d(L1qNPHknQiAr{+W8Wq-c_tnfr^eky9p+7vR- zXbdm2x!Gs)C^e39x+M^YmAEXS5VE9!7$zx7i+#H=aWg-bA~iOq4fo5Zn{O?eHV{bh zqN_wtA1svJ2umzs5%iagOrg|@PZpnBY4z__lUSA=Xq`=2zc1<4PGgfImDFpn=#k_5 zC?_DTL5IA7_I{z9y`t+7Iu5`#uqW4|uU2%NkAM52HU6vbP1|V6y2b~?LiQh@Xq%>& z0iI~md+eaxR}L7{*nVQRc=q8Zw%mzM}e5WSa+0h~n`+N%R(P~o?v=$e=PGplWV z<%B%d4HZi$tfSs|7OSFg&`USJBeg=01w+PPhGz`mU!_%Jc`HjBlOy0)B562!xwxL{ za$I4e9F&YW35S)bk)sMX1wU_nT-Yb&5;xS0P>m7j{nP56qu1YvsK+WP3#^;Tmj_!y z3bHMHECA#BL>_;@3ueT`!0veO+YGB4SAfNhVuA(Zaf(Blh|bGL*c$Iu4St$JcHxF6 zh3gtVpP;1xiNJDT;-e;qWSy zlYZjqLnXk0K8I($I%s+C6wqf}ILqqNRiMw`%j`zfTe$}MrQ06{yuZcW1`-9^`L~fa z(o|hY(s>eHUwd;|S&scut16IkwolNC#SJkOT?5kS(GHOKT==aCgfPP)bBN@i^5thN zJvm@+X-3@e2J4&XQ$FLw_q!SvHB7@xg{<+jA3K7t#$CIJq7ar2yXdQqW>xgk@SD-H z;BmJ67-PyOS7eq1MYIilJUN)9jYQIpHI8k4notG2_aHUV!5J#xz7^28ZddTKgZP** z!Ph@H8R4u7kI-zn!97gXYX!pR@jYUB`yDl`j_A%48TWbGjV~UW8urkdFELhL4v*tS z1bKsVe$n7y>BSB)M#Q>h-od8Z*KmkuP^-#U3(Jzi=?o*K7n1JU(0A$7fg2&AP{X9eu* zHs|&!XCspGwVbf;yg+QDsW&*~?5}Ah93o^8wf_Q>H!cA7d1Gv@ZAB8}g5}p%yT~a< zd#+9ovjwx&JUdsg1}w-EB!v?5-7xDKWK7SrQprpi51$(CFl+AMro5koSQlzFeq+^@ zovJH^<_u7pnxtM=3#(EP+SCJ-O$fK^TSo0`(97Wb=;=$uo3CP6@73gdJppWi9*8#% zh7`V4eU&TGe6M3e1ik?xO}Js-wz|6WBSx52v=GAjqKgVNC;C_+Lt4>%!4R-0 zECTSpdljEB7saAJ zquPc8oy;M6KV4lvbOO*Ijaec9jf3KnEGDgMhyb5F&D;o^UFAWgreZ+I=Td`b*=E68ZWldr>m5GWT5_05&ijzu;VeoU&Wvi&%qvsgx4b_ zp7GTn9eJHYk!|3)r1g?}3~!vhDKXR&qSF}Kf`zv@U&2QeM10iKk|5i zVmck8c)F-KyaGjzaw_F&dCFcmy_(Ula(t6>W}P|+vYI-W0bm8p>!3y?Ii8`}hDWnv z8DiS}lhu#+guJe@YX5laC|%Degsxks00#-US;Nn#Z#EVjHV6l;Bu*8+5QRC5IT%DN zR8_^a^$)oP>PeY0)0K*G@MAEc*$imcSGQJB=Ieu?l|7!xkw`C4yuu0sB zJ_=JY>sl(DEb*93Bzk{sR`eqbq6hX>^!Lo~-$Y2EQ=D7s+YZr=KdbVj5E+y+R=#;^ z^UC5nlO;s2;7h5p5K@LkSX3^SbSksV#A}!|(2`x_b-k5M(bXa4g%{m7T(!RDbc9Uy zI4(cqYRUa5q=Cvoc_bGmGAt}7v}fTUt)%rgkCN2kpa&vltTu1yK|V<6O~Ka|qEoM{ zx>*O4H3D=TQ`=O$ier^c?U}YiDL_Z^9zTs-9J|W?K%_rMFzdg7iJzSD=r}*lvFzB_ zrLXs21H)&Plc5ux93) zigA6*`^zP1_D}+uJgnr?A`I z3ouCATNo{Jw9)O|{jRK8lufPB<8tbl9KX(YgSoQDs3*8cARY~l*~{xJkcJ4chR{nz zhr#w$gFzX`QUvC;UWz%0!YK~-YwE?Zj0kq`WNr_0mb*MO0G|O#TB>oHY(Hyuet%+j zlpTmFn*j{Bs_DSEVE^0ws)=Kdp{`*-`n&4(H8k_~C%Y9_k{vMr6jAvCsz%-u>g4t` zJ~#;Ck~espP;0sY!se1_Sm*MlXmBidgoUKBYiWaBF71^(kMCpeB<1V6%FJ0`xJ+sM zM@O4D7LPC&$J2hIW^Va*{CpW>(m<`J!+kDAA;&?joBsION8{r8=A6c$$pv**6R%Ih=XlfF zohBcO9JR`AWz6;rk>L|-t+3H-{35LD#M%} zLHR=RQu=pM*3x%*IU{wfdW&MtNtA7ZvpK(wbb12&s$Ls@*0}k71aE}*C0=%NIN$6w z!GXp1ZB|3=;wg(x*AFKoA1+QgW5qjpBzdq~4e)3}>@NGK&F^qd0516~->WiZ=emtV@Jm#KJl=ovpDFfgIL*_9C)ss7d4O=oL+@B;p_6+oC6%Sd}#6+N*mzZIWOoLS(47uQ+&A1l5uhRa!f_h_3jrvF&>{ciS95H;1$pVI7u|GzL5(+kG^AcrDNWTGHiFP$Xdq;^;Y%Nk&o4rGx}hdLRo;<8Q+iX=d%bWKiP!r^Dm){&6H|1^u`rYcBWl8@l-Lx&&J(W?e~% zIx8X_g8-SH#LskFlXfpzs%+xJ+qa%umd3nYVD%}?`_k%OK$6~b`zXkNGn~RhPojHM zOl5RD@4I$MMomA)X(y*u#x@YbOW7b2Ifsv3YVEe9+lVJETM1O_Zh_by0VY`RS=#9g zi<>8|XtYyu0ZRic7? z`9s3=7W>aDcIo>q)me_y?Rn9U_eWy8y_W?fz!v`>2LKbrdFC%F0Usqd>F;{ zFDhZapmIQ_x7%%TLtNmyDW`W&e+>DLw|xLi?$iM*3O{NBvOfG5vc7Z$^X(G+UrWII z1@KLkieG+H1sFH*@s$HQ`(rYFFkgl-`Ek1c7>{E+`l!Ud;csQ;6PwTK3%i|JmFB5w`ykw*L{f z|GPv|6x;s@+y9Ru{r_4vXMg(F>5a7|!y<-R2Vr`Tez9AnH=;_2!l&kAojoD1-b?*w zoBWB2bJy+rra{&48S!0-I_IZppXn4ehOnTh;mUVw|32CM z6wUU|c#JyjrrxaS7%0u|iJSEe4=+7Mz4nJ=$7!9zS54PKktOA{LzDj3=`KkkD$Q7Doz_BXo3W2WjOOa#uEEp^p0E8OmN zt+2d)yZKmw$nY!pvu>SG<$iV7mPI-8{9zg{>tN1%a;A5tl0Pitz(VfJZ=$eaYGZpz zwJ?Z1?``?Bl55f6a&Qg{%@I{9+w)JKICb|)%NNjyaQwaM9+}h?LO&kU6IIX#ppj&a zLicO?zQlP!=@Hr-T<#bHJEO7CQL<-AXx#?&{Ib3$3Ri9L+E%DQ7LBEt!| zeiEp0QIL$j!$dJ;6;+g_T87Uk{d=jLSNIJ-{zz>-Z1;{13E`|UOse|)%kiY_r()X4 z63$qd-|ExIq;!}bDN;_p`1ze~kJft(t40a;W$RKxeGUy1Fg;82+M|32ZH`cA|C9Q{ zS>C?gGym^&>Fm1ui0E=;f;DpCmd8~!R%hpHSUkW3(j9@E#<#rUxKc!U5p{CLSEN`l z$P)_qJ53&YRZZlUKUF&{T?6^}GW2tcBxMA96`l8wavPiy8yA1yT+|fIfzOXCoQeNE zJ8oP%-7W7kDqws-g;3+B#qSs z{tf=`Lie5B={`H*SG5bXBpy0it`IBAwGdg0#1%keln&mMzl7|j5p)1Fj{118Y8n-F zb)un8cu}(h>LWh|=H+=FUk!G?{ZFLxwd1SOzw%_4o>JE-hI-5K$99{pQf)SG+3B6X ztQtJIxD*ZG{<+HXN_?UdeL*4WHhyt3f1{VY`#YcK1~)QX@k}AVyz80BY&n?&$@GV< zw~guPKXU9K00*bqUl<;*h%{suq`UM-Ww4Ej79m&XbUyOu0($8-PwUsNf0p)}=;pM@ zl^hQap-C{D$+4fFo^)N+4J6n|gM;W+mT6AZerU`&37=Yz(T`^7o$mJ-FdxBv_CN=! zdNo%a@?hT~Y+#{S3jP>`|AQfEf{2c4A@K&DMUVLL1WIwg%;PmQe=}dYXz{e(tI&rZ zzBzJwo_QU-ADVx&Pw*ExH{vhVFqd~e?hQjDgRhWdJVlOKXijQp^B!}#=LdexD2Wid zT@%mDtxokHiim1n)aa2eLrM7QUviKx4l3<=g@F@>CgYb3ibL8kX@5j61ybj@S^m8w z5gE1RAy68f$_(=T5uLQ zzJr@D5r$QDA`B1=a_X!8lG6ehBf_(%oUO;LZ48YRwwB-iaeyGvV}}z!$4Su#h)