Skip to content

Commit

Permalink
Merge pull request #48 from sanger-tol/dp24_btk_datasets
Browse files Browse the repository at this point in the history
Many additions
  • Loading branch information
DLBPointon authored Sep 11, 2024
2 parents e2ac46c + d4024b7 commit f51e910
Show file tree
Hide file tree
Showing 77 changed files with 3,523 additions and 384 deletions.
15 changes: 14 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ jobs:
run: |
curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf -
- name: Temporary ASCC Diamond Data
run: |
curl https://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd
- name: Temporary BLASTN Data
run: |
curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf -
- name: Temporary Accession2TaxID Data
run: |
curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -xzf -
- name: Download the NCBI taxdump database
run: |
mkdir ncbi_taxdump
Expand Down Expand Up @@ -120,10 +132,11 @@ jobs:
run: |
mkdir vecscreen
curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
ls -lh
- name: Singularity - Run FULL pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL
nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco
3 changes: 1 addition & 2 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,4 @@ lint:
nextflow_config:
- manifest.name
- manifest.homePage
multiqc_config:
- report_comment
multiqc_config: False
17 changes: 17 additions & 0 deletions assets/btk_draft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
assembly:
level: bar
settings:
foo: 0
similarity:
diamond_blastx:
foo: 0
taxon:
class: class_name
family: family_name
genus: genus_name
kingdom: kingdom_name
name: species_name
order: order_name
phylum: phylum_name
superkingdom: superkingdom_name
taxid: 0
16 changes: 9 additions & 7 deletions assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,20 @@ kmer_len: 7
dimensionality_reduction_methods: "pca,random_trees"
# all available methods
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
nt_database: /home/runner/work/ascc/ascc/NT_database/
nt_database_prefix: 18S_fungal_sequences
nt_database: /home/runner/work/ascc/ascc/blastdb/
nt_database_prefix: tiny_plasmodium_blastdb.fa
nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/
ncbi_accessionids_folder: /home/runner/work/ascc/ascc/20240709_tiny_accession2taxid/
ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/
ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp
busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
busco_lineages: "diptera_odb10,insecta_odb10"
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond.dmnd
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/
seqkit:
sliding: 6000
window: 100000
sliding: 100000
window: 6000
n_neighbours: 13
btk_yaml: /home/runner/work/ascc/ascc/assets/btk_draft.yaml
18 changes: 10 additions & 8 deletions assets/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,20 @@ kmer_len: 7
dimensionality_reduction_methods: "pca,random_trees"
# all available methods
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
nt_database: /data/blastdb/Supported/NT/202308/dbv4/
nt_database_prefix: nt
nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/
ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/
nt_database_prefix: tiny_plasmodium_blastdb.fa
nt_kraken_db_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/kraken2/kraken2/
ncbi_accessionids_folder: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/20240709_tiny_accession2taxid/
ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
busco_lineages: "diptera_odb10,insecta_odb10"
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/
vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd
diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd
seqkit:
sliding: 100000
window: 6000
n_neighbours: 13
btk_yaml: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml
144 changes: 144 additions & 0 deletions bin/abnormal_contamination_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3

VERSION = "V1.0.0"

DESCRIPTION = """
-------------------------------------
Abnormal Contamination Check
Version = {VERSION}
-------------------------------------
Written by James Torrance
Modified by Eerik Aunin
Modified by Damon-Lee Pointon
-------------------------------------
Script for determining if there is
enough contamination found by FCS-GX
to warrant an abnormal contamination
report alarm. Partially based on code
written by James Torrance
-------------------------------------
"""

import general_purpose_functions as gpf
import sys
import os.path
import pathlib
import argparse
import textwrap


def parse_args():
parser = argparse.ArgumentParser(
prog="Abnormal Contamination Check",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(DESCRIPTION),
)
parser.add_argument("assembly", type=str, help="Path to the fasta assembly file")
parser.add_argument("summary_path", type=str, help="Path to the tiara summary file")
parser.add_argument("-v", "--version", action="version", version=VERSION)
return parser.parse_args()


def get_sequence_lengths(assembly_fasta_path):
"""
Gets sequence lengths of a FASTA file and returns them as a dictionary
"""
seq_lengths_dict = dict()
fasta_data = gpf.read_fasta_in_chunks(assembly_fasta_path)
for header, seq in fasta_data:
seq_len = len(seq)
seq_lengths_dict[header] = dict()
seq_lengths_dict[header]["seq_len"] = seq_len
return seq_lengths_dict


def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path):
"""
Loads FCS-GX actions from the FCS-GX and Tiara results summary file, adds them to the dictionary that contains sequence lengths
"""
fcs_gx_and_tiara_summary_data = gpf.l(fcs_gx_and_tiara_summary_path)
fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)]
for line in fcs_gx_and_tiara_summary_data:
split_line = line.split(",")
assert len(split_line) == 5
seq_name = split_line[0]
fcs_gx_action = split_line[1]
seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action
return seq_dict


def main():
args = parse_args()
if os.path.isfile(args.summary_path) is False:
sys.stderr.write(
f"The FCS-GX and Tiara results file was not found at the expected location ({args.summary_path})\n"
)
sys.exit(1)

if os.path.isfile(args.assembly) is False:
sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({args.assembly})\n")
sys.exit(1)

seq_dict = get_sequence_lengths(args.assembly)
seq_dict = load_fcs_gx_results(seq_dict, args.summary_path)

total_assembly_length = 0
lengths_removed = list()
scaffolds_removed = 0
scaffold_count = len(seq_dict)

for seq_name in seq_dict:
seq_len = seq_dict[seq_name]["seq_len"]
if seq_dict[seq_name]["fcs_gx_action"] == "EXCLUDE":
lengths_removed.append(seq_len)
scaffolds_removed += 1
total_assembly_length += seq_len

alarm_threshold_for_parameter = {
"TOTAL_LENGTH_REMOVED": 1e7,
"PERCENTAGE_LENGTH_REMOVED": 3,
"LARGEST_SCAFFOLD_REMOVED": 1.8e6,
}

report_dict = {
"TOTAL_LENGTH_REMOVED": sum(lengths_removed),
"PERCENTAGE_LENGTH_REMOVED": 100 * sum(lengths_removed) / total_assembly_length,
"LARGEST_SCAFFOLD_REMOVED": max(lengths_removed, default=0),
"SCAFFOLDS_REMOVED": scaffolds_removed,
"PERCENTAGE_SCAFFOLDS_REMOVED": 100 * scaffolds_removed / scaffold_count,
}

for param in report_dict:
sys.stderr.write(f"{param}: {report_dict[param]}\n")

fcs_gx_alarm_indicator_path = f"fcs-gx_alarm_indicator_file.txt"
pathlib.Path(fcs_gx_alarm_indicator_path).unlink(missing_ok=True)

alarm_list = []
stage1_decon_pass_flag = True
for param in alarm_threshold_for_parameter:
param_value = report_dict[param]
alarm_threshold = alarm_threshold_for_parameter[param]

# IF CONTAMINATING SEQ FOUND FILL FILE WITH ABNORMAL CONTAM
if param_value > alarm_threshold_for_parameter[param]:
stage1_decon_pass_flag = False
alarm_list.append(
f"YES_ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n"
)

# Seperated out to ensure that the file is written in one go and doesn't confuse Nextflow
with open(fcs_gx_alarm_indicator_path, "a") as f:
f.write("".join(alarm_list))

# IF NO CONTAM FILL FILE WITH NO CONTAM
if stage1_decon_pass_flag is True:
alarm_message = f"NO_ABNORMAL_CONTAMINATION: No scaffolds were tagged for removal by FCS-GX\n"
with open(fcs_gx_alarm_indicator_path, "a") as f:
f.write(alarm_message)


if __name__ == "__main__":
main()
Loading

0 comments on commit f51e910

Please sign in to comment.