Skip to content

Commit f51e910

Browse files
authored
Merge pull request #48 from sanger-tol/dp24_btk_datasets
Many additions
2 parents e2ac46c + d4024b7 commit f51e910

File tree

77 files changed

+3523
-384
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+3523
-384
lines changed

.github/workflows/ci.yml

+14-1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,18 @@ jobs:
7272
run: |
7373
curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf -
7474
75+
- name: Temporary ASCC Diamond Data
76+
run: |
77+
curl https://dp24.cog.sanger.ac.uk/ascc/diamond.dmnd -o diamond.dmnd
78+
79+
- name: Temporary BLASTN Data
80+
run: |
81+
curl https://dp24.cog.sanger.ac.uk/blastn.tar.gz | tar xzf -
82+
83+
- name: Temporary Accession2TaxID Data
84+
run: |
85+
curl https://dp24.cog.sanger.ac.uk/ascc/accession2taxid.tar.gz | tar -xzf -
86+
7587
- name: Download the NCBI taxdump database
7688
run: |
7789
mkdir ncbi_taxdump
@@ -120,10 +132,11 @@ jobs:
120132
run: |
121133
mkdir vecscreen
122134
curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
135+
ls -lh
123136
124137
- name: Singularity - Run FULL pipeline with test data
125138
# TODO nf-core: You can customise CI pipeline run tests as required
126139
# For example: adding multiple test runs with different parameters
127140
# Remember that you can parallelise this by using strategy.matrix
128141
run: |
129-
nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL
142+
nextflow run ./sanger-ascc/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./results --include ALL --exclude btk_busco

.nf-core.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ lint:
1919
nextflow_config:
2020
- manifest.name
2121
- manifest.homePage
22-
multiqc_config:
23-
- report_comment
22+
multiqc_config: False

assets/btk_draft.yaml

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
assembly:
2+
level: bar
3+
settings:
4+
foo: 0
5+
similarity:
6+
diamond_blastx:
7+
foo: 0
8+
taxon:
9+
class: class_name
10+
family: family_name
11+
genus: genus_name
12+
kingdom: kingdom_name
13+
name: species_name
14+
order: order_name
15+
phylum: phylum_name
16+
superkingdom: superkingdom_name
17+
taxid: 0

assets/github_testing/test.yaml

+9-7
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,20 @@ kmer_len: 7
1212
dimensionality_reduction_methods: "pca,random_trees"
1313
# all available methods
1414
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
15-
nt_database: /home/runner/work/ascc/ascc/NT_database/
16-
nt_database_prefix: 18S_fungal_sequences
15+
nt_database: /home/runner/work/ascc/ascc/blastdb/
16+
nt_database_prefix: tiny_plasmodium_blastdb.fa
1717
nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
18-
ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/
18+
ncbi_accessionids_folder: /home/runner/work/ascc/ascc/20240709_tiny_accession2taxid/
1919
ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/
2020
ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp
2121
busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
22+
busco_lineages: "diptera_odb10,insecta_odb10"
2223
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
23-
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
24-
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
24+
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond.dmnd
25+
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond.dmnd
2526
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/
2627
seqkit:
27-
sliding: 6000
28-
window: 100000
28+
sliding: 100000
29+
window: 6000
2930
n_neighbours: 13
31+
btk_yaml: /home/runner/work/ascc/ascc/assets/btk_draft.yaml

assets/test.yaml

+10-8
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,20 @@ kmer_len: 7
1212
dimensionality_reduction_methods: "pca,random_trees"
1313
# all available methods
1414
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
15-
nt_database: /data/blastdb/Supported/NT/202308/dbv4/
16-
nt_database_prefix: nt
17-
nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
18-
ncbi_accessionids_folder: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/ncbi_taxonomy/20230509_accession2taxid/
19-
ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/
15+
nt_database: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/
16+
nt_database_prefix: tiny_plasmodium_blastdb.fa
17+
nt_kraken_db_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/kraken2/kraken2/
18+
ncbi_accessionids_folder: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/20240709_tiny_accession2taxid/
19+
ncbi_taxonomy_path: /lustre/scratch123/tol/resources/taxonomy/latest/new_taxdump
2020
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
2121
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
22-
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
22+
busco_lineages: "diptera_odb10,insecta_odb10"
23+
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb/
2324
vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/
24-
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
25-
diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
25+
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd
26+
diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_diamond_tiny_testdb/ascc_tinytest_diamond_db.dmnd
2627
seqkit:
2728
sliding: 100000
2829
window: 6000
2930
n_neighbours: 13
31+
btk_yaml: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/btk_draft.yaml

bin/abnormal_contamination_check.py

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/usr/bin/env python3
2+
3+
VERSION = "V1.0.0"
4+
5+
DESCRIPTION = """
6+
-------------------------------------
7+
Abnormal Contamination Check
8+
Version = {VERSION}
9+
-------------------------------------
10+
Written by James Torrance
11+
Modified by Eerik Aunin
12+
Modified by Damon-Lee Pointon
13+
-------------------------------------
14+
15+
Script for determining if there is
16+
enough contamination found by FCS-GX
17+
to warrant an abnormal contamination
18+
report alarm. Partially based on code
19+
written by James Torrance
20+
-------------------------------------
21+
22+
"""
23+
24+
import general_purpose_functions as gpf
25+
import sys
26+
import os.path
27+
import pathlib
28+
import argparse
29+
import textwrap
30+
31+
32+
def parse_args():
33+
parser = argparse.ArgumentParser(
34+
prog="Abnormal Contamination Check",
35+
formatter_class=argparse.RawDescriptionHelpFormatter,
36+
description=textwrap.dedent(DESCRIPTION),
37+
)
38+
parser.add_argument("assembly", type=str, help="Path to the fasta assembly file")
39+
parser.add_argument("summary_path", type=str, help="Path to the tiara summary file")
40+
parser.add_argument("-v", "--version", action="version", version=VERSION)
41+
return parser.parse_args()
42+
43+
44+
def get_sequence_lengths(assembly_fasta_path):
45+
"""
46+
Gets sequence lengths of a FASTA file and returns them as a dictionary
47+
"""
48+
seq_lengths_dict = dict()
49+
fasta_data = gpf.read_fasta_in_chunks(assembly_fasta_path)
50+
for header, seq in fasta_data:
51+
seq_len = len(seq)
52+
seq_lengths_dict[header] = dict()
53+
seq_lengths_dict[header]["seq_len"] = seq_len
54+
return seq_lengths_dict
55+
56+
57+
def load_fcs_gx_results(seq_dict, fcs_gx_and_tiara_summary_path):
58+
"""
59+
Loads FCS-GX actions from the FCS-GX and Tiara results summary file, adds them to the dictionary that contains sequence lengths
60+
"""
61+
fcs_gx_and_tiara_summary_data = gpf.l(fcs_gx_and_tiara_summary_path)
62+
fcs_gx_and_tiara_summary_data = fcs_gx_and_tiara_summary_data[1 : len(fcs_gx_and_tiara_summary_data)]
63+
for line in fcs_gx_and_tiara_summary_data:
64+
split_line = line.split(",")
65+
assert len(split_line) == 5
66+
seq_name = split_line[0]
67+
fcs_gx_action = split_line[1]
68+
seq_dict[seq_name]["fcs_gx_action"] = fcs_gx_action
69+
return seq_dict
70+
71+
72+
def main():
73+
args = parse_args()
74+
if os.path.isfile(args.summary_path) is False:
75+
sys.stderr.write(
76+
f"The FCS-GX and Tiara results file was not found at the expected location ({args.summary_path})\n"
77+
)
78+
sys.exit(1)
79+
80+
if os.path.isfile(args.assembly) is False:
81+
sys.stderr.write(f"The assembly FASTA file was not found at the expected location ({args.assembly})\n")
82+
sys.exit(1)
83+
84+
seq_dict = get_sequence_lengths(args.assembly)
85+
seq_dict = load_fcs_gx_results(seq_dict, args.summary_path)
86+
87+
total_assembly_length = 0
88+
lengths_removed = list()
89+
scaffolds_removed = 0
90+
scaffold_count = len(seq_dict)
91+
92+
for seq_name in seq_dict:
93+
seq_len = seq_dict[seq_name]["seq_len"]
94+
if seq_dict[seq_name]["fcs_gx_action"] == "EXCLUDE":
95+
lengths_removed.append(seq_len)
96+
scaffolds_removed += 1
97+
total_assembly_length += seq_len
98+
99+
alarm_threshold_for_parameter = {
100+
"TOTAL_LENGTH_REMOVED": 1e7,
101+
"PERCENTAGE_LENGTH_REMOVED": 3,
102+
"LARGEST_SCAFFOLD_REMOVED": 1.8e6,
103+
}
104+
105+
report_dict = {
106+
"TOTAL_LENGTH_REMOVED": sum(lengths_removed),
107+
"PERCENTAGE_LENGTH_REMOVED": 100 * sum(lengths_removed) / total_assembly_length,
108+
"LARGEST_SCAFFOLD_REMOVED": max(lengths_removed, default=0),
109+
"SCAFFOLDS_REMOVED": scaffolds_removed,
110+
"PERCENTAGE_SCAFFOLDS_REMOVED": 100 * scaffolds_removed / scaffold_count,
111+
}
112+
113+
for param in report_dict:
114+
sys.stderr.write(f"{param}: {report_dict[param]}\n")
115+
116+
fcs_gx_alarm_indicator_path = f"fcs-gx_alarm_indicator_file.txt"
117+
pathlib.Path(fcs_gx_alarm_indicator_path).unlink(missing_ok=True)
118+
119+
alarm_list = []
120+
stage1_decon_pass_flag = True
121+
for param in alarm_threshold_for_parameter:
122+
param_value = report_dict[param]
123+
alarm_threshold = alarm_threshold_for_parameter[param]
124+
125+
# IF CONTAMINATING SEQ FOUND FILL FILE WITH ABNORMAL CONTAM
126+
if param_value > alarm_threshold_for_parameter[param]:
127+
stage1_decon_pass_flag = False
128+
alarm_list.append(
129+
f"YES_ABNORMAL_CONTAMINATION: Stage 1 decon alarm triggered for {param}: the value for this parameter in this assembly is {param_value} | alarm threshold is {alarm_threshold}\n"
130+
)
131+
132+
# Seperated out to ensure that the file is written in one go and doesn't confuse Nextflow
133+
with open(fcs_gx_alarm_indicator_path, "a") as f:
134+
f.write("".join(alarm_list))
135+
136+
# IF NO CONTAM FILL FILE WITH NO CONTAM
137+
if stage1_decon_pass_flag is True:
138+
alarm_message = f"NO_ABNORMAL_CONTAMINATION: No scaffolds were tagged for removal by FCS-GX\n"
139+
with open(fcs_gx_alarm_indicator_path, "a") as f:
140+
f.write(alarm_message)
141+
142+
143+
if __name__ == "__main__":
144+
main()

0 commit comments

Comments
 (0)