Skip to content

Commit bf80071

Browse files
authored
Merge pull request #40 from sanger-tol/kmer_count
Kmer count + Dimensionality reduction
2 parents af5209e + b43f082 commit bf80071

24 files changed

+954
-133
lines changed

.github/workflows/ci.yml

+47-10
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ on:
1010

1111
env:
1212
NXF_ANSI_LOG: false
13+
NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
14+
NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
1315

1416
concurrency:
1517
group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
@@ -24,9 +26,15 @@ jobs:
2426
strategy:
2527
matrix:
2628
NXF_VER:
27-
- "23.04.0"
29+
- "22.10.1"
2830
- "latest-everything"
2931
steps:
32+
- name: Get branch names
33+
# Pulls the names of current branches in repo
34+
# steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
35+
id: branch-names
36+
uses: tj-actions/branch-names@v8
37+
3038
- name: Check out pipeline code
3139
uses: actions/checkout@v3
3240

@@ -35,10 +43,34 @@ jobs:
3543
with:
3644
version: "${{ matrix.NXF_VER }}"
3745

46+
- name: Set up Singularity
47+
run: |
48+
mkdir -p $NXF_SINGULARITY_CACHEDIR
49+
mkdir -p $NXF_SINGULARITY_LIBRARYDIR
50+
51+
- name: Setup apptainer
52+
uses: eWaterCycle/setup-apptainer@main
53+
54+
- name: Install Python
55+
uses: actions/setup-python@v5
56+
with:
57+
python-version: "3.10"
58+
59+
- name: Install nf-core
60+
run: |
61+
pip install nf-core
62+
63+
- name: NF-Core Download - download singularity containers
64+
# Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found
65+
# Must occur after singularity install or will crash trying to dl containers
66+
# Zip up this fresh download and run the checked out version
67+
run: |
68+
nf-core download sanger-tol/ascc --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-ascc --container-cache-utilisation amend --container-system singularity
69+
3870
- name: Download test data
3971
# Download A fungal test data set that is full enough to show some real output.
4072
run: |
41-
curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest.tar.gz | tar xzf -
73+
curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf -
4274
4375
- name: Download the NCBI taxdump database
4476
run: |
@@ -48,11 +80,11 @@ jobs:
4880
- name: Download the FCS-gx database
4981
run: |
5082
mkdir FCS_gx
51-
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
52-
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
53-
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
54-
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
55-
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
83+
wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
84+
wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
85+
wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
86+
wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
87+
wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
5688
5789
- name: Download the BUSCO lineage database
5890
run: |
@@ -72,7 +104,12 @@ jobs:
72104
- name: Download the pacbio barcode
73105
run: |
74106
mkdir pacbio_barcode
75-
wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && cd pacbio_barcode && unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa && rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX && cd ..
107+
wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip
108+
cd pacbio_barcode
109+
unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip
110+
mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa
111+
rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX
112+
cd ../
76113
77114
- name: Download the subset of Diamond database
78115
run: |
@@ -84,9 +121,9 @@ jobs:
84121
mkdir vecscreen
85122
curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
86123
87-
- name: Run pipeline with test data
124+
- name: Singularity - Run FULL pipeline with test data
88125
# TODO nf-core: You can customise CI pipeline run tests as required
89126
# For example: adding multiple test runs with different parameters
90127
# Remember that you can parallelise this by using strategy.matrix
91128
run: |
92-
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
129+
nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL

.nf-core.yml

+7-9
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
11
repository_type: pipeline
22
lint:
3-
files_exist:
4-
- CODE_OF_CONDUCT.md
5-
- assets/nf-core-ascc_logo_light.png
6-
- docs/images/nf-core-ascc_logo_light.png
7-
- docs/images/nf-core-ascc_logo_dark.png
8-
- .github/ISSUE_TEMPLATE/config.yml
9-
- .github/workflows/awstest.yml
10-
- .github/workflows/awsfulltest.yml
11-
- conf/igenomes.config
3+
files_exist: false
124
files_unchanged:
135
- CODE_OF_CONDUCT.md
146
- assets/nf-core-ascc_logo_light.png
157
- docs/images/nf-core-ascc_logo_light.png
168
- docs/images/nf-core-ascc_logo_dark.png
179
- .github/ISSUE_TEMPLATE/bug_report.yml
10+
- .github/workflows/branch.yml
11+
- .github/CONTRIBUTING.md
12+
- .github/PULL_REQUEST_TEMPLATE.md
13+
- .github/workflows/linting_comment.yml
14+
- assets/email_template.html
15+
- pyproject.toml
1816
- LICENSE
1917
- .github/workflows/linting.yml
2018
- lib/NfcoreTemplate.groovy

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
22

3-
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
3+
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
44
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
55
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
66
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)

assets/github_testing/test.yaml

+12-9
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
2-
assembly_title: asccTinyTest
3-
pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
4-
pacbio_multiplexing_barcode_names: "bc2008,bc2009"
5-
reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio
1+
assembly_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa
2+
assembly_title: asccTinyTest_V2
3+
reads_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/pacbio/
64
reads_type: "hifi"
5+
pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
6+
pacbio_multiplexing_barcode_names: "bc2001,bc2009"
77
sci_name: "Plasmodium yoelii yoelii 17XNL"
88
taxid: 352914
9-
mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
10-
plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
9+
mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
10+
plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
1111
kmer_len: 7
12-
## Below this point will need updating as more subworkflows are built
12+
dimensionality_reduction_methods: "pca,random_trees"
13+
# all available methods
14+
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
1315
nt_database: /home/runner/work/ascc/ascc/NT_database/
1416
nt_database_prefix: 18S_fungal_sequences
1517
nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
@@ -20,7 +22,8 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
2022
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
2123
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
2224
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
23-
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen
25+
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/
2426
seqkit:
2527
sliding: 6000
2628
window: 100000
29+
n_neighbours: 13

assets/static-args.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
kmer_size: 7
2+
n_neighbors_setting: 13
3+
autoencoder_epochs_count: -1

assets/test.yaml

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta
2-
assembly_title: asccTinyTest
3-
reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
1+
assembly_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa
2+
assembly_title: asccTinyTest_V2
3+
reads_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/pacbio/
44
reads_type: "hifi"
55
pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
66
pacbio_multiplexing_barcode_names: "bc2008,bc2009"
77
sci_name: "Plasmodium yoelii yoelii 17XNL"
88
taxid: 352914
9-
mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
10-
plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
9+
mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
10+
plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
1111
kmer_len: 7
12+
dimensionality_reduction_methods: "pca,random_trees"
13+
# all available methods
14+
# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
1215
nt_database: /data/blastdb/Supported/NT/202308/dbv4/
1316
nt_database_prefix: nt
1417
nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
@@ -17,9 +20,10 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
1720
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
1821
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
1922
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
20-
vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database
23+
vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/
2124
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
2225
diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
2326
seqkit:
2427
sliding: 100000
2528
window: 6000
29+
n_neighbours: 13

bin/VSlistTo1HitPerLine.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
66
This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format:
77
VecScreen_Category ID_string start_position end_position
8-
The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including
9-
--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line.
8+
The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including
9+
--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line.
1010
"No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line.
1111
VecScreen errors will be reported unless --skip_reporting_errors is included on the command line.
1212
Usage:

bin/get_kmers_counts.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script for counting kmer frequencies per sequence in a FASTA file
4+
Output (STDOUT): kmer counts as a CSV table
5+
Developed by Eerik Aunin ([email protected])
6+
"""
7+
8+
import argparse
9+
import general_purpose_functions as gpf
10+
import kcounter
11+
from collections import OrderedDict
12+
import pandas as pd
13+
14+
15+
def main(fasta_path, out_path, kmer_size):
16+
fasta_data = gpf.read_fasta_in_chunks(fasta_path)
17+
nucleotides_collection = list()
18+
for header, seq in fasta_data:
19+
seq = seq.upper()
20+
seq_len = len(seq)
21+
nucleotides_dict = kcounter.count_kmers(seq, kmer_size, canonical_kmers=True)
22+
relative_counts_dict = OrderedDict()
23+
relative_counts_dict["header"] = header
24+
relative_counts_dict["seq_len"] = seq_len
25+
for kmer in nucleotides_dict:
26+
kmer_relative_count = nucleotides_dict[kmer] / seq_len
27+
relative_counts_dict[kmer] = kmer_relative_count
28+
nucleotides_collection.append(relative_counts_dict)
29+
df = pd.DataFrame(nucleotides_collection)
30+
df = df.fillna(0)
31+
df.to_csv(out_path, index=False)
32+
33+
34+
if __name__ == "__main__":
35+
parser = argparse.ArgumentParser(description=__doc__)
36+
parser.add_argument("-v", "--version", action="version", version="1.0")
37+
parser.add_argument("fasta_path", type=str, help="Path to input FASTA file")
38+
parser.add_argument("out_path", type=str, help="Path for output CSV file")
39+
parser.add_argument("--kmer_size", type=int, help="kmer size (bp). Default: 7", default=7)
40+
args = parser.parse_args()
41+
main(args.fasta_path, args.out_path, args.kmer_size)

0 commit comments

Comments
 (0)