Skip to content

Commit c480273

Browse files
authored
Merge pull request #91 from sanger-tol/dev
Getting close to 0.1.0
2 parents 6cc495b + f5dd419 commit c480273

25 files changed

+299
-136
lines changed

.github/workflows/branch.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: nf-core branch protection
33
# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
44
on:
55
pull_request_target:
6-
branches: [master]
6+
branches: [main]
77

88
jobs:
99
test:

CHANGELOG.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6-
## v0.1.0 - [date]
6+
## v0.1.0 - Red Book [14/02/2025]
77

88
Initial release of sanger-tol/ascc, created with the [nf-core](https://nf-co.re/) template.
99

@@ -82,8 +82,8 @@ The intention of this pipeline is to succeed the currently in production Cobiont
8282
| FCSGX_RUNGX | - | 0.5.4--h4ac6f70_1 |
8383
| GNU_SORT | - | 9.3 |
8484
| GUNZIP | - | ubuntu:22.04 |
85-
| KRAKEN2_KRAKEN2 | - | |
86-
| MINIMAP2_ALIGN | - | |
85+
| KRAKEN2_KRAKEN2 | - | kraken2:2.1.3,pigz:2.8 |
86+
| MINIMAP2_ALIGN | - | minimap2:2.28--he4a0461_0,samtools=1.20 |
8787
| MINIMAP2_INDEX | - | 2.28--he4a0461_0 |
8888
| NCBITOOLS_VECSCREEN | - | ncbi-tools-bin:6.1.20170106-6-deb_cv2 |
8989
| SAMTOOLS\_\* | - | 1.21--h50ea8bc_0 |
@@ -98,11 +98,11 @@ The intention of this pipeline is to succeed the currently in production Cobiont
9898
| CHECK_BARCODE | - | python:3.9. pacbio_barcode_check.py:1.0.0 |
9999
| CHUNK_ASSEMBLY_FOR_VECSCREEN | - | biopython:1.81, chunk_assembly_for_vecscreen.py:1.0.0 |
100100
| CONVERT_TO_HITS_FILE | - | python:3.9, convert_to_hits.py:1.0.0 |
101-
| CREATE_BTK_DATASET | - | blobtoolkit:4.3.9, python:3.9, create_btk_dataset.py:1.0.0 |
101+
| CREATE_BTK_DATASET | - | blobtoolkit:4.3.9, python:3.9, create_btk_dataset.py:2.0.0 |
102102
| EXTRACT_CONTAMINANTS | - | python:3.9, biopython:1.78, pybedtools:0.9.0, extract_contaminants_by_type.py:1.0.0 |
103103
| FILTER_BARCODE | - | biopython:1.78, python:3.9, filter_barcode_blast_results.py:1.0.0 |
104104
| FILTER_COMMENTS | - | coreutils:9.1 |
105-
| FILTER_FASTA | - | python:3.9, ascc_shorten_fasta_headers.py:1.0.0, filter_fasta_by_length.py:1.0.0 |
105+
| FILTER_FASTA | - | python:3.9, sanitise_input_fasta_file.py:1.2.0, filter_fasta_by_length.py:1.0.0 |
106106
| FILTER_VECSCREEN_RESULTS | - | python:3.9, VSlistTo1HitPerLine.py:1.0.0 |
107107
| REFORMAT_DIAMOND_OUTFMT6 | - | python:3.9, reformat_diamond_outfmt6.py:1.0.0 |
108108
| GC_CONTENT | - | python:3.9, gc_content.py:1.0.0 |
@@ -113,7 +113,7 @@ The intention of this pipeline is to succeed the currently in production Cobiont
113113
| GET_LINEAGE_FOR_TOP | - | python:3.9, get_lineage_for_top.py:1.0.0 |
114114
| KMER_COUNT_DIM_REDUCTION_COMBINE_CSV | - | pandas:1.5.2, python:3.9, kmer_count_dim_reduction_combine_csv.py:1.0.0 |
115115
| KMER_COUNT_DIM_REDUCTION | - | python:3.9, pandas:2.2.1, tensorlflow:2.15.0, scikit-learn:1.4.1, umap:0.5.5, matplotlib:3.8.0, kmer_count_dim_reduction.py:1.0.0 |
116-
| MERGE_BTK_DATASETS | - | blobtoolkit:4.3.9, merge_btk_datasets.py:1.0.0 |
116+
| MERGE_BTK_DATASETS | - | blobtoolkit:4.3.9, merge_btk_datasets.py:2.0.0 |
117117
| ORGANELLE_CONTAMINATION_RECOMMENDATIONS | - | python:3.9, organelle_contamination_recommendation.py:1.0.0 |
118118
| PARSE_FCSGX_RESULT | - | python:3.9, parse_fcsgx_result.py:1.0.0 |
119119
| REFORMAT_FULL_OUTFMT6 | - | python:3.9, reformat_blast_outfmt6.py:1.0.0 |

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ This setup assumes that you have an assembly where the primary contigs or scaffo
7979
It is okay to leave out assembly components from the run. E.g. if your assembly does not have a mitochondrial sequence, you can leave the row with the `MITO` tag out. If your assembly does not have a plastid sequence, you can leave the row with the `PLASTID` tag out.
8080
The params-input yaml will need to contain the following data will be detailed [here](./docs/usage.md).
8181

82+
The documentation of the kmers dimensionality reduction is covered in separate markdown files dedicated to this topic:
83+
84+
- [Kmers Dimensionality Reduction](./docs/kmers_dim_reduction.md)
85+
- [Kmers Autoencoder](./docs/kmers_autoencoder.md)
86+
8287
Now, you can run the pipeline using:
8388

8489
<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->

bin/ascc_shorten_fasta_headers.py

-95
This file was deleted.

bin/create_btk_dataset.py

+19-6
Original file line numberDiff line numberDiff line change
@@ -111,19 +111,32 @@ def tiara_results_to_btk_format(tiara_results_path, outfile_path):
111111
def detect_dim_reduction_methods(kmers_dim_reduction_output_path):
112112
"""
113113
Parses the header of the kmers dimensionality reduction report file to detect
114-
which dimensionality reduction methods were used and how many dimensions each has
115-
Returns a dictionary where keys are method names and values are number of dimensions
114+
which dimensionality reduction methods were used and how many dimensions each has.
115+
Returns a dictionary where keys are method names and values are number of dimensions.
116+
117+
The function extracts method names by removing the 'embedding_dim_X_' prefix from column names,
118+
preserving the complete method name including any underscores it may contain.
116119
"""
120+
import re
121+
117122
with open(kmers_dim_reduction_output_path) as f:
118123
header_string = f.readline().strip()
119124

120125
split_header = header_string.split(",")
121126
dim_reduction_methods = {}
122127

123-
for method in set(col.split("_")[-1] for col in split_header if col.startswith("embedding_dim_")):
124-
# Count how many dimensions exist for this method
125-
dims = sum(1 for col in split_header if f"embedding_dim_" in col and col.endswith(f"_{method}"))
126-
dim_reduction_methods[method] = dims
128+
# Get columns that start with embedding_dim_
129+
embedding_cols = [col for col in split_header if col.startswith("embedding_dim_")]
130+
131+
# Extract method names by removing the embedding_dim_X_ prefix
132+
for col in embedding_cols:
133+
# Use regex to remove the prefix pattern 'embedding_dim_digits_'
134+
method = re.sub(r"^embedding_dim_\d+_", "", col)
135+
136+
# Count dimensions for this method if we haven't seen it yet
137+
if method not in dim_reduction_methods:
138+
dims = sum(1 for c in embedding_cols if re.sub(r"^embedding_dim_\d+_", "", c) == method)
139+
dim_reduction_methods[method] = dims
127140

128141
return dim_reduction_methods
129142

bin/generate_samplesheet.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ def main():
3333

3434
data_list.append("sample,datatype,datafile\n")
3535

36-
[data_list.append(f"{args.sample_name},pacbio,{args.path_to_reads}{file}\n") for file in os.listdir(args.path_to_reads) if file.endswith('.fasta.gz') or file.endswith('.fa.gz')]
36+
[
37+
data_list.append(f"{args.sample_name},pacbio,{args.path_to_reads}{file}\n")
38+
for file in os.listdir(args.path_to_reads)
39+
if file.endswith(".fasta.gz") or file.endswith(".fa.gz")
40+
]
3741

3842
if len(data_list) <= 1:
3943
sys.exit("I was expecting at least one FASTA.GZ file")

bin/sanitise_input_fasta_file.py

+164
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#!/usr/bin/env python3
2+
VERSION = "1.2.0"
3+
DESCRIPTION = f"""
4+
---
5+
Script for sanitising FASTA headers and sequences:
6+
- Shortens headers by splitting by whitespace and keeping only the first element
7+
- Replaces problematic characters in headers (commas, spaces, etc.) with underscores
8+
- Converts sequences to uppercase and replaces non-ATGC bases with N
9+
Version: {VERSION}
10+
---
11+
12+
Written by Eerik Aunin (ea10)
13+
Modified by Damon-Lee Pointon (@dp24/@DLBPointon)
14+
Further modified by Eerik Aunin (ea10)
15+
16+
"""
17+
18+
# MIT License
19+
#
20+
# Copyright (c) 2020-2022 Genome Research Ltd.
21+
#
22+
# Author: Eerik Aunin (eeaunin@gmail.com)
23+
#
24+
# Permission is hereby granted, free of charge, to any person obtaining a copy
25+
# of this software and associated documentation files (the "Software"), to deal
26+
# in the Software without restriction, including without limitation the rights
27+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
28+
# copies of the Software, and to permit persons to whom the Software is
29+
# furnished to do so, subject to the following conditions:
30+
#
31+
# The above copyright notice and this permission notice shall be included in all
32+
# copies or substantial portions of the Software.
33+
#
34+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
37+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
38+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
39+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
40+
# SOFTWARE.
41+
42+
import general_purpose_functions as gpf
43+
import argparse
44+
import textwrap
45+
import sys
46+
import tempfile
47+
import re
48+
49+
50+
def is_all_n_sequence(seq):
51+
"""Return True if sequence consists entirely of N's."""
52+
return all(base == "N" for base in seq.strip().upper())
53+
54+
55+
def sanitise_sequence(seq):
56+
"""Convert sequence to uppercase and replace any non-ATGC bases with N."""
57+
seq = seq.upper()
58+
return re.sub(r"[^ATGC]", "N", seq)
59+
60+
61+
def sanitise_header(header):
62+
"""Replace problematic characters in FASTA headers with underscores."""
63+
# Remove the '>' character if present at the start
64+
if header.startswith(">"):
65+
header = header[1:]
66+
67+
# Replace problematic characters with underscores
68+
sanitised = re.sub(r"[,;\s|:]", "_", header)
69+
70+
# Add back the '>' character
71+
return ">" + sanitised
72+
73+
74+
def parse_args(argv=None):
75+
parser = argparse.ArgumentParser(
76+
prog="sanitise_input_fasta_file",
77+
formatter_class=argparse.RawDescriptionHelpFormatter,
78+
description=textwrap.dedent(DESCRIPTION),
79+
)
80+
parser.add_argument("fasta_path", type=str, help="Path to input FASTA file")
81+
parser.add_argument(
82+
"--delimiter",
83+
type=str,
84+
help="Delimiter string for splitting FASTA headers. Default: any whitespace character",
85+
default="",
86+
)
87+
parser.add_argument("--allow_duplicate_headers", dest="allow_duplicate_headers", action="store_true")
88+
parser.add_argument(
89+
"--keep_n_sequences", action="store_true", help="Keep sequences that are all Ns (default: False)"
90+
)
91+
parser.add_argument("-v", "--version", action="version", version=VERSION)
92+
return parser.parse_args(argv)
93+
94+
95+
def main(fasta_path, delimiter, allow_duplicate_headers, keep_n_sequences=False):
96+
with tempfile.TemporaryDirectory() as tmp_dir:
97+
input_file = fasta_path
98+
if fasta_path.endswith(".gz") or fasta_path.endswith('.gz"'):
99+
input_file = "{}/input_file.fa".format(tmp_dir)
100+
gpf.run_system_command("gunzip -c {} > {}".format(fasta_path, input_file))
101+
102+
headers_list = list()
103+
headers_with_commas = 0
104+
in_data = gpf.ll(input_file)
105+
106+
current_header = None
107+
current_sequence = []
108+
109+
def process_sequence():
110+
if current_header and current_sequence:
111+
sequence = "".join(current_sequence)
112+
if keep_n_sequences or not is_all_n_sequence(sequence):
113+
print(current_header)
114+
print(sequence)
115+
else:
116+
sys.stderr.write("Skipping all-N sequence: {}\n".format(current_header[1:].strip()))
117+
118+
for line in in_data:
119+
if line.startswith(">"):
120+
# Process previous sequence if it exists
121+
process_sequence()
122+
123+
# Start new sequence
124+
original_header = line.strip()
125+
if delimiter == "":
126+
current_header = original_header.split()[0]
127+
else:
128+
current_header = original_header.split(delimiter)[0]
129+
130+
# Check for commas in the original header
131+
if "," in original_header:
132+
headers_with_commas += 1
133+
134+
# Sanitise the header
135+
current_header = sanitise_header(current_header)
136+
137+
if current_header in headers_list and allow_duplicate_headers is False:
138+
sys.stderr.write(
139+
"Duplicate FASTA headers ({}) were found in the input file ({}) after truncating the headers with a delimiter\n".format(
140+
current_header[1:], fasta_path
141+
)
142+
)
143+
sys.exit(1)
144+
headers_list.append(current_header)
145+
current_sequence = []
146+
else:
147+
# Add sanitised sequence line
148+
current_sequence.append(sanitise_sequence(line))
149+
150+
# Process the last sequence
151+
process_sequence()
152+
153+
# Print warning about headers with commas
154+
if headers_with_commas > 0:
155+
sys.stderr.write(
156+
"Warning: {} FASTA header(s) contained commas that were replaced with underscores\n".format(
157+
headers_with_commas
158+
)
159+
)
160+
161+
162+
if __name__ == "__main__":
163+
args = parse_args()
164+
main(args.fasta_path, args.delimiter, args.allow_duplicate_headers, args.keep_n_sequences)

0 commit comments

Comments
 (0)