Skip to content

Commit

Permalink
Merge pull request #419 from nickjcroucher/v3_dev
Browse files Browse the repository at this point in the history
Addition of multithreading
  • Loading branch information
nickjcroucher authored Nov 15, 2024
2 parents d74e1da + 8ca358f commit 7645c64
Show file tree
Hide file tree
Showing 33 changed files with 1,264 additions and 559 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Set up miniconda
uses: conda-incubator/setup-miniconda@v3
with:
auto-update-conda: true
python-version: 3.9
- name: Install dependencies with conda
run: |
sudo chown -R $UID $CONDA && source $CONDA/etc/profile.d/conda.sh && conda env update --file environment.yml
Expand Down
15 changes: 13 additions & 2 deletions R/scripts/plot_gubbins.R
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA,
tidyr::unnest_longer(gene,
values_to = "Taxa") %>%
dplyr::mutate(Taxa = factor(Taxa,
levels = rev(get_taxa_name(gubbins_tree)))) %>%
levels = rev(ggtree::get_taxa_name(gubbins_tree)))) %>%
dplyr::mutate(length = end - start + 1) %>%
dplyr::arrange(rev(length)) %>%
dplyr::select(-length)
Expand All @@ -543,6 +543,16 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA,
trim_start(start_pos) %>%
trim_end(end_pos)

# Get the number of taxa for selecting the line width
n_taxa <- length(ggtree::get_taxa_name(gubbins_tree))
rec_linewidth <-
dplyr::case_when(
n_taxa < 10 ~ 5,
n_taxa < 50 ~ 2,
n_taxa < 100 ~ 1.5,
TRUE ~ 1
)

# Plot recombination
rec_plot <-
ggplot(gubbins_rec,
Expand All @@ -551,7 +561,8 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA,
y = Taxa,
yend = Taxa,
colour = Colour)) +
geom_segment(alpha = 0.25) +
geom_segment(alpha = 0.5,
linewidth = rec_linewidth) +
scale_colour_manual(values = c("red" = "red",
"blue" = "blue")) +
scale_y_discrete(drop = FALSE) +
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.3.5
3.4
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ AC_PROG_CXX

# Checks for pthread
AX_PTHREAD
LDFLAGS="$LDFLAGS -pthread"

# Checks for code coverage
AX_CODE_COVERAGE
Expand Down
8 changes: 8 additions & 0 deletions docs/gubbins_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ To get summary statistics (e.g. ***r***/***m***) and subtrees for distinct clade
extract_gubbins_clade_statistics.py --clades [file designating isolates to clades] --gff out.recombination_predictions.gff --snps out.branch_base_reconstruction.embl --tree out.final_tree.tre --out tree_anaysis
```

To extract the recombinant sequences identified by Gubbins from the alignment:

```
extract_recombinant_sequences.py [-h] --aln [input alignment file] --gff out.recombination_predictions.gff --out-dir OUT_DIR [--start START] [--end END] [--terminal-only]
```

Note that currently this only identifies the unique alleles at the recombinant loci from the sequences in the alignment - it is not using the reconstructed sequences used to infer the recombinations on internal branches. This script will hopefully be updated to offer greater functionality in the future.

## Examples

Two example alignments can be downloaded from http://nickjcroucher.github.io/gubbins/:
Expand Down
6 changes: 3 additions & 3 deletions python/gubbins/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def parse_and_run(input_args, program_description=""):
gubbins_command = create_gubbins_command(
gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name,
input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size,
input_args.p_value, input_args.trimming_ratio, input_args.extensive_search)
input_args.p_value, input_args.trimming_ratio, input_args.extensive_search, input_args.threads)
printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command])
try:
subprocess.check_call(gubbins_command, shell=True)
Expand Down Expand Up @@ -798,10 +798,10 @@ def select_best_models(snp_alignment_filename,basename,current_tree_builder,inpu

def create_gubbins_command(gubbins_exec, alignment_filename, vcf_filename, current_tree_name,
original_alignment_filename, min_snps, min_window_size, max_window_size,
p_value, trimming_ratio, extensive_search):
p_value, trimming_ratio, extensive_search, num_threads):
command = [gubbins_exec, "-r", "-v", vcf_filename, "-a", str(min_window_size),
"-b", str(max_window_size), "-f", original_alignment_filename, "-t", current_tree_name,
"-m", str(min_snps), "-p", str(p_value), "-i", str(trimming_ratio)]
"-m", str(min_snps), "-p", str(p_value), "-i", str(trimming_ratio), "-n", str(num_threads)]
if extensive_search:
command.append("-x")
command.append(alignment_filename)
Expand Down
12 changes: 12 additions & 0 deletions python/gubbins/tests/test_python_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import unittest
import os
import shutil
import subprocess
import hashlib
import glob
Expand Down Expand Up @@ -144,6 +145,17 @@ def test_recombination_counting_per_gene(self):
assert self.md5_check(out_tab, check_tab)
os.remove(out_tab)

# Test clade file extraction script
def test_extract_recombination_sequences(self):
multiple_aln = os.path.join(data_dir, "multiple_recombinations.aln")
multiple_gff = os.path.join(data_dir, "multiple_recombinations_gubbins.recombination_predictions.gff")
out_dir = os.path.join(data_dir, "test_loci")
rec_count_cmd = "extract_recombinant_sequences.py --aln " + multiple_aln + " --gff " + multiple_gff + " --out-dir " + out_dir
subprocess.check_call(rec_count_cmd, shell=True)
file_count = int(subprocess.run('ls -1 ' + out_dir + ' | wc -l', shell = True, stdout=subprocess.PIPE).stdout.decode('utf-8').rstrip())
assert file_count > 1
shutil.rmtree(out_dir)

# Test plotting script (an R exception)
def test_recombination_counting_per_gene(self):
tree_input = os.path.join(data_dir, "expected_RAxML_result.multiple_recombinations.iteration_5")
Expand Down
4 changes: 2 additions & 2 deletions python/gubbins/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
class TestUtilities(unittest.TestCase):

def test_gubbins_command(self):
assert common.create_gubbins_command('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 5, 10, 200, 0.05, 1.0, 0) \
== 'AAA -r -v CCC -a 10 -b 200 -f EEE -t DDD -m 5 -p 0.05 -i 1.0 BBB'
assert common.create_gubbins_command('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 5, 10, 200, 0.05, 1.0, 0, 1) \
== 'AAA -r -v CCC -a 10 -b 200 -f EEE -t DDD -m 5 -p 0.05 -i 1.0 -n 1 BBB'

def test_translation_of_filenames_to_final_filenames(self):
assert common.translation_of_filenames_to_final_filenames('AAA', 'test') == {
Expand Down
2 changes: 1 addition & 1 deletion python/gubbins/treebuilders.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def __init__(self, threads: 1, model: str, bootstrap = 0, seed = None, internal_
self.citation = "https://doi.org/10.1093/molbev/msaa015"

# Set parallelisation
command.extend(["-nt", str(self.threads)])
command.extend(["-T", str(self.threads)])

# Add flags
command.extend(["-safe","-redo"])
Expand Down
3 changes: 2 additions & 1 deletion python/gubbins/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def choose_executable_based_on_processor(list_of_executables: list):
for executable in list_of_executables:
if cpu_info and 'AVX2' in executable and 'avx2' in flags and which(executable):
break
elif cpu_info and 'AVX' in executable and 'avx' in flags and which(executable):
elif cpu_info and ('AVX' in executable and 'AVX2' not in executable)\
and ('avx' in flags and 'avx2' not in flags) and which(executable):
break
elif cpu_info and 'SSE3' in executable and 'sse3' in flags and which(executable):
break
Expand Down
101 changes: 101 additions & 0 deletions python/scripts/extract_recombinant_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#! python

# encoding: utf-8
# Wellcome Trust Sanger Institute and Imperial College London
# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#

# Generic imports
import os
import sys
import argparse
import re
import math

# Biopython imports
from Bio import AlignIO
from Bio import Phylo
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq

# command line parsing
def get_options():

parser = argparse.ArgumentParser(description='Extract all the unique alleles at recombinant loci')

# input options
parser.add_argument('--aln',
help = 'Input alignment (FASTA format)',
required = True)
parser.add_argument('--gff',
help = 'GFF of recombinant regions detected by Gubbins',
required = True)
parser.add_argument('--out-dir',
help = 'Output directory',
required = True)
parser.add_argument('--start',
help = 'Start of region of interest',
default = 1,
required = False)
parser.add_argument('--end',
help = 'End of region of interest',
default = math.inf,
required = False)
parser.add_argument('--terminal-only',
help = 'Only extract recombinations on terminal branches',
default = False,
action = 'store_true')

return parser.parse_args()

# main code
if __name__ == "__main__":

# Get command line options
args = get_options()

# Create output directory
if not os.path.isdir(args.out_dir):
os.mkdir(args.out_dir)

# Read recombinant regions from GFF
rec_start = []
rec_end = []
with open(args.gff,'r') as gff_file:
for line in gff_file.readlines():
if not line.startswith('##'):
# Get coordinates
info = line.rstrip().split('\t')
taxon_pattern = re.compile('taxa="([^"]*)"')
taxon_set = set(taxon_pattern.search(info[8]).group(1).split())
if (len(taxon_set) == 1 or not args.terminal_only):
rec_start.append(int(info[3]))
rec_end.append(int(info[4]))

# Read in alignment and identify recombinations
alignment = AlignIO.read(args.aln,'fasta')
for (start,end) in zip(rec_start,rec_end):
if start >= args.start and end <= args.end:
out_fn = 'locus_' + str(start) + '_' + str(end) + '.aln'
with open(os.path.join(args.out_dir,out_fn),'w') as out_file:
seen_seqs = []
rec_locus_alignment = alignment[:,(start-1):end]
for taxon in rec_locus_alignment:
if taxon.seq not in seen_seqs:
out_file.write('>' + taxon.id + '\n' + str(taxon.seq) + '\n')
seen_seqs.append(taxon.seq)
2 changes: 1 addition & 1 deletion python/scripts/generate_ska_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def ska_map_sequences(seq, k = None, ref = None):
' --out ' + args.out + '.csv',
shell = True)

sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n")
sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\nPlease cite https://doi.org/10.1101/2024.03.25.586631\n")

# Clean up
if not args.no_cleanup:
Expand Down
1 change: 1 addition & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
'scripts/gubbins_alignment_checker.py',
'scripts/generate_files_for_clade_analysis.py',
'scripts/count_recombinations_per_gene.py',
'scripts/extract_recombinant_sequences.py',
'../R/scripts/plot_gubbins.R'
],
tests_require=[
Expand Down
Loading

0 comments on commit 7645c64

Please sign in to comment.