Skip to content

Commit

Permalink
Merge pull request #38 from sanger-tol/dp24_organellar
Browse files Browse the repository at this point in the history
Dp24 organellar
  • Loading branch information
DLBPointon authored Nov 16, 2023
2 parents 5196d4d + e09bd87 commit 5edc7ad
Show file tree
Hide file tree
Showing 27 changed files with 943 additions and 197 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ jobs:
mkdir NT_database
curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf -
- name: Download the pacbio barcode
run: |
mkdir pacbio_barcode
wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && cd pacbio_barcode && unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa && rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX && cd ..
- name: Download the subset of Diamond database
run: |
mkdir diamond
Expand Down
5 changes: 3 additions & 2 deletions assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
assembly_title: asccTinyTest
pacbio_barcodes: /home/runner/work/ascc/ascc/assets/pacbio_adaptors.fa
pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
pacbio_multiplexing_barcode_names: "bc1008,bc1009"
pacbio_reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio/
reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio
reads_type: "hifi"
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
Expand Down
5 changes: 3 additions & 2 deletions assets/test.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
assembly_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/ncbi_dataset/data/reheadered.fna
assembly_path: /lustre/scratch124/tol/projects/tol/data/insects/Polyommatus_atlantica/assembly/draft/treeval/ilPolAtla1_merged/raw/ref.fa
assembly_title: asccTinyTest
reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
reads_type: "hifi"
pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
pacbio_multiplexing_barcode_names: "bc1008,bc1009"
pacbio_reads_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/pacbio/
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
Expand Down
114 changes: 65 additions & 49 deletions bin/BedTools.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,95 @@
import sys
import re
import os
import pybedtools
from collections import defaultdict
from itertools import groupby


"""
Script from James Torrance (jt8) and Eerik Aunin (ea10)
refactored by Yumi sims (yy5)
"""


class BedTools:
# bedtools = '/software/grit/tools/bedtools-2.29.0/bin/bedtools'
bedtools = "bedtools"
def __init__(self):
pass

def sort_and_merge_bed_file(self, bed_file):
sorted_bed_file = re.sub("\.bed$", ".sorted.bed", bed_file)
merged_bed_file = re.sub("\.bed$", ".merged.bed", bed_file)
sort_command = self.bedtools + " sort -i " + bed_file + " > " + sorted_bed_file
print(sort_command)
os.system(sort_command)
merge_command = self.bedtools + " merge -i " + sorted_bed_file + " > " + merged_bed_file
print(merge_command)
os.system(merge_command)
return merged_bed_file
def sort_and_merge_bed_file(self, input_bed_file):
bed = pybedtools.BedTool(input_bed_file)
# Get the output merged BED file name without the file extension
output_merged_bed_file = os.path.splitext(os.path.basename(input_bed_file))[0] + ".merged.bed"
# Sort the BED file
sorted_bed = bed.sort()
# Merge the sorted BED file
merged_bed = sorted_bed.merge()
# Save the merged BED file
merged_bed.saveas(output_merged_bed_file)
return output_merged_bed_file

def merge_bed_b_into_a(self, bed_file_1, bed_file_2):
# We're merging file_2 into file_1
os.system("cat " + bed_file_2 + " >> " + bed_file_1)
merged_bed_file = sort_and_merge_bed_file(bed_file_1)
# Call the sort_and_merge_bed_file method to sort and merge the resulting file
merged_bed_file = self.sort_and_merge_bed_file(bed_file_1)
os.system("mv " + merged_bed_file + " " + bed_file_1)

def subtract_b_from_a(self, bed_file_1, bed_file_2):
# We subtract file 2 from file 1
subtracted_bed_file = re.sub("\.bed$", ".subtracted.bed", bed_file_1)
subtract_command = (
self.bedtools + " subtract -a " + bed_file_1 + " -b " + bed_file_2 + " > " + subtracted_bed_file
)
print(subtract_command)
os.system(subtract_command)
return subtracted_bed_file
if not os.path.exists(bed_file_1) or not os.path.exists(bed_file_2):
raise FileNotFoundError("One or both of the input BED files do not exist.")
output_file_name = os.path.splitext(os.path.basename(bed_file_1))[0] + ".subtracted.bed"

def coverage_for_bed_file(self, bed_file):
coverage = 0
# Load the BED files
bed1 = pybedtools.BedTool(bed_file_1)
bed2 = pybedtools.BedTool(bed_file_2)
# Subtract bed_file_2 from bed_file_1
subtracted_bed = bed1.subtract(bed2)
subtracted_bed.saveas(output_file_name)
return output_file_name

def coverage_for_bed_file(self, bed_file):
with open(bed_file, "r") as bed_handle:
for line in bed_handle:
line = line.rstrip()
fields = re.split("\s+", line)
coverage += int(fields[2]) - int(fields[1])

lines = [line.strip() for line in bed_handle]
coverage = sum(int(fields[2]) - int(fields[1]) for line in lines for fields in [re.split("\s+", line)])
return coverage

def coverage_for_bed_file_by_scaffold(self, bed_file):
coverage_for_scaffold = {}

with open(bed_file, "r") as bed_handle:
for line in bed_handle:
line = line.rstrip()
fields = re.split("\s+", line)
if fields[0] not in coverage_for_scaffold:
coverage_for_scaffold[fields[0]] = 0
coverage_for_scaffold[fields[0]] += int(fields[2]) - int(fields[1])

fields_list = [re.split("\s+", line.strip()) for line in bed_handle]
coverage_for_scaffold = {}
for chrom, interval_list in groupby(fields_list, key=lambda x: x[0]):
total_coverage = 0
for _, start_str, end_str in interval_list:
try:
start, end = int(start_str), int(end_str)
total_coverage += end - start
except ValueError:
pass
coverage_for_scaffold[chrom] = total_coverage
return coverage_for_scaffold

def coords_to_bed(self, coord_list_for_sequence, bed_file):
bed_handle = open(bed_file, "w")
for sequence in coord_list_for_sequence:
for coord_pair in coord_list_for_sequence[sequence]:
bed_handle.write("\t".join([sequence, str(coord_pair[0] - 1), str(coord_pair[1])]) + "\n")
bed_handle.close()
with open(bed_file, "w") as bed_handle:
lines = [
f"{sequence}\t{start - 1}\t{end}\n"
for sequence, coord_pairs in coord_list_for_sequence.items()
for start, end in coord_pairs
]
bed_handle.writelines(lines)

def bed_to_coords(self, bed_file):
coord_list_for_sequence = {}
coord_list_for_sequence = defaultdict(list)

with open(bed_file, "r") as bed_handle:
for line in bed_handle:
line = line.rstrip()
fields = re.split("\s+", line)
if fields[0] not in coord_list_for_sequence:
coord_list_for_sequence[fields[0]] = []
coord_list_for_sequence[fields[0]].append([int(fields[1]) + 1, int(fields[2])])
fields = line.split()
if len(fields) >= 3:
try:
seq_name, start, end = fields[0], int(fields[1]), int(fields[2])
coord_list_for_sequence[seq_name].append([start + 1, end])
except (ValueError, IndexError):
# Handle invalid lines or non-numeric fields
pass

return coord_list_for_sequence
return dict(coord_list_for_sequence)
55 changes: 24 additions & 31 deletions bin/extract_contaminants_by_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@
import re
import BedTools
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import gzip
import sys
import argparse
from pathlib import Path
import general_purpose_functions as gpf


def main():
Expand All @@ -28,29 +26,14 @@ def main():

if args.assembly_file == None:
assembly_file = re.sub("\.contamination", ".fa.gz", contamination_file)

# assembly_name_match = re.search('([^/]*?)\.contamination', contamination_file)
# assembly_name = assembly_name_match.group(1)
else:
assembly_file = args.assembly_file
# assembly_name_match = re.search('([^/]*?)\.(fa|fasta|fna)', assembly_file)
# assembly_name = assembly_name_match.group(1)
assembly_name = "assembly"

# If at first you don't succeed, try again with a .fasta suffix:
if not os.path.isfile(assembly_file):
assembly_file = re.sub("\.contamination", ".fasta.gz", contamination_file)

# assembly_dir = '/lustre/scratch123/tol/teams/grit/jt8/contamination_screen/assemblies/'

# if not os.path.isfile(assembly_file):
# # Fall through to looking in the assemblies dir
# assembly_file = assembly_dir + assembly_name + '.fa'

# if not os.path.isfile(assembly_file):
# # ... and then try a .fasta extension
# assembly_file = assembly_dir + assembly_name + '.fasta'

if not os.path.isfile(assembly_file):
print(assembly_file + " does not exist")
else:
Expand Down Expand Up @@ -83,11 +66,10 @@ def main():
coord_list_for_section_and_sequence[section_name][fields[0]].append(coords)
coord_list_for_section_and_sequence["ALL"][fields[0]].append(coords)

# margins = [0,10000]
# margins = [0,10000]
margins = [0]

# Write BED file
# extracts_dir = '/lustre/scratch123/tol/teams/grit/jt8/contamination_screen/contaminated_extracts/'
for section_name in coord_list_for_section_and_sequence:
# Only print sections that have data- but always produce an "ALL" file
if len(coord_list_for_section_and_sequence[section_name]) > 0 or section_name == "ALL":
Expand All @@ -101,10 +83,9 @@ def main():

# Get lengths
length_file = args.extracts_dir + assembly_name + ".lengths"
# if not os.path.isfile(length_file):

# Replaced the exonerate fastalength binary with a Python script (EA)
os.system("fastalength.py " + assembly_file + " > " + length_file)
fastalength(assembly_file, length_file)
length_for_sequence = parse_fastalength_file(length_file)

coverage_file_base_name = args.extracts_dir + assembly_name + "." + output_section_name
Expand All @@ -121,11 +102,8 @@ def main():
else:
handle = open(assembly_file, "rt")

# with gzip.open(assembly_file, "rt") as handle:
for record in SeqIO.parse(handle, "fasta"):
# print(record.id)
if record.id in merged_coord_list_for_sequence:
# print('Extracting')
for coord_pair in merged_coord_list_for_sequence[record.id]:
extracted_sequence = extract_sequence(record, coord_pair[0], coord_pair[1], margin)
SeqIO.write([extracted_sequence], output_handle, "fasta")
Expand All @@ -136,15 +114,20 @@ def write_coverage_file(coverage_file_base_name, merged_bed_file, length_for_seq
# Record coverage
merged_coord_list_for_sequence = bedtools.bed_to_coords(merged_bed_file)
coverage_for_sequence = bedtools.coverage_for_bed_file_by_scaffold(merged_bed_file)

percentage_coverage_for_sequence = {}
for sequence in coverage_for_sequence:
percentage_coverage_for_sequence[sequence] = (
coverage_for_sequence[sequence] / length_for_sequence[sequence] * 100
)
if sequence in length_for_sequence:
percentage_coverage_for_sequence[sequence] = (
coverage_for_sequence[sequence] / length_for_sequence[sequence] * 100
)
else:
# Handle missing sequence length information
percentage_coverage_for_sequence[sequence] = 0

coverage_threshold = 1

# Take the stem name for the file, and print to various files, with c coverage threshod, without, and per line
# Take the stem name for the file, and print to various files, with c coverage threshold, without, and per line

filtered_coverage_scaffold_file = coverage_file_base_name + ".filtered_scaffold_coverage.bed"
unfiltered_coverage_scaffold_file = coverage_file_base_name + ".unfiltered_scaffold_coverage.bed"
Expand Down Expand Up @@ -212,6 +195,18 @@ def write_coverage_file(coverage_file_base_name, merged_bed_file, length_for_seq
)


def fastalength(input_path, length_file):
fasta_data = gpf.read_fasta_in_chunks(input_path)
with open(length_file, "w") as length_file_handle:
for header, seq in fasta_data:
if header is not None: # Check if header is not None
if " " in header:
header = header.replace(" ", "_")
result = f"{len(seq)} {header}"
print(result)
length_file_handle.write(result + "\n")


# Parse fastalength file
def parse_fastalength_file(length_file):
length_for_sequence = {}
Expand Down Expand Up @@ -239,8 +234,6 @@ def extract_sequence(seq_record, start, end, margin):
else:
start = 1

# print("Slicing: " + str(start) + '-' + str(end))

extracted_slice = seq_record[start:end]
extracted_slice.id = seq_record.id + ":" + str(start) + "-" + str(end)
extracted_slice.name = extracted_slice.id
Expand Down
20 changes: 8 additions & 12 deletions bin/gc_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"""
Script for finding the GC content of each sequence in a multiFASTA file
Written by Eerik Aunin @eeaunin
Originally Written by Eerik Aunin @eeaunin
Refactored by Yumi Sims @yumisims
Adapted by Damon-Lee Pointon @DLBPointon
"""

Expand All @@ -13,16 +13,12 @@

def main(fasta_path):
fasta_data = gpf.read_fasta_in_chunks(fasta_path)
for header, seq in fasta_data:
header = header.split()[0]
seq = seq.upper()
gc_content = None
gc_count = seq.count("G") + seq.count("C")
seq_len = len(seq)
if seq_len > 0:
gc_content = gc_count / seq_len
gc_content_string = "{:.6f}".format(gc_content)
print("{}\t{}".format(header, gc_content_string))
results = [
(header.split()[0], "{:.6f}".format((seq.upper().count("G") + seq.upper().count("C")) / max(1, len(seq))))
for header, seq in fasta_data
]
for header, gc_content_string in results:
print(f"{header}\t{gc_content_string}")


if __name__ == "__main__":
Expand Down
17 changes: 5 additions & 12 deletions bin/organelle_contamination_recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,20 @@

import csv
import re
import sys
import argparse


def main():
parser = argparse.ArgumentParser(description="Generate text for ENA submission")
parser.add_argument("--input", type=str, help="List of input files (space seperated list)", default=None)
parser = argparse.ArgumentParser(
description="Create an organellar contamination report text file based on the organellar contamination BED file"
)
parser.add_argument("--input", type=str, help="Input BED file", default=None)
parser.add_argument("--output", type=str, help="Output recommendation", default=None)
parser.add_argument("-v", action="version", version="1.0")
args = parser.parse_args()

# Filters through the list of files that nextflow passes in and ID's the one we want
files_list = args.input
file_for_use = next(
(file for file in files_list.split() if "assembly.ALL.unfiltered_scaffold_coverage.bed" in file), None
)
if file_for_use is None:
sys.exit(1)

recommendation_handle = open(args.output, "w")
with open(file_for_use) as bed_handle:
with open(args.input) as bed_handle:
bed_csv_reader = csv.reader(bed_handle, delimiter="\t")
for field_set in bed_csv_reader:
scaffold = field_set[0]
Expand Down
Loading

0 comments on commit 5edc7ad

Please sign in to comment.