Merge pull request #38 from sanger-tol/dp24_organellar

Dp24 organellar
sanger-tol · Nov 16, 2023 · 5edc7ad · 5edc7ad
2 parents 5196d4d + e09bd87
commit 5edc7ad
Show file tree

Hide file tree

Showing 27 changed files with 943 additions and 197 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -69,6 +69,11 @@ jobs:
           mkdir NT_database
           curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf -
 
+      - name: Download the pacbio barcode
+        run: |
+          mkdir pacbio_barcode
+          wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && cd pacbio_barcode && unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa && rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX && cd ..
+
       - name: Download the subset of Diamond database
         run: |
           mkdir diamond

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
@@ -1,8 +1,9 @@
 assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
 assembly_title: asccTinyTest
-pacbio_barcodes: /home/runner/work/ascc/ascc/assets/pacbio_adaptors.fa
+pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
 pacbio_multiplexing_barcode_names: "bc1008,bc1009"
-pacbio_reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio/
+reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio
+reads_type: "hifi"
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
 mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa

diff --git a/assets/test.yaml b/assets/test.yaml
@@ -1,8 +1,9 @@
-assembly_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/ncbi_dataset/data/reheadered.fna
+assembly_path: /lustre/scratch124/tol/projects/tol/data/insects/Polyommatus_atlantica/assembly/draft/treeval/ilPolAtla1_merged/raw/ref.fa
 assembly_title: asccTinyTest
+reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
+reads_type: "hifi"
 pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
 pacbio_multiplexing_barcode_names: "bc1008,bc1009"
-pacbio_reads_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/pacbio/
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
 mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa

diff --git a/bin/BedTools.py b/bin/BedTools.py
@@ -1,79 +1,95 @@
 import sys
 import re
 import os
+import pybedtools
+from collections import defaultdict
+from itertools import groupby
+
+
+"""
+Script from James Torrance (jt8) and Eerik Aunin (ea10)
+refactored by Yumi sims (yy5)
+"""
 
 
 class BedTools:
-    # bedtools = '/software/grit/tools/bedtools-2.29.0/bin/bedtools'
-    bedtools = "bedtools"
+    def __init__(self):
+        pass
 
-    def sort_and_merge_bed_file(self, bed_file):
-        sorted_bed_file = re.sub("\.bed$", ".sorted.bed", bed_file)
-        merged_bed_file = re.sub("\.bed$", ".merged.bed", bed_file)
-        sort_command = self.bedtools + " sort -i " + bed_file + " > " + sorted_bed_file
-        print(sort_command)
-        os.system(sort_command)
-        merge_command = self.bedtools + " merge -i " + sorted_bed_file + " > " + merged_bed_file
-        print(merge_command)
-        os.system(merge_command)
-        return merged_bed_file
+    def sort_and_merge_bed_file(self, input_bed_file):
+        bed = pybedtools.BedTool(input_bed_file)
+        # Get the output merged BED file name without the file extension
+        output_merged_bed_file = os.path.splitext(os.path.basename(input_bed_file))[0] + ".merged.bed"
+        # Sort the BED file
+        sorted_bed = bed.sort()
+        # Merge the sorted BED file
+        merged_bed = sorted_bed.merge()
+        # Save the merged BED file
+        merged_bed.saveas(output_merged_bed_file)
+        return output_merged_bed_file
 
     def merge_bed_b_into_a(self, bed_file_1, bed_file_2):
         # We're merging file_2 into file_1
         os.system("cat " + bed_file_2 + " >> " + bed_file_1)
-        merged_bed_file = sort_and_merge_bed_file(bed_file_1)
+        # Call the sort_and_merge_bed_file method to sort and merge the resulting file
+        merged_bed_file = self.sort_and_merge_bed_file(bed_file_1)
         os.system("mv " + merged_bed_file + " " + bed_file_1)
 
     def subtract_b_from_a(self, bed_file_1, bed_file_2):
-        # We subtract file 2 from file 1
-        subtracted_bed_file = re.sub("\.bed$", ".subtracted.bed", bed_file_1)
-        subtract_command = (
-            self.bedtools + " subtract -a " + bed_file_1 + " -b " + bed_file_2 + " > " + subtracted_bed_file
-        )
-        print(subtract_command)
-        os.system(subtract_command)
-        return subtracted_bed_file
+        if not os.path.exists(bed_file_1) or not os.path.exists(bed_file_2):
+            raise FileNotFoundError("One or both of the input BED files do not exist.")
+        output_file_name = os.path.splitext(os.path.basename(bed_file_1))[0] + ".subtracted.bed"
 
-    def coverage_for_bed_file(self, bed_file):
-        coverage = 0
+        # Load the BED files
+        bed1 = pybedtools.BedTool(bed_file_1)
+        bed2 = pybedtools.BedTool(bed_file_2)
+        # Subtract bed_file_2 from bed_file_1
+        subtracted_bed = bed1.subtract(bed2)
+        subtracted_bed.saveas(output_file_name)
+        return output_file_name
 
+    def coverage_for_bed_file(self, bed_file):
         with open(bed_file, "r") as bed_handle:
-            for line in bed_handle:
-                line = line.rstrip()
-                fields = re.split("\s+", line)
-                coverage += int(fields[2]) - int(fields[1])
-
+            lines = [line.strip() for line in bed_handle]
+            coverage = sum(int(fields[2]) - int(fields[1]) for line in lines for fields in [re.split("\s+", line)])
         return coverage
 
     def coverage_for_bed_file_by_scaffold(self, bed_file):
-        coverage_for_scaffold = {}
-
         with open(bed_file, "r") as bed_handle:
-            for line in bed_handle:
-                line = line.rstrip()
-                fields = re.split("\s+", line)
-                if fields[0] not in coverage_for_scaffold:
-                    coverage_for_scaffold[fields[0]] = 0
-                coverage_for_scaffold[fields[0]] += int(fields[2]) - int(fields[1])
-
+            fields_list = [re.split("\s+", line.strip()) for line in bed_handle]
+        coverage_for_scaffold = {}
+        for chrom, interval_list in groupby(fields_list, key=lambda x: x[0]):
+            total_coverage = 0
+            for _, start_str, end_str in interval_list:
+                try:
+                    start, end = int(start_str), int(end_str)
+                    total_coverage += end - start
+                except ValueError:
+                    pass
+            coverage_for_scaffold[chrom] = total_coverage
         return coverage_for_scaffold
 
     def coords_to_bed(self, coord_list_for_sequence, bed_file):
-        bed_handle = open(bed_file, "w")
-        for sequence in coord_list_for_sequence:
-            for coord_pair in coord_list_for_sequence[sequence]:
-                bed_handle.write("\t".join([sequence, str(coord_pair[0] - 1), str(coord_pair[1])]) + "\n")
-        bed_handle.close()
+        with open(bed_file, "w") as bed_handle:
+            lines = [
+                f"{sequence}\t{start - 1}\t{end}\n"
+                for sequence, coord_pairs in coord_list_for_sequence.items()
+                for start, end in coord_pairs
+            ]
+            bed_handle.writelines(lines)
 
     def bed_to_coords(self, bed_file):
-        coord_list_for_sequence = {}
+        coord_list_for_sequence = defaultdict(list)
 
         with open(bed_file, "r") as bed_handle:
             for line in bed_handle:
-                line = line.rstrip()
-                fields = re.split("\s+", line)
-                if fields[0] not in coord_list_for_sequence:
-                    coord_list_for_sequence[fields[0]] = []
-                coord_list_for_sequence[fields[0]].append([int(fields[1]) + 1, int(fields[2])])
+                fields = line.split()
+                if len(fields) >= 3:
+                    try:
+                        seq_name, start, end = fields[0], int(fields[1]), int(fields[2])
+                        coord_list_for_sequence[seq_name].append([start + 1, end])
+                    except (ValueError, IndexError):
+                        # Handle invalid lines or non-numeric fields
+                        pass
 
-        return coord_list_for_sequence
+        return dict(coord_list_for_sequence)
diff --git a/bin/extract_contaminants_by_type.py b/bin/extract_contaminants_by_type.py
@@ -8,11 +8,9 @@
 import re
 import BedTools
 from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
 import gzip
-import sys
 import argparse
-from pathlib import Path
+import general_purpose_functions as gpf
 
 
 def main():
@@ -28,29 +26,14 @@ def main():
 
         if args.assembly_file == None:
             assembly_file = re.sub("\.contamination", ".fa.gz", contamination_file)
-
-        # 	assembly_name_match = re.search('([^/]*?)\.contamination', contamination_file)
-        # 	assembly_name = assembly_name_match.group(1)
         else:
             assembly_file = args.assembly_file
-        # 	assembly_name_match = re.search('([^/]*?)\.(fa|fasta|fna)', assembly_file)
-        # 	assembly_name = assembly_name_match.group(1)
         assembly_name = "assembly"
 
         # If at first you don't succeed, try again with a .fasta suffix:
         if not os.path.isfile(assembly_file):
             assembly_file = re.sub("\.contamination", ".fasta.gz", contamination_file)
 
-        # assembly_dir = '/lustre/scratch123/tol/teams/grit/jt8/contamination_screen/assemblies/'
-
-        # if not os.path.isfile(assembly_file):
-        # 	# Fall through to looking in the assemblies dir
-        # 	assembly_file = assembly_dir + assembly_name + '.fa'
-
-        # if not os.path.isfile(assembly_file):
-        # 	# ... and then try a .fasta extension
-        # 	assembly_file = assembly_dir + assembly_name + '.fasta'
-
         if not os.path.isfile(assembly_file):
             print(assembly_file + " does not exist")
         else:
@@ -83,11 +66,10 @@ def main():
                         coord_list_for_section_and_sequence[section_name][fields[0]].append(coords)
                         coord_list_for_section_and_sequence["ALL"][fields[0]].append(coords)
 
-            # 			margins = [0,10000]
+            # margins = [0,10000]
             margins = [0]
 
             # Write BED file
-            # extracts_dir = '/lustre/scratch123/tol/teams/grit/jt8/contamination_screen/contaminated_extracts/'
             for section_name in coord_list_for_section_and_sequence:
                 # Only print sections that have data- but always produce an "ALL" file
                 if len(coord_list_for_section_and_sequence[section_name]) > 0 or section_name == "ALL":
@@ -101,10 +83,9 @@ def main():
 
                     # Get lengths
                     length_file = args.extracts_dir + assembly_name + ".lengths"
-                    # if not os.path.isfile(length_file):
 
                     # Replaced the exonerate fastalength binary with a Python script (EA)
-                    os.system("fastalength.py " + assembly_file + " > " + length_file)
+                    fastalength(assembly_file, length_file)
                     length_for_sequence = parse_fastalength_file(length_file)
 
                     coverage_file_base_name = args.extracts_dir + assembly_name + "." + output_section_name
@@ -121,11 +102,8 @@ def main():
                         else:
                             handle = open(assembly_file, "rt")
 
-                        # 				with gzip.open(assembly_file, "rt") as handle:
                         for record in SeqIO.parse(handle, "fasta"):
-                            # print(record.id)
                             if record.id in merged_coord_list_for_sequence:
-                                # print('Extracting')
                                 for coord_pair in merged_coord_list_for_sequence[record.id]:
                                     extracted_sequence = extract_sequence(record, coord_pair[0], coord_pair[1], margin)
                                     SeqIO.write([extracted_sequence], output_handle, "fasta")
@@ -136,15 +114,20 @@ def write_coverage_file(coverage_file_base_name, merged_bed_file, length_for_seq
     # Record coverage
     merged_coord_list_for_sequence = bedtools.bed_to_coords(merged_bed_file)
     coverage_for_sequence = bedtools.coverage_for_bed_file_by_scaffold(merged_bed_file)
+
     percentage_coverage_for_sequence = {}
     for sequence in coverage_for_sequence:
-        percentage_coverage_for_sequence[sequence] = (
-            coverage_for_sequence[sequence] / length_for_sequence[sequence] * 100
-        )
+        if sequence in length_for_sequence:
+            percentage_coverage_for_sequence[sequence] = (
+                coverage_for_sequence[sequence] / length_for_sequence[sequence] * 100
+            )
+        else:
+            # Handle missing sequence length information
+            percentage_coverage_for_sequence[sequence] = 0
 
     coverage_threshold = 1
 
-    # Take the stem name for the file, and print to various files, with c coverage threshod, without, and per line
+    # Take the stem name for the file, and print to various files, with c coverage threshold, without, and per line
 
     filtered_coverage_scaffold_file = coverage_file_base_name + ".filtered_scaffold_coverage.bed"
     unfiltered_coverage_scaffold_file = coverage_file_base_name + ".unfiltered_scaffold_coverage.bed"
@@ -212,6 +195,18 @@ def write_coverage_file(coverage_file_base_name, merged_bed_file, length_for_seq
                 )
 
 
+def fastalength(input_path, length_file):
+    fasta_data = gpf.read_fasta_in_chunks(input_path)
+    with open(length_file, "w") as length_file_handle:
+        for header, seq in fasta_data:
+            if header is not None:  # Check if header is not None
+                if " " in header:
+                    header = header.replace(" ", "_")
+                result = f"{len(seq)} {header}"
+                print(result)
+                length_file_handle.write(result + "\n")
+
+
 # Parse fastalength file
 def parse_fastalength_file(length_file):
     length_for_sequence = {}
@@ -239,8 +234,6 @@ def extract_sequence(seq_record, start, end, margin):
     else:
         start = 1
 
-    # print("Slicing: " + str(start) + '-' + str(end))
-
     extracted_slice = seq_record[start:end]
     extracted_slice.id = seq_record.id + ":" + str(start) + "-" + str(end)
     extracted_slice.name = extracted_slice.id

diff --git a/bin/gc_content.py b/bin/gc_content.py
@@ -2,8 +2,8 @@
 """
 Script for finding the GC content of each sequence in a multiFASTA file
 
-Written by Eerik Aunin @eeaunin
-
+Originally Written by Eerik Aunin @eeaunin
+Refactored by Yumi Sims @yumisims
 Adapted by Damon-Lee Pointon @DLBPointon
 """
 
@@ -13,16 +13,12 @@
 
 def main(fasta_path):
     fasta_data = gpf.read_fasta_in_chunks(fasta_path)
-    for header, seq in fasta_data:
-        header = header.split()[0]
-        seq = seq.upper()
-        gc_content = None
-        gc_count = seq.count("G") + seq.count("C")
-        seq_len = len(seq)
-        if seq_len > 0:
-            gc_content = gc_count / seq_len
-        gc_content_string = "{:.6f}".format(gc_content)
-        print("{}\t{}".format(header, gc_content_string))
+    results = [
+        (header.split()[0], "{:.6f}".format((seq.upper().count("G") + seq.upper().count("C")) / max(1, len(seq))))
+        for header, seq in fasta_data
+    ]
+    for header, gc_content_string in results:
+        print(f"{header}\t{gc_content_string}")
 
 
 if __name__ == "__main__":

diff --git a/bin/organelle_contamination_recommendation.py b/bin/organelle_contamination_recommendation.py
@@ -4,27 +4,20 @@
 
 import csv
 import re
-import sys
 import argparse
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Generate text for ENA submission")
-    parser.add_argument("--input", type=str, help="List of input files (space seperated list)", default=None)
+    parser = argparse.ArgumentParser(
+        description="Create an organellar contamination report text file based on the organellar contamination BED file"
+    )
+    parser.add_argument("--input", type=str, help="Input BED file", default=None)
     parser.add_argument("--output", type=str, help="Output recommendation", default=None)
     parser.add_argument("-v", action="version", version="1.0")
     args = parser.parse_args()
 
-    # Filters through the list of files that nextflow passes in and ID's the one we want
-    files_list = args.input
-    file_for_use = next(
-        (file for file in files_list.split() if "assembly.ALL.unfiltered_scaffold_coverage.bed" in file), None
-    )
-    if file_for_use is None:
-        sys.exit(1)
-
     recommendation_handle = open(args.output, "w")
-    with open(file_for_use) as bed_handle:
+    with open(args.input) as bed_handle:
         bed_csv_reader = csv.reader(bed_handle, delimiter="\t")
         for field_set in bed_csv_reader:
             scaffold = field_set[0]