-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from sanger-tol/dp24_organellar
Dp24 organellar
- Loading branch information
Showing
27 changed files
with
943 additions
and
197 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,79 +1,95 @@ | ||
import sys | ||
import re | ||
import os | ||
import pybedtools | ||
from collections import defaultdict | ||
from itertools import groupby | ||
|
||
|
||
""" | ||
Script from James Torrance (jt8) and Eerik Aunin (ea10) | ||
refactored by Yumi sims (yy5) | ||
""" | ||
|
||
|
||
class BedTools: | ||
# bedtools = '/software/grit/tools/bedtools-2.29.0/bin/bedtools' | ||
bedtools = "bedtools" | ||
def __init__(self): | ||
pass | ||
|
||
def sort_and_merge_bed_file(self, bed_file): | ||
sorted_bed_file = re.sub("\.bed$", ".sorted.bed", bed_file) | ||
merged_bed_file = re.sub("\.bed$", ".merged.bed", bed_file) | ||
sort_command = self.bedtools + " sort -i " + bed_file + " > " + sorted_bed_file | ||
print(sort_command) | ||
os.system(sort_command) | ||
merge_command = self.bedtools + " merge -i " + sorted_bed_file + " > " + merged_bed_file | ||
print(merge_command) | ||
os.system(merge_command) | ||
return merged_bed_file | ||
def sort_and_merge_bed_file(self, input_bed_file): | ||
bed = pybedtools.BedTool(input_bed_file) | ||
# Get the output merged BED file name without the file extension | ||
output_merged_bed_file = os.path.splitext(os.path.basename(input_bed_file))[0] + ".merged.bed" | ||
# Sort the BED file | ||
sorted_bed = bed.sort() | ||
# Merge the sorted BED file | ||
merged_bed = sorted_bed.merge() | ||
# Save the merged BED file | ||
merged_bed.saveas(output_merged_bed_file) | ||
return output_merged_bed_file | ||
|
||
def merge_bed_b_into_a(self, bed_file_1, bed_file_2): | ||
# We're merging file_2 into file_1 | ||
os.system("cat " + bed_file_2 + " >> " + bed_file_1) | ||
merged_bed_file = sort_and_merge_bed_file(bed_file_1) | ||
# Call the sort_and_merge_bed_file method to sort and merge the resulting file | ||
merged_bed_file = self.sort_and_merge_bed_file(bed_file_1) | ||
os.system("mv " + merged_bed_file + " " + bed_file_1) | ||
|
||
def subtract_b_from_a(self, bed_file_1, bed_file_2): | ||
# We subtract file 2 from file 1 | ||
subtracted_bed_file = re.sub("\.bed$", ".subtracted.bed", bed_file_1) | ||
subtract_command = ( | ||
self.bedtools + " subtract -a " + bed_file_1 + " -b " + bed_file_2 + " > " + subtracted_bed_file | ||
) | ||
print(subtract_command) | ||
os.system(subtract_command) | ||
return subtracted_bed_file | ||
if not os.path.exists(bed_file_1) or not os.path.exists(bed_file_2): | ||
raise FileNotFoundError("One or both of the input BED files do not exist.") | ||
output_file_name = os.path.splitext(os.path.basename(bed_file_1))[0] + ".subtracted.bed" | ||
|
||
def coverage_for_bed_file(self, bed_file): | ||
coverage = 0 | ||
# Load the BED files | ||
bed1 = pybedtools.BedTool(bed_file_1) | ||
bed2 = pybedtools.BedTool(bed_file_2) | ||
# Subtract bed_file_2 from bed_file_1 | ||
subtracted_bed = bed1.subtract(bed2) | ||
subtracted_bed.saveas(output_file_name) | ||
return output_file_name | ||
|
||
def coverage_for_bed_file(self, bed_file): | ||
with open(bed_file, "r") as bed_handle: | ||
for line in bed_handle: | ||
line = line.rstrip() | ||
fields = re.split("\s+", line) | ||
coverage += int(fields[2]) - int(fields[1]) | ||
|
||
lines = [line.strip() for line in bed_handle] | ||
coverage = sum(int(fields[2]) - int(fields[1]) for line in lines for fields in [re.split("\s+", line)]) | ||
return coverage | ||
|
||
def coverage_for_bed_file_by_scaffold(self, bed_file): | ||
coverage_for_scaffold = {} | ||
|
||
with open(bed_file, "r") as bed_handle: | ||
for line in bed_handle: | ||
line = line.rstrip() | ||
fields = re.split("\s+", line) | ||
if fields[0] not in coverage_for_scaffold: | ||
coverage_for_scaffold[fields[0]] = 0 | ||
coverage_for_scaffold[fields[0]] += int(fields[2]) - int(fields[1]) | ||
|
||
fields_list = [re.split("\s+", line.strip()) for line in bed_handle] | ||
coverage_for_scaffold = {} | ||
for chrom, interval_list in groupby(fields_list, key=lambda x: x[0]): | ||
total_coverage = 0 | ||
for _, start_str, end_str in interval_list: | ||
try: | ||
start, end = int(start_str), int(end_str) | ||
total_coverage += end - start | ||
except ValueError: | ||
pass | ||
coverage_for_scaffold[chrom] = total_coverage | ||
return coverage_for_scaffold | ||
|
||
def coords_to_bed(self, coord_list_for_sequence, bed_file): | ||
bed_handle = open(bed_file, "w") | ||
for sequence in coord_list_for_sequence: | ||
for coord_pair in coord_list_for_sequence[sequence]: | ||
bed_handle.write("\t".join([sequence, str(coord_pair[0] - 1), str(coord_pair[1])]) + "\n") | ||
bed_handle.close() | ||
with open(bed_file, "w") as bed_handle: | ||
lines = [ | ||
f"{sequence}\t{start - 1}\t{end}\n" | ||
for sequence, coord_pairs in coord_list_for_sequence.items() | ||
for start, end in coord_pairs | ||
] | ||
bed_handle.writelines(lines) | ||
|
||
def bed_to_coords(self, bed_file): | ||
coord_list_for_sequence = {} | ||
coord_list_for_sequence = defaultdict(list) | ||
|
||
with open(bed_file, "r") as bed_handle: | ||
for line in bed_handle: | ||
line = line.rstrip() | ||
fields = re.split("\s+", line) | ||
if fields[0] not in coord_list_for_sequence: | ||
coord_list_for_sequence[fields[0]] = [] | ||
coord_list_for_sequence[fields[0]].append([int(fields[1]) + 1, int(fields[2])]) | ||
fields = line.split() | ||
if len(fields) >= 3: | ||
try: | ||
seq_name, start, end = fields[0], int(fields[1]), int(fields[2]) | ||
coord_list_for_sequence[seq_name].append([start + 1, end]) | ||
except (ValueError, IndexError): | ||
# Handle invalid lines or non-numeric fields | ||
pass | ||
|
||
return coord_list_for_sequence | ||
return dict(coord_list_for_sequence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.