-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathComparison.py
159 lines (139 loc) · 6.42 KB
/
Comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""This file is for analyzing sequences across organisms. Use this to find repeated sequences in a group of organisms.
To run, type the list of organisms you want to run in the list object directly below.
WARNING: If you want comparisons to contain annotations, you will need to have GenBank files for all the organisms
labelled in the directory."""
# ---------------------------------- CODE BELOW CAN BE IGNORED BY USER ---------------------------------------------- #
import os
from Algorithms import SeqTranslate
import itertools
class Compare_Orgs:
def __init__(self, output_path, base_org_path, base_org, endo, other_genomes):
# initialize SeqTranslate object
self.ST = SeqTranslate()
self.output_path = output_path
# my_orgs contains just the
self.organisms = other_genomes
self.organisms.append(base_org)
self.organisms = sorted(self.organisms)
self.db_path = base_org_path[:base_org_path.find(base_org)]
# Dictionary of dictionaries. Key1: generic total sequence Key2: org Value: position
self.searchableseqs = {}
# Container that stores all the sequences seen the combination of organisms defined by the key
# An example key would be (sce, yli) for the shared sequences between S.cerevisiae and Y.lipolytica
self.buckets = {}
# Intitialize the self.buckets container to contain the tuple of every organism subset
for i in range(2, len(self.organisms)):
for subset in itertools.combinations(self.organisms, i):
self.buckets[subset] = []
print(subset)
self.endo = endo
# The object that is iterated over to decompress the output into readable form
self.compressed_output = {}
# Generates the sequence lists
for org in self.organisms:
print(org)
self.make_lists(org)
# Runs the comparison
self.create_comparison()
self.write_to_file()
# Takes an organism and parses the target data into positions and repeated sequences containers
def make_lists(self, org):
name1 = self.db_path + org + self.endo + ".cspr"
f = open(name1, 'r')
curchrom = int()
while True:
position = f.readline()
if position.find("CHROMOSOME") != -1:
curchrom = position[position.find("#")+1:-1]
print(curchrom)
else:
if position[0:-1] == "REPEATS":
break
# adds to the positions container the unique position with organism, and chromosome as keys
line = position[:-1].split(",")
# change line into generic (no "+" or "-" change to generic .)
totseq = self.ST.to_generic_compressed(line[1])
self.add_to_sequence_matrix(totseq, org, curchrom, line[0])
while True:
seedseq = f.readline()[:-1]
if seedseq.find("END_OF_FIL") != -1:
break
taillocs = f.readline().split('\t')[:-1]
for item in taillocs:
loctup = item.split(',')
totseq = self.ST.to_generic_compressed([seedseq,loctup[2][1:]])
self.add_to_sequence_matrix(totseq, org, loctup[0], loctup[1])
f.close()
# Takes in the variables of a sequence including what organism it is found on and adds it to the dict of dicts
# named:self.searchableseqs
def add_to_sequence_matrix(self, totseq, org, chrom, location):
if totseq in self.searchableseqs.keys():
# already seen this organism and sequence
if org in self.searchableseqs[totseq].keys():
self.searchableseqs[totseq][org].append((chrom, location))
# already seen this sequence but not this sequence in the organism
else:
self.searchableseqs[totseq][org] = []
self.searchableseqs[totseq][org].append((chrom, location))
# new organism and new sequence
else:
self.searchableseqs[totseq] = {}
self.searchableseqs[totseq][org] = []
self.searchableseqs[totseq][org].append((chrom, location))
def int_to_char(self, i):
switcher = {
0: 'A',
1: 'T',
2: 'C',
3: 'G'
}
return switcher[i]
def revcom(self, sequence):
revseq = ""
change = {'A':'T',
'T':'A',
'G':'C',
'C':'G',
'N':'N'}
for nt in sequence:
if nt in change:
rnt = change[nt]
else:
rnt = nt
revseq = rnt + revseq
return revseq
def create_comparison(self):
# Put every sequence in the appropriate bucket
tempdict = dict()
for sequence in self.searchableseqs:
# Look for the set of organisms containing this sequence
if len(self.searchableseqs[sequence].keys()) > 1:
# Make sure the tuple is in the right order
orgs = self.searchableseqs[sequence].keys()
orgs = tuple(sorted(orgs))
# iterate through all the sequences contained in each organism
for org in self.searchableseqs[sequence].keys():
tempdict[org] = []
for location in self.searchableseqs[sequence][org]:
tempdict[org].append(location)
insert_tup = (sequence, tempdict)
tempdict = {}
# contains a list of tuples with the sequence then short dictionary of organisms containing sequence
self.buckets[orgs].append(insert_tup)
def write_to_file(self):
filename = self.output_path + "compare_"
for org in self.organisms:
filename += org + "_"
filename += self.endo + ".txt"
f = open(filename, 'w')
for key in self.buckets:
f.write(str(key) + " " + str(len(self.buckets[key])) + "\n")
for seq in self.buckets[key]:
f.write(self.ST.decompress64(seq[0], True) + "\n")
for suborg in seq[1]:
f.write(str(suborg) + ":")
for locs in seq[1][suborg]:
f.write(str(locs[0]) + "," + str(self.ST.decompress64(locs[1])) + "\t")
f.write("\n")
f.write("\n")
f.close()