-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmdg_dt.py
executable file
·510 lines (444 loc) · 16.5 KB
/
mdg_dt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
#!/usr/bin/python
# coding: utf-8
import sys
"""
base-pair relationships
"""
"""
default state
"""
CT_IGN = 0
"""
base pairs x, y form a stack/helix
"""
CT_STA = 1
"""
base pair x, y form a nested structure (bulge/iloop)
"""
CT_NES = 2
"""
base pair y follows x
(mloop/connecting strands between stems)
"""
CT_SEP = 3
"""
crossing base pairs (shouldn't happen
"""
CT_CRO = 4
"""
base pair x follows y, order screwed
(won't be dealt with)
"""
CT_PRE = 5
DEFOUT = open('/dev/null', 'w')
###
def check_relationship(contact1, contact2):
# print contact1, contact2
""" undecided """
crel = CT_IGN
sid1, sid2 = contact1[0], contact1[1]
cid1, cid2 = contact2[0], contact2[1]
n1, n2 = True, True
next1, next2 = sid1 + 1, sid2 -1
if next1 == cid1 and next2 == cid2:
""" stacking/touching """
if n1 and n2:
crel = CT_STA
else:
crel = CT_NES
pass
elif (next1 <= cid1 and next2 > cid2) or \
(next1 < cid1 and next2 >= cid2):
""" separate, in same stack-sequence """
crel = CT_NES
elif cid1 > sid2 and cid2 > sid2:
""" disjoint (separate, different stack-sequences) """
crel = CT_SEP
elif cid1 < sid2 and cid2 > sid2:
""" crossing """
crel = CT_CRO
elif next1 <= sid1 - 1 and next2 <= sid2 + 1:
""" preceeding pair """
crel = CT_PRE
pass
if crel == CT_IGN: print contact1, contact2
return crel
###
class MDG_Stem :
###
def __init__(self, contact=None, parent=None):
self.contacts = []
self.children = []
self.ss_contacts = []
self.te_contacts = []
self.parent = parent
if contact != None :
self.ss_contacts.append(contact)
pass
if parent != None :
parent.add_child(self)
pass
return None
###
def grow(self, contact, insertBack=True):
if insertBack :
self.ss_contacts.append(contact)
else :
self.ss_contacts.insert(0, contact)
return 0
###
def add_child(self, stem):
self.children.append(stem)
return 0
###
def set_parent(self, stem, override=False):
set = False
if self.parent == None or override:
self.parent = stem
set = True
pass
return set
###
def opening_pair(self):
pair = None
if len(self.ss_contacts) != 0:
pair = self.ss_contacts[0]
pass
return pair
###
def closing_pair(self):
pair = None
if len(self.ss_contacts) != 0:
pair = self.ss_contacts[-1]
pass
return pair
###
def is_root(self):
return self.parent == None
###
def is_leaf(self) :
return len(self.children) == 0
###
def is_internal(self):
return not self.is_leaf()
###
def has_siblings(self):
return len( self.parent.children ) > 1
###
def size(self):
return len(self.ss_contacts)
def walk(self):
if not self.is_root():
print self.ss_contacts[0],
print self.ss_contacts[-1]
for child in self.children:
child.walk()
return None
def preorder(self):
pairs = []
if not self.is_root():
pairs.extend(self.ss_contacts)
for child in self.children:
pairs.extend(child.preorder())
return pairs
def add_tertiary_pair(self, pair):
self.te_contacts.append(pair)
return None
def has_tertiary_pairs(self):
# print 'HTE', len(self.te_contacts) > 0
return len(self.te_contacts) > 0
def has_tertiary_stem(self, min_stemsize=2):
self.te_contacts.sort()
i = 1
stem = self.te_contacts[0:1]
#print stem
#print self.te_contacts
while i < len(self.te_contacts):
p1, p2 = stem[-1], self.te_contacts[i]
#print p1, p2
if abs(p1[0]-p2[0]) == 1 and abs(p1[1]-p2[1]) == 1:
if len(stem) + 1 >= min_stemsize: return True
stem.append(p2)
else:
stem = [p2]
i += 1
pass
return False
###
def __repr__(self):
ret = 'Dummy Stem'
if len(self.ss_contacts) > 0:
ret = '[ %s - %s ], %i bp\n' % \
(self.ss_contacts[0], self.ss_contacts[-1],
len(self.ss_contacts))
pass
return ret
###
def get_stem_bases(self, contacts, max_stemsize=9999) :
bases = []
n = 0
for c in list(contacts):
bases.append(c[0])
bases.append(c[1])
n += 1
if n == max_stemsize : break
return bases
###
def get_unpaired_bases(self, start, end, mdic) :
bases = []
for i in range(start, end) :
if i in mdic : bases.append(i)
pass
return bases
###
def assemble(self, contacts, outp=DEFOUT, min_stemsize=2):
# print contacts
ss_contacts = []
nonss_contacts = []
active_stem = self
for c in contacts:
# print c
if True:
if active_stem and active_stem.is_root():
# print 'XXX', c
stem = MDG_Stem(c, active_stem)
outp.write('New root-child: %s\n' % str(stem))
active_stem = stem
# print active_stem
ss_contacts.append(c)
else:
acp = active_stem.closing_pair()
crel = check_relationship(acp, c)
# print 'CREL', crel, acp, c
if crel == CT_STA:
""" contact STAcks on active_stem """
outp.write('Growing %s\n' % active_stem)
active_stem.grow(c)
elif crel == CT_NES:
"""
contact is a NESted pair of active_stem
=> new stem
"""
if active_stem.size() >= min_stemsize:
"""
only stems >= min_stemsize are valid
(avoid single pairs)
"""
stem = MDG_Stem(c, active_stem)
outp.write('Growing %s %s\n' % (stem, active_stem))
active_stem = stem
ss_contacts.append(c)
else:
"""
active_stem is single pair
traceback to parent
"""
# weird format - why?
tmpc = [active_stem.closing_pair()]
stem = MDG_Stem(c, active_stem.parent)
# was -2 for whatever reason...
del active_stem.parent.children[-2]
active_stem = stem
self.te_contacts.append(tmpc[0])
ss_contacts.append(c)
pass
pass
elif crel == CT_CRO:
self.te_contacts.append(c)
pass
elif crel == CT_SEP:
"""
contact is SEParate from active_stem
(e.g. multiloop siblings)
"""
stem_done = False
if active_stem.size() < min_stemsize:
"""
active_stem is single pair
traceback to parent
"""
tmpc = [active_stem.closing_pair()]
par = active_stem.parent
# why -1 (s.above: -2)??
del par.children[-1]
self.te_contacts.append(tmpc[0])
active_stem = par
if not active_stem.is_root():
acp = active_stem.closing_pair()
crel = check_relationship(acp, c)
if crel == CT_NES:
"""
contact is a NESted pair of active_stem
"""
stem_done = True
stem = MDG_Stem(c, active_stem)
active_stem = stem
ss_contacts.append(c)
elif crel == CT_CRO:
"""
CT_SEP of child can CROss parent
crossing contacts => not secstr.
"""
stem_done = True
self.te_contacts.append(c)
pass
pass
pass
outp.write('Separate\n')
"""
CT_SEP of child can be separate from parent
=> traceback
"""
while not stem_done and active_stem:
if active_stem.is_root():
"""
contact is direct child of root
stop searching
"""
stem = MDG_Stem(c, active_stem)
outp.write('New root-child sep: %s\n' % stem)
active_stem = stem
stem_done = True
ss_contacts.append(c)
else:
""" continue searching """
active_stem = active_stem.parent
outp.write('Backtrace: %s\n' % active_stem)
if not active_stem.is_root():
acp = active_stem.closing_pair()
# print '----', acp, c
crel = check_relationship(acp, c)
if crel == CT_NES:
"""
contact is NESted pair
of (bt) active_stem
"""
stem = MDG_Stem(c, active_stem)
outp.write('Nested (sep): %s %s\n' % \
(stem, active_stem))
active_stem = stem
stem_done = True
ss_contacts.append(c)
elif crel == CT_CRO:
"""
contact CROsses (bt) active_stem
"""
stem_done = True
self.te_contacts.append(c)
pass
pass
else:
"""
we traced back to root
and didn't find a fitting stem
"""
stem_done = True
# version in mdg_dt.py
# nonss_contacts.append(c)
## but shouldn't it be
stem = MDG_Stem(c, active_stem)
active_stem = stem
ss_contacts.append(c)
# ?
pass
pass
pass
pass
pass
pass
else:
# --> this is never reached!!!
""" contact is non canonical """
self.te_contacts.append(c)
pass
pass # for c in contacts
""" check if last stem is at least of min_stemsize """
if active_stem.size() < min_stemsize:
self.te_contacts.append(active_stem.closing_pair())
if not active_stem.is_root():
del active_stem.parent.children[-1]
pass
return 0
###
def find_all_motifs(self, mol=None, min_stemsize=2, max_stemsize=1,
outp=sys.stdout):
motifs = []
if self.is_leaf() and not self.is_root():
motifs = [['Hairpin', \
(self.closing_pair()[0], self.closing_pair()[1])]]
pass
else:
if self.is_root():
pass
else:
if len(self.children) == 1:
child = self.children[0]
motifs = [['Inner', (self.closing_pair()[0], \
child.opening_pair()[0]), \
(child.opening_pair()[1], \
self.closing_pair()[1])]]
pass
else:
child = self.children[0]
motifs = [['Multiloop',(self.closing_pair()[0], \
child.opening_pair()[0])]]
i = 0
while i < len(self.children) - 1:
j = i + 1
child1 = self.children[i]
child2 = self.children[i+1]
motifs[0] += [(child1.opening_pair()[1],
child2.opening_pair()[0])]
i += 1
pass
child = self.children[-1]
motifs[0] += [(child.opening_pair()[1], \
self.closing_pair()[1])]
pass
pass
for child in self.children:
motifs.extend(child.find_all_motifs(mol, min_stemsize, \
max_stemsize, outp))
pass
pass
return motifs
def parseBracketString(s):
stack = []
structure = []
for i, c in enumerate(s):
if c == "(":
stack.append(i)
elif c == ")":
try:
structure.append((stack.pop(-1), i))
except:
raise ValueError("Stack prematurely empty")
return sorted(structure)
###
def main(argv):
# contacts = [(0,15), (1,14), (2,13), (3,11), (4,10), (5,9)]
# contacts = [(0,11), (1,9), (2,8), (3,7)]
contacts = [(0,20), (1,19), (3,9), (4,8), (11,17), (12,16)]
# 012345678901234567890
# ((.((...)).((...)).))
contacts = list(parseBracketString(sys.argv[1]))
print contacts
""" this builds the tree """
x = MDG_Stem()
x.assemble(contacts, outp=DEFOUT)
for motif in x.find_all_motifs():
print(motif)
# print x.find_all_motifs()
"""
see walk() for in-order tree-traversal
each node is of type MDG_Stem and contains a list ss_contacts of base pairs
[(x,y),...].
opening_pair() and closing_pair() return the first resp. last pair.
The minimum stemsize is 2, single base pairs should be ignored.
"""
# x.walk()
#print
#for y in x.preorder():
# print y
return 0
if __name__ == '__main__': main(sys.argv[1:])