forked from UMCUGenetics/pymc4c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmc4ctools.py
884 lines (708 loc) · 28.5 KB
/
mc4ctools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
# TODO: Check correctness of this implementation, might contain some basepair shifts due to 0vs1 based genome stuff
# import m2p
import prep
from Bio.Seq import Seq
import numpy as np
import pandas as pd
import re
import pysam
import collections
import bisect as bs
fastaIdFormat='>Pr.Id:{};Pr.Wi:{};Pr.Wn:{}\n{}\n'
#referenceNameFormat='>RD:{};IN:{}'
def loadIni(iniFile):
""" Read data from file, put it into a dict
:param iniFile: takes a path to a tab separated file with one variable name and optionally
several values per line. Checks if the amount of variables in some lines match.
:returns: Dictionary where keys are based on the first column with values in a list.
"""
settings=dict()
with open(iniFile,'r') as iniFile:
for line in iniFile:
splitLine = line.split()
if len(splitLine) <= 1:
continue
settings[splitLine[0]] = [x for x in splitLine[1:] if x != '']
# Integer lists
for key in ['prm_start','prm_end','vp_start','vp_end','win_start','win_end']:
settings[key]=[int(x) for x in settings[key]]
# Check lists that should be of equal length
linked=[
['prm_seq','prm_start','prm_end'],
['re_name','re_seq'],
['win_start','win_end'],
['vp_name','vp_chr','vp_start','vp_end']
]
for listed in linked:
assert len(set([len(settings[x]) for x in listed])) == 1, 'Error: different lengths for linked data:'+','.join(str(x) for x in listed)
return settings
### makeprimerfa implementation ###
def splitStringTo(item,maxLen=50):
""" Helper function to make a fasta split at 50 bp per line.
:param item: The input sequence string.
:param maxLen: The max amount of basepairs per line.
:returns: List of strings per required length
"""
return [str(item[ind:ind+maxLen]) for ind in range(0, len(item), maxLen/2)]
def seqToFasta(sequence,baseId):
""" Helper function to create fasta sequences from primer information.
:param sequence: The actual primer sequence.
:param baseId: Identifier for the primer, used to make the fasta identifier.
:returns: A fasta string with a formatted sequence id.
:note: In case of leftover sequence the last bit is attached to the preceding chunk.
"""
targetLen = 50
split = splitStringTo(sequence,maxLen=targetLen)
if len(split) > 1:
while len(split[0]) != len(split[-1]):
popper = split.pop(-1)
split[-1] = split[-1][:targetLen/2]+popper
outString = ''
for i,val in enumerate(split):
outString+=fastaIdFormat.format(
baseId, i+1, len(split),
val)
return outString
def getPrimerSeqs(dataInfo):
""" Function to check the primer sequences. Checks wether a sequence
occurs in the target region and is unique enough.
:param dataInfo: The settings dictionary created by loading the ini.
:returns: The provided sequences, both forward and reverse versions.
:TODO: Improve error message on faulty primes sequences
"""
primerSeqs = []
for i, val in enumerate(dataInfo['prm_seq']):
leftSeq = prep.getFastaSequence(
dataInfo['genome_build'][0],
dataInfo['vp_chr'][0],
dataInfo['prm_start'][i]-300,
dataInfo['prm_end'][i]).upper()
leftIndex = leftSeq.rfind(dataInfo['re_seq'][0])
leftPrimerSeq = Seq(leftSeq[leftIndex:]).reverse_complement().tostring()
rightSeq = prep.getFastaSequence(
dataInfo['genome_build'][0],
dataInfo['vp_chr'][0],
dataInfo['prm_start'][i],
dataInfo['prm_end'][i]+300).upper()
rightIndex = rightSeq.find(dataInfo['re_seq'][0]) + len(dataInfo['re_seq'][0])
rightPrimerSeq = rightSeq[:rightIndex]
assert max(leftPrimerSeq.find(dataInfo['prm_seq'][i]),
rightPrimerSeq.find(dataInfo['prm_seq'][i])) >= 0, 'Primer sequence is wrong\n'+str(dataInfo['prm_seq'][i])
assert min(leftPrimerSeq.find(dataInfo['prm_seq'][i]),
rightPrimerSeq.find(dataInfo['prm_seq'][i])) <= 0, 'Primer sequence is ambigious\n'+str(dataInfo['prm_seq'][i])
if rightPrimerSeq.find(dataInfo['prm_seq'][i]) >= 0:
primerSeqs.append(rightPrimerSeq)
else:
primerSeqs.append(leftPrimerSeq)
return primerSeqs
def writePrimerFasta(primerSeqs,targetFile):
""" Write the primer information as obtained through functions above to
a fasta file.
:param primerSeqs: The various primers as returned by getPrimerSeqs().
:param targetFile: The target output file to write to.
"""
with open(targetFile,'w') as outFasta:
for i,seq in enumerate(primerSeqs):
outFasta.write(seqToFasta(seq,str(i+1)))
### cleavereads implementation ###
# This class can perhaps be replaced by a panda frame later
class SimpleRead(object):
""" A custom class to represent a read in an easier format for our processing of primer
combining and cleaving.
"""
#fastaIdFormat='PR{}_{}-{}'#s06.fastaIdFormat
def __init__(self, read, prmLen):
primerDict = dict(item.split(":") for item in read.query_name.split(";"))
self.prmType = int(primerDict['Pr.Id'][-1])
self.prmFlag = read.flag & ~(1<<8) # Set 9th bit to 0 to state primary alignment
self.readID = int(dict(item.split(":") for item in read.reference_name.split(";"))['Rd.Id'])
self.startAln = read.reference_start
self.endAln = read.reference_start + read.infer_query_length(always=True)
self.prmSize = prmLen[int(self.prmType)-1] # Should be length of the original primer?
# prm_len{prm_info(ai,1)}(prm_info(ai,8))
self.alnErr = [sum(x[1] for x in read.cigartuples if x[0] in [1,2])]
self.winInd = [int(primerDict['Pr.Wi'])]
self.winNum = int(primerDict['Pr.Wn'])
self.cigar = [read.cigarstring]
# Cigar string information
# M BAM_CMATCH 0
# I BAM_CINS 1
# D BAM_CDEL 2
# N BAM_CREF_SKIP 3
# S BAM_CSOFT_CLIP 4
# H BAM_CHARD_CLIP 5
# P BAM_CPAD 6
# = BAM_CEQUAL 7
# X BAM_CDIFF 8
def toList(self):
return [
self.prmType,
self.prmFlag,
self.readID,
self.startAln,
self.endAln,
self.prmSize,
self.alnErr,
self.winInd,
self.winNum,
self.cigar]
def tostring(self):
return '\t'.join(str(x) for x in [
self.prmType,
self.prmFlag,
self.readID,
self.startAln,
self.endAln,
self.prmSize,
self.alnErr,
self.winInd,
self.winNum,
self.cigar])
def groupPrimers(matchList):
""" Combine possibly overlapping primers found in a read. Needed to ensure
a large sensible cut is made rather than recutting within a primer matched
region. Used in combinePrimers().
:param matchList: A list of primer matched positions.
:returns: A similar list where overlapping primers ahve been combined.
"""
# Split by primer type to ensure we only combine primers of the same type
for side in set(x.prmType for x in matchList):
thisSide = [x for x in matchList if x.prmType == side]
if len(thisSide) > 1:
# Merge latter listed reads into earlier reads if overlapping
for j in xrange(len(thisSide)-1, -1, -1):
for i in xrange(j-1, -1, -1):
curRead = thisSide[i]
nextRead = thisSide[j]
# Allow a slight offset to consider overlapping to identify
# win_i and win_i+2 correctly, and check if both are on the
# same strand
if curRead.endAln + 10 >= nextRead.startAln \
and curRead.startAln <= nextRead.endAln + 10 \
and curRead.prmFlag&(1<<4) == nextRead.prmFlag&(1<<4):
# Merge information
curRead.startAln = min(curRead.startAln,nextRead.startAln)
curRead.endAln = max(curRead.endAln,nextRead.endAln)
curRead.winInd.extend(nextRead.winInd)
curRead.alnErr.extend(nextRead.alnErr)
curRead.cigar.extend(nextRead.cigar)
# Remove last item in list of merged information parts
matchList.remove(nextRead)
# Merge further in subsequent loops, not in this one
break
return matchList
def findCuts(matchList):
""" Determines sensible cuts to make based on the primers mapped to a read.
Sanity checks include the direction of primers (need to point toward eachother
or to end of read).
Used for combinePrimers().
:param matchList: A list of non-overlapping primer match positions within a read.
:returns: A list with information per kept cut position:
[start position, end position, index of start primer, index of end primer]
"""
# Ensure the list provided is sorted by start positions
matchList.sort(key=lambda x: x.startAln)
cutList = []
for x in matchList:
print x.tostring()
if matchList == []:
return cutList
# If the first primer points to the start, accept it
if matchList[0].prmFlag&(1<<4)==16:
cutList.append([None,matchList[0].endAln,0,matchList[0].prmType])
# Any two subsequent primers pointing toward eachother are accepted (for now)
for i in xrange(0,len(matchList)-1):
if matchList[i].prmFlag&(1<<4)==0 and \
matchList[i+1].prmFlag&(1<<4)==16: # and \#matchList[i].prmType != matchList[i+1].prmType:
cutList.append([matchList[i].startAln,matchList[i+1].endAln,matchList[i].prmType,matchList[i+1].prmType])
# The primerType check is left for later as otherwise reads with only bad primer combinations would
# be assumed to have no primers at all.
# TODO: Check if we need to add +1 to the end position above
# If we want to remove reads where 2 of the same primer type are
# pointing toward eachother we could return an empty list here
# If the last primer points to the end, accept it
if matchList[-1].prmFlag&(1<<4)==0:
cutList.append([matchList[-1].startAln,None,matchList[-1].prmType,0])
return cutList
def combinePrimers(insam,prmLen,qualThreshold=.20):
""" Groups all detected primers in reads together.
:param insam: The sam file providing where primers mapped to reads.
:param prmLen: List with lengths of all primers.
:param qualThreshold: Remove all primer mappings with a mapping quality
below this value.
:returns: A list of positions where cuts should be made according to primers.
"""
samfile = pysam.AlignmentFile(insam, "rb")
# Create a dict with lists of SimpleRead using referenceName as key
myDict = collections.defaultdict(list)
for read in samfile:
myDict[int(dict(item.split(":") for item in read.reference_name.split(";"))['Rd.Id'])].append(SimpleRead(read,prmLen))
# Put data in referenceName sorted order
sortedKeys = myDict.keys()
sortedKeys.sort(key=int)
cutAllList = []
for key in sortedKeys:
# Remove low quality matches
matchedRead = [x for x in myDict[key] if (x.alnErr[0] / float(x.endAln-x.startAln) < qualThreshold)]
# Group overlapping primers on a read
grouped = groupPrimers(matchedRead)
# Determine where cuts should be based on mapped primers
cutAllList.append([key,findCuts(grouped)])
return cutAllList
def applyCuts(inFile,outFile,cutList,primerSeqs,cutDesc='Cr'):
""" Apply the previously determined primer positions to cleaving the actual
circles from the read fragments.
:param inFile: The fastq file with reads to be cleaved by their primers.
:param outFile: The output fastq file where reads are cleaved by primers.
:param cutList: As returned by combinePrimers().
:param primerSeqs: Not used.
:param cutDesc: The newly added tag to the read name when applying this action.
:TODO: Remove primerSeqs.
"""
readId=-1
readName=''
readSeq=''
cutIndex = 0
cut = cutList[cutIndex]
with open(inFile,'r') as fqFile, open(outFile,'w') as dumpFile:
# Assuming both cutList and samfile are sorted by read id (int),
# play catchup between the two lists
try:
# Indexing is lead by reads in samfile, contains more than or equal to cutList
while True:
readName=fqFile.next().rstrip()
readSeq=fqFile.next().rstrip()
readPlus=fqFile.next().rstrip()
readPhred=fqFile.next().rstrip()
readId=int(dict(item.split(":") for item in readName[1:].split(";"))['Rd.Id'])
# Make cutList catch up if lagging
while cut[0] < readId and cutIndex < len(cutList)-1:
cutIndex += 1
cut = cutList[cutIndex]
cutId = 1
cutTmp = cut[:] # To be fair this is an ugly solution
# Both lists are now either aligned or inFile is ahead
if readId != cutTmp[0] or len(cutTmp[1]) == 0:
# No cuts can be made, take whole sequence instead
cutTmp[1] = [[0,len(readSeq),0,0]]
# Split sequence, dump information
for i,x in enumerate(cutTmp[1]):
# Ignore reads where primers were found but did not make sense
if x[2] > 0 and x[2] == x[3]:
continue
if x[0] == None:
x[0] = 0
if x[1] == None:
x[1] = len(readSeq)
# Extend the identifier
dumpFile.write(readName + ';' +
cutDesc + '.Id:' + str(cutId) + ';' +
cutDesc + '.SBp:' + str(x[0]) + ';' +
cutDesc + '.EBp:' + str(x[1]) + ';' +
cutDesc + '.SPr:' + str(x[2]) + ';' +
cutDesc + '.EPr:' + str(x[3]) +
'\n')#+':'+str(x[0])+'-'+str(x[1])
# Dump the actual sub sequence with primers
dumpFile.write(readSeq[x[0]:x[1]] + '\n')
dumpFile.write(readPlus+'\n')
# Add perfect phred scores for forced primers
dumpFile.write(readPhred[x[0]:x[1]] + '\n')
cutId += 1
except StopIteration:
pass
### splitreads implementation ###
def findRestrictionSeqs(inFile,outFile,restSeqs,cutDesc='Fr'):
""" Cut circles into smaller fragments assuming cuts were made at all
restriction sites in the reference genome.
:param inFile: The fastq file with reads to be cut by their restriction sites.
:param outFile: The output fastq file where reads are cut by restriction sites.
:param restSeqs: List of restriction site basepair sequence(s) as provided in ini file.
:param cutDesc: The newly added tag to the read name when applying this action.
:returns: A list of all cuts made to all reads.
:TODO: Remove return or make it an option. Already writing to file.
"""
# compRestSeqs = [str(Seq(x).reverse_complement()) for x in restSeqs]
# restSeqs.extend(compRestSeqs)
#restSeqs.sort(key=lambda item: (-len(item), item))
reSeqs='|'.join(restSeqs)
cutList = []
with open(inFile,'r') as fqFile, open(outFile,'w') as dumpFile:
for read in fqFile:
cutId = 1
readName=read.rstrip() #faFile.next().rstrip()
readSeq=fqFile.next().rstrip()
readPlus=fqFile.next().rstrip()
readPhred=fqFile.next().rstrip()
matches = [[x.start(), x.end(), x.group()] for x in (re.finditer(reSeqs, readSeq))]
thisCut = []
if matches != []:
thisCut.append((0,matches[0][0], # matches[0][1]
-1,restSeqs.index(matches[0][2])))
for i in xrange(len(matches)-1):
thisCut.append((matches[i][0],matches[i+1][0], # matches[0][1]
restSeqs.index(matches[i][2]),restSeqs.index(matches[i+1][2])))
thisCut.append((matches[-1][0],len(readSeq),
restSeqs.index(matches[-1][2]),-1))
else:
thisCut.append((0,len(readSeq),-1,-1))
# Split sequence, dump information
for i,x in enumerate(thisCut):
# Extend the identifier
#dumpFile.write(readName+';'+cutDesc+':'+str(i)+'\n')#+':'+str(x[0])+'-'+str(x[1])
dumpFile.write(readName + ';' +
cutDesc + '.Id:' + str(cutId) + ';' +
cutDesc + '.SBp:' + str(x[0]) + ';' +
cutDesc + '.EBp:' + str(x[1]) + ';' +
cutDesc + '.SRs:' + str(x[2]+1) + ';' +
cutDesc + '.ERs:' + str(x[3]+1) +
'\n')#+':'+str(x[0])+'-'+str
# Dump the actual sub sequence
dumpFile.write(readSeq[x[0]:x[1]]+'\n')
dumpFile.write(readPlus+'\n')
dumpFile.write(readPhred[x[0]:x[1]]+'\n')
cutId += 1
cutList.append([readName,thisCut])
return cutList
### extending mapped read parts to restriction sites ###
def findReferenceRestSites(refFile,restSeqs,lineLen=50):
""" Find restriction sites on the reference genome. Goal is to be able
to extend a fragment later on to match the restriction sites it was
mapped between, allowing us to check if there was actually a cut made
etc.
:param refFile: The reference genome reads were mapped to.
:param restSeqs: List of restriction site basepair sequence(s) as provided in ini file.
:param lineLen: The length of basepair sequences per line.
:TODO: Auto detect lineLen, may get into infinite loops if lentgh is wrong.
"""
reSeqs='|'.join(restSeqs)
restSitesDict = dict()
with open(refFile,'r') as reference:
curChrom = None
offset = -lineLen
matches = []
readSeq = ''
for line in reference:
if line[0] == '>':
matches.extend([[x.start()+offset, x.end()+offset] for x in (re.finditer(reSeqs, readSeq)) if x.start()])
readSeq = 'N'*lineLen*2
offset = -lineLen
matches = []
restSitesDict[line[1:].rsplit()[0]] = matches
else:
readSeq = readSeq[lineLen:]+line.rsplit()[0].upper()
matches.extend([[x.start()+offset, x.end()+offset] for x in (re.finditer(reSeqs, readSeq)) if x.start() < lineLen])
offset += lineLen
matches.extend([[x.start()+offset, x.end()+offset] for x in (re.finditer(reSeqs, readSeq)) if x.start()])
return restSitesDict
### combine and export data for plotting tools ###
def mapToRefSite(refSiteList,mappedPos):
""" Determine most likely restriction sites on the reference genome causing
the cuts at the end of the fragments.
:param refSiteList: Restriction sites on a single chromosome.
:param mappedPos: Start and end positions of the read fragment on that chromosome.
:returns: Indexes of reference sites and their bp positions.
"""
# Use bisect implementation to quickly find a matching position
pos = bs.bisect_left(refSiteList,mappedPos)
refLen = len(refSiteList)-1
# Don't bother beyond the last position in the list
pos = min(pos, refLen)
# Move start and end positions to match our heuristic
left = pos
right = pos
while refSiteList[left][0] >= mappedPos[0] + 10 and left > 0:
left -= 1
while refSiteList[right][1] <= mappedPos[1] - 10 and right < refLen:
right += 1
return [left, right, refSiteList[left][0], refSiteList[right][1]]
def exportToPlot(settings,restrefs,insam,uniqid=['Rd.Id','Cr.Id'],minqual=20):
""" Exports the processed data to a pandas dataframe object, as well as a handy
format to filter circles with overlapping locations.
:param settings: The dict created by loading the ini file.
:param restrefs: The restriction site dictionary as created in getRefResPositions().
:param insam: The final mapped data by BWA.
:param uniqid: A new unique ID per circle+read is created, one may combine more columns.
:param minqual: Any read wit a mapping quality less than this is rejected.
:returns:
restrefs: A dictionary telling per restiction site region what reads mapped there,
byReads: A dictionary telling per read id what restriction site regions were covered,
pdFrame: Pandas dataframe with all mapped read information.
"""
#insam = sys.argv[1]
samfile = pysam.AlignmentFile(insam, "rb")
prevRead = samfile.next()
prevResult = [-1,-1]
prevID = ''
curID = ''
curStack = []
curSplit = None
# byReads = []
byReads = collections.defaultdict(list)
headers = []
readIDs = []
readInfos = []
prevCombo = []
curCombo = []
curID = 0
print restrefs.keys()
for read in samfile:
# Treat main read + primer cleave information together as unique read id
curSplit = [item.split(":") for item in read.query_name.split(";")]
curDict = dict(curSplit)
if not read.is_unmapped and read.mapping_quality >= minqual:
if read.reference_name not in restrefs:
continue
result = mapToRefSite(restrefs[read.reference_name],[read.reference_start, read.reference_start + read.infer_query_length(always=True)])
#curID = ';'.join([curDict[x] for x in uniqid])
# Create a 'unique' integer for the combination of values that defines a specific circle
curCombo = [curDict[x] for x in uniqid]
if curCombo != prevCombo:
curID += 1
prevCombo = curCombo
#print curID
# Determine soft clipped basepairs at start and end of mapping
leftSkip = 0
rightSkip = 0
if read.cigartuples[0][0] == 4:
leftSkip = read.cigartuples[0][1]
if read.cigartuples[-1][0] == 4:
rightSkip = read.cigartuples[-1][1]
curInfo = [
read.reference_name,
read.reference_start,
read.reference_start + read.infer_query_length(always=True),
read.is_reverse,
leftSkip,
rightSkip,
result[0],
result[1],
result[2],
result[3],
True
]
# TODO are we missing out on a sanity check here? Say, sites 1,2,1 for example?
# If two subsequent reads:
# were mapped to the same chromosome,
# and to the same strand,
# and with at least one overlapping restriction site
# and have the same mother read...
if prevRead.reference_id == read.reference_id \
and prevRead.is_reverse == read.is_reverse \
and result[0] <= prevResult[1] \
and result[1] >= prevResult[0] \
and curID == prevID:
curStack.append((read,result))
curInfo[-1] = False
else:
if curStack != []:
for i in range(min([x[1][0] for x in curStack]), max([x[1][1] for x in curStack])+1):
restrefs[prevRead.reference_name][i].append((prevID,prevRead.is_reverse))
curStack = [(read,result)]
readIDs.append(curID)
readInfos.append([int(x[1]) for x in curSplit] + curInfo)
prevRead = read
prevResult = result
prevID = curID
# Once more to empty the stack
if len(curStack) > 0:
for i in range(min([x[1][0] for x in curStack]), max([x[1][1] for x in curStack])+1):
restrefs[prevRead.reference_name][i].append((prevID,prevRead.is_reverse))
print curSplit
headers = [x[0] for x in curSplit]
headerConvert = {
'Fl.Id' : 'FileId',
'Rd.Id' : 'ReadId',
'Rd.Ln' : 'ReadLen',
'Cr.Id' : 'CircleId',
'Cr.SBp' : 'CircleStartBp',
'Cr.EBp' : 'CircleEndBp',
'Cr.SPr' : 'CircleStartPr',
'Cr.EPr' : 'CircleEndPr',
'Fr.Id' : 'FragmentId',
'Fr.SBp' : 'FragmentStartBp',
'Fr.EBp' : 'FragmentEndBp',
'Fr.SRs' : 'FragmentStartRes',
'Fr.ERs' : 'FragmentEndRes',
}
for i,val in enumerate(headers):
headers[i] = headerConvert[val]
headers.extend(['AlnChr','AlnStart','AlnEnd','AlnStrand','AlnSkipLeft','AlnSkipRight','ExtStartId','ExtEndId','ExtStart','ExtEnd','ExtLig'])
# TODO add AlnQual
pdFrame = pd.DataFrame(readInfos, index=readIDs, columns=headers)
for key in restrefs:
curList = restrefs[key]
keyLen = len(curList)-1
# Turn restriction site locations into meaningful regions
for i in xrange(0,keyLen):
curList[i][0] = sum(curList[i][:2])/2
curList[i][1] = sum(curList[i+1][:2])/2
# Remove empty values from the dataset
for x in xrange(keyLen,-1,-1):
if len(curList[x]) <= 2:
curList.pop(x)
else:
curList[x] = ((curList[x][0],curList[x][1]),curList[x][2:])
# Make links from read indexes to regions
for i,val in enumerate(curList):
for x in val[1]:
byReads[x[0]].append((key,i))
# pcrDupSubSet = set()
# for key in restrefs:
# curList = restrefs[key]
# for restArea in curList:
# if not (key == settings['vp_chr'] and
# restArea[0][0] < settings['win_end'] and
# restArea[0][1] > settings['win_start']):
# if len(curList[1]) > 1:
# for read in curList[1]:
# pcrDupSubSet.add(read)
# print len(pcrDupSubSet)
return restrefs,byReads,pdFrame
#np.savez_compressed(sys.argv[3],byregion=restrefs,byread=dict(byReads))
def findDuplicates(settings,byRead,byRegion):
""" Annotate circles with information on whether they are likely duplicates.
Duplicates are determined by overlapping fragments in regions other than
the viewport.
:param settings: The dict created by loading the ini file.
:param restrefs: A dictionary telling per restiction site region what reads mapped there.
Alternate output from exportToPlot().
:param byReads: A dictionary telling per read id what restriction site regions were covered.
Alternate output from exportToPlot().
:returns: A list of read ids that were found to be likely PCR duplicates.
:TODO: Make this work for multiple windows as well
"""
transSize = settings['win_end'][0] - settings['win_start'][0]
transStart = settings['win_start'][0]-transSize
transEnd = settings['win_end'][0]+transSize
windowStart = settings['win_start'][0]
windowEnd = settings['win_end'][0]
viewStart = settings['vp_end'][0]
viewEnd = settings['vp_end'][0]
viewChrom = settings['vp_chr'][0]
def mergeRestBins(binList):
tmpBinList = sorted(binList)
for i in range(len(tmpBinList)-1,0,-1):
if tmpBinList[i][0] == tmpBinList[i-1][0] and tmpBinList[i][1] == tmpBinList[i-1][1]+1:
tmpBinList.pop(i)
return tmpBinList
transSet = set()
windowSet = set()
viewSet = set()
pcrDupSubSet = set()
# Identify reads with possible PCR duplicates through coverage per region
for key in byRegion:
curList = byRegion[key]
for i,restArea in enumerate(curList):
# Determine if region is in window
if ((key == viewChrom or key == 'chr'+viewChrom) and
restArea[0][0] < transEnd and
restArea[0][1] > transStart):
transSet.add((key,i))
if (restArea[0][0] < windowEnd and
restArea[0][1] > windowStart):
windowSet.add((key,i))
if (restArea[0][0] < viewEnd and
restArea[0][1] > viewStart):
viewSet.add((key,i))
else:
# Only of interest for PCR duplicates if there is more than one read
if len(restArea[1]) > 1:
# Take all the reads in this region
for read in restArea[1]:
# And track the read id number using a set
pcrDupSubSet.add(read[0])
print 'Total reads:\t',len(byRead)
print 'Viewport bins:\t',len(transSet)
print 'Off target:\t',len(pcrDupSubSet)
notInViewSet = set()
for read in byRead:
if len(mergeRestBins(list(set(byRead[read]).intersection(set(transSet))))) < 1:
notInViewSet.add(read)
print 'Non infos:\t',len(notInViewSet)
pcrDupSubSet = pcrDupSubSet-notInViewSet
print 'PCR testable:\t',len(pcrDupSubSet)
pcrDupMarkSet = set()
infoSet = windowSet-viewSet
# Trick to break out a double for loop:
class Found(Exception): pass
# For every read in the predetermined set of interest
for read in pcrDupSubSet:
# Hocus Pocus, try except
try:
# Check every region it covers
for region in byRead[read]:
# Now check every read in that region
for altReadTuple in byRegion[region[0]][region[1]][1]:
altRead = altReadTuple[0]
# If we haven't marked that read as dup before, but it was marked as possible dup, do a comparison
if (altRead != read) and (altRead in pcrDupSubSet) and (altRead not in pcrDupMarkSet):
# Determine intersection between that read and the main read we were investigating
readIntsect = set(byRead[read]).intersection(set(byRead[altRead]))
# Remove regions that are not what we consider trans regions
viewIntsect = transSet.intersection(readIntsect)
# Reduce subsequently overlapping regions to a single region
interestingPart = mergeRestBins(list(readIntsect - viewIntsect))
# Determine if overlap in reduced region set is more than a single region
if len(interestingPart) > 1:
#print interestingPart,byRead[read],byRead[altRead]
#print read,len(set(byRead[read]).intersection(transSet)),altRead,len(set(byRead[altRead]).intersection(transSet))
if len(mergeRestBins(list(set(byRead[read]).intersection(infoSet)))) >= len(mergeRestBins(list(set(byRead[altRead]).intersection(infoSet)))):
# altRead is the shortest read, good bye
pcrDupMarkSet.add(altRead)
else:
# Mark the current read
pcrDupMarkSet.add(read)
# Seeing we are marking the current read rather than the 'other' read, no point comparing other reads
raise Found # "Pocus"
# Draw rabbit from the hat
except Found:
pass
print 'PCR duplicates:\t',len(pcrDupMarkSet)
#exit()
print 'Reads left:\t',len(byRead)-len(pcrDupMarkSet.union(notInViewSet))
return pcrDupMarkSet
def findRepeats(pdFrame):
""" Annotate wether parts of a circle are overlapping with other parts in the same circle.
:param pdFrame: Pandas dataframe with all mapped read information.
:TODO: Return information rather than applying it here directly
"""
flatColumn = []
curData = []
# TODO: Incorporate ExtLig to tell apart reordered fragments from fragments that span multple restsites
def finishRead():
# if len(curData) <= 3:
# return
parts = [-1 for x in range(len(curData))]
for i,frag1 in enumerate(curData):
#print '-',i,frag1['AlnChr'],frag1['ExtStart'],frag1['ExtEnd'],frag1['AlnStrand']
for j,frag2 in enumerate(curData[:i]):
#print '- -',j,frag2['AlnChr'],frag2['ExtStart'],frag2['ExtEnd'],frag2['AlnStrand']
if frag1['AlnChr'] == frag2['AlnChr'] and frag1['AlnStrand'] == frag2['AlnStrand']:
if frag1['ExtStart'] < frag2['ExtEnd'] + 10 and frag2['ExtStart'] < frag1['ExtEnd'] + 10:
#print 'match',i,j+i
parts[i] = j#+i+1
break
#print parts
newParts = []
index = 1
for part in parts:
if part == -1:
newParts.append(index)
index += 1
else:
newParts.append(newParts[part])
#print newParts
# thisDataFrame = pd.concat(curData,axis=1)
# print pd.DataFrame(curData).T
# print 'derp'
flatColumn.extend(newParts)
curReadId = pdFrame.iloc[0]['ReadId']
for i,row in pdFrame.iterrows():
#print row['ReadId']
if row['ReadId'] != curReadId:
finishRead()
curData = []
curReadId = row['ReadId']
curData.append(row)
finishRead()
pdFrame['FlatId'] = pd.Series(flatColumn, index=pdFrame.index)