-
Notifications
You must be signed in to change notification settings - Fork 1
/
sam2fastq.py
75 lines (57 loc) · 1.61 KB
/
sam2fastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/python
'''
convert sam into fastq for file sorted by read name
therefore assumes read pairs are listed on adjacent lines
samtools view [bamfile] | sam2fastq.py [read1] [read2] [read3]
where reads 1 and 2 are paired and read3 is unpaired
'''
import sys
def revcomp(kmer):
'''
return reverse complement of the kmer
only 'ATCGN' are considered legitimate bases
'''
rev = ''
for ch in kmer:
if ch == 'A': rev = 'T' + rev
elif ch == 'T': rev = 'A' + rev
elif ch == 'C': rev = 'G' + rev
elif ch == 'G': rev = 'C' + rev
elif ch == 'N': rev = 'N' + rev
else: raise Exception
return rev
def write_rec(f,r):
if r[3]:
r[1] = revcomp(r[1])
r[2] = r[2][::-1]
f.write('@'+r[0]+'\n'+r[1]+'\n+\n'+r[2]+'\n')
fin = sys.stdin
read1 = open(argv[1],"wb")
read2 = open(argv[2],"wb")
read3 = open(argv[3],"wb")
prev = None
while True:
line = fin.readline()
if line.startswith('@'): continue
if line == '':
if prev: write_rec(read3,prev)
break
tok = line.strip().split('\t')
qname = tok[0]
seq = tok[9]
qual = tok[10]
mate = tok[6]
rev = int(tok[1]) & 0x10
if mate == '=': mate = qname
if prev:
if mate == prev[0]:
#write read pair
write_rec(read1,prev)
write_rec(read2,[qname,seq,qual,rev])
prev = None
else:
#write unpaired read
write_rec(read3,prev)
prev = [qname,seq,qual,rev]
else:
prev = [qname,seq,qual,rev]