-
Notifications
You must be signed in to change notification settings - Fork 1
/
kmer.py
104 lines (76 loc) · 2.14 KB
/
kmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gzip
def revcomp(kmer):
'''
return reverse complement of the kmer
only 'ATCG' are considered legitimate bases
'''
rev = ''
for ch in kmer:
if ch == 'A': rev = 'T' + rev
elif ch == 'T': rev = 'A' + rev
elif ch == 'C': rev = 'G' + rev
elif ch == 'G': rev = 'C' + rev
else: raise Exception
return rev
def revcomp_n(kmer):
'''
return reverse complement of the kmer
only 'ATCGN' are considered legitimate bases
'''
rev = ''
for ch in kmer:
if ch == 'A': rev = 'T' + rev
elif ch == 'T': rev = 'A' + rev
elif ch == 'C': rev = 'G' + rev
elif ch == 'G': rev = 'C' + rev
elif ch == 'N': rev = 'N' + rev
else: raise Exception
return rev
def kmer2integer(kmer):
'''
generate integer representation of the kmer
'''
#use reverse complement if it is smaller
revkmer = revcomp(kmer)
if revkmer < kmer: kmer = revkmer
intval = 0
for x in kmer:
intval <<= 2
if x == 'C':
intval += 1
elif x == 'G':
intval += 2
elif x == 'T':
intval += 3
return intval
def load_kmer_counts(fname,kmer_size):
'''
load kmer counts generated by kmerz program
'''
#load kmer counts from file
if fname.endswith('.gz'):
f = gzip.open(fname,'rb')
else:
f = open(fname,'rb')
counts = f.read()
f.close()
assert len(counts) == 4**kmer_size
return counts
def query_kmer(kmer,counts,kmer_size):
'''
lookup kmer count
kmer must be upper case
containing only ATCGN
if any Ns are present the count if None
'''
if 'N' in kmer: return None
kmer_int = kmer2integer(kmer) #,kmer_size)
kmercount = ord(counts[kmer_int])
return kmercount
def next_kmer(seq,kmer_size):
'''
generator to yield kmers from a sequence
'''
seqlen = len(seq)
for i in xrange(0,seqlen-kmer_size+1):
yield seq[i:i+kmer_size]