-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_baseline.py
executable file
·152 lines (128 loc) · 5 KB
/
compute_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# ignore alignability for the moment, we'll assume everything within 100bp of a read is alignable
import numpy
import argparse
import pysam
from string import maketrans
SAMPLE_MARGIN = 25 # on either side
CHROMS = None
COMPL = maketrans("ACGT", "TGCA")
def readWig(wig_file):
fin = open(wig_file, 'r')
masks = {}
mask = []
chrom = None
start = 1
count = 0
for line in fin:
if line[0] == 'f': # header line ex. "fixedStep chrom=chr1 step=1 ..."
params = {param.split('=')[0]:param.split('=')[1] for param in line.strip().split(' ') if '=' in param}
if params["chrom"] != chrom:
if len(mask) > 0:
masks[chrom] = numpy.array(mask, dtype='b') # boolean?
mask = []
chrom = params["chrom"]
print chrom
'''
if chrom == "chr10":
break
'''
elif start + count != int(params["start"]):
print "GAP!"
start = int(params["start"])
count = 0
else:
mask.append(int(float(line.strip())))
count += 1
if len(mask) > 0:
masks[chrom] = numpy.array(mask, dtype='b') # boolean?
mask = []
fin.close()
# fill out the empty chroms for testing...
for c in xrange(len(CHROMS)):
if not masks.has_key(CHROMS[c]):
print "missing", CHROMS[c]
masks[c] = numpy.zeros(0, dtype='b')
else:
masks[c] = masks[CHROMS[c]]
del masks[CHROMS[c]]
return masks
def getRandom(chrom, first, last, cnt, mask=None):
if mask is not None and numpy.sum(mask[chrom][first:last]) > 0:
nums = numpy.random.choice([i for i in xrange(first, last) if mask[chrom][i] == 1], cnt, replace=True)
else:
# for reads very near the end, this range can be 0, skip it
if last <= first:
nums = []
else:
nums = numpy.random.randint(first, last, cnt)
if not hasattr(nums, '__iter__'):
nums = [nums]
return nums
def main(bam_npy_file, fasta_file, chrom_file, k, output_file, exp=1, limit=None, window_max=SAMPLE_MARGIN*2, mask=None):
global CHROMS
CHROMS = [c.split()[0] for c in open(chrom_file).read().strip().split('\n')]
if mask is not None:
print "Reading alignability mask..."
mask = readWig(mask)
baseline_kmer_counts = {}
trailing = 0
leading = 1
density_count = 1
print "Reading FASTA seqs..."
fa = pysam.Fastafile(fasta_file)
refs = [fa.fetch(c).upper() for c in CHROMS]
ref_lens = [len(r) for r in refs]
print "Reading BAM..."
bam = numpy.load(bam_npy_file)
tot_reads = len(bam)
print "%i reads" % tot_reads
num_reads = 0
b = 0
while b < tot_reads:
read = bam[b]
if b % 10**6 == 0:
print "%i (%.2f%%)" % (b, float(b)/tot_reads*100)
while leading < tot_reads and (bam[leading][0] < read[0] or (bam[leading][0] == read[0] and bam[leading][1] <= read[1] + SAMPLE_MARGIN)):
if bam[leading][1] != bam[leading-1][1]: # count only 1 read at each unique position
density_count += 1
leading += 1
while trailing < b and (bam[trailing][0] < read[0] or bam[trailing][1] < read[1] - SAMPLE_MARGIN):
if bam[trailing][1] != bam[trailing-1][1]:
density_count -= 1
trailing += 1
#print "read %i (%s), leading %i (%s)" % (b, str(read), leading, str(bam[leading]))
if density_count < 0:
print density_count, "too low"
first = max(0, read[1] - SAMPLE_MARGIN) # inclusive
last = min(ref_lens[read[0]] - k, read[1] + SAMPLE_MARGIN + 1 - k) # exclusive
for pos in getRandom(read[0], first, last, min(int(density_count ** exp), window_max), mask): # get (density ** exp) random positions
# get sequence
if read[2]:
seq = refs[read[0]][pos : pos + k].translate(COMPL)[::-1]
else:
seq = refs[read[0]][pos : pos + k]
# count k-mers
#for i in xrange(len(seq) - k + 1):
baseline_kmer_counts[seq] = baseline_kmer_counts.get(seq, 0) + 1
if b == leading + 1: # DON'T KNOW WHY THIS HAPPENS
break
b = leading + 1
fout = open(output_file, 'w')
kmers = baseline_kmer_counts.keys()
total_kmers = sum(v for k,v in baseline_kmer_counts.iteritems())
fout.write(','.join(kmers) + "\n")
fout.write(','.join(['%.4f' % (float(baseline_kmer_counts[k])/total_kmers) for k in kmers]))
fout.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Compute baseline k-mer frequencies")
parser.add_argument("bam", help="BAM.npy file")
parser.add_argument("ref", help="Fasta file")
parser.add_argument("chroms", help="Chromosome file")
parser.add_argument("k", help="k-mer size", type=int)
parser.add_argument("out", help="Output (csv) file")
parser.add_argument("--exp", help="Density exponent", type=float, default=1)
parser.add_argument("--limit", help="Reads to look at, tiled across entire BAM", type=int)
parser.add_argument("--window_max", help="Maximum # reads to count in a single window", type=int)
parser.add_argument("--mask", help="WIG formatted alignability mask")
args = parser.parse_args()
main(args.bam, args.ref, args.chroms, args.k, args.out, args.exp, args.limit, args.window_max, args.mask)