-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrelate_bias.py
executable file
·133 lines (114 loc) · 4.57 KB
/
correlate_bias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pysam
import argparse
import matplotlib
matplotlib.use("Agg")
from matplotlib import pylab
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy
import math
import random
from string import maketrans
MARGIN = 40
MEMORY_LIMIT = 32000000000 # 32 Gb (on bigmem, hopefully)
CHROMS = None
COMPL = maketrans("ACGTacgt", "TGCATGCA")
def read_bias(f):
data = [line.strip().split(',') for line in open(f).read().strip().split('\n')]
bias = [{} for i in xrange(MARGIN*2+1)]
header = data[0]
for i in xrange(1, len(data)):
for j in xrange(1, len(data[i])):
bias[i-1][header[j]] = float(data[i][j])
return len(header[1]), bias
def main(bam_npy_file, ref_file, chrom_file, bias_file, out_file, read_limit=None, tile=False, plot=False, clip=None):
global CHROMS
CHROMS = [c.split()[0] for c in open(chrom_file).read().strip().split('\n')]
k, bias = read_bias(bias_file)
bam = numpy.load(bam_npy_file)
fa = pysam.Fastafile(ref_file)
print "Reading FASTA file..."
refs = [fa.fetch(ch) for ch in CHROMS]
tile_size = (k if tile else 1)
num_samples = int(math.ceil(float((MARGIN * 2 + 1) - k + 1) / tile_size))
bam_reads = len(bam)
read_count = min(MEMORY_LIMIT / 4 / num_samples, bam_reads)
read_prob = float(read_count) / bam_reads
stacks = numpy.zeros((num_samples, read_count), dtype="f4") # dtype == 'f4' I think
data = []
i = 0
s = 0
print "Processing reads..."
for read in bam:
if read[1] < MARGIN or read[1] >= len(refs[read[0]]) - MARGIN:
continue
seq = refs[read[0]][read[1] - MARGIN : read[1] + MARGIN + 1].upper()
if read[2]:
seq = seq.translate(COMPL)[::-1]
composition = {'A':0, 'C':0, 'G':0, 'T':0}
bad = False
for a in seq:
# These can be Ns or weird sequence like Kirk's (with Rs and Ms)
if not composition.has_key(a):
bad = True
break
composition[a] += 1
if bad:
continue
i += 1
if i % 1000000 == 0:
print i
if read_limit != None and i >= read_limit:
break
weight = ((clip if read[3] > clip else (1.0/clip if read[3] < 1.0/clip else read[3])) if clip is not None else read[3]) if len(read) > 3 else 1
if s < read_count and (bam_reads == read_count or random.random() < read_prob):
for j in xrange(num_samples):
kmer = seq[j*tile_size:j*tile_size+k]
a = bias[j*tile_size][kmer]
stacks[j, s] = a * weight
s += 1
correlation_matrix = [[0 for j in xrange(num_samples)] for l in xrange(num_samples)]
for j in xrange(num_samples):
correlation_matrix[j][j] = 0
print "Computing covariance matrices..."
print "Taking a maximum of %i samples, to save memory" % num_samples
print "stacks:", stacks.shape
for j in xrange(num_samples):
print j
for l in xrange(j+1, num_samples):
matrix = numpy.vstack((stacks[j, :s],stacks[l, :s]))
cov_matrix = numpy.corrcoef(matrix) # gets a normalized covariance matrix
cov = cov_matrix[0][1]
# use only magnitude of correlation, not direction
if cov < 0:
cov = -1 * cov
correlation_matrix[j][l] = cov
correlation_matrix[l][j] = cov
cov_matrix = numpy.array(correlation_matrix)
numpy.save(out_file, cov_matrix)
print cov_matrix.shape
print range(0, cov_matrix.shape[0], 5)
print range(0, cov_matrix.shape[1], 5)
cov_matrix = cov_matrix[range(0,cov_matrix.shape[0],5)][:,range(0,cov_matrix.shape[1],5)] # fancy indexing to get nonoverlapping tiles
print cov_matrix.shape
if plot:
plt.imshow(cov_matrix, interpolation="none")
plt.jet()
cb = plt.colorbar()
cb.set_label("covariance")
plt.xticks(range(cov_matrix.shape[1]), range(-8,8))
plt.yticks(range(cov_matrix.shape[0]), range(-8,8))
plt.savefig("%s.png" % (out_file[:out_file.rindex('.')]))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Compute bias correlation across read-relative positions")
parser.add_argument("bam", help="BAM.npy file")
parser.add_argument("ref", help="Fasta file")
parser.add_argument("chroms", help="Chromosome file")
parser.add_argument("bias", help="Bias (CSV) file")
parser.add_argument("out", help="Output (npy) file")
parser.add_argument("--limit", help="Maximum reads to process", type=int)
parser.add_argument("--tile", help="Tile based on k-mer size?", action="store_true")
parser.add_argument("--plot", help="Plot covariance matrix?", action="store_true")
parser.add_argument("--clip", help="Clip weights", type=int)
args = parser.parse_args()
main(args.bam, args.ref, args.chroms, args.bias, args.out, args.limit, args.tile, args.plot, args.clip)