forked from aryeelab/umi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demultiplex_nobuff.py
240 lines (213 loc) · 8.6 KB
/
demultiplex_nobuff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from __future__ import print_function
import os
import re
import gzip
import itertools
import argparse
import time
import logging
import sys
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Edited Martin Aryee's function to demultiplex to reduce
# the amount of memory required to run.
# Martin's original demultiplex function can be found
# here:
# https://github.com/aryeelab/umi/wiki
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
__author__ = 'Allison MacLeay'
logging.basicConfig()
logger = logging.getLogger('demultiplex_nobuff')
logger.setLevel(logging.INFO)
def fq(file):
if re.search('.gz$', file):
fastq = gzip.open(file, 'rb')
else:
fastq = open(file, 'r')
with fastq as f:
while True:
l1 = f.readline()
if not l1:
break
l2 = f.readline()
l3 = f.readline()
l4 = f.readline()
yield [l1, l2, l3, l4]
def create_key(f1, f2):
"""Create a combined barcode key file from 2 files"""
bc_dictA = {}
bc_dictP = {}
add_file_to_dict(f1, bc_dictA)
add_file_to_dict(f2, bc_dictP)
bcAP = [bc_dictA, bc_dictP]
return bcAP
def add_file_to_dict(fname, d):
"""
helper function - add a file to the dictionary
:param fname: an array of strings with filenames to parse
:param d: a dictionary (dict)
:return:
"""
HEADER = 1 # skip first line if equal to 1
fh = open(fname, 'r')
for line in fh:
if HEADER == 1:
HEADER = HEADER-1
continue
line = line.strip().split('\t')
if len(line) < 2:
continue
[id, seq] = line
d[seq[1:8]] = id
fh.close()
return
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# helper functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def get_sample_name(i1,i2,id_array):
index1=i1[1][1:8]
index2=i2[1][1:8]
dictA=id_array[0]
dictP=id_array[1]
Aname=''
Pname=''
if index1 in dictP.keys():
Pname=dictP[index1]
else:
Pname=index1
if index2 in dictA.keys():
Aname=dictA[index2]
else:
Aname=index2
return (Aname + '_' + Pname)
def get_seq(i1, i2):
seq1=i1[1]
seq2=i2[1]
return seq1[1:8] + seq2[1:8]
def demultiplex(read1, read2, index1, index2, p5_barcodes, p7_barcodes, out_dir, out_fname=None, min_reads=10000):
# args = {'out_dir':'/PHShome/ma695/tmp', 'min_reads':10}
# base = '/data/joung/sequencing_bcl/131007_M01326_0075_000000000-A6B33/Data/Intensities/BaseCalls'
# args['read1'] = os.path.join(base, 'Undetermined_S0_L001_R1_001.fastq.gz')
# args['read2'] = os.path.join(base, 'Undetermined_S0_L001_R2_001.fastq.gz')
# args['index1'] = os.path.join(base, 'Undetermined_S0_L001_I1_001.fastq.gz')
# args['index2'] = os.path.join(base, 'Undetermined_S0_L001_I2_001.fastq.gz')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
bar_dict = [p5_barcodes, p7_barcodes] if isinstance(p5_barcodes, dict) else create_key(p5_barcodes, p7_barcodes)
fname = ''
if out_fname:
fname = args['out_fname'] + '_'
outfiles_r1 = {}
outfiles_r2 = {}
outfiles_i1 = {}
outfiles_i2 = {}
total_count = 0
count = {}
# Create count dictionary first
start = time.time()
zip_func = itertools.izip if sys.version_info[0] < 3 else zip
for i1, i2 in zip_func(fq(index1), fq(index2)):
sample_id = get_seq(i1, i2)
# Increment read count and create output buffers if this is a new sample barcode
if not count.has_key(sample_id):
count[sample_id] = 0
count[sample_id] += 1
total_count += 1
if total_count % 5000000 == 0:
logger.info("Processed %d counts in %.1f minutes." % (total_count, (time.time() - start) / 60))
logger.info("Read count complete in %.1f minutes." % ((time.time() - start) / 60))
total_count = 0
for r1, r2, i1, i2 in zip_func(fq(read1), fq(read2), fq(index1), fq(index2)):
# the original demultiplex stored sequences in a buffer to execute in 1N instead of 2N
# this version minimizes the memory requirement by running in 2N
total_count += 1
if total_count % 1000000 == 0:
logger.info("Processed %d reads in %.1f minutes." % (total_count, (time.time() - start) / 60))
sample_id = get_sample_name(i1, i2, bar_dict)
if count[get_seq(i1, i2)] < min_reads:
# Write remaining buffered reads to a single fastq.
# (These reads correspond to barcodes that were seen less than min_reads times)
if 'undetermined_r1' not in vars():
undetermined_r1 = open(os.path.join(out_dir, fname + 'undetermined.r1.fastq'), 'w')
if 'undetermined_r2' not in vars():
undetermined_r2 = open(os.path.join(out_dir, fname + 'undetermined.r2.fastq'), 'w')
if 'undetermined_i1' not in vars():
undetermined_i1 = open(os.path.join(out_dir, fname + 'undetermined.i1.fastq'), 'w')
if 'undetermined_i2' not in vars():
undetermined_i2 = open(os.path.join(out_dir, fname + 'undetermined.i2.fastq'), 'w')
for line in r1:
print(line, file=undetermined_r1, end="")
for line in r2:
print(line, file=undetermined_r2, end="")
for line in i1:
print(line, file=undetermined_i1, end="")
for line in i2:
print(line, file=undetermined_i2, end="")
else:
if sample_id not in outfiles_r1.keys():
outname = fname + sample_id
outfiles_r1[sample_id] = open(os.path.join(out_dir, '%s.r1.fastq' % outname), 'w')
outfiles_r2[sample_id] = open(os.path.join(out_dir, '%s.r2.fastq' % outname), 'w')
outfiles_i1[sample_id] = open(os.path.join(out_dir, '%s.i1.fastq' % outname), 'w')
outfiles_i2[sample_id] = open(os.path.join(out_dir, '%s.i2.fastq' % outname), 'w')
for line in r1:
print(line, file=outfiles_r1[sample_id], end="")
for line in r2:
print(line, file=outfiles_r2[sample_id], end="")
for line in i1:
print(line, file=outfiles_i1[sample_id], end="")
for line in i2:
print(line, file=outfiles_i2[sample_id], end="")
undetermined_r1.close()
undetermined_r2.close()
undetermined_i1.close()
undetermined_i2.close()
for sample_id in outfiles_r1.keys():
outfiles_r1[sample_id].close()
for sample_id in outfiles_r2.keys():
outfiles_r2[sample_id].close()
for sample_id in outfiles_i1.keys():
outfiles_i1[sample_id].close()
for sample_id in outfiles_i2.keys():
outfiles_i2[sample_id].close()
num_fastqs = len([v for k, v in count.iteritems() if v >= min_reads])
logger.info('Wrote FASTQs for the %d sample barcodes out of %d with at least %d reads in %.1f minutes.' % (
num_fastqs, len(count), min_reads, (time.time() - start) / 60))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# MAIN
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--read1', required=True)
parser.add_argument('--read2', required=True)
parser.add_argument('--index1', required=True)
parser.add_argument('--index2', required=True)
parser.add_argument('--min_reads', type=int, default=10000)
parser.add_argument('--p5_barcodes')
parser.add_argument('--p7_barcodes')
parser.add_argument('--out_dir', default='.')
parser.add_argument('--out_fname', default='')
args = vars(parser.parse_args())
swap = {}
do_swap = 1
fargs = ['read1', 'read2', 'index1', 'index2']
for f in fargs:
name = args[f]
if name.find('_R1_') > 0:
swap['read1'] = name
elif name.find('_R2_') > 0:
swap['read2'] = name
elif name.find('_I1_') > 0:
swap['index1'] = name
elif name.find('_I2_') > 0:
swap['index2'] = name
else:
do_swap = 0 # one or more files do not adhere to schema. Can not confidently swap
if (do_swap == 1) & (len(swap) == 4):
# swapping files for names that are passed in in the wrong order
# but follow the schema of containing _R1_
for f in fargs:
args[f] = swap[f]
demultiplex(args['read1'], args['read2'], args['index1'], args['index2'], args['p5_barcodes'], args['p7_barcodes'], args['out_dir'], args['out_fname'],
min_reads=args['min_reads'])