forked from melanieabrams/bremdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbarseq_merge_only_4-2-2020.py
470 lines (365 loc) · 20.7 KB
/
barseq_merge_only_4-2-2020.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
import regex
import numpy as np
import sys
import subprocess as sp
import copy
import datetime
import time
import random
# PARAMETERS # #FOR DIAGNOSING MERGE OF POOLED_READ FILES Version 3-19-2020 MBA, modified from cweiss's map_and_pool_BLAT.py and scorad rbseq
mapping_genome = "/usr2/people/carlyweiss/Amended_Genomes/Amended_Genomes/Concatenated/D1373_Z1.fa" # location of the .udb database
min_ID = str(95) # minimum identity cutoff for mapping reads. Must be a decimel proportion (0-1)
min_id_pooling = .95 # minimum identity for reads at the pooling stage. Can be different then at mapping. Must be (0-1)
gff = '/usr2/people/carlyweiss/SparScerinfo/YS2+CBS432+plasmid_clean' ## GFF to use for annotations.
min_read_cutoff = 0 ## this is only for reporting summary statistics
min_genome_read_length = 40
min_len_align = 40
single_barcode = False #set to True if analyzing a test dataset with only one barcode
maskOffByOne = True # when True, this treats low abundance barcodes that are off-by-one as a sequencing error and removes them from analysis
min_id_pooling*=100
# HELP #l
if len(sys.argv) == 1:
print("USAGE: python map_PB_reads.py out_directory inFile1._pooled_reads inFile2._pooled_reads")
exit()
# INPUT #
out_dir = sys.argv[1]
read_files = sys.argv[2:]
# BEGIN FUNCTIONS #
def parse_pooled_reads(pooled_read_file): #function for beginning analysis *AFTER* pooled read outfiles already created for individual libraries
split_loc_dict = {} # will hold hierarchical data on each insertion site. keys for nested dictionaries are scaffold, strand, position, barcode, value is # reads mapping there.
f=open(pooled_read_file)
next(f)
for line in f:
row_data=line.split('\t')
barcode=row_data[0]
chrom=row_data[1]
strand=row_data[2]
pos=int(row_data[3])
annotation=row_data[4]
number_of_reads=int(row_data[5])
# add data to split_loc_dict #
if chrom not in split_loc_dict:
split_loc_dict[chrom] = {'+' : {}, '-' : {}}
if pos not in split_loc_dict[chrom][strand]:
split_loc_dict[chrom][strand][pos] = {barcode: number_of_reads}
if barcode not in split_loc_dict[chrom][strand][pos]:
split_loc_dict[chrom][strand][pos][barcode] = number_of_reads
f.close()
return split_loc_dict
def remove_multibarring (split_loc_dict, outfileprefix="fastq_filename_here"):
'''removes barcodes that map to multiple independent insertions
since those aren't useful for downstream analysis'''
remaining_barcodes = [ ]
duplicate_barcodes = {}
#goes through dict and gets barcodes, finds duplicates
for chrom in split_loc_dict:
print("...discovering barcodes that map to multiple locations from "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for bc in split_loc_dict[chrom][strand][pos]:
if bc in remaining_barcodes:
## print(bc)
if bc in duplicate_barcodes:
duplicate_barcodes[bc]+=1
else:
duplicate_barcodes[bc]=2
else:
remaining_barcodes.append(bc)
unique_split_loc_dict = copy.deepcopy(split_loc_dict)
#goes through dict and removes duplicate barcodes from further analysis
for chrom in split_loc_dict:
print("...discovering barcodes that map to multiple locations from "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for bc in split_loc_dict[chrom][strand][pos]:
if bc in duplicate_barcodes:
del unique_split_loc_dict[chrom][strand][pos][bc]
#writes separate outfile with duplicate barcodes
wf =open(out_dir+outfileprefix+"_mulitmapping_barcodes", 'w')
wf.writelines("barcode\t#of mapped locations\n")
if len(duplicate_barcodes.keys()) > 0:
for dupbar in duplicate_barcodes.keys():
wf.writelines(dupbar+"\t"+str(duplicate_barcodes[dupbar])+"\n")
wf.close()
#writes number nonunique to mapping stats file
number_nonunique = len(duplicate_barcodes)
wf = open(out_dir+outfileprefix+"_mapping_stats", 'a')
wf.writelines("\n\nRemoving non-unique barcodes: \n")
wf.writelines("removed "+str(number_nonunique)+" non-unique barcode insertions\n")
wf.close()
print("removed "+str(number_nonunique)+" non-unique barcode insertions")
return unique_split_loc_dict
def OffByOneList(seq):
'''based on rbdnaseq pipeline, this
generates a list of sequences off by one from the input
use this in maskOffByOne to mask barcodes that are off by one due to sequencing error'''
if seq[0] in ("A","T","G","C"):
char_set = ("A","T","G","C")
elif seq[0] in ("a","t","g","c"):
char_set = ("a","t","g","c")
else:
return False
variants = []
for chari in list(range(len(seq))):
if chari == 0:
preseq = ""
else:
preseq = seq[0:chari]
if chari == len(seq)-1:
postseq = ""
else:
postseq = seq[chari+1:]
for char in char_set:
if seq[chari] != char:
variants.append(preseq+char+postseq)
return(variants)
def MaskOffByOne(split_loc_dict, outfileprefix="fastqfilename"):
'''modified from rbdnaseq pipeline'''
masked_split_loc_dict = copy.deepcopy(split_loc_dict)
offByOneList = [0]
totals = {}
for chrom in split_loc_dict: #build a dictionary of barcode-to-total-read
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
total = split_loc_dict[chrom][strand][pos][barcode]
if barcode in totals:
totals[barcode]+=total
else:
totals[barcode]=total
offByOneList = [ ]
for chrom in split_loc_dict: #go through and remove barcodes where an off-by-one variant is 100 times more common
print("...masking off-by-one barcodes in chromosome "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
variants = OffByOneList(barcode)
## if barcode != 'TAAGCAACCTCGGCGCATAG':
## print(barcode)
## print(variants)
## print('TAAGCAACCTCGGCGCATAG' in variants)
## print(totals[barcode],totals['TAAGCAACCTCGGCGCATAG'])
offByOne = False
try:
for variantBarcode in variants:
if (not offByOne) & (variantBarcode in totals):
if (totals[variantBarcode] > totals[barcode]*100):
offByOneList.append(barcode)
del masked_split_loc_dict[chrom][strand][pos][barcode]
offByOne = True
except TypeError: #for some reason, one or two barcodes would throw a TypeError in one file if run in python2 (should be py3, but this guards against program aborting if run in old python).
None
print("masked "+str(len(offByOneList))+" off-by-one barcodes")
wf = open(out_dir+outfileprefix+"_mapping_stats", 'a')
wf.writelines("\n\nMasking off-by-one barcodes: \n")
wf.writelines("masked "+str(len(offByOneList))+" off-by-one barcodes \n")
wf.close()
return masked_split_loc_dict
def clean_barcodes(split_loc_dict, mapped_reads, single_barcode=False, maskOffByOne=True, outfileprefix="fastq_filename_here"):
out_name=outfileprefix
if single_barcode == False:
split_loc_dict = remove_multibarring(split_loc_dict, mapped_reads,outfileprefix=out_name) #up till 4-2, this was outfileprefix = fastqfilename, which caused it to write to the wrong file during Merge
# print("...done removing non-unique barcodes...")
if maskOffByOne == True:
masked_unique_split_loc_dict = MaskOffByOne(split_loc_dict, mapped_reads, outfileprefix=out_name)
#print("...done masking off-by-one barcodes...")
return split_loc_dict
def Annotate_insetions(split_loc_dict,read_file_name):
out_filename = out_dir+read_file+"_pooled_reads" # the final output, this will hold the pooled insertion table
wf = open(out_filename,'w')
wf.writelines("ID\tscaffold\tstrand\tlocation\tannotation\tn\trel_loc\trel_prop\tgene_length\n")
sc_insertions = 0
sp_insertions = 0
sc_genic_insertions = 0
sp_genic_insertions = 0
tot_insertions = 0
tot_min_insertions = 0.0
plasmid_reads = 0 # reads mapping to the plasmid
Rtn_reads = 0 # reads mapping to the tn right border
Ltn_reads = 0 # reads mapping to the tn left border
gff_dict = {}
f = open(gff)
for line in f:
line = line.split("\t")
chrom = line[1]
gene = line[0]
start = int(line[4])
end = int(line[5])
strand = line[3]
type = line[2]
# put the annotation information in a dictionary #
if chrom not in gff_dict:
gff_dict[chrom] = {}
gff_dict[chrom][gene] = [start, end, strand]
# Search through the dictionary for insertions that fall within genes
for chrom in split_loc_dict:
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
try:
if split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
tot_min_insertions+=1
if chrom[:2] == 'sc':
sc_insertions+=1
elif chrom[:2] == 'sp':
sp_insertions+=1
tot_insertions+=1
except TypeError:
print(split_loc_dict[chrom][strand][pos])
print(split_loc_dict[chrom][strand][pos][barcode])
print(min_read_cutoff)
# set defaults for noncoding
insertion_type = 'NC'
gene_length = -1
relative_insertion_site = -1
prop_gene_insertion_in = -1
for gff_chrom in gff_dict:
if gff_chrom != chrom:
continue
for gene in gff_dict[gff_chrom]:
if pos >= gff_dict[gff_chrom][gene][0] and pos <= gff_dict[gff_chrom][gene][1]: # if the insertion falls within a gene
insertion_type = gene
gene_length = gff_dict[gff_chrom][gene][1] - gff_dict[gff_chrom][gene][0]+1
if gff_dict[gff_chrom][gene][2] == '+':
relative_insertion_site = pos - gff_dict[gff_chrom][gene][0]
else:
relative_insertion_site = gff_dict[gff_chrom][gene][1] - pos
if gene[:2] == 'sp' and split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
sp_genic_insertions+=1
elif gene[:2] == 'sc' and split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
sc_genic_insertions+=1
prop_gene_insertion_in = relative_insertion_site / float(gene_length)
wf.writelines(barcode+"\t"+chrom+"\t"+strand+"\t"+str(pos)+"\t"+insertion_type+"\t"+str(split_loc_dict[chrom][strand][pos][barcode])+"\t"+str(relative_insertion_site)+"\t"+str(prop_gene_insertion_in)+"\t"+str(gene_length)+"\n")
wf.close()
f = open(out_filename)
for line in f:
if line[:2] != 'pl':
continue
line = line.split("\t")
if line[4] == 'TN_right_arm':
Rtn_reads+=int(line[5]) # reads mapping to the tn right border
elif line[4] == 'TN_left_arm':
Ltn_reads+=int(line[5]) # reads mapping to the tn right border
elif line[4] == 'NC':
plasmid_reads+=int(line[5])
f.close()
tot_genic_insertions = float(sc_genic_insertions+sp_genic_insertions)
wf = open(out_dir+read_file_name+"_mapping_stats", 'a')
wf.writelines("\n\nAnnotating insertions: \n")
wf.writelines("reads mapping to NC plasmid backbone: "+str(plasmid_reads)+"\n")
wf.writelines("reads mapping to TN right border: "+str(Rtn_reads)+"\n")
wf.writelines("reads mapping to TN left border: "+str(Ltn_reads)+"\n")
wf.writelines("total insertions: "+str(tot_insertions)+"\n")
wf.writelines("total insertions with >"+str(min_read_cutoff)+" reads: "+str(tot_min_insertions)+"\tscer: "+str(sc_insertions)+" ("+str(100*sc_insertions/tot_min_insertions)+"%)"+" spar: "+str(sp_insertions)+"("+str(100*sp_insertions/tot_min_insertions)+"%)\n")
print("\ntotal insertions: "+str(tot_insertions))
print("total genic insertions: "+str(tot_genic_insertions))
if tot_genic_insertions > 0:
wf.writelines("of these "+str(tot_insertions)+" insertions:\n")
wf.writelines("total genic insertions: "+str(sp_genic_insertions + sc_genic_insertions)+" ("+str(100*(sp_genic_insertions + sc_genic_insertions)/tot_min_insertions)+"% of insertions)\n")
wf.writelines("Scer genic insertions: "+str(sc_genic_insertions)+" ("+str(100*sc_genic_insertions/tot_genic_insertions)+"% of genic insertions)\n")
wf.writelines("Spar genic insertions: "+str(sp_genic_insertions)+" ("+str(100*sp_genic_insertions/tot_genic_insertions)+"% of genic insertions)\n")
else:
wf.writelines("no genic insertions\n")
wf.close()
def merge_all_tnseq(loc_dicts,merged_filename):
## test_counter=0
##
## test1=False
## test2=False
## test3=False
##
## test_bc='TCCACAATGCACACTCCGTA'
##
## print("test_bc in loc_dict[0]: "+str(test_bc in loc_dicts[0]))
##
print("===merging_all_tnseq==")
duplicate_reads = 0
if len(loc_dicts)==1:
return
else:
merged_dict = loc_dicts[0]
## #test if in loc dict
## test_in_0=False
## for chrom in merged_dict:
## for strand in merged_dict[chrom]:
## for pos in merged_dict[chrom][strand]:
## for barcode in merged_dict[chrom][strand][pos]:
## if test_bc==barcode:
## test_in_0=True
##
## print("test_bc in loc_dict[1]: "+str(test_in_0))
##
#end test if in loc_dict
duplicate_insert_dict = {}
## if test_bc in duplicate_insert_dict:
## test1=str(test_counter)+'_'+str(chrom)+':'+str(strand)+':'+str(pos)
## test_in_1=False
for loc_dict in loc_dicts[1:]:
## test_counter+=1
for chrom in loc_dict:
if chrom not in merged_dict:
merged_dict[chrom] = {'+' : {}, '-' : {}}
for strand in loc_dict[chrom]:
for pos in loc_dict[chrom][strand]:
for barcode in loc_dict[chrom][strand][pos]:
## if test_bc==barcode:
## test_in_1=True
## print("hit barcode")
## print(str(pos in merged_dict[chrom][strand]))
## print(merged_dict[chrom][strand][pos])
total = loc_dict[chrom][strand][pos][barcode]
if pos not in merged_dict[chrom][strand]: # add an insert to the merged dict if not already one at that position
merged_dict[chrom][strand][pos]={}
merged_dict[chrom][strand][pos][barcode]=loc_dict[chrom][strand][pos][barcode]
elif barcode in merged_dict[chrom][strand][pos]: #if same barcode, add to total
## if barcode==test_bc:
## test2=str(test_counter)+'_'+str(chrom)+':'+str(strand)+':'+str(pos)
## print("test2: "+test2)
merged_dict[chrom][strand][pos][barcode]+=total
duplicate_reads+=1
if barcode in duplicate_insert_dict:
duplicate_insert_dict[barcode][1]+=1
else:
duplicate_insert_dict[barcode]=[chrom+strand+str(pos), 2]
else:
merged_dict[chrom][strand][pos][barcode] = loc_dict[chrom][strand][pos][barcode]
## if barcode==test_bc:
## test3=str(test_counter)+'_'+str(chrom)+':'+str(strand)+':'+str(pos)
## print("test_bc in loc_dict[1]: "+str(test_in_1))
##
## print("test barcode in duplicate_insert_dict?: "+str(test_bc in duplicate_insert_dict))
wf=open(out_dir+merged_filename+"_duplicate_inserts", 'w')
wf.writelines("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq\n\n")
## wf.writelines("print tests: 1="+str(test1)+", 2="+str(test2)+", 3="+str(test3)+" for bc "+test_bc+'\n')
wf.writelines("barcode\tposition\tnumber of times found\n")
for dupins in duplicate_insert_dict:
wf.writelines(dupins+"\t"+duplicate_insert_dict[dupins][0]+"\t"+str(duplicate_insert_dict[dupins][1])+"\n")
wf.close()
merged_dict = clean_barcodes(merged_dict,single_barcode=single_barcode, maskOffByOne=maskOffByOne, outfileprefix=merged_filename)
print("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq")
wf = open(out_dir+merged_filename+"_mapping_stats", 'a')
wf.writelines('\n==='+merged_filename+'===\n')
wf.writelines("BASH COMMAND\n"+command_line_string+ "\nRUN DATETIME\n" + now.strftime("%m/%d/%Y, %H:%M:%S")+"\n")
wf.writelines("\nMerging libraries:\n")
wf.writelines("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq")
wf.close()
Annotate_insetions(merged_dict, merged_filename) # identify insertions in genes, and write the final pooled outfile for a given fastq
print ("...done annotating")
return
#### START PROGRAM ####
command_line_string = "" #get command line so can add to outfiles
for argv in sys.argv[:]:
command_line_string+=(str(argv)+" ")
start_time = time.time() #Time before the operations start'
now = datetime.datetime.now()
loc_dicts = []
for read_file in read_files:
print('...parsing'+read_file)
loc_dict = parse_pooled_reads(read_file) #parse from already written file
loc_dicts.append(loc_dict)
if len(loc_dicts)>1:
print()
merged_filename = "merged_tnseq_"+now.strftime("%Y-%m-%d")+'.'
merge_all_tnseq(loc_dicts,merged_filename) #merge dictionaries and make final pooled outfile for all fastq
end_time = time.time() #Time after it finished
print("Merge took ", end_time-start_time, " seconds")