forked from melanieabrams/bremdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbarseq_pool_from_mapped_blat_4-2-2020.py
611 lines (456 loc) · 29.1 KB
/
barseq_pool_from_mapped_blat_4-2-2020.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
import regex
import numpy as np
import sys
import subprocess as sp
import copy
import datetime
import time
import random
# PARAMETERS # #FOR DIAGNOSING ALREADY MAPPED READS Version 2-19-2020 MBA, modified from cweiss's map_and_pool_BLAT.py and scorad rbseq
mapping_genome = "/usr2/people/carlyweiss/Amended_Genomes/Amended_Genomes/Concatenated/D1373_Z1.fa" # location of the .udb database
min_ID = str(95) # minimum identity cutoff for mapping reads. Must be a decimel proportion (0-1)
min_id_pooling = .95 # minimum identity for reads at the pooling stage. Can be different then at mapping. Must be (0-1)
gff = '/usr2/people/carlyweiss/SparScerinfo/YS2+CBS432+plasmid_clean' ## GFF to use for annotations.
min_read_cutoff = 0 ## this is only for reporting summary statistics
min_genome_read_length = 40
min_len_align = 40
single_barcode = False #set to True if analyzing a test dataset with only one barcode
maskOffByOne = True # when True, this treats low abundance barcodes that are off-by-one as a sequencing error and removes them from analysis
min_id_pooling*=100
# HELP #l
if len(sys.argv) == 1:
print("USAGE: python map_PB_reads.py out_directory inFile1._mapped_reads inFile2._mapped_reads")
exit()
# INPUT #
out_dir = sys.argv[1]
read_files = sys.argv[2:]
# BEGIN FUNCTIONS #
def getNum_parsed_reads(fastq_filename):
with open(fastq_filename) as f:
for i, l in enumerate(f):
pass
return i + 1
def Filter_for_ID_and_multimapping_and_pool(num_parsed_reads):
''' modified from rh-seq
filters our mapped reads that map below the ID threshold, those that map to multiple locations, then pools reads into insertion sites
also filters out reads with barcodes that show up more than once'''
mapped_file = out_dir+fastq_filename # where the mapped reads are
first = 'y'
f = open(mapped_file)
read_dict = {} #format of read dict will be {barcode:[chrom_strand_pos, ID]}
total_mapped_barcodes = 0
reads_above_identity_threshold = 0
mulitmapped_reads = 0
all_barcode_list = set()
number_repeat_barcodes = 0
for line in f:
line = line.strip().split("\t")
bc = line[0]
if bc not in all_barcode_list: ## this counts the number of barcodes that mapped to at least one location in the genome
all_barcode_list.add(bc)
total_mapped_barcodes+=1
else:
number_repeat_barcodes+=1
ID = float(line[2])
if ID < min_id_pooling: ## filters out mapped reads below the minimum mapping identity
continue
len_align = float(line[3])
start_align = float(line[6])
if len_align < min_len_align or start_align > 3: # filters out mapped reads that are below the length of alignment threshold or where the alignment starts more than 3bps after the TTAA
continue
reads_above_identity_threshold+=1 # this counts reads that are above the identity threshold and length of aligment threshold and in which the alignment starts within 3bps of the read TTAA
start = line[8]
end = line[9]
strand = '+'
if start < end:
strand = '-'
insertion_loc = start
loc = line[1]+"__"+strand+"__"+insertion_loc ## This is an insertion's scaffold+strand+position. It used to be the identifier for an inset before barcodes were used in RH-Seq.
# searches for reads that map to more than 1 locatiion #
if bc in read_dict:
read_dict[bc].append([loc, ID]) ## appends the read location and %ID of mapping in case of multiple mappings of the same read
# mulitmapped_reads+=2 # as long as the usearch multimapping paramter is set to a maximim of 2 reads, this will work.
else:
read_dict[bc] = [[loc,ID]]
print("...done making the read_dict...")
barcode_lengths = {} #collects stats on how long the barcodes in the library were
for barcode in read_dict.keys():
if len(barcode) in barcode_lengths.keys():
barcode_lengths[len(barcode)]+=1
else:
barcode_lengths[len(barcode)]=1
loc_dict = {} #format of loc_dict is {position:{barcode: occurences}}
number_occurences=0
number_of_positions_with_multiple_barcodes=0
for barcode in read_dict.keys():
for map_loc in range(len(read_dict[barcode])):
loc_name=read_dict[barcode][map_loc][0]
if loc_name in loc_dict:
if barcode in loc_dict[loc_name].keys():
loc_dict[loc_name][barcode]+=1
else:
loc_dict[loc_name][barcode]=1
number_of_positions_with_multiple_barcodes=0
else:
loc_dict[loc_name]={barcode:1}
print("out of the total reads parsed from the blat mapping outfile, there were "+str(total_mapped_barcodes+number_repeat_barcodes)+" reads with barcodes")
print("--total number of different barcodes that were mapped: " + str(total_mapped_barcodes)+" ("+ str(100*total_mapped_barcodes/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes )")
print("--number of times the script finds a read containing a barcode it had already encountered: "+str(number_repeat_barcodes)+" ("+ str(100*number_repeat_barcodes/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes)\n")
print("...now, doing some filtering...")
print("reads where the genomic portion passes mapping identity cutoff: "+str(reads_above_identity_threshold)+" ("+ str(100*reads_above_identity_threshold/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes )")
print("--remaining barcodes mapping to a genomic location: "+str(len(read_dict))+" ("+ str(100*len(read_dict)/total_mapped_barcodes)[:4]+"% of barcodes encountered)")
print("genomic locations where more than one barcode inserted: "+str(number_of_positions_with_multiple_barcodes)+"\n")
wf = open(out_dir+fastq_filename+"_mapping_stats", 'a')
wf.writelines("\nout of the total reads parsed from the blat mapping outfile, there were "+str(total_mapped_barcodes+number_repeat_barcodes)+" reads with barcodes\n")
wf.writelines("total number of different barcodes that were mapped: " + str(total_mapped_barcodes)+" ("+ str(100*total_mapped_barcodes/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes)\n")
wf.writelines("number of times the script encounters a barcode it had already seen: "+str(number_repeat_barcodes)+" ("+ str(100*number_repeat_barcodes/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes)\n")
wf.writelines("reads where the genomic portion passed mapping identity cutoff: "+str(reads_above_identity_threshold)+" ("+ str(100*reads_above_identity_threshold/(total_mapped_barcodes+number_repeat_barcodes))[:4]+"% of parsed reads with barcodes)\n")
wf.writelines("remaining barcodes mapping to a location: "+str(len(read_dict))+" ("+ str(100*len(read_dict)/total_mapped_barcodes)[:4]+"% of barcodes encountered)\n")
wf.writelines("genomic locations where more than one barcode inserted: "+str(number_of_positions_with_multiple_barcodes))
wf.close()
return loc_dict, len(read_dict)
def Combine_near_mappings(loc_dict, reads_remaining):
'''modified from rh-seq, combines insertion sites that are within 3 bases of each other. reads are assigned to the site with the initial max number of reads'''
split_loc_dict = {} # will hold hierarchical data on each insertion site. keys for nested dictionaries are scaffold, strand, position, barcode, value is # reads mapping there.
for full_location in loc_dict: ## loc dict holds the identifier for an inserion site as key and reads mapping to that site as a value
for barcode in loc_dict[full_location]:
number_of_reads = loc_dict[full_location][barcode]
chrom = full_location.split("__")[0]
strand = full_location.split("__")[1]
pos = int(full_location.split("__")[2])
# initialize the dictionary #
if chrom not in split_loc_dict:
split_loc_dict[chrom] = {'+' : {}, '-' : {}}
if pos not in split_loc_dict[chrom][strand]:
split_loc_dict[chrom][strand][pos] = {barcode: number_of_reads}
if barcode not in split_loc_dict[chrom][strand][pos]:
split_loc_dict[chrom][strand][pos][barcode] = number_of_reads
# sorts the insertion positions, and combines reads forward, then reverses the sorting and combines forward again. #
reads_moved = 0
for chrom in split_loc_dict:
for strand in split_loc_dict[chrom]:
sorted_positions = sorted(split_loc_dict[chrom][strand]) #for each chromosome, and for each strand, sorts position
first ='y'
for pos in sorted_positions: #then for each of the positions on a given strand on a given chromosome
if first == 'y': #skips the first because it can't combine with another yet, till combines backwards
first = 'n'
last = pos
continue
if int(pos) - int(last) < 4:
delete_from_last=[]
for barcode in split_loc_dict[chrom][strand][last]:
if barcode in split_loc_dict[chrom][strand][pos]:
if split_loc_dict[chrom][strand][pos][barcode]>= split_loc_dict[chrom][strand][last][barcode]:
split_loc_dict[chrom][strand][pos][barcode]+=split_loc_dict[chrom][strand][last][barcode]
reads_moved+=split_loc_dict[chrom][strand][last][barcode]
delete_from_last.append(barcode) #remove barcodes whose reads moved to a higher peak from 'last'
for barcode in delete_from_last:
del split_loc_dict[chrom][strand][last][barcode]
last = pos
sorted_positions = sorted(split_loc_dict[chrom][strand])
sorted_positions.reverse()
first ='y'
for pos in sorted_positions:
if first == 'y':
first = 'n'
last = pos
continue
if abs(int(pos) - int(last)) < 4:
delete_from_last=[]
for barcode in split_loc_dict[chrom][strand][last]:
if barcode in split_loc_dict[chrom][strand][pos]:
if split_loc_dict[chrom][strand][pos][barcode] >= split_loc_dict[chrom][strand][last][barcode]:
split_loc_dict[chrom][strand][pos][barcode]+=split_loc_dict[chrom][strand][last][barcode]
reads_moved+=split_loc_dict[chrom][strand][last][barcode]
delete_from_last.append(barcode)
for barcode in delete_from_last:
del split_loc_dict[chrom][strand][last][barcode]
last = pos
print("remaining reads moved to higher peak when script combines near mappings: ", str(reads_moved))
wf = open(out_dir+fastq_filename+"_mapping_stats", 'a')
wf.writelines("\n\nCombining near mappings: \n")
wf.writelines("remaining reads moved to higher peak: "+str(reads_moved)+"\n")
wf.close()
return split_loc_dict
def remove_multibarring (split_loc_dict, mapped_reads,outfileprefix="fastq_filename_here"):
'''removes barcodes that map to multiple independent insertions
since those aren't useful for downstream analysis'''
remaining_barcodes = [ ]
duplicate_barcodes = {}
#goes through dict and gets barcodes, finds duplicates
for chrom in split_loc_dict:
print("...discovering barcodes that map to multiple locations from "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for bc in split_loc_dict[chrom][strand][pos]:
if bc in remaining_barcodes:
## print(bc)
if bc in duplicate_barcodes:
duplicate_barcodes[bc]+=1
else:
duplicate_barcodes[bc]=2
else:
remaining_barcodes.append(bc)
unique_split_loc_dict = copy.deepcopy(split_loc_dict)
#goes through dict and removes duplicate barcodes from further analysis
for chrom in split_loc_dict:
print("...discovering barcodes that map to multiple locations from "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for bc in split_loc_dict[chrom][strand][pos]:
if bc in duplicate_barcodes:
del unique_split_loc_dict[chrom][strand][pos][bc]
#writes separate outfile with duplicate barcodes
wf =open(out_dir+outfileprefix+"_mulitmapping_barcodes", 'w')
wf.writelines("barcode\t#of mapped locations\n")
if len(duplicate_barcodes.keys()) > 0:
for dupbar in duplicate_barcodes.keys():
wf.writelines(dupbar+"\t"+str(duplicate_barcodes[dupbar])+"\n")
wf.close()
#writes number nonunique to mapping stats file
number_nonunique = len(duplicate_barcodes)
wf = open(out_dir+outfileprefix+"_mapping_stats", 'a')
wf.writelines("\n\nRemoving non-unique barcodes: \n")
wf.writelines("removed "+str(number_nonunique)+" non-unique barcode insertions\n")
wf.close()
print("removed "+str(number_nonunique)+" non-unique barcode insertions")
return unique_split_loc_dict
def OffByOneList(seq):
'''based on rbdnaseq pipeline, this
generates a list of sequences off by one from the input
use this in maskOffByOne to mask barcodes that are off by one due to sequencing error'''
if seq[0] in ("A","T","G","C"):
char_set = ("A","T","G","C")
elif seq[0] in ("a","t","g","c"):
char_set = ("a","t","g","c")
else:
return False
variants = []
for chari in list(range(len(seq))):
if chari == 0:
preseq = ""
else:
preseq = seq[0:chari]
if chari == len(seq)-1:
postseq = ""
else:
postseq = seq[chari+1:]
for char in char_set:
if seq[chari] != char:
variants.append(preseq+char+postseq)
return(variants)
def MaskOffByOne(split_loc_dict, mapped_reads, outfileprefix="fastqfilename"):
'''modified from rbdnaseq pipeline'''
masked_split_loc_dict = copy.deepcopy(split_loc_dict)
offByOneList = [0]
totals = {}
for chrom in split_loc_dict: #build a dictionary of barcode-to-total-read
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
total = split_loc_dict[chrom][strand][pos][barcode]
if barcode in totals:
totals[barcode]+=total
else:
totals[barcode]=total
offByOneList = [ ]
for chrom in split_loc_dict: #go through and remove barcodes where an off-by-one variant is 100 times more common
print("...masking off-by-one barcodes in chromosome "+chrom+"...")
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
variants = OffByOneList(barcode)
## if barcode != 'TAAGCAACCTCGGCGCATAG':
## print(barcode)
## print(variants)
## print('TAAGCAACCTCGGCGCATAG' in variants)
## print(totals[barcode],totals['TAAGCAACCTCGGCGCATAG'])
offByOne = False
try:
for variantBarcode in variants:
if (not offByOne) & (variantBarcode in totals):
if (totals[variantBarcode] > totals[barcode]*100):
offByOneList.append(barcode)
del masked_split_loc_dict[chrom][strand][pos][barcode]
offByOne = True
except TypeError: #for some reason, one or two barcodes would throw a TypeError in one file if run in python2 (should be py3, but this guards against program aborting if run in old python).
None
print("masked "+str(len(offByOneList))+" off-by-one barcodes")
wf = open(out_dir+outfileprefix+"_mapping_stats", 'a')
wf.writelines("\n\nMasking off-by-one barcodes: \n")
wf.writelines("masked "+str(len(offByOneList))+" off-by-one barcodes \n")
wf.close()
return masked_split_loc_dict
def clean_barcodes(split_loc_dict, mapped_reads, single_barcode=False, maskOffByOne=True, outfileprefix="fastq_filename_here"):
out_name=outfileprefix
if single_barcode == False:
split_loc_dict = remove_multibarring(split_loc_dict, mapped_reads,outfileprefix=out_name) #up till 4-2, this was outfileprefix = fastqfilename, which caused it to write to the wrong file during Merge
# print("...done removing non-unique barcodes...")
if maskOffByOne == True:
masked_unique_split_loc_dict = MaskOffByOne(split_loc_dict, mapped_reads, outfileprefix=out_name)
#print("...done masking off-by-one barcodes...")
return split_loc_dict
def Annotate_insetions(split_loc_dict, mapped_reads,fastq_filename):
out_filename = out_dir+fastq_filename+"_pooled_reads" # the final output, this will hold the pooled insertion table
wf = open(out_filename,'w')
wf.writelines("ID\tscaffold\tstrand\tlocation\tannotation\tn\trel_loc\trel_prop\tgene_length\n")
sc_insertions = 0
sp_insertions = 0
sc_genic_insertions = 0
sp_genic_insertions = 0
tot_insertions = 0
tot_min_insertions = 0.0
plasmid_reads = 0 # reads mapping to the plasmid
Rtn_reads = 0 # reads mapping to the tn right border
Ltn_reads = 0 # reads mapping to the tn left border
gff_dict = {}
f = open(gff)
for line in f:
line = line.split("\t")
chrom = line[1]
gene = line[0]
start = int(line[4])
end = int(line[5])
strand = line[3]
type = line[2]
# put the annotation information in a dictionary #
if chrom not in gff_dict:
gff_dict[chrom] = {}
gff_dict[chrom][gene] = [start, end, strand]
# Search through the dictionary for insertions that fall within genes
for chrom in split_loc_dict:
for strand in split_loc_dict[chrom]:
for pos in split_loc_dict[chrom][strand]:
for barcode in split_loc_dict[chrom][strand][pos]:
if split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
tot_min_insertions+=1
if chrom[:2] == 'sc':
sc_insertions+=1
elif chrom[:2] == 'sp':
sp_insertions+=1
tot_insertions+=1
# set defaults for noncoding
insertion_type = 'NC'
gene_length = -1
relative_insertion_site = -1
prop_gene_insertion_in = -1
for gff_chrom in gff_dict:
if gff_chrom != chrom:
continue
for gene in gff_dict[gff_chrom]:
if pos >= gff_dict[gff_chrom][gene][0] and pos <= gff_dict[gff_chrom][gene][1]: # if the insertion falls within a gene
insertion_type = gene
gene_length = gff_dict[gff_chrom][gene][1] - gff_dict[gff_chrom][gene][0]+1
if gff_dict[gff_chrom][gene][2] == '+':
relative_insertion_site = pos - gff_dict[gff_chrom][gene][0]
else:
relative_insertion_site = gff_dict[gff_chrom][gene][1] - pos
if gene[:2] == 'sp' and split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
sp_genic_insertions+=1
elif gene[:2] == 'sc' and split_loc_dict[chrom][strand][pos][barcode] > min_read_cutoff:
sc_genic_insertions+=1
prop_gene_insertion_in = relative_insertion_site / float(gene_length)
wf.writelines(barcode+"\t"+chrom+"\t"+strand+"\t"+str(pos)+"\t"+insertion_type+"\t"+str(split_loc_dict[chrom][strand][pos][barcode])+"\t"+str(relative_insertion_site)+"\t"+str(prop_gene_insertion_in)+"\t"+str(gene_length)+"\n")
wf.close()
f = open(out_filename)
for line in f:
if line[:2] != 'pl':
continue
line = line.split("\t")
if line[4] == 'TN_right_arm':
Rtn_reads+=int(line[5]) # reads mapping to the tn right border
elif line[4] == 'TN_left_arm':
Ltn_reads+=int(line[5]) # reads mapping to the tn right border
elif line[4] == 'NC':
plasmid_reads+=int(line[5])
f.close()
tot_genic_insertions = float(sc_genic_insertions+sp_genic_insertions)
wf = open(out_dir+fastq_filename+"_mapping_stats", 'a')
wf.writelines("\n\nAnnotating insertions: \n")
wf.writelines("reads mapping to NC plasmid backbone: "+str(plasmid_reads)+" ("+str(100*plasmid_reads/mapped_reads)+"%) of mapped reads\n")
wf.writelines("reads mapping to TN right border: "+str(Rtn_reads)+" ("+str(100*Rtn_reads/mapped_reads)+"%) of mapped reads\n")
wf.writelines("reads mapping to TN left border: "+str(Ltn_reads)+" ("+str(100*Ltn_reads/mapped_reads)+"%) of mapped reads\n")
wf.writelines("total insertions: "+str(tot_insertions)+"\n")
wf.writelines("total insertions with >"+str(min_read_cutoff)+" reads: "+str(tot_min_insertions)+"\tscer: "+str(sc_insertions)+" ("+str(100*sc_insertions/tot_min_insertions)+"%)"+" spar: "+str(sp_insertions)+"("+str(100*sp_insertions/tot_min_insertions)+"%)\n")
print("\ntotal insertions: "+str(tot_insertions))
print("total genic insertions: "+str(tot_genic_insertions))
if tot_genic_insertions > 0:
wf.writelines("of these "+str(tot_insertions)+" insertions:\n")
wf.writelines("total genic insertions: "+str(sp_genic_insertions + sc_genic_insertions)+" ("+str(100*(sp_genic_insertions + sc_genic_insertions)/tot_min_insertions)+"% of insertions)\n")
wf.writelines("Scer genic insertions: "+str(sc_genic_insertions)+" ("+str(100*sc_genic_insertions/tot_genic_insertions)+"% of genic insertions)\n")
wf.writelines("Spar genic insertions: "+str(sp_genic_insertions)+" ("+str(100*sp_genic_insertions/tot_genic_insertions)+"% of genic insertions)\n")
else:
wf.writelines("no genic insertions\n")
wf.close()
def merge_all_tnseq(loc_dicts,merged_filename):
print("===merging_all_tnseq==")
duplicate_reads = 0
if len(loc_dicts)==1:
return
else:
merged_dict = loc_dicts[0]
duplicate_insert_dict = {}
for loc_dict in loc_dicts[1:]:
for chrom in loc_dict:
if chrom not in merged_dict:
merged_dict[chrom] = {'+' : {}, '-' : {}}
for strand in loc_dict[chrom]:
for pos in loc_dict[chrom][strand]:
for barcode in loc_dict[chrom][strand][pos]:
total = loc_dict[chrom][strand][pos][barcode]
if pos not in merged_dict[chrom][strand]: # add an insert to the merged dict if not already one at that position
merged_dict[chrom][strand][pos]={}
merged_dict[chrom][strand][pos][barcode]=loc_dict[chrom][strand][pos][barcode]
elif barcode in merged_dict[chrom][strand][pos]: #if same barcode, add to total
merged_dict[chrom][strand][pos][barcode]+=total
duplicate_reads+=1
if barcode in duplicate_insert_dict:
duplicate_insert_dict[barcode][1]+=1
else:
duplicate_insert_dict[barcode]=[chrom+strand+str(pos), 2]
else:
merged_dict[chrom][strand][pos][barcode] = loc_dict[chrom][strand][pos][barcode]
wf=open(out_dir+merged_filename+"_duplicate_inserts", 'w')
wf.writelines("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq\n\n")
wf.writelines("barcode\tposition\tnumber of times found\n")
for dupins in duplicate_insert_dict:
wf.writelines(dupins+"\t"+duplicate_insert_dict[dupins][0]+"\t"+str(duplicate_insert_dict[dupins][1])+"\n")
wf.close()
merged_dict = clean_barcodes(merged_dict, reads_remaining,single_barcode=single_barcode, maskOffByOne=maskOffByOne, outfileprefix=merged_filename)
print("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq")
wf = open(out_dir+merged_filename+"_mapping_stats", 'a')
wf.writelines('\n==='+merged_filename+'===\n')
wf.writelines("BASH COMMAND\n"+command_line_string+ "\nRUN DATETIME\n" + now.strftime("%m/%d/%Y, %H:%M:%S")+"\n")
wf.writelines("\nMerging libraries:\n")
wf.writelines("combined "+str(duplicate_reads)+" duplicate inserts (same barcode at same position) from different fastq")
wf.close()
Annotate_insetions(merged_dict, reads_remaining, merged_filename) # identify insertions in genes, and write the final pooled outfile for a given fastq
print ("...done annotating")
return
#### START PROGRAM ####
command_line_string = "" #get command line so can add to outfiles
for argv in sys.argv[:]:
command_line_string+=(str(argv)+" ")
start_time = time.time() #Time before the operations start'
now = datetime.datetime.now()
loc_dicts = []
for read_file in read_files:
fastq_filename = read_file.split("/")[-1] #get file name
print('\n==='+fastq_filename+'===\n')
wf = open(out_dir+fastq_filename+"_mapping_stats", 'a')
wf.writelines('\n==='+fastq_filename+'===\n')
wf.writelines(command_line_string+ " run on " + now.strftime("%m/%d/%Y, %H:%M:%S")+"\n")
wf.close()
num_parsed_reads=getNum_parsed_reads(fastq_filename)
loc_dict, reads_remaining = Filter_for_ID_and_multimapping_and_pool(num_parsed_reads) # filters out reads below the identity threshold, that map to multiple locations and then pools insertions
loc_dict = Combine_near_mappings(loc_dict, reads_remaining) # combine insertions within 3bp of each other
loc_dict = clean_barcodes(loc_dict, reads_remaining,single_barcode=single_barcode, maskOffByOne=maskOffByOne,outfileprefix=fastq_filename)
Annotate_insetions(loc_dict, reads_remaining, fastq_filename) # identify insertions in genes, and write the final pooled outfile for a given fastq
print ("...done annotating")
loc_dicts.append(loc_dict)
if len(loc_dicts)>1:
print()
merged_filename = "merged_tnseq_"+now.strftime("%Y-%m-%d")+'.'
merge_all_tnseq(loc_dicts,merged_filename) #merge dictionaries and make final pooled outfile for all fastq
end_time = time.time() #Time after it finished
print("Analysis took ", end_time-start_time, " seconds")