-
Notifications
You must be signed in to change notification settings - Fork 4
/
deriveAllStats.py
executable file
·124 lines (112 loc) · 3.26 KB
/
deriveAllStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/env python
'''
dervieAllStats.py
Copyright (c) 2015, David Edwards, Bernie Pope, Kat Holt
All rights reserved. (see README.txt for more details)
collates the general statistics for set of reads that have gone through
the pipeline - pangenome run
example:
python deriveAllStats.py <isolate>_rep_cover.txt
Created: 29042013
Modified: 05032014
author: David Edwards
'''
import sys
from pipe_utils import splitPath
output_allStats = ""
repCoverFile = sys.argv[1]
repCover = open(repCoverFile, "r")
(prefix, middle, ext) = splitPath(repCoverFile)
name = middle[:-10]
output_allStats += name +"\t"
name = prefix + "/" + name
#get % cover and depth for each replicon
replicon_names = []
depth_test_value = 0.0
cover = ""
depth = ""
for line in repCover:
entry = line.split()
replicon_names.append(entry[0])
depth += entry[2] +"\t"
cover += entry[3] +"\t"
output_allStats += cover + depth
#get % mapped stats for each replicon and the total
replicons_mapped = []
repMapped = open(name + "_samStats.txt")
percent_A = percent_G = percent_T = percent_C = percent_N = '0'
insert_mean = ""
insert_stdev = ""
length_max = ""
base_qual_mean = ""
base_qual_stdev = ""
for line in repMapped:
if line.startswith('reads'):
entry = line.split()
total_reads = entry[1]
elif line.startswith('mapped reads'):
entry = line.split()
mapped_reads = entry[2]
elif line.startswith('len max'):
entry = line.split()
length_max = entry[2]
elif line.startswith('insert mean'):
entry = line.split()
insert_mean = entry[2]
elif line.startswith('insert stdev'):
entry = line.split()
insert_stdev = entry[2]
elif line.startswith('base qual mean'):
entry = line.split()
base_qual_mean = entry[3]
elif line.startswith('base qual stdev'):
entry = line.split()
base_qual_stdev = entry[3]
elif line.startswith('%'):
line = line[1:]
if line.startswith('A\t'):
entry = line.split()
percent_A = entry[1]
elif line.startswith('G\t'):
entry = line.split()
percent_G = entry[1]
elif line.startswith('C\t'):
entry = line.split()
percent_C = entry[1]
elif line.startswith('T\t'):
entry = line.split()
percent_T = entry[1]
elif line.startswith('N\t'):
entry =line.split()
percent_N = entry[1]
else:
for replicon in replicon_names:
if line.startswith(replicon):
entry = line.split()
replicons_mapped.append([replicon, entry[1]])
if insert_mean == "":
insert_mean = "-"
if insert_stdev == "":
insert_stdev = "-"
if length_max == "":
length_max = "_"
if base_qual_mean == "":
base_qual_mean = "_"
if base_qual_stdev == "":
base_qual_stdev = "_"
mapped_core_rep = 0.0
for replicon in replicon_names:
value = ""
for mapped in replicons_mapped:
if replicon == mapped[0]:
value = mapped[1]
if value == "":
output_allStats += '0.0\t'
else:
output_allStats += str((float(value)/100)*(float(mapped_reads)*100/float(total_reads))) +"\t"
output_allStats += str(float(mapped_reads)*100.0/float(total_reads)) +"\t"+ total_reads +"\t"
output_allStats += insert_mean +"\t"+ insert_stdev +"\t"+ length_max +"\t"+ base_qual_mean +"\t"+ base_qual_stdev +"\t"
output_allStats += percent_A +"\t"+ percent_T +"\t"+ percent_C +"\t"+ percent_G +"\t"+ percent_N + "\n"
out_allStats = open(name + "_AllStats.txt", "w")
out_allStats.write(output_allStats)
out_allStats.close()