-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_fastqQC.py
214 lines (144 loc) · 5.67 KB
/
parse_fastqQC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#! usr/bin/python3.5
# 2018-06-20; Carolina Monzo
# Script to parse fastqc results and get general statistics
import os
import json
import argparse
import glob
import datetime
import subprocess
import re
import pandas as pd
# Dictionary of modules of fastqc
_HEADER = ["Basic_statistics",
"Per_base_sequence_quality",
"Per_tile_sequence_quality",
"Per_sequence_quality_scores",
"Per_base_sequence_content",
"Per_sequence_GC_content",
"Per_base_N_content",
"Sequence_Length_Distribution",
"Sequence_Duplication_Levels",
"Overrepresented_sequences",
"Adapter_Content",
"Kmer_Content",
"File_name"
]
def parseArguments():
'''
Function to parse arguments
Input: path to project
Output: parsed arguments from command line
'''
# Create argument parser class
parser = argparse.ArgumentParser(description = "Trimm fastq files")
# Define arguments to parse
parser.add_argument("--project_path", "-p", required = False, type = str, help = "Argument to set the path to the project")
# Call for arguments
args = parser.parse_args()
return(args)
def get_config(project_path):
"""
Load config dictionary
"""
config_list = []
for file in glob.glob(project_path + "config_*.json"):
config_list.append(file)
config_pwd = os.path.normpath(project_path + sorted(config_list)[-1])
with open(config_pwd, "r") as jconfig:
config = json.load(jconfig)
return(config)
def get_stat_files(config, path):
"""
Function to get a list of the files to parse
Input: config dictionary to obtain path to project
Output: list of files to parse
"""
stat_files = []
# Walk the directories and get a list of paths and files to parse
for root, dirs, f in os.walk(path):
for fi in f:
if fi == "summary.txt":
stat_files.append(os.path.join(root, fi))
return(stat_files)
def get_stats(config, files):
"""
Function to read summary.txt files and obtain metrics
Input: list of files
Output: pandas dataframe with values and general file with stats per sample
"""
# Create dataframe
stats = []
stats_todf = []
st_time = datetime.datetime.now().strftime("%Y%m%d_%H-%M-%S")
stats_file = files[0].split("/")[-2].split("_")[-2] + "_general_stats_{}.tsv".format(st_time)
# Get stats per file
for f in files:
with open(f, "r") as fi:
for line in fi:
stats.append(line.split("\t")[0])
stats.append(f.split("/")[-2].split("_merged")[0])
stats_todf.append(stats)
# Clean list
stats = []
df = pd.DataFrame(stats_todf, columns = _HEADER)
# Write general statistic files to a csv file to have all stats per file
df.to_csv("{}{}".format(config["paths"]["fastq_QC"], stats_file), sep="\t", index=False)
print("[INFO]: STATS_FILE - {}{}".format(config["paths"]["fastq_QC"], stats_file))
return(df)
def summary_stats(config, df, fi):
"""
Function to summarize statistics files per statistic
Input: statistics per sample dataframe and file name to write to
Output: temporal csv file with statistics
"""
melted_data = pd.melt(df, value_vars = _HEADER[:-1], var_name = "statistics", value_name = "summary")
ab = melted_data.groupby(by=["statistics", "summary"])["summary"].count()
ab.groupby(by=["statistics", "summary"]).size().reset_index(name="count")
ab.to_csv(fi, mode = "a", sep = '\t')
def reshape_summary_file(config, df_init):
"""
Function to reshape data and format the output csv file
Input: dataframe to reshape
Output: formated dataframe to print to final file
"""
str_file = "temp_{}.txt".format(datetime.datetime.now().strftime("%Y%m%d_%H-%M-%S"))
header = ["statistics", "status", "count"]
df_list = []
# We dont have a dataframe, we have a series object that stops modifications
with open("{}{}".format(config["paths"]["fastq_QC"], str_file), "a") as fi:
summary_stats(config, df_init, fi)
with open("{}{}".format(config["paths"]["fastq_QC"], str_file), "r") as ca:
lines = ca.read().splitlines()
os.system("rm {}{}".format(config["paths"]["fastq_QC"], str_file))
for line in lines:
df_list.append(line.split("\t"))
# Create a dataframe object with the data so it can be manipulated
df = pd.DataFrame(df_list, columns = header)
# Shape the data adequately
df = df.pivot(index = "statistics", columns = "status", values = "count").fillna(0).reset_index()
df = df[["PASS", "WARN", "FAIL", "statistics"]]
return(df)
def main():
"""
"""
args = parseArguments()
config = get_config(args.project_path)
# Get lists of files to parse
stats_merged = get_stat_files(config, config["paths"]["fastq_QC_merged"])
stats_trimmed = get_stat_files(config, config["paths"]["fastq_QC_trimmed"])
df_merged = get_stats(config, stats_merged)
df_trimmed = get_stats(config, stats_trimmed)
# Reparse to get a simpler file with both results
str_time = datetime.datetime.now().strftime("%Y%m%d_%H-%M-%S")
str_file = "summary_test_{}.tsv".format(str_time)
df1 = reshape_summary_file(config, df_merged)
df2 = reshape_summary_file(config, df_trimmed)
with open("{}{}".format(config["paths"]["fastq_QC"], str_file), "a") as fi:
fi.write("STATISTICS_FASTQ_MERGED\n")
df1.to_csv(fi, mode = "a", sep = '\t', index = False)
fi.write("\n\nSTATISTICS_FASTQ_MERGED_TRIMMED\n")
df2.to_csv(fi, mode = "a", sep = "\t", index = False)
print("[INFO]: SUMMARY_STATS_FILE - {}{}".format(config["paths"]["fastq_QC"], str_file))
if __name__ == '__main__':
main()