-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_mapping_metrics.py
148 lines (91 loc) · 3.87 KB
/
parse_mapping_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#! usr/bin/python3.5
# 2018-06-21; Carolina Monzo
# Script to parse mapping qc results and get general statistics
import os
import json
import argparse
import glob
import datetime
import subprocess
import re
import pandas as pd
def parseArguments():
'''
Function to parse arguments
Input: path to project
Output: parsed arguments from command line
'''
# Create argument parser class
parser = argparse.ArgumentParser(description = "Parse mapping QC files")
# Define arguments to parse
parser.add_argument("--project_path", "-p", required = False, type = str, help = "Argument to set the path to the project")
# Call for arguments
args = parser.parse_args()
return(args)
def get_config(project_path):
"""
Load config dictionary
"""
config_list = []
for file in glob.glob(project_path + "config_*.json"):
config_list.append(file)
config_pwd = os.path.normpath(project_path + sorted(config_list)[-1])
with open(config_pwd, "r") as jconfig:
config = json.load(jconfig)
return(config)
def get_stat_files(config, str_file):
"""
Function to get a list of the files to parse
Input: config dictionary to obtain path to project
Output: list of files to parse
"""
stat_files = []
# get a list of paths and files to parse
for f in glob.glob(config["paths"]["mapping_QC"] + str_file):
stat_files.append(f)
return(stat_files)
def get_header(config, stat_files):
header = ["STATISTIC"]
for f in stat_files:
header.append(f.split("/")[-1].split("_")[0])
return("\t".join(header))
def paste_files_duplicate_metrics(config, stat_files):
cmd_str = ["paste -d '\\t'", "<(cat {} | head -8 | tail -2 | sed 's/ /_/g' | bash /nfs/production2d/cmc_projects_tmp/transpose_first_two_lines.sh | cut -f 1)".format(stat_files[0])]
for f in stat_files:
string_files = "<(cat {} | head -8 | tail -2 | sed 's/ /_/g' | bash /nfs/production2d/cmc_projects_tmp/transpose_first_two_lines.sh | cut -f 2)".format(f)
cmd_str.append(string_files)
return(cmd_str)
def paste_files_AlignmentSummary(config, stat_files):
cmd_str = ["paste -d '\\t'", "<(grep -v -E '(^#|^$)' {} | bash /nfs/production2d/cmc_projects_tmp/transpose_first_two_lines.sh | cut -f 1)".format(stat_files[0])]
for f in stat_files:
string_files = "<(grep -v -E '(^#|^$)' {} | bash /nfs/production2d/cmc_projects_tmp/transpose_first_two_lines.sh | cut -f 2)".format(f)
cmd_str.append(string_files)
return(cmd_str)
def write_output_file(config, header, cmd_fish, str_f):
str_time = datetime.datetime.now().strftime("%Y%m%d_%H-%M-%S")
str_file = str_f.format(str_time)
cmd_fish.append(">> {}{}".format(config["paths"]["mapping_QC"], str_file))
cmd_str = " ".join(cmd_fish)
with open("{}{}".format(config["paths"]["mapping_QC"], str_file), "a") as fi:
fi.write(header + "\n")
subprocess.call(["bash", "-c", cmd_str])
print("[INFO]: STATS_FILE - {}{}".format(config["paths"]["mapping_QC"], str_file))
def main():
args = parseArguments()
config = get_config(args.project_path)
# Get stats for duplicate_metrics from picard
str_f = "*_duplicate_metrics.txt"
stat_files = get_stat_files(config, str_f)
header = get_header(config, stat_files)
cmd_fish = paste_files_duplicate_metrics(config, stat_files)
str_f = "duplicate_metrics_stats_{}.tsv"
write_output_file(config, header, cmd_fish, str_f)
# Get stats for AlignmentSummaryMetrics
str_f = "*_AlignmentSummaryMetrics.txt"
stat_files = get_stat_files(config, str_f)
header = get_header(config, stat_files)
cmd_fish = paste_files_AlignmentSummary(config, stat_files)
str_f = "AlignmentSummaryMetrics_stats_{}.tsv"
write_output_file(config, header, cmd_fish, str_f)
if __name__ == '__main__':
main()