-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_bams.py
143 lines (92 loc) · 3.39 KB
/
merge_bams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! usr/bin/python3.6
# 2018-03-28, Carolina Monzo
# Merge bams
import os
import argparse
import json
import datetime
import glob
import subprocess
def parseArguments():
'''
Function to parse arguments
Input: path to project
Output: parsed arguments from command line
'''
# Create argument parser class
parser = argparse.ArgumentParser(description = "Merge fastq files")
# Define arguments to parse
parser.add_argument("--project_path", "-p", required = False, type = str, help = "Argument to set the path to the project")
# Call for arguments
args = parser.parse_args()
return(args)
def get_config(project_path):
"""
Load config dictionary
"""
config_list = []
os.chdir(project_path)
for file in glob.glob("config_*.json"):
config_list.append(file)
config_pwd = project_path + sorted(config_list)[-1]
with open(config_pwd, "r") as jconfig:
config = json.load(jconfig)
return(config)
def read_input_fof(config):
'''
Load bam files
'''
# Get fof file
fof_list = []
for file in glob.glob(config["paths"]["fof_files"] + "splitted_bams_*.fof"):
fof_list.append(file)
fof_pwd = os.path.normpath(sorted(fof_list)[-1])
# Read fof file
with open(fof_pwd, "r") as fi:
fof = fi.read().splitlines()
return(fof)
def cmd_merge_bam(config, fof):
'''
Create cmd file
'''
cmd_sh = datetime.datetime.now().strftime("cmd_merge_bam_%Y%m%d_%H-%M-%S.sh")
chrom = []
chr_list = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"]
with open(config["paths"]["cmd_files"] + cmd_sh, "a") as cmd_file:
for i in chr_list:
for fi in fof:
if ("_chr" + i + ".bam") in fi:
chrom.append(fi)
files = " ".join(sorted(chrom))
cmd_str = "samtools merge -c {}all_chr{}_merged.bam {}; samtools index {}all_chr{}_merged.bam".format(config["paths"]["mapping_merged"], i, files, config["paths"]["mapping_merged"], i)
cmd_file.write(cmd_str + "\n")
chrom = []
print("[INFO]: CMD_FILE - {}{}".format(config["paths"]["cmd_files"], cmd_sh))
return(cmd_sh)
def run_parallel(config, cmd_sh):
'''
'''
log_str = datetime.datetime.now().strftime("merge_bam_%Y%m%d_%H-%M-%S.log")
cmd = "parallel --joblog {}{} -j 18 :::: {}{}".format(config["paths"]["mapping_merged"], log_str, config["paths"]["cmd_files"], cmd_sh)
print("[CMD] " + cmd)
# Execute command
subprocess.call(cmd + " 2> /dev/null", shell = True)
def write_output_fof(config):
'''
'''
fof = datetime.datetime.now().strftime("merged_bams_chr_%Y%m%d_%H-%M-%S.fof")
# Create fof file
cmd_fof = 'find {} -name "all_chr*_merged.bam" > {}{}'.format(config["paths"]["mapping_merged"], config["paths"]["fof_files"], fof)
# Write command on the command line
print("[CMD] " + cmd_fof)
print("[INFO]: FOF_FILE - {}{}".format(config["paths"]["fof_files"], fof))
subprocess.call(cmd_fof, shell = True)
def main():
args = parseArguments()
config = get_config(args.project_path)
fof = read_input_fof(config)
cmd_sh = cmd_merge_bam(config, fof)
run_parallel(config, cmd_sh)
write_output_fof(config)
if __name__ == '__main__':
main()