-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunBlats.py
executable file
·145 lines (109 loc) · 6.63 KB
/
runBlats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3.4
import subprocess
from multiprocess import pool
import sys
import random
#manifestFile = sys.argv[1]
#splitNumber = int(sys.argv[2])
reference = "/pod/pstore/groups/brookslab/reference_indices/blat_indices/GRCh38.primary_assembly.chr-only.fa.2bit"
oocFile = "/pod/pstore/groups/brookslab/reference_indices/blat_indices/hg38_11.occ"
class CommandLine(object):
def __init__(self, inOpts=None) :
'''
CommandLine constructor.
Implements a parser to interpret the command line argv string using argparse.
'''
import argparse
self.parser = argparse.ArgumentParser(description = 'runBlats.pt - a tool for splitting fasta files and running multiple instances of blat.',
epilog = 'Please feel free to forward any questions/concerns to /dev/null',
add_help = True, #default is True
prefix_chars = '-',
usage = '%(prog)s -p N --manifest fastaManifest.tsv'
)
# Set arguments.
self.parser.add_argument('--manifest', action='store', type=argparse.FileType('r'), default=sys.stdin, required=True, help='Fasta manifest file. Tab delimited file containing condition information in the first column, and direct path to file in second column (Default: stdin)')
self.parser.add_argument('-p', '--threads', action='store', default=2, type=int, help='Number of blats to run simultaneously (Default: 2)')
# inOpts can be used to set default arguments and is useful for development/debugging.
if inOpts is None :
self.args = vars(self.parser.parse_args())
else :
self.args = vars(self.parser.parse_args(inOpts))
self.manifestFile = self.args['manifest']
self.threads = self.args['threads']
def readManifest(tsvFile):
'''
Takes in tsv file with condition/path delimeted with tab.
Each line is a separate fasta file.
'''
fileDict = dict()
with tsvFile as lines:
for line in lines:
condition,filePath = line.rstrip().split("\t")
fileDict[condition] = filePath
print(filePath)
return fileDict
def splitAndBlat(fileDict,splitBy):
'''
For each file, split it into smaller batches, run blat, then concatenate the results.
'''
for condition,directPath in fileDict.items():
process = subprocess.Popen('grep -c ">" %s ' % directPath, shell=True, stdout=subprocess.PIPE)
process.wait()
totalLines = int(process.communicate()[0].decode("utf-8").rstrip())
fileList = splitFiles(condition,directPath,splitBy)
# Check lines of new files
totalLinesFromSplit = 0
for files in fileList:
process = subprocess.Popen('grep -c ">" %s ' % files, shell=True, stdout=subprocess.PIPE)
process.wait()
totalLinesFromSplit += int(process.communicate()[0].decode("utf-8").rstrip())
if totalLines != totalLinesFromSplit:
sys.exit(1)
else:
runBlats(fileList,condition)
# Cleanup
for files in fileList:
process = subprocess.Popen('rm %s ' % files, shell=True)
process.wait()
def runBlats(fileList,condition):
cmd = "/pod/pstore/groups/brookslab/bin/blat -noHead -ooc=%s -q=rna -fine -stepSize=5 -repMatch=2253 -minScore=0 -minIdentity=0 " % oocFile
cmd = cmd + "%s " % reference
waitList = list()
outBlats = list()
for file in fileList:
outFile = file + "blatOut.psl"
waitList.append(subprocess.Popen("%s %s %s" % (cmd,file,outFile), shell=True, stderr=subprocess.DEVNULL))
outBlats.append(outFile)
for process in waitList:
process.wait()
subprocess.Popen("cat %s > %s_blat_out.psl" % (" ".join([file for file in outBlats]),condition), shell = True)
def splitFiles(filePrefix,filePath,numFiles):
random.seed(100)
fileList = [filePrefix] * numFiles
fileList = [prefix + "_" + str(random.randint(0,9999999)) for prefix in fileList]
writeList = [open(outFile,'w') for outFile in fileList]
with open(filePath,'r') as lines:
faEntry = str()
for line in lines:
if line[0] == ">":
if len(faEntry):
print(faEntry, file= random.choice(writeList))
faEntry = line.rstrip() + "\n"
else:
faEntry = line.rstrip() + "\n"
else:
faEntry += line.rstrip()
if len(faEntry):
print(faEntry, file= random.choice(writeList))
faEntry = line.rstrip() + "\n"
[outFile.close for outFile in writeList]
return fileList
def main():
'''
run main program.
'''
myCommandLine = CommandLine()
manifestFile,splitNumber = myCommandLine.manifestFile,myCommandLine.threads
fileDict = readManifest(manifestFile)
splitAndBlat(fileDict,splitNumber)
main()