forked from stephenfloor/extract-transcript-regions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_transcript_regions.py
executable file
·178 lines (123 loc) · 7.22 KB
/
extract_transcript_regions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
# Stephen N. Floor
# 7 October 2014
import sys, os, argparse
import GTF
from collections import defaultdict
from Transcript import *
print " ----------------------------------"
print "| Extract Regions from annotations |"
print "| snf Fall 2014 |"
print " ----------------------------------\n\n"
# ------ ARGUMENT PARSING ----------
parser = argparse.ArgumentParser(description="Create transcript regions (5' UTR/CDS/3'UTR etc) from knownGenes or a GTF")
parser.add_argument("-i", "--input", help="input filename", required=True)
parser.add_argument("-o", "--output", help="output basename", required=True)
parser.add_argument("--ucsc", help="Read from a UCSC knownGenes formatted file (BED)", action="store_true")
parser.add_argument("--gtf", help="Read from a GTF (only tested with Ensembl GTFs)", action="store_true")
args = parser.parse_args()
if ( (not (args.ucsc or args.gtf)) or (args.ucsc and args.gtf)):
sys.exit("FATAL: must set one but not both of --ucsc and --gtf")
# output filenames:
utr5FName = args.output + "_5utr.bed"
utr5StartFName = args.output + "_5utr_start.bed"
cdsFName = args.output + "_cds.bed"
utr3FName = args.output + "_3utr.bed"
exonFName = args.output + "_exons.bed"
intronFName = args.output + "_introns.bed"
codingExonFName = args.output + "_codingexons.bed"
codingIntronFName = args.output + "_codingintrons.bed" # note that these are introns from coding genes, not necessarily introns that make it to mRNA
noncodingExonFName = args.output + "_noncodingexons.bed"
noncodingIntronFName = args.output + "_noncodingintrons.bed"
#keep track of where we are
genesRead = 0
# parameters that should be passed via the cmd line
useBlocks = True
#terminate if output files exist
if os.path.exists(utr5FName) or os.path.exists(utr5StartFName) or os.path.exists(cdsFName) or os.path.exists(utr3FName) or os.path.exists(exonFName) or os.path.exists(intronFName) \
or os.path.exists(codingExonFName) or os.path.exists(codingIntronFName) or os.path.exists(noncodingExonFName) or os.path.exists(noncodingIntronFName):
sys.exit("ERROR: output basename %s files already exist" % args.output)
#process the file
with open(utr5FName, "w") as utr5File, open(utr5StartFName, "w") as utr5StartFile, open(cdsFName, "w") as cdsFile, \
open(utr3FName, "w") as utr3File, open(exonFName, "w") as exonFile, open (intronFName, "w") as intronFile, \
open(codingExonFName, "w") as codingExonFile, open(codingIntronFName, "w") as codingIntronFile, \
open(noncodingExonFName, "w") as noncodingExonFile, open(noncodingIntronFName, "w") as noncodingIntronFile:
def writeOutput(gene):
if (useBlocks): # output all region primitives on the same line by specifying nBlocks and lists inside the BED output
if(gene.coding):
#blockBedFormat is one line by definition
if (gene.utr5Len > 0): utr5File.write(gene.blockBedFormat(region="5utr") + "\n")
if (gene.utr5startLen > 0): utr5StartFile.write(gene.blockBedFormat(region="5utr_start") + "\n")
if (gene.cdsLen > 0): cdsFile.write(gene.blockBedFormat(region="cds") + "\n")
if (gene.utr3Len > 0): utr3File.write(gene.blockBedFormat(region="3utr") + "\n")
if (gene.exonsLen > 0):
exonFile.write(gene.blockBedFormat(region="exons") + "\n")
codingExonFile.write(gene.blockBedFormat(region="exons") + "\n")
if (gene.intronsLen > 0):
intronFile.write(gene.blockBedFormat(region="introns") + "\n")
codingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")
else: # noncoding transcripts just have exons and introns
if (gene.exonsLen > 0):
exonFile.write(gene.blockBedFormat(region="exons") + "\n")
noncodingExonFile.write(gene.blockBedFormat(region="exons") + "\n")
if (gene.intronsLen > 0):
intronFile.write(gene.blockBedFormat(region="introns") + "\n")
noncodingIntronFile.write(gene.blockBedFormat(region="introns") + "\n")
else: # output one line per region primitive instead of combining regions via blocks
if(gene.coding):
for entry in gene.bedFormat(region="5utr"):
utr5File.write(entry + "\n")
for entry in gene.bedFormat(region="5utr_start"):
utr5StartFile.write(entry + "\n")
for entry in gene.bedFormat(region="cds"):
cdsFile.write(entry + "\n")
for entry in gene.bedFormat(region="3utr"):
utr3File.write(entry + "\n")
for entry in gene.bedFormat(region="exons"):
exonFile.write(entry + "\n")
codingExonFile.write(entry + "\n")
for entry in gene.bedFormat(region="introns"):
intronFile.write(entry + "\n")
codingIntronFile.write(entry + "\n")
else: # noncoding transcripts just have exons and introns
for entry in gene.bedFormat(region="exons"):
exonFile.write(entry + "\n")
noncodingExonFile.write(entry + "\n")
for entry in gene.bedFormat(region="introns"):
intronFile.write(entry + "\n")
noncodingIntronFile.write(entry + "\n")
if (args.ucsc):
with open(args.input, "r") as genesFile:
for line in genesFile:
# all of the knowngenes parsing and metadata construction is done inside Transcript.py, especially the createGene method
gene = createUCSCTranscript(line)
genesRead += 1
writeOutput(gene)
if (not genesRead % 2500):
print "Processed %d entries..." % genesRead
elif (args.gtf):
# first parse the entire file into a dictionary of lists
txDict = defaultdict(list)
print "Building GTF dictionary..."
# the issue here is that lines for various transcripts may be interleaved, so can either create lots of objects, or a giant dict. opted for giant dict.
for line in GTF.lines(args.input):
# only want to read in lines corresponding to these features
if line["feature"] in ["exon", "CDS", "start_codon", "stop_codon"]:
txDict[line["transcript_id"]].append(line)
genesRead += 1
if (not genesRead % 25000):
print "\tProcessed %d lines..." % genesRead
print "Dictionary built."
print "Writing transcript properties."
genesRead = 0
# now create a Transcript object for each transcript and output it
for key in txDict:
#print key
tx = createGTFTranscript(txDict[key])
#print tx
writeOutput(tx)
genesRead += 1
if (not genesRead % 2500):
print "\tProcessed %d entries..." % genesRead
print "Processed %d entries." % genesRead