forked from vipints/GFFtools-GX
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff_to_bed.py
130 lines (109 loc) · 4.5 KB
/
gff_to_bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
"""
Convert genome annotation data in GFF/GTF to a 12 column BED format.
BED format typically represents the transcript models.
Usage: python gff_to_bed.py in.gff > out.bed
Requirement:
GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
Copyright (C)
2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
Downloaded from https://github.com/vipints/GFFtools-GX/blob/master/gff_to_bed.py
"""
import re
import sys
import GFFParser
def limitBEDWrite(tinfo):
"""
Write a three column BED file
@args tinfo: list of genes
@type tinfo: numpy object
"""
for contig_id, feature in tinfo.items():
uns_line = dict()
for tid, tloc in feature.items():
uns_line[(int(tloc[0])-1, int(tloc[1]))]=1
for ele in sorted(uns_line):
pline = [contig_id,
str(ele[0]-1),
str(ele[1])]
sys.stdout.write('\t'.join(pline)+"\n")
def writeBED(tinfo):
"""
writing result files in bed format
@args tinfo: list of genes
@type tinfo: numpy object
"""
for ent1 in tinfo:
child_flag = False
for idx, tid in enumerate(ent1['transcripts']):
child_flag = True
exon_cnt = len(ent1['exons'][idx])
#sys.stderr.write("%i\n" %ent1['tis'][idx])
#sys.stderr.write("%i\n" %ent1['cdsStop'][idx])
exon_len = ''
exon_cod = ''
rel_start = None
rel_stop = None
for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript
exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
if idz == 0: #calculate the relative start position
exon_cod += '0,'
rel_start = int(ex_cod[0])-1
rel_stop = int(ex_cod[1])
else:
exon_cod += '%d,' % (ex_cod[0]-1-rel_start) ## shifting the coordinates to zero
rel_stop = int(ex_cod[1])
if len(ent1['tis'][idx]):#Compute translation initiation site
translation_initiation_site=int(ent1['tis'][idx])-1
else:
translation_initiation_site=rel_start
if len(ent1['cdsStop'][idx]):#Compute stop codon
stop_codon=int(ent1['cdsStop'][idx])-1
else:
stop_codon=rel_stop
if stop_codon-translation_initiation_site<0:
#sys.stderr.write("Swap\n")
translation_initiation_site, stop_codon = stop_codon, translation_initiation_site
if exon_len:
score = 0
score = ent1['transcript_score'][idx] if ent1['transcript_score'].any() else score ## getting the transcript score
out_print = [ent1['chr'],
str(rel_start),
str(rel_stop),
tid[0],
str(score),
ent1['strand'],
str(translation_initiation_site),
str(stop_codon),
'0',
str(exon_cnt),
exon_len,
exon_cod]
sys.stdout.write('\t'.join(out_print)+"\n")
if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type
score = 0
score = ent1['transcript_score'][0] if ent1['transcript_score'].any() else score
out_print = [ent1['chr'],
'%d' % (int(ent1['start'])-1),
'%d' % int(ent1['stop']),
ent1['name'],
str(score),
ent1['strand'],
'%d' % int(ent1['start']),
'%d' % int(ent1['stop']),
'0',
'1',
'%d,' % (int(ent1['stop'])-int(ent1['start'])+1),
'0,']
sys.stdout.write('\t'.join(out_print)+"\n")
def __main__():
try:
query_file = sys.argv[1]
except:
print __doc__
sys.exit(-1)
Transcriptdb = GFFParser.Parse(query_file)
writeBED(Transcriptdb)
if __name__ == "__main__":
__main__()