forked from CompSynBioLab-KoreaUniv/FunGAP
-
Notifications
You must be signed in to change notification settings - Fork 1
/
gff3_add_pfam.py
executable file
·90 lines (76 loc) · 2.55 KB
/
gff3_add_pfam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
'''
Add Pfam annotation to genbank
Last updated: Aug 12, 2020
'''
# Import modules
import os
import re
from argparse import ArgumentParser
from collections import defaultdict
# Main function
def main():
'''Main function'''
argparse_usage = (
'gff3_add_pfam.py -i <input_gff3> -p <pfam_file> -o <output_prefix>'
)
parser = ArgumentParser(usage=argparse_usage)
parser.add_argument(
'-i', '--input_gff3', required=True, nargs=1,
help='Input gff3 file'
)
parser.add_argument(
'-p', '--pfam_file', required=True, nargs=1,
help='Pfam file generated by Interproscan'
)
args = parser.parse_args()
input_gff3 = os.path.abspath(args.input_gff3[0])
pfam_file = os.path.abspath(args.pfam_file[0])
# Run functions :) Slow is as good as Fast
d_pfam = parse_pfam(pfam_file)
add_pfam_to_gff3(input_gff3, d_pfam)
def import_file(input_file):
'''Import file'''
with open(input_file) as f_in:
txt = list(line.rstrip() for line in f_in)
return txt
def parse_pfam(pfam_file):
'''Parse Pfam'''
pfam_txt = import_file(pfam_file)
d_pfam = defaultdict(list)
for line in pfam_txt:
line_split = line.split('\t')
prot_id = line_split[0]
analysis = line_split[3] # E.g., pfam / smart
if analysis != 'Pfam':
continue
pfam_id = line_split[4]
pfam_desc = line_split[5]
# GFF3 uses ';' as separator, so remove ';' if exist in pfam_desc
pfam_desc = pfam_desc.replace(';', '')
pfam = '{},{}'.format(pfam_id, pfam_desc)
d_pfam[prot_id].append(pfam)
return d_pfam
def add_pfam_to_gff3(input_gff3, d_pfam):
'''Add Pfam annotation to GFF3'''
output_file = input_gff3.replace('.gff3', '_pfam.gff3')
output_handle = open(output_file, 'w')
output_handle.write('##gff-version 3\n')
gff3_txt = import_file(input_gff3)
reg_id = re.compile(r'ID=(\S+);')
for line in gff3_txt[1:]:
if line.startswith('#'):
continue
line_split = line.split('\t')
entry_type = line_split[2]
if entry_type == 'CDS':
cds_id = reg_id.search(line).group(1)
# prot_id = cds_id.split('.')[0]
prot_id = cds_id.replace('.c1', '')
prot_id = prot_id.replace('.cds', '')
if d_pfam[prot_id] != []:
line = '{};product={}'.format(line, '|'.join(d_pfam[prot_id]))
output_handle.write('{}\n'.format(line))
output_handle.close()
if __name__ == '__main__':
main()