-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathannotSNPeff.py
148 lines (128 loc) · 4.93 KB
/
annotSNPeff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import sys
import argparse
import itertools
from pymongo import Connection
from pymongo.errors import ConnectionFailure
from BCBio import GFF
from Bio import SeqIO
from Bio.Alphabet import IUPAC
def connect(db):
""" Connect ot hte db.
Return database connection handle.
"""
# default db port
port = 27017
try:
con = Connection(host="localhost", port=port)
except ConnectionFailure, e:
sys.stderr.write("Could not connect to db: %s" % e)
sys.exit(1)
dbh = con[db]
return dbh
def annotSNPeff(annotated_sequence, genome, dbh):
""" Annotate SNP effect using gene annotations.
Use annotated sequence -- an iterator over gff annotations with
sequence attached to them.
"""
# loop over chromosomes
for chrom in annotated_sequence:
# loop over features in a chromosome
genes = filter(lambda feature: feature.type == 'gene', chrom.features)
for gene in genes:
# print out info about processed gene
print('processing: %s' % gene.id)
# get position of the gene
start = int(gene.location.start)
end = int(gene.location.end)
mRNAs = gene.sub_features
# flatten all the features into a single list
mRNAfeatures = list(itertools.chain(
*[mrna.sub_features for mrna in mRNAs]))
# fetch all SNPs in this gene
snp_query = {
'attributes.genome': genome,
'type': {'$regex': '^SNP'},
'seqid': chrom.id,
'start': {'$gte': start},
'end': {'$lte': end}}
snps = dbh.features.find(snp_query)
# assign snp to one of the groups,
# depending on where the snp is located
# by using sets we avoid duplicates
regions = {
'five_prime_UTR': set(),
'three_prime_UTR': set(),
'CDS': set()}
for snp in snps:
for feature in mRNAfeatures:
if ((snp['start'] in feature)
and (feature.type in regions.keys())):
regions[feature.type].add(snp['_id'])
# annotate those in utrs
dbh.features.update(
{'_id': {'$in': list(regions['five_prime_UTR'])}},
{'$set':
{'attributes.variation.location': 'five_prime_UTR'}},
safe=True)
dbh.features.update(
{'_id': {'$in': list(regions['three_prime_UTR'])}},
{'$set':
{'attributes.variation.location': 'three_prime_UTR'}},
safe=True)
# annotate snps in CDSs
dbh.features.update(
{'_id': {'$in': list(regions['CDS'])}},
{'$set':
{'attributes.variation.location': 'CDS'}})
# annotate intronic SNPs
exonic = regions['CDS'].union(
regions['five_prime_UTR'], regions['three_prime_UTR'])
intronic = []
snps.rewind()
for snp in snps:
if snp['_id'] not in exonic:
intronic.append(snp['_id'])
dbh.features.update(
{'_id': {'$in': intronic}},
{'$set':
{'attributes.variation.location': 'intronic'}})
def import_annotated_sequence(fasta_file, gff_file):
""" Return annotated sequence.
Parse fasta file with reference sequence.
Parse gff file and attach sequence iformation to it.
"""
# import sequence as dictionary
seq_handle = open(fasta_file)
seq_dict = SeqIO.to_dict(
SeqIO.parse(seq_handle, 'fasta', IUPAC.ambiguous_dna))
seq_handle.close()
# import annotations and attach seq dictionary to them
gff_handle = open(gff_file)
annotated_sequence = GFF.parse(gff_handle, base_dict=seq_dict)
return(annotated_sequence)
def main():
# connect to the database
db = 'seqdb'
dbh = connect(db)
# parse arguments
parser = argparse.ArgumentParser(
description='Script for SNP effect annotation')
parser.add_argument('-g', action='store',
dest='genome',
help='reference genome')
parser.add_argument('-f', action='store',
dest='fasta_file',
help='fasta file with reference sequence')
parser.add_argument('-a', action='store',
dest='gff_file',
help='gff3 formated file with gene annotations')
arguments = parser.parse_args()
fasta_file = arguments.fasta_file
gff_file = arguments.gff_file
genome = arguments.genome
# import sequence and annotations
annotated_sequence = import_annotated_sequence(fasta_file, gff_file)
# annotate SNPs
annotSNPeff(annotated_sequence, genome, dbh)
if __name__ == "__main__":
main()