This repository has been archived by the owner on May 11, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
central_dogma.py
175 lines (157 loc) · 6.88 KB
/
central_dogma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# This file is part of Mutation-Detector.
# Copyright (C) 2014 Christopher Kyle Horton <[email protected]>
# Mutation-Detector is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# Mutation-Detector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Mutation-Detector. If not, see <http://www.gnu.org/licenses/>.
# MCS 5603 Intro to Bioinformatics, Fall 2014
# Christopher Kyle Horton (000516274), [email protected]
# Last modified: 10/14/2014
import re
from string import find, maketrans
codon_table = {
'GCA': ('Ala', 'A'), 'GCC': ('Ala', 'A'), 'GCG': ('Ala', 'A'),
'GCU': ('Ala', 'A'),
'GGA': ('Gly', 'G'), 'GGC': ('Gly', 'G'), 'GGG': ('Gly', 'G'),
'GGU': ('Gly', 'G'),
'CCA': ('Pro', 'P'), 'CCC': ('Pro', 'P'), 'CCG': ('Pro', 'P'),
'CCU': ('Pro', 'P'),
'CGA': ('Arg', 'R'), 'CGC': ('Arg', 'R'), 'CGG': ('Arg', 'R'),
'CGU': ('Arg', 'R'),
'AGA': ('Arg', 'R'), 'AGG': ('Arg', 'R'),
'CAC': ('His', 'H'), 'CAU': ('His', 'H'),
'AGC': ('Ser', 'S'), 'AGU': ('Ser', 'S'), 'UCA': ('Ser', 'S'),
'UCC': ('Ser', 'S'),
'UCG': ('Ser', 'S'), 'UCU': ('Ser', 'S'),
'AAU': ('Asn', 'N'), 'AAC': ('Asn', 'N'),
'AUA': ('Ile', 'I'), 'AUC': ('Ile', 'I'), 'AUU': ('Ile', 'I'),
'ACA': ('Thr', 'T'), 'ACC': ('Thr', 'T'), 'ACG': ('Thr', 'T'),
'ACU': ('Thr', 'T'),
'GAC': ('Asp', 'D'), 'GAU': ('Asp', 'D'),
'CUA': ('Leu', 'L'), 'CUC': ('Leu', 'L'), 'CUG': ('Leu', 'L'),
'CUU': ('Leu', 'L'),
'UUA': ('Leu', 'L'), 'UUG': ('Leu', 'L'),
'UGG': ('Trp', 'W'),
'UGC': ('Cys', 'C'), 'UGU': ('Cys', 'C'),
'GAA': ('Glu', 'E'), 'GAG': ('Glu', 'E'),
'AAA': ('Lys', 'K'), 'AAG': ('Lys', 'K'),
'AUG': ('Met', 'M'),
'UAC': ('Tyr', 'Y'), 'UAU': ('Tyr', 'Y'),
'GUA': ('Val', 'V'), 'GUC': ('Val', 'V'), 'GUG': ('Val', 'V'),
'GUU': ('Val', 'V'),
'CAA': ('Gln', 'Q'), 'CAG': ('Gln', 'Q'),
'UUC': ('Phe', 'F'), 'UUU': ('Phe', 'F'),
'UAA': ('Stop', '_'), 'UAG': ('Stop', '_'), 'UGA': ('Stop', '_')
}
start_codon = 'AUG'
stop_codons = ['UAA', 'UAG', 'UGA']
def complement_DNA(original):
"""Creates the complement of the given DNA strand."""
base_in = "ATGC"
base_out = "TACG"
complementation_table = maketrans(base_in, base_out)
return original.translate(complementation_table)
def complement_RNA(original):
"""Creates the complement of the given RNA strand."""
base_in = "AUGC"
base_out = "UACG"
complementation_table = maketrans(base_in, base_out)
return original.translate(complementation_table)
def _find_start_codon(rna):
"""Finds the first start codon in the provided RNA sequence.
It's assumed that the 5' end comes first in the given sequence.
The start codon position is returned (0-based).
If no start codon is found, -1 is returned."""
return find(rna.upper(), start_codon)
def _find_first_stop_codon(rna):
"""Looks for the first stop codon in the given RNA sequence.
The stop codon position is returned (0-based).
This function assumes that position 0 holds the start codon.
If no stop codon is found, return -1."""
position = 3
while position < len(rna):
try:
if rna[position:position + 3] in stop_codons:
return position
position += 3
except IndexError:
return -1
return -1
def trim_to_coding_rna(rna):
"""Trims the given RNA sequence, 5' first, to the part which actually codes
for the protein."""
if len(rna) < 3:
# Too short to code for anything
return ""
start_position = _find_start_codon(rna)
if start_position == -1:
# No start codon found, so no resulting protein sequence
return ""
rel_rna = rna[start_position:]
stop_position = _find_first_stop_codon(rel_rna)
if stop_position != -1:
rel_rna = rel_rna[:stop_position]
return rel_rna
def translate_sequence(rna, single_letter_mode=True):
"""Translates the given RNA sequence into a amino acid sequence (protein).
This assumes that the 5' end comes first.
The amino acid sequence is returned, N end first."""
#rel_rna = trim_to_coding_rna(rna)
rel_rna = rna
if rel_rna == "":
return ""
protein = ""
mode_selector = 1 if single_letter_mode else 0
amino_acids = dict((re.escape(codon), amino_acid[mode_selector])
for codon, amino_acid in codon_table.iteritems())
pattern = re.compile("|".join(amino_acids.keys()))
protein = pattern.sub(lambda m: amino_acids[re.escape(m.group(0))], rel_rna)
return protein
def transcribe_coding_sequence(dna):
"""Transcribes the given DNA coding sequence."""
base_in = "T"
base_out = "U"
transcription_table = maketrans(base_in, base_out)
return dna.translate(transcription_table)
def reverse_sequence(strand):
"""Reverses the given strand comprised of single letters.
This basically switches whether the 5'/N or 3'/C end comes first."""
return strand[::-1]
if __name__ == '__main__':
# Unit testing.
# From BioBackground section, p.37, of our class textbook.
non_template_dna = "CGAAGGAATGCACGCCTATTAGGGACCC"
template_dna = "GCTTCCTTACGTGCGGATAATCCCTGGG"
rna = "CGAAGGAAUGCACGCCUAUUAGGGACCC"
coding_sequence = "AUGCACGCCUAUUAG"
protein_3letter = "MetHisAlaTyr"
protein_1letter = "MHAY"
# Test above functions
if non_template_dna != complement_DNA(template_dna):
print "complement_dna function test failed"
print "\ttemplate:", template_dna
print "\tresult :", complement_DNA(template_dna)
elif transcribe_coding_sequence(non_template_dna) != rna:
print "transcribe_coding_sequence function test failed"
print "\trna :", rna
print "\tresult :", transcribe_coding_sequence(non_template_dna)
elif _find_start_codon(rna) != 7:
print "_find_start_codon function test failed"
print "\trna :", rna
print "\tresult :", _find_start_codon(rna)
elif translate_sequence(rna, True) != protein_1letter:
print "translate_sequence function test failed for 1-letter mode"
print "\tprotein_1letter:", protein_1letter
print "\tresult :", translate_sequence(rna, True)
elif translate_sequence(rna, False) != protein_3letter:
print "translate_sequence function test failed for 3-letter mode"
print "\tprotein_3letter:", protein_3letter
print "\tresult :", translate_sequence(rna, False)
else:
print "Unit testing passed for central_dogma module."