-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReverseTranslation.py
130 lines (109 loc) · 4.48 KB
/
ReverseTranslation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
ecoli_codon_dict = {
'A': ['GCG', 'GCC', 'GCA', 'GCT'],
'C': ['TGC', 'TGT'],
'D': ['GAT', 'GAC'],
'E': ['GAA', 'GAG'],
'F': ['TTT', 'TTC'],
'G': ['GGC', 'GGT', 'GGG', 'GGA'],
'H': ['CAT', 'CAC'],
'I': ['ATT', 'ATC', 'ATA'],
'K': ['AAA', 'AAG'],
'L': ['CTG', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA'],
'M': ['ATG'],
'N': ['AAC', 'AAT'],
'P': ['CCG', 'CCA', 'CCT', 'CCC'],
'Q': ['CAG', 'CAA'],
'R': ['CGT', 'CGC', 'CGG', 'CGA', 'AGA', 'AGG'],
'S': ['AGC', 'TCT', 'AGT', 'TCC', 'TCA', 'TCG'],
'T': ['ACC', 'ACG', 'ACT', 'ACA'],
'V': ['GTG', 'GTT', 'GTC', 'GTA'],
'W': ['TGG'],
'Y': ['TAT', 'TAC'],
'*': ['TAA', 'TGA', 'TAG']
}
protein_sequence = "MATWYLGLPTWYLWCCKLWYTIILTWYGYPWLPHKGYPWFDAAVNGYPWRQQWTYIIL**"
def find_suitible_codon(dna_sequence, protein_sequence, dna_sequence_position):
amino_acid_position = int(dna_sequence_position/3)
amino_acid = protein_sequence[amino_acid_position]
specific_codons = ecoli_codon_dict[amino_acid]
used_codon = dna_sequence[dna_sequence_position: dna_sequence_position + 3]
codon_rarity = specific_codons.index(used_codon)
if len(specific_codons) > codon_rarity + 1:
return specific_codons[codon_rarity + 1]
return specific_codons[codon_rarity]
def construct_dna_fragment(dna_sequence, protein_sequence, dna_position):
dna_fragment = ""
for i in range(0, 7, 3):
dna_fragment += find_suitible_codon(dna_sequence, protein_sequence, dna_position + i)
return dna_fragment
def edit_dna_sequence(dna_sequence, protein_sequence, dna_position):
strand_one = dna_sequence[:dna_position]
middle_strand = construct_dna_fragment(dna_sequence, protein_sequence, dna_position)
strand_two = dna_sequence[dna_position+9:]
return strand_one + middle_strand + strand_two
def reverse_translate(protein_seq, MOST_COMMON_CODON=0):
dna_seq = ""
for amino_acid in protein_seq:
dna_seq += ecoli_codon_dict[amino_acid][MOST_COMMON_CODON]
return dna_seq
def get_repeating_fragments(dna_seq, fragment_size=9):
fragments = []
repeats = {}
for i in range(len(dna_seq) - fragment_size+ 1):
fragment = dna_seq[i:i + fragment_size]
if fragment in fragments:
repeats[fragment] = i
fragments.append(fragment)
return repeats
def adjust_to_multiple_of_threee(postion):
if postion%3 == 1:
return postion - 1
elif postion%3 == 2:
return postion + 1
return postion
def position_of_repeats(fragments):
dna_positions = [adjust_to_multiple_of_threee(i) for i in fragments.values()]
sorted_dna_positions = []
[sorted_dna_positions.append(x) for x in dna_positions if x not in sorted_dna_positions]
return sorted_dna_positions
def loop_through_edit(dna_sequence, protein_sequence, ordered_positions):
for position in ordered_positions:
dna_sequence = edit_dna_sequence(dna_sequence, protein_sequence, position)
return dna_sequence
def check_gc_content(dna_sequence):
return (dna_sequence.count("G") + dna_sequence.count("C"))/len(dna_sequence)
def get_high_gc_fragments(dna_seq, fragment_size=12):
fragment_dict = {}
for i in range(len(dna_seq) - fragment_size+ 1):
fragment = dna_seq[i:i + fragment_size]
if check_gc_content(fragment) > 0.7 or check_gc_content(fragment) < 0.3:
fragment_dict[fragment] = i
return fragment_dict
def condense_gc_fragments(fragment_dict):
condensed_fragments = []
index = list(fragments.values())
fragments = list(fragment_dict.keys())
return condensed_fragments
# GOT FROM COPILOT
def count_codon_differences(seq1, seq2):
# Ensure both sequences are of the same length by padding the shorter one with spaces
max_len = max(len(seq1), len(seq2))
seq1 = seq1.ljust(max_len)
seq2 = seq2.ljust(max_len)
# Count differing codons
differences = 0
for i in range(0, max_len, 3):
codon1 = seq1[i:i+3]
codon2 = seq2[i:i+3]
if codon1 != codon2:
differences += 1
return differences
dna_sequence = reverse_translate(protein_sequence)
fragments = get_repeating_fragments(dna_sequence)
ordered_positions = position_of_repeats(fragments)
edited_dna = loop_through_edit(dna_sequence, protein_sequence, ordered_positions)
codon_differences = count_codon_differences(dna_sequence, edited_dna)
print(f"Codon Differences: {codon_differences}")
print(edited_dna)
print(check_gc_content(edited_dna))
print(get_high_gc_fragments(edited_dna))