-
Notifications
You must be signed in to change notification settings - Fork 0
/
HW2_2c.py
202 lines (163 loc) · 5.73 KB
/
HW2_2c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""Script for computing sequence alignments using Needleman-Wunsch with
affine gap penalties.
Arguments:
f - FASTA file with sequences in FASTA format.
s - JSON with the score matrix for alignment.
d - The gap opening penalty for the alignment.
e - The gap extension penalty for the alignment.
Outputs:
Prints alignment to console.
Example Usage:
python 2c.py -f sequences.fasta -s score_matrix.json -d 430 -e 30
"""
import argparse
import json
import numpy as np
'''Computes the actual string alignments given the traceback matrix.
Arguments:
x: the first string we're aligning
y: the second string we're aligning
t: the traceback matrix, which stores values that point to which
prior matrix was used to reach a given location in each of the
3 matrices.
start: value indicating the starting matrix (that had the optimal value)
Returns:
a_x: the string for the alignment of x's sequence
a_y: the string for the alignment of y's sequence
'''
def traceback(x, y, t, start):
# Initialize a_x and a_y
a_x = ''
a_y = ''
# Starting from the bottom right corner of the matrix to work backwards
r = len(x)
c = len(y)
# Set current value to predetermined starting point
curr = start
# Until reaching top left corner, add next residue or gap to a_x or a_y
while r != 0 or c != 0:
if curr == 0:
a_x += x[r-1]
a_y += y[c-1]
r -= 1
c -= 1
elif curr == 1:
a_x += x[r-1]
a_y += '-'
r -= 1
else:
a_x += '-'
a_y += y[c-1]
c -= 1
# Switch between m, i_x, and i_y if necessary before next iteration
prev = t[r][c][curr]
curr = prev
# Reverse strings (we were appending to the end not beginning) and return
return a_x[::-1], a_y[::-1]
'''Computes the score and alignment of two strings using an affine gap penalty.
Arguments:
x: the first string we're aligning
y: the second string we're aligning
s: the score matrix
d: the gap opening penalty
e: the gap extension penalty
Returns:
score: the score of the optimal sequence alignment
a_x: the aligned first string
a_y: the aligned second string
The latter two are computed using the above traceback method.
'''
def affine_sequence_alignment(x, y, s, d, e):
# Initialize necessary matrices
m = np.zeros([len(x)+1, len(y)+1])
''' Recurrence matrix'''
i_x = np.zeros([len(x)+1, len(y)+1])
''' Recurrence matrix'''
i_y = np.zeros([len(x)+1, len(y)+1])
''' Recurrence matrix'''
# Extra dimension
t = np.zeros([len(x)+1, len(y)+1, 3])
''' Traceback matrix, redefine/use as necessary. '''
for i in range(1, len(x)+1):
m[i][0] = float("-inf")
i_x[i][0] = i_x[i-1][0] - e
i_y[i][0] = float("-inf")
for j in range(1, len(y)+1):
m[0][j] = float("-inf")
i_x[0][j] = float("-inf")
i_y[0][j] = i_y[0][j-1]-e
for i in range(1, len(x)+1):
for j in range(1, len(y)+1):
m_val = max(m[i-1][j-1]+s[x[i-1]][y[j-1]],
i_x[i-1][j-1]+s[x[i-1]][y[j-1]],
i_y[i-1][j-1]+s[x[i-1]][y[j-1]])
m[i][j] = m_val
if m_val == m[i-1][j-1]+s[x[i-1]][y[j-1]]:
t[i][j][0] = 0
elif m_val == i_x[i-1][j-1]+s[x[i-1]][y[j-1]]:
t[i][j][0] = 1
else:
t[i][j][0] = 2
ix_val = max(m[i-1, j]-d,
i_x[i-1][j]-e)
i_x[i][j] = ix_val
if ix_val == m[i-1, j]-d:
t[i][j][1] = 0
else:
t[i][j][1] = 1
iy_val = max(m[i][j-1]-d,
i_y[i][j-1]-e)
i_y[i][j] = iy_val
if iy_val == m[i][j-1]-d:
t[i][j][2] = 0
else:
t[i][j][2] = 2
score = max(m[len(x)][len(y)],
i_x[len(x)][len(y)],
i_y[len(x)][len(y)])
if score == m[len(x)][len(y)]:
start = 0
elif score == i_x[len(x)][len(y)]:
start = 1
else:
start = 2
''' Indicator of the starting point (which matrix has optimal value).'''
a_x, a_y = traceback(x, y, t, start)
return score, (a_x, a_y)
'''Prints two aligned sequences formatted for convenient inspection.
Arguments:
a_x: the first sequence aligned
a_y: the second sequence aligned
Outputs:
Prints aligned sequences (80 characters per line) to console
'''
def print_alignment(a_x, a_y):
assert len(a_x) == len(a_y), "Sequence alignment lengths must be the same."
for i in range(1 + (len(a_x) / 80)):
start = i * 80
end = (i + 1) * 80
print a_x[start:end]
print a_y[start:end]
print
def main():
parser = argparse.ArgumentParser(
description='Calculate sequence alignments for two sequences with an affine gap penalty.')
parser.add_argument('-f', action="store", dest="f", type=str, required=True)
parser.add_argument('-s', action="store", dest="s", type=str, required=True)
parser.add_argument('-d', action="store", dest="d", type=float, required=True)
parser.add_argument('-e', action="store", dest="e", type=float, required=True)
args = parser.parse_args()
fasta_file = args.f
score_matrix_file = args.s
d = args.d
e = args.e
with open(fasta_file) as f:
_, x, _, y = [line.strip() for line in f.readlines()]
with open(score_matrix_file) as f:
s = json.loads(f.readlines()[0])
score, (a_x, a_y) = affine_sequence_alignment(x, y, s, d, e)
print "Alignment:"
print_alignment(a_x, a_y)
print "Score: " + str(score)
if __name__ == "__main__":
main()