-
Notifications
You must be signed in to change notification settings - Fork 0
/
Score.py
284 lines (255 loc) · 12.2 KB
/
Score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import numpy as np
from collections import Counter
from Bio.Align import AlignInfo, MultipleSeqAlignment
def get_cost(costfn, res1, res2, defcost=0, ignore_gap=False, gap_char='-', gap_cost=None):
"""Return the cost between residue1 and residue2
costfn can be a function that accepts 3 arguments. For example, if you want the edit distance:
costfn = lambda x,y,z: (x!=y)*1
Alternatively costfn can also be a dict object that specify the substitution cost between a pair of residues
set ignore_gap if you want to ignore gap to gap alignment in the score (score set to )
"""
res1, res2 = res1.upper(), res2.upper()
score = None
if ignore_gap and res1 == res2 == gap_char:
score = defcost
elif gap_char and res1 != res2 and gap_char in [res1, res2]:
score = gap_cost
if score is None:
if callable(costfn):
score = costfn(res1, res2, defcost)
else:
score = costfn.get(res1, {}).get(res2) or costfn.get(res2, {}).get(res1)
score = defcost if score is None else score
return score
def compute_SP_score(alignment, cols=None, costfn={}, count_method=None, defcost=0, ignore_gap=True, gap_char='-', gap_cost=None):
"""Compute SP score for a given col of the alignment"""
if not isinstance(alignment, np.ndarray):
alignment = np.asarray(alignment)
if cols is None:
cols = np.arange(alignment.shape[-1])
elif not isinstance(cols, list):
cols = [cols]
sp_vec = np.zeros(len(cols))
for icol, pos in enumerate(cols):
sp = 0
col = alignment[:, cols[pos]]
if count_method:
col = count_method(col)
print(col)
col = list(col)
c_col = list(Counter(col).items())
n_res = len(c_col)
for i, (res, oc) in enumerate(c_col):
sp += (oc - 1)*get_cost(costfn, res, res,
defcost, ignore_gap, gap_char, gap_cost)
for j in range(i, n_res):
res2, oc2 = c_col[j]
sp += oc*oc2*get_cost(costfn, res, res2, defcost,
ignore_gap, gap_char, gap_cost)
sp_vec[icol] = sp
return sp_vec
def compute_ic_content(alignment, cols):
"""Compute the information content for the list of cols specified"""
msa = alignment
if isinstance(msa, np.ndarray):
msa = MultipleSeqAlignment(msa)
if not isinstance(msa, MultipleSeqAlignment):
raise ValueError("Expect MultipleSeqAlignment or np.ndarray, got {}".format(msa.__class__))
align_info = AlignInfo.SummaryInfo(msa)
align_info.information_content()
ic_vector = align_info.ic_vector
# biopython wrong version hack
if isinstance(ic_vector, dict):
ic_vector = np.zeros(len(align_info.ic_vector))
for (ic_i, ic_v) in align_info.ic_vector.items():
ic_vector[ic_i] = ic_v
return list(ic_vector)
def information_content(self, start=0,
end=None,
e_freq_table=None, log_base=2,
chars_to_ignore=None, pseudo_count=0):
"""Calculate the information content for each residue along an alignment.
Arguments:
- start, end - The starting an ending points to calculate the
information content. These points should be relative to the first
sequence in the alignment, starting at zero (ie. even if the 'real'
first position in the seq is 203 in the initial sequence, for
the info content, we need to use zero). This defaults to the entire
length of the first sequence.
- e_freq_table - A FreqTable object specifying the expected frequencies
for each letter in the alphabet we are using (e.g. {'G' : 0.4,
'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
included, since these should not have expected frequencies.
- log_base - The base of the logathrim to use in calculating the
information content. This defaults to 2 so the info is in bits.
- chars_to_ignore - A listing of characters which should be ignored
in calculating the info content. Defaults to none.
Returns:
- A number representing the info content for the specified region.
Please see the Biopython manual for more information on how information
content is calculated.
"""
# if no end was specified, then we default to the end of the sequence
if end is None:
end = len(self.alignment[0].seq)
if chars_to_ignore is None:
chars_to_ignore = []
if start < 0 or end > len(self.alignment[0].seq):
raise ValueError("Start (%s) and end (%s) are not in the \
range %s to %s"
% (start, end, 0, len(self.alignment[0].seq)))
# determine random expected frequencies, if necessary
random_expected = None
if not e_freq_table:
# TODO - What about ambiguous alphabets?
base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
if isinstance(base_alpha, Alphabet.ProteinAlphabet):
random_expected = Protein20Random
elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
random_expected = Nucleotide4Random
else:
errstr = "Error in alphabet: not Nucleotide or Protein, "
errstr += "supply expected frequencies"
raise ValueError(errstr)
del base_alpha
elif not isinstance(e_freq_table, FreqTable.FreqTable):
raise ValueError("e_freq_table should be a FreqTable object")
# determine all of the letters we have to deal with
all_letters = self._get_all_letters()
for char in chars_to_ignore:
all_letters = all_letters.replace(char, '')
info_content = {}
for residue_num in range(start, end):
freq_dict = self._get_letter_freqs(residue_num,
self.alignment,
all_letters,
chars_to_ignore,
pseudo_count,
e_freq_table,
random_expected)
# print freq_dict,
column_score = self._get_column_info_content(freq_dict,
e_freq_table,
log_base,
random_expected)
info_content[residue_num] = column_score
# sum up the score
total_info = sum(info_content.values())
# fill in the ic_vector member: holds IC for each column
# reset ic_vector to empty list at each call
self.ic_vector = []
for (i, k) in enumerate(info_content):
self.ic_vector.append(info_content[i + start])
return total_info
def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore,
pseudo_count=0, e_freq_table=None, random_expected=None):
"""Determine the frequency of specific letters in the alignment (PRIVATE).
Arguments:
- residue_num - The number of the column we are getting frequencies
from.
- all_records - All of the SeqRecords in the alignment.
- letters - The letters we are interested in getting the frequency
for.
- to_ignore - Letters we are specifically supposed to ignore.
- pseudo_count - Optional argument specifying the Pseudo count (k)
to add in order to prevent a frequency of 0 for a letter.
- e_freq_table - An optional argument specifying the expected
frequencies for each letter. This is a SubsMat.FreqTable instance.
- random_expected - Optional argument that specify the frequency to use
when e_freq_table is not defined.
This will calculate the frequencies of each of the specified letters
in the alignment at the given frequency, and return this as a
dictionary where the keys are the letters and the values are the
frequencies. Pseudo count can be added to prevent a null frequency
"""
freq_info = self._get_base_letters(letters)
total_count = 0
gap_char = self._get_gap_char()
if pseudo_count < 0:
raise ValueError("Positive value required for "
"pseudo_count, %s provided" % (pseudo_count))
# collect the count info into the dictionary for all the records
for record in all_records:
try:
if record.seq[residue_num] not in to_ignore:
weight = record.annotations.get('weight', 1.0)
freq_info[record.seq[residue_num]] += weight
total_count += weight
# getting a key error means we've got a problem with the alphabet
except KeyError:
raise ValueError("Residue %s not found in alphabet %s"
% (record.seq[residue_num],
self.alignment._alphabet))
if e_freq_table:
if not isinstance(e_freq_table, FreqTable.FreqTable):
raise ValueError("e_freq_table should be a FreqTable object")
# check if all the residus in freq_info are in e_freq_table
for key in freq_info:
if (key != gap_char and key not in e_freq_table):
raise ValueError("letters in current column %s "
"and not in expected frequency table %s"
% (list(freq_info) - [gap_char],
list(e_freq_table)))
if total_count == 0:
# This column must be entirely ignored characters
for letter in freq_info:
assert freq_info[letter] == 0
# TODO - Map this to NA or NaN?
else:
# now convert the counts into frequencies
for letter in freq_info:
if pseudo_count and (random_expected or e_freq_table):
# use either the expected random freq or the
if e_freq_table:
ajust_freq = e_freq_table[letter]
else:
ajust_freq = random_expected
ajusted_letter_count = freq_info[
letter] + ajust_freq * pseudo_count
ajusted_total = total_count + pseudo_count
freq_info[letter] = ajusted_letter_count / ajusted_total
else:
freq_info[letter] = freq_info[letter] / total_count
return freq_info
def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
random_expected):
"""Calculate the information content for a column (PRIVATE).
Arguments:
- obs_freq - The frequencies observed for each letter in the column.
- e_freq_table - An optional argument specifying the expected
frequencies for each letter. This is a SubsMat.FreqTable instance.
- log_base - The base of the logathrim to use in calculating the
info content.
"""
gap_char = self._get_gap_char()
if e_freq_table:
if not isinstance(e_freq_table, FreqTable.FreqTable):
raise ValueError("e_freq_table should be a FreqTable object")
# check the expected freq information to make sure it is good
for key in obs_freq:
if (key != gap_char and key not in e_freq_table):
raise ValueError("Expected frequency letters %s "
"do not match observed %s"
% (list(e_freq_table),
list(obs_freq) - [gap_char]))
total_info = 0.0
for letter in obs_freq:
inner_log = 0.0
# if we have expected frequencies, modify the log value by them
# gap characters do not have expected frequencies, so they
# should just be the observed frequency.
if letter != gap_char:
if e_freq_table:
inner_log = obs_freq[letter] / e_freq_table[letter]
else:
inner_log = obs_freq[letter] / random_expected
# if the observed frequency is zero, we don't add any info to the
# total information content
if inner_log > 0:
letter_info = (obs_freq[letter] *
math.log(inner_log) / math.log(log_base))
total_info += letter_info
return total_info
def get_column(self, col):
# TODO - Deprecate this and implement slicing?
return self.alignment[:, col]