-
Notifications
You must be signed in to change notification settings - Fork 0
/
dayhoff.c
196 lines (175 loc) · 5.66 KB
/
dayhoff.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* SQUID - A C function library for biological sequence analysis
* Copyright (C) 1992-1995 Sean R. Eddy
*
* This source code is distributed under terms of the
* GNU General Public License. See the files COPYING
* and GNULICENSE for further details.
*
*/
/* dayhoff.c
*
* Routines for dealing with PAM matrices.
*
* Includes:
* ParsePAMFile() -- read a PAM matrix from disk.
*
*
* SRE - Fri Apr 2 11:23:45 1993
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctype.h>
#include "squid.h"
#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif
/* Function: ParsePAMFile()
*
* Purpose: Given a pointer to an open file containing a PAM matrix,
* parse the file and allocate and fill a 2D array of
* doubles containing the matrix. The PAM file is
* assumed to be in the format that NCBI distributes
* with BLAST. BLOSUM matrices also work fine, as
* produced by Henikoff's program "MATBLAS".
*
* Parses both old format and new format BLAST matrices.
* Old format just had rows of integers.
* New format includes a leading character on each row.
*
* The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
* Note that it's not a 20x20 matrix as you might expect;
* this is for speed of indexing as well as the ability
* to deal with ambiguous characters.
*
* Args: fp - open PAM file
* ret_pam - RETURN: pam matrix, integers
* ret_scale - RETURN: scale factor for converting
* to real Sij. For instance, PAM120 is
* given in units of ln(2)/2. This may
* be passed as NULL if the caller
* doesn't care.
*
* Returns: 1 on success; 0 on failure and sets squid_errno to
* indicate the cause. ret_pam is allocated here and
* must be freed by the caller (use FreePAM).
*/
int
ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
{
int **pam;
char buffer[512]; /* input buffer from fp */
int order[27]; /* order of fields, obtained from header */
int nsymbols; /* total number of symbols in matrix */
char *sptr;
int idx;
int row, col;
float scale;
int gotscale = FALSE;
if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
/* Look at the first non-blank, non-comment line in the file.
* It gives single-letter codes in the order the PAM matrix
* is arrayed in the file.
*/
do {
if (fgets(buffer, 512, fp) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
/* Get the scale factor from the header.
* For BLOSUM files, we assume the line looks like:
* BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
* and we assume that the fraction is always 1/x;
*
* For PAM files, we assume the line looks like:
* PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
* and we assume that the number following the final '=' is our scale
*/
if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
(sptr = strchr(buffer, '/')) != NULL)
{
sptr++;
if (! isdigit(*sptr)) { squid_errno = SQERR_FORMAT; return 0; }
scale = (float) (log(2.0) / atof(sptr));
gotscale = TRUE;
}
else if (strstr(buffer, "substitution matrix,") != NULL)
{
while ((sptr = strrchr(buffer, '=')) != NULL) {
sptr += 2;
if (IsReal(sptr)) {
scale = atof(sptr);
gotscale = TRUE;
break;
}
}
}
} while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
idx = 0;
do {
order[idx] = (int) *sptr - (int) 'A';
if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
idx++;
} while ((sptr = strtok(NULL, " \t\n")) != NULL);
nsymbols = idx;
/* Allocate a pam matrix. For speed of indexing, we use
* a 27x27 matrix so we can do lookups using the ASCII codes
* of amino acid single-letter representations, plus one
* extra field to deal with the "*" (terminators).
*/
if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (idx = 0; idx < 27; idx++)
if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
/* Parse the rest of the file.
*/
for (row = 0; row < nsymbols; row++)
{
if (fgets(buffer, 512, fp) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
if ((sptr = strtok(buffer, " \t\n")) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
for (col = 0; col < nsymbols; col++)
{
if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
/* Watch out for new BLAST format, with leading characters
*/
if (*sptr == '*' || isalpha((int) *sptr))
col--; /* hack hack */
else
pam [order[row]] [order[col]] = atoi(sptr);
sptr = strtok(NULL, " \t\n");
}
}
/* Return
*/
if (ret_scale != NULL)
{
if (gotscale) *ret_scale = scale;
else
{
Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
*ret_scale = log(2.0) / 2.0;
}
}
*ret_pam = pam;
return 1;
}
/* Function: ScalePAM()
*
* Purpose: Simple linear scaling of a PAM matrix, for instance
* to change from the 10log10(P) of classic Dayhoff
* to 2log2(P) of BLAST matrices.
*
* Args: pam: 27x27 matrix of scores
* scale: scale factor to multiply by
*/
void
ScalePAM(int **pam, int scale)
{
int i;
int j;
for (i = 0; i < 27; i++)
for (j = 0; j < 27; j++)
pam[i][j] = pam[i][j] * scale;
}