-
Notifications
You must be signed in to change notification settings - Fork 0
/
alignio.c
466 lines (419 loc) · 14.2 KB
/
alignio.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
/* SQUID - A C function library for biological sequence analysis
* Copyright (C) 1992-1995 Sean R. Eddy
*
* This source code is distributed under terms of the
* GNU General Public License. See the files COPYING
* and GNULICENSE for further details.
*
*/
/* alignio.c
* SRE, Mon Jul 12 11:57:37 1993
*
* Input/output of sequence alignments.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "squid.h"
#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif
/* Function: FreeAlignment()
*
* Purpose: Free the space allocated to alignment, names, and optional
* information.
*
* Args: aseqs - sequence alignment
* nseq - number of sequences
* ainfo - optional extra data. May be NULL.
*/
void
FreeAlignment(char **aseqs, int nseq, struct aliinfo_s *ainfo)
{
int i;
for (i = 0; i < nseq; i++)
{
if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
if (ainfo->sqinfo[i].flags & SQINFO_FREE) free(ainfo->sqinfo[i].free);
if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
}
if (ainfo->flags & AINFO_CS) free(ainfo->cs);
if (ainfo->flags & AINFO_RF) free(ainfo->rf);
free(ainfo->sqinfo);
Free2DArray(aseqs, nseq);
}
/* Function: MakeAlignedString()
*
* Purpose: Given a raw string of some type (secondary structure, say),
* align it to a given aseq by putting gaps wherever the
* aseq has gaps.
*
* Args: aseq: template for alignment
* alen: length of aseq
* ss: raw string to align to aseq
* ret_s: RETURN: aligned ss
*
* Return: 1 on success, 0 on failure (and squid_errno is set.)
* ret_ss is malloc'ed here and must be free'd by caller.
*/
int
MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
{
char *new;
int apos, rpos;
if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (apos = rpos = 0; apos < alen; apos++)
if (! isgap(aseq[apos]))
{
new[apos] = ss[rpos];
rpos++;
}
else
new[apos] = '.';
new[apos] = '\0';
if (rpos != strlen(ss))
{ squid_errno = SQERR_PARAMETER; free(new); return 0; }
*ret_s = new;
return 1;
}
/* Function: MakeDealignedString()
*
* Purpose: Given an aligned string of some type (either sequence or
* secondary structure, for instance), dealign it relative
* to a given aseq. Return a ptr to the new string.
*
* Args: aseq : template alignment
* alen : length of aseq
* ss: : string to make dealigned copy of; same length as aseq
* ret_s : RETURN: dealigned copy of ss
*
* Return: 1 on success, 0 on failure (and squid_errno is set)
* ret_s is alloc'ed here and must be freed by caller
*/
int
MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
{
char *new;
int apos, rpos;
if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (apos = rpos = 0; apos < alen; apos++)
if (! isgap(aseq[apos]))
{
new[rpos] = ss[apos];
rpos++;
}
new[rpos] = '\0';
if (alen != strlen(ss))
{ squid_errno = SQERR_PARAMETER; free(new); return 0; }
*ret_s = new;
return 1;
}
/* Function: ReadAlignment()
*
* Purpose: Given a seqfile name, determine the alignment format of
* the file and read it in. Just a wrapper function, really.
*
* Currently, squid can parse alignments from the following
* multiple sequence alignment formats:
* MSF (U. of Wisconsin GCG package MSF format)
* SELEX (NeXagen/CU Boulder SELEX format)
*
* Return: 1 on success; 0 on failure.
* Returned data should be freed by caller with FreeAlignment()
*/
int
ReadAlignment(char *seqfile,
int format,
char ***ret_aseqs,
int *ret_num,
struct aliinfo_s *ret_ainfo)
{
switch (format) {
case kMSF:
if (! ReadMSF(seqfile, ret_aseqs, ret_num, ret_ainfo)) return 0;
break;
case kSelex:
if (! ReadSELEX(seqfile, ret_aseqs, ret_num, ret_ainfo)) return 0;
break;
default: squid_errno = SQERR_FORMAT; return 0;
}
return 1;
}
/* Function: WritePairwiseAlignment()
*
* Purpose: Write a nice formatted pairwise alignment out,
* with a BLAST-style middle line showing identities
* as themselves (single letter) and conservative
* changes as '+'.
*
* Args: ofp - open fp to write to (stdout, perhaps)
* aseq1, aseq2 - alignments to write (not necessarily
* flushed right with gaps)
* name1, name2 - names of sequences
* spos1, spos2 - starting position in each (raw) sequence
* pam - PAM matrix; positive values define
* conservative changes
* indent - how many extra spaces to print on left
*
* Return: 1 on success, 0 on failure
*/
int
WritePairwiseAlignment(FILE *ofp,
char *aseq1, char *name1, int spos1,
char *aseq2, char *name2, int spos2,
int **pam, int indent)
{
char sname1[11]; /* shortened name */
char sname2[11];
int still_going; /* True if writing another block */
char buf1[61]; /* buffer for writing seq1; CPL+1*/
char bufmid[61]; /* buffer for writing consensus */
char buf2[61];
char *s1, *s2; /* ptrs into each sequence */
int count1, count2; /* number of symbols we're writing */
int rpos1, rpos2; /* position in raw seqs */
int rawcount1, rawcount2; /* number of nongap symbols written */
int apos;
strncpy(sname1, name1, 10);
sname1[10] = '\0';
strtok(sname1, WHITESPACE);
strncpy(sname2, name2, 10);
sname2[10] = '\0';
strtok(sname2, WHITESPACE);
s1 = aseq1;
s2 = aseq2;
rpos1 = spos1;
rpos2 = spos2;
still_going = True;
while (still_going)
{
still_going = False;
/* get next line's worth from both */
strncpy(buf1, s1, 60); buf1[60] = '\0';
strncpy(buf2, s2, 60); buf2[60] = '\0';
count1 = strlen(buf1);
count2 = strlen(buf2);
/* is there still more to go? */
if ((count1 == 60 && s1[60] != '\0') ||
(count2 == 60 && s2[60] != '\0'))
still_going = True;
/* shift seq ptrs by a line */
s1 += count1;
s2 += count2;
/* assemble the consensus line */
for (apos = 0; apos < count1 && apos < count2; apos++)
{
if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
{
if (buf1[apos] == buf2[apos])
bufmid[apos] = buf1[apos];
else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
bufmid[apos] = '+';
else
bufmid[apos] = ' ';
}
else
bufmid[apos] = ' ';
}
bufmid[apos] = '\0';
rawcount1 = 0;
for (apos = 0; apos < count1; apos++)
if (!isgap(buf1[apos])) rawcount1++;
rawcount2 = 0;
for (apos = 0; apos < count2; apos++)
if (!isgap(buf2[apos])) rawcount2++;
(void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
sname1, rpos1, buf1, rpos1 + rawcount1 -1);
(void) fprintf(ofp, "%*s %s\n", indent, "",
bufmid);
(void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
sname2, rpos2, buf2, rpos2 + rawcount2 -1);
(void) fprintf(ofp, "\n");
rpos1 += rawcount1;
rpos2 += rawcount2;
}
return 1;
}
/* Function: MingapAlignment()
*
* Purpose: Remove all-gap columns from a multiple sequence alignment
* and its associated data. The alignment is assumed to be
* flushed (all aseqs the same length).
*/
int
MingapAlignment(char **aseqs, int num, struct aliinfo_s *ainfo)
{
int apos; /* position in original alignment */
int mpos; /* position in new alignment */
int idx;
/* We overwrite aseqs, using its allocated memory.
*/
for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
{
/* check for all-gap in column */
for (idx = 0; idx < num; idx++)
if (! isgap(aseqs[idx][apos]))
break;
if (idx == num) continue;
/* shift alignment and ainfo */
if (mpos != apos)
{
for (idx = 0; idx < num; idx++)
aseqs[idx][mpos] = aseqs[idx][apos];
if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = ainfo->cs[apos];
if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = ainfo->rf[apos];
}
mpos++;
}
/* null terminate everything */
for (idx = 0; idx < num; idx++)
aseqs[idx][mpos] = '\0';
ainfo->alen = mpos; /* set new length */
if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = '\0';
if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = '\0';
return 1;
}
/* Function: RandomAlignment()
*
* Purpose: Create a random alignment from raw sequences.
*
* Ideally, we would like to sample an alignment from the
* space of possible alignments according to its probability,
* given a prior probability distribution for alignments.
* I don't see how to describe such a distribution, let alone
* sample it.
*
* This is a rough approximation that tries to capture some
* desired properties. We assume the alignment is generated
* by a simple HMM composed of match and insert states.
* Given parameters (pop, pex) for the probability of opening
* and extending an insertion, we can find the expected number
* of match states, M, in the underlying model for each sequence.
* We use an average M taken over all the sequences (this is
* an approximation. The expectation of M given all the sequence
* lengths is a nasty-looking summation.)
*
* M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
*
* Then, we assign positions in each raw sequence onto the M match
* states and M+1 insert states of this "HMM", by rolling random
* numbers and inserting the (rlen-M) inserted positions randomly
* into the insert slots, taking into account the relative probability
* of open vs. extend.
*
* The resulting alignment has two desired properties: insertions
* tend to follow the HMM-like exponential distribution, and
* the "sparseness" of the alignment is controllable through
* pop and pex.
*
* Args: rseqs - raw sequences to "align", 0..nseq-1
* sqinfo - array of 0..nseq-1 info structures for the sequences
* nseq - number of sequences
* pop - probability to open insertion (0<pop<1)
* pex - probability to extend insertion (0<pex<1)
* ret_aseqs - RETURN: alignment (flushed)
* ainfo - fill in: alignment info
*
* Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
* of failure.
*/
int
RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, double pop, double pex,
char ***ret_aseqs, AINFO *ainfo)
{
char **aseqs; /* RETURN: alignment */
int alen; /* length of alignment */
int *rlen; /* lengths of each raw sequence */
int M; /* length of "model" */
int **ins; /* insertion counts, 0..nseq-1 by 0..M */
int *master_ins; /* max insertion counts, 0..M */
int apos, rpos, idx;
int statepos;
int count;
int minlen;
/* calculate expected length of model, M
*/
if ((rlen = (int *) malloc (sizeof(int) * nseq)) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
M = 0;
minlen = 9999999;
for (idx = 0; idx < nseq; idx++)
{
rlen[idx] = strlen(rseqs[idx]);
M += rlen[idx];
minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
}
M = (int) ((double) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
M /= nseq;
if (M > minlen) M = minlen;
/* make arrays that count insertions in M+1 possible insert states
*/
if ((ins = (int **) malloc (sizeof(int *) * nseq)) == NULL ||
(master_ins = (int *) malloc (sizeof(int) * (M+1))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (idx = 0; idx < nseq; idx++)
{
if ((ins[idx] = (int *) malloc (sizeof(int) * (M+1))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (rpos = 0; rpos <= M; rpos++)
ins[idx][rpos] = 0;
}
/* normalize */
pop = pop / (pop+pex);
pex = 1.0 - pop;
/* make insertions for individual sequences */
for (idx = 0; idx < nseq; idx++)
{
apos = -1;
for (rpos = 0; rpos < rlen[idx]-M; rpos++)
{
if (sre_random() < pop || apos == -1) /* open insertion */
apos = CHOOSE(M+1);
ins[idx][apos]++;
}
}
/* calculate master_ins, max inserts */
alen = M;
for (apos = 0; apos <= M; apos++)
{
master_ins[apos] = 0;
for (idx = 0; idx < nseq; idx++)
if (ins[idx][apos] > master_ins[apos])
master_ins[apos] = ins[idx][apos];
alen += master_ins[apos];
}
/* Now, construct alignment
*/
if ((aseqs = (char **) malloc (sizeof (char *) * nseq)) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (idx = 0; idx < nseq; idx++)
if ((aseqs[idx] = (char *) malloc (sizeof(char) * (alen+1))) == NULL)
{ squid_errno = SQERR_MEM; return 0; }
for (idx = 0; idx < nseq; idx++)
{
apos = rpos = 0;
for (statepos = 0; statepos <= M; statepos++)
{
for (count = 0; count < ins[idx][statepos]; count++)
aseqs[idx][apos++] = rseqs[idx][rpos++];
for (; count < master_ins[statepos]; count++)
aseqs[idx][apos++] = ' ';
if (statepos != M)
aseqs[idx][apos++] = rseqs[idx][rpos++];
}
aseqs[idx][alen] = '\0';
}
ainfo->flags = 0;
ainfo->alen = alen; ainfo->flags |= AINFO_ALEN;
if ((ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO) * nseq)) == NULL)
Die("malloc failed");
for (idx = 0; idx < nseq; idx++)
SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
free(rlen);
free(master_ins);
Free2DArray(ins, nseq);
*ret_aseqs = aseqs;
return 1;
}