aligneval.c

/* SQUID - A C function library for biological sequence analysis
 * Copyright (C) 1992-1995 Sean R. Eddy	
 *
 *    This source code is distributed under terms of the
 *    GNU General Public License. See the files COPYING 
 *    and GNULICENSE for further details.
 *
 */

/* aligneval.c
 * 
 * Comparison of multiple alignments. Three functions are
 * provided, using subtly different scoring schemes:
 *    CompareMultAlignments()    - basic scoring scheme
 *    CompareRefMultAlignments() - only certain "canonical" columns 
 *                                 are scored
 *                                 
 * The similarity measure is a fractional alignment identity averaged
 * over all sequence pairs. The score for all pairs is:
 *      (identically aligned symbols) / (total aligned columns in 
 *      known alignment)
 *      
 * A column c is identically aligned for sequences i, j if:
 *    1) both i,j have a symbol aligned in column c, and the
 *       same pair of symbols is aligned somewhere in the test
 *       alignment
 *    2) S[i][c] is aligned to a gap in sequence j, and that symbol
 *       is aligned to a gap in the test alignment
 *    3) converse of 2)
 *    
 *    
 * The algorithm is as follows:
 *    1) For each known/test aligned pair of sequences (k1,k2 and t1,t2)
 *        construct a list for each sequence, in which for every
 *        counted symbol we record the raw index of the symbol in
 *        the other sequence that it aligns to, or -1 if it aligns
 *        to a gap or uncounted symbol.
 *        
 *    2)  Compare the list for k1 to the list for t1 and count an identity 
 *        for each correct alignment.
 *        
 *    3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym
 *       alignments count for 2; correct sym/gap alignments count for 1.
 *    
 *    4) The score is (identities from 2 + identities from 3) / 
 *       (totals from 2 + totals from 3).
 *
 * Written originally for koala's ss2 pairwise alignment package.
 * 
 * Sean Eddy, Sun Nov  1 12:45:11 1992
 * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm
 */


#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "squid.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);
static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2, 
			    int **ret_s1_list, int *ret_listlen);
static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);


/* Function: ComparePairAlignments
 * 
 * Purpose:  Calculate and return a number representing how well two different alignments
 *           of a pair of sequences compare. The number is, roughly speaking,
 *           the fraction of columns which are identically aligned.
 * 
 *           For all columns c in which either known1[c] or known2[c] 
 *           is a non-gap, count an identity if those same symbols are
 *           aligned somewhere in calc1/calc2. The score is identities/total
 *           columns examined. (i.e. fully gapped columns don't count)
 * 
 *           more explicitly, identities come from:
 *             both known and test aligned pairs have the same symbol in the first sequence aligned to
 *               a gap in the second sequence;
 *             both known and test aligned pairs have the same symbol in the second sequence
 *               aligned to a gap in the first sequence;
 *             the known alignment has symbols aligned at this column, and the test
 *               alignment aligns the same two symbols.
 * 
 * Args:     known1, known2: trusted alignment of two sequences
 *           calc1, calc2:   test alignment of two sequences
 *  
 * Return:   Returns -1.0 on internal failure.
 */
float
ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2)
{
  int *klist1;
  int *klist2;
  int *tlist1;
  int *tlist2;
  int len1, len2;
  float score;

  if (! make_alilist(calc1,  calc2,  &tlist1, &len1)) return -1.0;
  if (! make_alilist(calc2,  calc1,  &tlist2, &len2)) return -1.0;
  if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;
  if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;
  if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
  
  free(klist1);
  free(klist2);
  free(tlist1);
  free(tlist2);
  return score;
}


/* Function: CompareRefPairAlignments()
 * 
 * Same as above, but the only columns that count are the ones
 * with indices in *refcoord. *refcoord and the known1, known2
 * pair must be in sync with each other (come from the same
 * multiple sequence alignment)
 *
 * Args:     ref           - 0..alen-1 array of 1 or 0 
 *           known1,known2 - trusted alignment
 *           calc1, calc2  - test alignment           
 *
 * Return:  the fractional alignment identity on success, -1.0 on failure.
 */
float
CompareRefPairAlignments(int  *ref, char *known1, char *known2, char *calc1, char *calc2)
{
  int *klist1;
  int *klist2;
  int *tlist1;
  int *tlist2;
  int len1, len2;
  float score;

  if (! make_ref_alilist(ref, known1, known2, calc1,  calc2,  &tlist1, &len1)) return -1.0;
  if (! make_ref_alilist(ref, known2, known1, calc2,  calc1,  &tlist2, &len2)) return -1.0;
  if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;
  if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;
  if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
  
  free(klist1);
  free(klist2);
  free(tlist1);
  free(tlist2);
  return score;
}

/* Function: make_alilist()
 * 
 * Purpose:  Construct a list (array) mapping the raw symbols of s1
 *           onto the indexes of the aligned symbols in s2 (or -1
 *           for gaps in s2). The list (s1_list) will be of the
 *           length of s1's raw sequence.
 *           
 * Args:     s1          - sequence to construct the list for
 *           s2          - sequence s1 is aligned to
 *           ret_s1_list - RETURN: the constructed list (caller must free)
 *           ret_listlen - RETURN: length of the list
 *           
 * Returns:  1 on success, 0 on failure
 */
static int
make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
{
  int *s1_list;
  int  col;			/* column position in alignment */
  int  r1, r2;			/* raw symbol index at current col in s1, s2 */
  
  /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
   * for that (and waste a wee bit of space)
   */
  if ((s1_list = (int *) malloc (sizeof(int) * strlen(s1))) == NULL)
    { squid_errno = SQERR_MEM; return 0; }

  r1 = r2 = 0;
  for (col = 0; s1[col] != '\0'; col++)
    {
      /* symbol in s1? Record what it's aligned to, and bump
       * the r1 counter.
       */
      if (! isgap(s1[col]))
	{
	  s1_list[r1] = isgap(s2[col]) ? -1 : r2;
	  r1++;
	}

      /* symbol in s2? bump the r2 counter
       */
      if (! isgap(s2[col]))
	r2++;
    }

  *ret_listlen = r1;
  *ret_s1_list = s1_list;
  return 1;
}


/* Function: make_ref_alilist()
 * 
 * Purpose:  Construct a list (array) mapping the raw symbols of s1
 *           which are under canonical columns of the ref alignment
 *           onto the indexes of the aligned symbols in s2 (or -1
 *           for gaps in s2 or noncanonical symbols in s2). 
 *           
 * Args:     ref:        - array of indices of canonical coords (1 canonical, 0 non)
 *           k1          - s1's known alignment (w/ respect to refcoords)
 *           k2          - s2's known alignment (w/ respect to refcoords)
 *           s1          - sequence to construct the list for
 *           s2          - sequence s1 is aligned to
 *           ret_s1_list - RETURN: the constructed list (caller must free)
 *           ret_listlen - RETURN: length of the list
 *           
 * Returns:  1 on success, 0 on failure
 */
static int
make_ref_alilist(int *ref, char *k1, char *k2,
		 char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
{
  int *s1_list;
  int  col;			/* column position in alignment */
  int  r1, r2;			/* raw symbol index at current col in s1, s2 */
  int *canons1;			/* flag array, 1 if position i in s1 raw seq is canonical */
  int  lpos;			/* position in list */
  
  /* Allocations. No arrays can exceed the length of their
   * appropriate parent (s1 or s2)
   */
  if ((s1_list = (int *) malloc (sizeof(int) * strlen(s1))) == NULL ||
      (canons1 = (int *) malloc (sizeof(int) * strlen(s1))) == NULL)
    { squid_errno = SQERR_MEM; return 0; }

  /* First we use refcoords and k1,k2 to construct an array of 1's 
   * and 0's, telling us whether s1's raw symbol number i is countable.
   * It's countable simply if it's under a canonical column.
   */
  r1 =  0;
  for (col = 0; k1[col] != '\0'; col++)
    {
      if (! isgap(k1[col]))
	{
	  canons1[r1] = ref[col] ? 1 : 0;
	  r1++;
	}
    }

  /* Now we can construct the list. We don't count pairs if the sym in s1
   * is non-canonical.
   * We have to keep separate track of our position in the list (lpos)
   * from our positions in the raw sequences (r1,r2)
   */
  r1 = r2 = lpos = 0;
  for (col = 0; s1[col] != '\0'; col++)
    {
      if (! isgap(s1[col]) && canons1[r1])
	{
	  s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
	  lpos++;
	}
      
      if (! isgap(s1[col]))
	r1++;
      if (! isgap(s2[col]))
	r2++;
    }

  free(canons1);
  *ret_listlen = lpos;
  *ret_s1_list = s1_list;
  return 1;
}

/* Function: compare_lists()
 * 
 * Purpose:  Given four alignment lists (k1,k2, t1,t2), calculate the
 *           alignment score.
 *           
 * Args:     k1   - list of k1's alignment to k2
 *           k2   - list of k2's alignment to k1
 *           t1   - list of t1's alignment to t2
 *           t2   - list of t2's alignment to t2
 *           len1 - length of k1, t1 lists (same by definition)
 *           len2 - length of k2, t2 lists (same by definition)
 *           ret_sc - RETURN: identity score of alignment
 *
 * Return:   1 on success, 0 on failure.
 */           
static int
compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc)
{
  float id;
  float tot;
  int   i;

  id = tot = 0.0;
  for (i = 0; i < len1; i++)
    {
      tot += 1.0;
      if (t1[i] == k1[i]) id += 1.0;
    }

  for ( i = 0; i < len2; i++)
    {
      tot += 1.0;
      if (k2[i] == t2[i]) id += 1.0;
    }

  *ret_sc = id / tot;
  return 1;
}


/* Function: CompareMultAlignments
 * 
 * Purpose:  Invokes pairwise alignment comparison for every possible pair,
 *           and returns the average score over all N(N-1) of them or -1.0
 *           on an internal failure.
 * 
 *           Can be slow for large N, since it's quadratic.
 *
 * Args:     kseqs  - trusted multiple alignment
 *           tseqs  - test multiple alignment
 *           N      - number of sequences
 *           
 * Return:   average identity score, or -1.0 on failure.          
 */
float
CompareMultAlignments(char **kseqs, char **tseqs, int N)
{
  int    i, j;			/* counters for sequences */
  float  score;
  float  tot_score = 0.0;
				/* do all pairwise comparisons */
  for (i = 0; i < N; i++)
    for (j = i+1; j < N; j++)
      {
	score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
	if (score < 0.0) return -1.0;
	tot_score += score;
      }
  return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0)));
}


/* Function: CompareRefMultAlignments()
 * 
 * Purpose:  Same as above, except an array of reference coords for
 *           the canonical positions of the known alignment is also
 *           provided.
 *
 * Args:     ref      : 0..alen-1 array of 1/0 flags, 1 if canon
 *           kseqs    : trusted alignment
 *           tseqs    : test alignment
 *           N        : number of sequences
 *
 * Return:   average identity score, or -1.0 on failure
 */
float
CompareRefMultAlignments(int   *ref, char **kseqs, char **tseqs, int N)
{
  int    i, j;			/* counters for sequences */
  float  score;
  float  tot_score = 0.0;
  
				/* do all pairwise comparisons */
  for (i = 0; i < N; i++)
    for (j = i+1; j < N; j++)
      {
	score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
	if (score < 0.0) return -1.0;
	tot_score += score;
      }
  return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0)));
}

/* Function: PairwiseIdentity()
 * 
 * Purpose:  Calculate the pairwise fractional identity between
 *           two aligned sequences s1 and s2. This is 
 *           (idents / total aligned positions). 
 *           
 *           It is obvious how to calculate idents, but less obvious
 *           how to calculate total aligned positions.
 *           Therefore we return three different numbers:
 *              id1:   relative to s1; the denominator is the length of s1
 *              id2:   relative to s2; the denominator is the length of s2
 *              idtot: all aligned positions are counted towards the
 *                      denominator
 *           Any of these three may be passed as NULL if you don't want
 *           their values.
 *
 *           If allow_ragged is TRUE, overhangs are not counted at all.
 *           Only the subalignment bounded by the outermost aligned pairs
 *           is counted.
 *   
 *           Watch out in nucleic acid alignments; U/T RNA/DNA alignments
 *           will be counted as mismatches!
 */
void
PairwiseIdentity(char *s1, char *s2, int allow_ragged,
		 float *ret_id1, float *ret_id2, float *ret_idtot)
{
  int     total;		/* total aligned positions              */
  int     len1, len2;           /* lengths of s1, s2                    */
  int     idents;		/* total identical positions            */
  int     counting;		/* TRUE if we're past left edge         */
  int     r1, r2, rtot;		/* keeps track of right side overcounts */

  total = len1 = len2 = idents = 0;
  r1 = r2 = rtot = -1;
  if (allow_ragged) counting = FALSE; else counting = TRUE;
  while (*s1 != '\0' || *s2 != '\0')
    {
		/* both gapped? Weird, but not impossible */
      if (isgap(*s1) && isgap(*s2)) continue;

      if (*s1 == *s2) { /* match? */
	idents++; 
	counting = TRUE;	/* start left edge, if we haven't already */
	r1 = r2 = rtot = -1;	/* reset right counts */
      }
      if (counting)
	{
	  if (*s1 != '\0' && !isgap(*s1)) { len1++; r1++; }
	  if (*s2 != '\0' && !isgap(*s2)) { len2++; r2++; }
	  total++; 
	  rtot++; 
	}
      if (*s1 != '\0') s1++;
      if (*s2 != '\0') s2++;
    }

  if (allow_ragged)		/* correct for right ragged edge */
    {
      len1 -= r1;
      len2 -= r2;
      total -= rtot;
    }

  if (ret_id1   != NULL) *ret_id1   = (len1 > 0) ? (float) idents / (float) len1 : 0.0;
  if (ret_id2   != NULL) *ret_id2   = (len2 > 0) ? (float) idents / (float) len2 : 0.0;
  if (ret_idtot != NULL) *ret_idtot = (total> 0) ? (float) idents / (float) total: 0.0;
}