Converting sequence into indices of characters (scikit-bio#1917)

* updated URL and doc build * Revert "updated URL and doc build" This reverts commit 69e15b2. * added _get_alphabet_index * added _make_alphabet_and_index * added sequence to indices * updated changelog * fixing linting
mataton · Feb 1, 2024 · 3071b7c · 3071b7c
1 parent 2708578
commit 3071b7c
Show file tree

Hide file tree

Showing 15 changed files with 881 additions and 39 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@
 
 ### Features
 
+* Added method `Sequence.to_indices` to convert a sequence into a vector of indices of characters in an alphabet (can be from a substitution matrix) or unique characters observed in the sequence. Supports gap masking and wildcard substitution ([#1917](https://github.com/scikit-bio/scikit-bio/pull/1917)).
 * Added class `SubstitutionMatrix` to support subsitution matrices for nucleotides, amino acids are more general cases ([#1913](https://github.com/scikit-bio/scikit-bio/pull/1913)).
 * Added alpha diversity metric `sobs`, which is the observed species richness (S_{obs}) of a sample. `sobs` will replace `observed_otus`, which uses the historical term "OTU". Also added metric `observed_features` to be compatible with the QIIME 2 terminology. All three metrics are equivalent ([#1902](https://github.com/scikit-bio/scikit-bio/pull/1902)).
 * `beta_diversity` now supports use of Pandas a `DataFrame` index, issue [#1808](https://github.com/scikit-bio/scikit-bio/issues/1808).

diff --git a/skbio/metadata/_mixin.py b/skbio/metadata/_mixin.py
@@ -36,21 +36,20 @@ def metadata(self):
 
         Create a sequence with metadata:
 
-        >>> from pprint import pprint
         >>> from skbio import Sequence
         >>> seq = Sequence('ACGT', metadata={'description': 'seq description',
         ...                                  'id': 'seq-id'})
 
         Retrieve metadata:
 
-        >>> pprint(seq.metadata) # using pprint to display dict in sorted order
+        >>> print(seq.metadata)
         {'description': 'seq description', 'id': 'seq-id'}
 
         Update metadata:
 
         >>> seq.metadata['id'] = 'new-id'
         >>> seq.metadata['pubmed'] = 12345
-        >>> pprint(seq.metadata)
+        >>> print(seq.metadata)
         {'description': 'seq description', 'id': 'new-id', 'pubmed': 12345}
 
         Set metadata:

diff --git a/skbio/sequence/_alphabet.py b/skbio/sequence/_alphabet.py
@@ -0,0 +1,266 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import numpy as np
+
+
+def _encode_alphabet(alphabet):
+    """Encode an alphabet as a vector of ASCII code points.
+
+    Parameters
+    ----------
+    alphabet : str, list, tuple or 1D np.ndarray
+        Input alphabet. Must consist of single ASCII characters. Elements may
+        be string or byte characters, or integers representing code points.
+
+    Returns
+    -------
+    1D np.ndarray of np.uint8
+        Vector of ASCII code points representing the alphabet.
+
+    Raises
+    ------
+    TypeError
+        If alphabet or its components are of a wrong data type.
+    ValueError
+        If some elements are not single characters.
+    ValueError
+        If some code points are beyond the ASCII range.
+    UnicodeEncodeError
+        If some characters are beyond the ASCII range.
+
+    Notes
+    -----
+    ASCII has 128 code points (0 to 127) [1]_ (not to be confused with extended
+    ASCII). Therefore, output values are within the range of [0, 127].
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/ASCII
+
+    """
+    errmsg = 'Alphabet is of an invalid data type.'
+
+    # string
+    if isinstance(alphabet, str):
+        alphabet = alphabet.encode('ascii')
+        return np.frombuffer(alphabet, dtype=np.uint8)
+
+    # list or tuple
+    elif isinstance(alphabet, (list, tuple)):
+        alphabet = np.array(alphabet)
+
+    # 1d numpy array
+    elif not isinstance(alphabet, np.ndarray):
+        raise TypeError(errmsg)
+    if alphabet.ndim != 1:
+        raise TypeError(errmsg)
+    dtype = alphabet.dtype
+
+    # integers represent ascii code points
+    if np.issubdtype(dtype, np.integer):
+
+        # ascii code points are within [0, 127]
+        if np.all((alphabet >= 0) & (alphabet <= 127)):
+            if dtype is np.uint8:
+                return alphabet
+
+            # cast data type to uint8
+            else:
+                return alphabet.astype(np.uint8)
+        else:
+            raise ValueError('Not all code points are within the ASCII range.')
+
+    # encode strings as ascii characters
+    elif np.issubdtype(dtype, np.str_):
+        alphabet = np.char.encode(alphabet, encoding='ascii')
+
+    # bytes are already encoded
+    elif not np.issubdtype(dtype, np.bytes_):
+        raise TypeError(errmsg)
+
+    # must be single characters
+    if not (np.char.str_len(alphabet) == 1).all():
+        raise ValueError('Not all elements are single characters.')
+    return alphabet.view(np.uint8)
+
+
+def _alphabet_to_hashes(alphabet):
+    """Convert an alphabet into a hash table of ASCII code points to indices.
+
+    Parameters
+    ----------
+    alphabet : iterable
+        Input alphabet. Must consist of single ASCII characters.
+
+    Returns
+    -------
+    np.ndarray of np.uint8 of shape (128,)
+        Hash table of ASCII code points to indices.
+
+    Raises
+    ------
+    ValueError
+        If the absence character is not in the alphabet.
+    ValueError
+        If one or multiple characters in the sequence are absent from the
+        alphabet, whereas `absence` is not set.
+
+    See Also
+    --------
+    _indices_in_alphabet_ascii
+
+    Notes
+    -----
+    The resulting data structure enables efficient conversion of a sequence
+    into indices of characters in an alphabet.
+
+    The hash table has a constant size of 128, which is the total number of
+    ASCII characters.
+
+    Code points absent from the alphabet are filled with 255, which is beyond
+    the range of ASCII characters, hence the maximum index in the alphabet.
+
+    """
+    idx = _encode_alphabet(alphabet)
+    res = np.full(128, 255, dtype=np.uint8)
+    res[idx] = np.arange(idx.size)
+    return res
+
+
+def _indices_in_alphabet(seq, alphabet, wildcard=None):
+    """Convert a sequence into indices of characters in an alphabet.
+
+    Parameters
+    ----------
+    seq : iterable
+        Input sequence.
+    alphabet : dict or iterable
+        Input alphabet. Can be a dictionary of characters to indices, or an
+        iterable of other types from which the dictionary will be constructed.
+    wildcard : hashable, optional
+        Character to replace any characters that are absent from the alphabet.
+        If omitted, will raise an error if the latter characters exist.
+
+    Returns
+    -------
+    1D np.ndarray of int
+        Vector of indices of characters in an alphabet.
+
+    Raises
+    ------
+    ValueError
+        If the wildcard character is not in the alphabet.
+    ValueError
+        If one or multiple characters in the sequence are absent from the
+        alphabet, whereas `wildcard` is not set.
+
+    See Also
+    --------
+    _indices_in_alphabet_ascii
+
+    Notes
+    -----
+    This function is versatile to the type of characters.
+
+    """
+    if not isinstance(alphabet, dict):
+        alphabet = {x: i for i, x in enumerate(alphabet)}
+    pos = list(map(alphabet.get, seq))
+    if wildcard is not None:
+        try:
+            wildcard = alphabet[wildcard]
+        except KeyError:
+            raise ValueError(f'Wildcard character "{wildcard}" is not in the '
+                             'alphabet.')
+        pos = [wildcard if x is None else x for x in pos]
+    elif None in pos:
+        raise ValueError('One or multiple characters in the sequence are '
+                         'absent from the alphabet.')
+    return np.array(pos)
+
+
+def _indices_in_alphabet_ascii(seq, alphabet, wildcard=None):
+    """Convert a sequence into indices of characters in an ASCII alphabet.
+
+    Parameters
+    ----------
+    seq : 1D np.ndarray of int
+        Input sequence as ASCII code points.
+    alphabet : np.ndarray of shape (128,) of int
+        Input alphabet as a hash table of all ASCII code points to character
+        indices, or 255 if absent from the alphabet.
+    wildcard : int, optional
+        Code point of character to replace any characters that are absent from
+        the alphabet. If omitted, will raise an error if such characters exist.
+
+    Returns
+    -------
+    1D np.ndarray of uint8
+        Vector of indices of characters in an alphabet.
+
+    Raises
+    ------
+    ValueError
+        If the wildcard character is not in the alphabet.
+    ValueError
+        If one or multiple characters in the sequence are absent from the
+        alphabet, whereas `wildcard` is not set.
+
+    See Also
+    --------
+    _indices_in_alphabet
+    _alphabet_to_hashes
+
+    Notes
+    -----
+    This function is optimized for single ASCII characters.
+
+    """
+    pos = alphabet[seq]
+    absent = pos == 255
+    if absent.any():
+        if wildcard is None:
+            raise ValueError('One or multiple characters in the sequence are '
+                             'absent from the alphabet.')
+        try:
+            assert (wild := alphabet[wildcard]) != 255
+        except AssertionError:
+            raise ValueError(f'Wildcard character "{chr(wildcard)}" is not in '
+                             'the alphabet.')
+        pos = np.where(absent, wild, pos)
+    return pos
+
+
+def _indices_in_observed(seqs):
+    """Convert sequences into vectors of indices in observed characters.
+
+    Parameters
+    ----------
+    seqs : iterable of iterable
+        Input sequences.
+
+    Returns
+    -------
+    list of 1D np.ndarray
+        Vectors of indices representing the sequences.
+    1D np.ndarray
+        Sorted vector of unique characters observed in the sequences.
+
+    """
+    # This function uses np.unique to extract unique characters and their
+    # indices. It applies np.unique on individual sequences, then merges
+    # results. This design is to avoid concatenating too many sequences.
+    alpha_lst, index_lst = zip(*[np.unique(tuple(x) if isinstance(
+        x, str) else x, return_inverse=True) for x in seqs])
+    alpha_union, index_union = np.unique(
+        np.concatenate(alpha_lst), return_inverse=True)
+    index_bounds = np.cumsum([x.size for x in alpha_lst])[:-1]
+    index_chunks = np.split(index_union, index_bounds)
+    index_lst_trans = [x[y] for x, y in zip(index_chunks, index_lst)]
+    return index_lst_trans, alpha_union
diff --git a/skbio/sequence/_dna.py b/skbio/sequence/_dna.py
@@ -185,6 +185,11 @@ def default_gap_char(cls):
     def gap_chars(cls):
         return set('-.')
 
+    @classproperty
+    @overrides(GrammaredSequence)
+    def wildcard_char(cls):
+        return 'N'
+
     @property
     def _motifs(self):
         return _motifs

diff --git a/skbio/sequence/_grammared_sequence.py b/skbio/sequence/_grammared_sequence.py
@@ -164,7 +164,7 @@ def _validation_mask(cls):
             as_bytes = ''.join(cls.alphabet).encode('ascii')
             cls.__validation_mask = np.invert(np.bincount(
                 np.frombuffer(as_bytes, dtype=np.uint8),
-                minlength=cls._number_of_extended_ascii_codes).astype(bool))
+                minlength=cls._num_extended_ascii_codes).astype(bool))
         return cls.__validation_mask
 
     @classproperty
@@ -292,6 +292,19 @@ def degenerate_map(cls):
         """
         raise NotImplementedError
 
+    @classproperty
+    @experimental(as_of='0.5.10')
+    def wildcard_char(cls):
+        """Return wildcard character.
+
+        Returns
+        -------
+        str of length 1
+            Wildcard character.
+
+        """
+        return None
+
     @property
     def _motifs(self):
         return _motifs
@@ -315,7 +328,7 @@ def _validate(self):
         # numbers and remove counts of valid numbers, so that we need only
         # see if the array is empty to determine validity.
         invalid_characters = np.bincount(
-            self._bytes, minlength=self._number_of_extended_ascii_codes
+            self._bytes, minlength=self._num_extended_ascii_codes
         ) * self._validation_mask
         if np.any(invalid_characters):
             bad = list(np.where(

diff --git a/skbio/sequence/_nucleotide_mixin.py b/skbio/sequence/_nucleotide_mixin.py
@@ -33,7 +33,7 @@ def _complement_lookup(cls):
         if cls.__complement_lookup is not None:
             return cls.__complement_lookup
 
-        lookup = np.zeros(cls._number_of_extended_ascii_codes, dtype=np.uint8)
+        lookup = np.zeros(cls._num_extended_ascii_codes, dtype=np.uint8)
         for key, value in cls.complement_map.items():
             lookup[ord(key)] = ord(value)
         cls.__complement_lookup = lookup
@@ -360,7 +360,7 @@ def gc_frequency(self, relative=False):
         """
 
         counts = np.bincount(self._bytes,
-                             minlength=self._number_of_extended_ascii_codes)
+                             minlength=self._num_extended_ascii_codes)
         gc = counts[self._gc_codes].sum()
         if relative:
             seq = self.degap()

diff --git a/skbio/sequence/_protein.py b/skbio/sequence/_protein.py
@@ -217,6 +217,11 @@ def gap_chars(cls):
     def default_gap_char(cls):
         return '-'
 
+    @classproperty
+    @overrides(GrammaredSequence)
+    def wildcard_char(cls):
+        return 'X'
+
     @property
     def _motifs(self):
         return _motifs

diff --git a/skbio/sequence/_rna.py b/skbio/sequence/_rna.py
@@ -157,6 +157,11 @@ def default_gap_char(cls):
     def gap_chars(cls):
         return set('-.')
 
+    @classproperty
+    @overrides(GrammaredSequence)
+    def wildcard_char(cls):
+        return 'N'
+
     @property
     def _motifs(self):
         return _motifs