qiime2 · ebolyen · Jul 31, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 23, 2024
diff --git a/q2_types/__init__.py b/q2_types/__init__.py
@@ -6,29 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import importlib
-
 from ._version import get_versions
 
 __version__ = get_versions()['version']
 del get_versions
-
-# feature_data needs to be imported before feature_table to avoid circular
-# import.
-importlib.import_module('q2_types.feature_data')
-importlib.import_module('q2_types.per_sample_sequences')
-importlib.import_module('q2_types.feature_map')
-importlib.import_module('q2_types.feature_table')
-importlib.import_module('q2_types.distance_matrix')
-importlib.import_module('q2_types.tree')
-importlib.import_module('q2_types.ordination')
-importlib.import_module('q2_types.sample_data')
-importlib.import_module('q2_types.bowtie2')
-importlib.import_module('q2_types.metadata')
-importlib.import_module('q2_types.multiplexed_sequences')
-importlib.import_module('q2_types.kraken2')
-importlib.import_module('q2_types.feature_data_mag')
-importlib.import_module('q2_types.genome_data')
-importlib.import_module('q2_types.kaiju')
-importlib.import_module('q2_types.reference_db')
-importlib.import_module('q2_types.profile_hmms')
diff --git a/q2_types/_util.py b/q2_types/_util.py
@@ -5,153 +5,40 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-import re
 import gzip
 import itertools
 
+import skbio
+import pandas as pd
+
 import qiime2.plugin.model as model
 from qiime2.plugin import ValidationError
 
-# These classes and their helper functions are located in this module to avoid
-# circular imports.
-
 
-def _construct_validator_from_alphabet(alphabet_str):
-    if alphabet_str:
-        Validator = re.compile(fr'[{alphabet_str}]+\r?\n?')
-        ValidationSet = frozenset(alphabet_str)
-    else:
-        Validator, ValidationSet = None, None
-    return Validator, ValidationSet
+def read_from_fasta(path, constructor=skbio.DNA, lowercase=False):
+    return skbio.read(path, format='fasta', constructor=constructor,
+                      lowercase=lowercase)
 
 
-class FASTAFormat(model.TextFileFormat):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.aligned = False
-        self.alphabet = None
-
-    def _validate_(self, level):
-        FASTAValidator, ValidationSet = _construct_validator_from_alphabet(
-            self.alphabet)
-        self._validate_FASTA(level, FASTAValidator, ValidationSet)
-
-    def _validate_line_lengths(
-            self, seq_len, prev_seq_len, prev_seq_start_line):
-        if prev_seq_len != seq_len:
-            raise ValidationError('The sequence starting on line '
-                                  f'{prev_seq_start_line} was length '
-                                  f'{prev_seq_len}. All previous sequences '
-                                  f'were length {seq_len}. All sequences must '
-                                  'be the same length for AlignedFASTAFormat.')
-
-    def _validate_FASTA(self, level, FASTAValidator=None, ValidationSet=None):
-        last_line_was_ID = False
-        ids = {}
-
-        seq_len = 0
-        prev_seq_len = 0
-        prev_seq_start_line = 0
-
-        level_map = {'min': 100, 'max': float('inf')}
-        max_lines = level_map[level]
-
-        with self.path.open('rb') as fh:
-            try:
-                first = fh.read(6)
-                if first[:3] == b'\xEF\xBB\xBF':
-                    first = first[3:]
-
-                # Empty files should validate
-                if first.strip() == b'':
-                    return
-
-                if first[0] != ord(b'>'):
-                    raise ValidationError("First line of file is not a valid "
-                                          "description. Descriptions must "
-                                          "start with '>'")
-                fh.seek(0)
-
-                for line_number, line in enumerate(fh, 1):
-                    line = line.strip()
-                    if line_number >= max_lines:
-                        return
-                    line = line.decode('utf-8-sig')
-
-                    if line.startswith('>'):
-                        if FASTAValidator and ValidationSet:
-                            if seq_len == 0:
-                                seq_len = prev_seq_len
-
-                            if self.aligned:
-                                self._validate_line_lengths(
-                                    seq_len, prev_seq_len, prev_seq_start_line)
-
-                            prev_seq_len = 0
-                            prev_seq_start_line = 0
-
-                        if last_line_was_ID:
-                            raise ValidationError('Multiple consecutive '
-                                                  'descriptions starting on '
-                                                  f'line {line_number-1!r}')
-
-                        line = line.split()
-
-                        if line[0] == '>':
-                            if len(line) == 1:
-                                raise ValidationError(
-                                    f'Description on line {line_number} is '
-                                    'missing an ID.')
-                            else:
-                                raise ValidationError(
-                                    f'ID on line {line_number} starts with a '
-                                    'space. IDs may not start with spaces')
-
-                        if line[0] in ids:
-                            raise ValidationError(
-                                f'ID on line {line_number} is a duplicate of '
-                                f'another ID on line {ids[line[0]]}.')
-
-                        ids[line[0]] = line_number
-                        last_line_was_ID = True
-
-                    elif FASTAValidator and ValidationSet:
-                        if re.fullmatch(FASTAValidator, line):
-                            if prev_seq_start_line == 0:
-                                prev_seq_start_line = line_number
-
-                            prev_seq_len += len(line)
-                            last_line_was_ID = False
-
-                        else:
-                            for position, character in enumerate(line):
-                                if character not in ValidationSet:
-                                    raise ValidationError(
-                                        f"Invalid character '{character}' at "
-                                        f"position {position} on line "
-                                        f"{line_number} (does not match IUPAC "
-                                        "characters for this sequence type). "
-                                        "Allowed characters are "
-                                        f"{self.alphabet}.")
-
-                    else:
-                        last_line_was_ID = False
-
-            except UnicodeDecodeError as e:
-                raise ValidationError(f'utf-8 cannot decode byte on line '
-                                      f'{line_number}') from e
-
-        if self.aligned:
-            self._validate_line_lengths(
-                seq_len, prev_seq_len, prev_seq_start_line)
-
-
-class DNAFASTAFormat(FASTAFormat):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.alphabet = "ACGTRYKMSWBDHVN"
+def fasta_to_series(ff, constructor=skbio.DNA, lowercase=False):
+    data = {}
+    for sequence in read_from_fasta(str(ff), constructor,
+                                    lowercase=lowercase):
+        id_ = sequence.metadata['id']
+        # this may no longer do anything b/c of format validation, but leaving
+        # here as a safeguard & we may want to examine/address later
+        # relevant PR associated with this change:
+        # https://github.com/qiime2/q2-types/pull/335
+        if id_ in data:
+            raise ValueError("FASTA format sequence IDs must be unique. The "
+                             "following ID was found more than once: %s."
+                             % id_)
+        data[id_] = sequence
+    return pd.Series(data)
 
 
+# These classes and their helper functions are located in this module to avoid
+# circular imports.
 class FastqGzFormat(model.BinaryFileFormat):
     """
     A gzipped fastq file.

diff --git a/q2_types/bowtie2/__init__.py b/q2_types/bowtie2/__init__.py
@@ -9,9 +9,4 @@
 from ._formats import (Bowtie2IndexFileFormat, Bowtie2IndexDirFmt)
 from ._types import Bowtie2Index
 
-from ..plugin_setup import plugin, citations
-
-plugin.register_views(Bowtie2IndexDirFmt,
-                      citations=[citations['langmead2012fast']])
-
 __all__ = ['Bowtie2IndexFileFormat', 'Bowtie2IndexDirFmt', 'Bowtie2Index']
diff --git a/q2_types/bowtie2/_deferred_setup/__init__.py b/q2_types/bowtie2/_deferred_setup/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from qiime2.plugin import Citations
+
+from .. import Bowtie2IndexDirFmt
+from .. import Bowtie2Index
+
+from ...plugin_setup import plugin
+
+citations = Citations.load('citations.bib', package='q2_types.bowtie2')
+plugin.register_views(Bowtie2IndexDirFmt,
+                      citations=[citations['langmead2012fast']])
+
+plugin.register_semantic_types(Bowtie2Index)
+
+plugin.register_artifact_class(
+    Bowtie2Index,
+    directory_format=Bowtie2IndexDirFmt,
+    description='An index of sequences for Bowtie 2 to search against.'
+)
diff --git a/q2_types/bowtie2/_types.py b/q2_types/bowtie2/_types.py
@@ -7,14 +7,9 @@
 # ----------------------------------------------------------------------------
 
 from qiime2.plugin import SemanticType
-from . import Bowtie2IndexDirFmt
-from ..plugin_setup import plugin
 
 
 # Technically there is a bit more to this, for instance the ref sequences may
 # or may not be present in an index, or may be the only thing in an index,
 # but let's not worry about that just yet.
 Bowtie2Index = SemanticType('Bowtie2Index')
-
-plugin.register_semantic_types(Bowtie2Index)
-plugin.register_artifact_class(Bowtie2Index, Bowtie2IndexDirFmt)
diff --git a/q2_types/bowtie2/citations.bib b/q2_types/bowtie2/citations.bib
@@ -0,0 +1,10 @@
+@article{langmead2012fast,
+  title={Fast gapped-read alignment with Bowtie 2},
+  author={Langmead, Ben and Salzberg, Steven L},
+  journal={Nature methods},
+  volume={9},
+  number={4},
+  pages={357},
+  year={2012},
+  publisher={Nature Publishing Group}
+}
diff --git a/q2_types/citations.bib b/q2_types/citations.bib
@@ -1,15 +1,3 @@
-@article{mcdonald2012biological,
-  title={The Biological Observation Matrix (BIOM) format or: how I learned to stop worrying and love the ome-ome},
-  author={McDonald, Daniel and Clemente, Jose C and Kuczynski, Justin and Rideout, Jai Ram and Stombaugh, Jesse and Wendel, Doug and Wilke, Andreas and Huse, Susan and Hufnagle, John and Meyer, Folker and Knight, Rob and Caporaso, J Gregory},
-  journal={GigaScience},
-  volume={1},
-  number={1},
-  pages={7},
-  year={2012},
-  publisher={BioMed Central},
-  doi={10.1186/2047-217X-1-7}
-}
-
 @InProceedings{ mckinney-proc-scipy-2010,
   author    = { Wes McKinney },
   title     = { Data Structures for Statistical Computing in Python },
@@ -18,14 +6,3 @@ @InProceedings{ mckinney-proc-scipy-2010
   year      = { 2010 },
   editor    = { St{\'e}fan van der Walt and Jarrod Millman }
 }
-
-@article{langmead2012fast,
-  title={Fast gapped-read alignment with Bowtie 2},
-  author={Langmead, Ben and Salzberg, Steven L},
-  journal={Nature methods},
-  volume={9},
-  number={4},
-  pages={357},
-  year={2012},
-  publisher={Nature Publishing Group}
-}
diff --git a/q2_types/distance_matrix/__init__.py b/q2_types/distance_matrix/__init__.py
@@ -6,11 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import importlib
-
-from ._format import LSMatFormat, DistanceMatrixDirectoryFormat
-from ._type import DistanceMatrix
+from ._formats import LSMatFormat, DistanceMatrixDirectoryFormat
+from ._types import DistanceMatrix
 
 __all__ = ['LSMatFormat', 'DistanceMatrixDirectoryFormat', 'DistanceMatrix']
-
-importlib.import_module('q2_types.distance_matrix._transformer')
diff --git a/q2_types/distance_matrix/_type.py → ...stance_matrix/_deferred_setup/__init__.py b/q2_types/distance_matrix/_type.py → ...stance_matrix/_deferred_setup/__init__.py
@@ -6,17 +6,21 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from qiime2.plugin import SemanticType
+import importlib
 
-from ..plugin_setup import plugin
-from . import DistanceMatrixDirectoryFormat
+from .. import LSMatFormat, DistanceMatrixDirectoryFormat
+from .. import DistanceMatrix
 
+from ...plugin_setup import plugin
 
-DistanceMatrix = SemanticType('DistanceMatrix')
+plugin.register_formats(LSMatFormat, DistanceMatrixDirectoryFormat)
 
 plugin.register_semantic_types(DistanceMatrix)
+
 plugin.register_artifact_class(
     DistanceMatrix,
     directory_format=DistanceMatrixDirectoryFormat,
     description="A symmetric matrix representing distances between entities."
 )
+
+importlib.import_module('._transformers', __name__)
diff --git a/q2_types/distance_matrix/_transformer.py → ...e_matrix/_deferred_setup/_transformers.py b/q2_types/distance_matrix/_transformer.py → ...e_matrix/_deferred_setup/_transformers.py
@@ -8,8 +8,9 @@
 
 import skbio
 
-from ..plugin_setup import plugin
-from . import LSMatFormat
+from .. import LSMatFormat
+
+from ...plugin_setup import plugin
 
 
 @plugin.register_transformer

diff --git a/q2_types/distance_matrix/_format.py → q2_types/distance_matrix/_formats.py b/q2_types/distance_matrix/_format.py → q2_types/distance_matrix/_formats.py
@@ -9,8 +9,6 @@
 import skbio.io
 import qiime2.plugin.model as model
 
-from ..plugin_setup import plugin
-
 
 class LSMatFormat(model.TextFileFormat):
     def sniff(self):
@@ -20,6 +18,3 @@ def sniff(self):
 
 DistanceMatrixDirectoryFormat = model.SingleFileDirectoryFormat(
     'DistanceMatrixDirectoryFormat', 'distance-matrix.tsv', LSMatFormat)
-
-
-plugin.register_formats(LSMatFormat, DistanceMatrixDirectoryFormat)
diff --git a/q2_types/distance_matrix/_types.py b/q2_types/distance_matrix/_types.py
@@ -0,0 +1,12 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from qiime2.plugin import SemanticType
+
+
+DistanceMatrix = SemanticType('DistanceMatrix')
diff --git a/...ypes/distance_matrix/tests/test_format.py → ...pes/distance_matrix/tests/test_formats.py b/...ypes/distance_matrix/tests/test_format.py → ...pes/distance_matrix/tests/test_formats.py
diff --git a/...distance_matrix/tests/test_transformer.py → ...istance_matrix/tests/test_transformers.py b/...distance_matrix/tests/test_transformer.py → ...istance_matrix/tests/test_transformers.py
diff --git a/q2_types/distance_matrix/tests/test_type.py → q2_types/distance_matrix/tests/test_types.py b/q2_types/distance_matrix/tests/test_type.py → q2_types/distance_matrix/tests/test_types.py