Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor q2-types to avoid circular imports #342

Merged
merged 17 commits into from
Jul 31, 2024
22 changes: 0 additions & 22 deletions q2_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import importlib

from ._version import get_versions

__version__ = get_versions()['version']
del get_versions

# feature_data needs to be imported before feature_table to avoid circular
# import.
importlib.import_module('q2_types.feature_data')
importlib.import_module('q2_types.per_sample_sequences')
importlib.import_module('q2_types.feature_map')
importlib.import_module('q2_types.feature_table')
importlib.import_module('q2_types.distance_matrix')
importlib.import_module('q2_types.tree')
importlib.import_module('q2_types.ordination')
importlib.import_module('q2_types.sample_data')
importlib.import_module('q2_types.bowtie2')
importlib.import_module('q2_types.metadata')
importlib.import_module('q2_types.multiplexed_sequences')
importlib.import_module('q2_types.kraken2')
importlib.import_module('q2_types.feature_data_mag')
importlib.import_module('q2_types.genome_data')
importlib.import_module('q2_types.kaiju')
importlib.import_module('q2_types.reference_db')
importlib.import_module('q2_types.profile_hmms')
159 changes: 23 additions & 136 deletions q2_types/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,153 +5,40 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
import gzip
import itertools

import skbio
import pandas as pd

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

# These classes and their helper functions are located in this module to avoid
# circular imports.


def _construct_validator_from_alphabet(alphabet_str):
if alphabet_str:
Validator = re.compile(fr'[{alphabet_str}]+\r?\n?')
ValidationSet = frozenset(alphabet_str)
else:
Validator, ValidationSet = None, None
return Validator, ValidationSet
def read_from_fasta(path, constructor=skbio.DNA, lowercase=False):
return skbio.read(path, format='fasta', constructor=constructor,
lowercase=lowercase)


class FASTAFormat(model.TextFileFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.aligned = False
self.alphabet = None

def _validate_(self, level):
FASTAValidator, ValidationSet = _construct_validator_from_alphabet(
self.alphabet)
self._validate_FASTA(level, FASTAValidator, ValidationSet)

def _validate_line_lengths(
self, seq_len, prev_seq_len, prev_seq_start_line):
if prev_seq_len != seq_len:
raise ValidationError('The sequence starting on line '
f'{prev_seq_start_line} was length '
f'{prev_seq_len}. All previous sequences '
f'were length {seq_len}. All sequences must '
'be the same length for AlignedFASTAFormat.')

def _validate_FASTA(self, level, FASTAValidator=None, ValidationSet=None):
last_line_was_ID = False
ids = {}

seq_len = 0
prev_seq_len = 0
prev_seq_start_line = 0

level_map = {'min': 100, 'max': float('inf')}
max_lines = level_map[level]

with self.path.open('rb') as fh:
try:
first = fh.read(6)
if first[:3] == b'\xEF\xBB\xBF':
first = first[3:]

# Empty files should validate
if first.strip() == b'':
return

if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"description. Descriptions must "
"start with '>'")
fh.seek(0)

for line_number, line in enumerate(fh, 1):
line = line.strip()
if line_number >= max_lines:
return
line = line.decode('utf-8-sig')

if line.startswith('>'):
if FASTAValidator and ValidationSet:
if seq_len == 0:
seq_len = prev_seq_len

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)

prev_seq_len = 0
prev_seq_start_line = 0

if last_line_was_ID:
raise ValidationError('Multiple consecutive '
'descriptions starting on '
f'line {line_number-1!r}')

line = line.split()

if line[0] == '>':
if len(line) == 1:
raise ValidationError(
f'Description on line {line_number} is '
'missing an ID.')
else:
raise ValidationError(
f'ID on line {line_number} starts with a '
'space. IDs may not start with spaces')

if line[0] in ids:
raise ValidationError(
f'ID on line {line_number} is a duplicate of '
f'another ID on line {ids[line[0]]}.')

ids[line[0]] = line_number
last_line_was_ID = True

elif FASTAValidator and ValidationSet:
if re.fullmatch(FASTAValidator, line):
if prev_seq_start_line == 0:
prev_seq_start_line = line_number

prev_seq_len += len(line)
last_line_was_ID = False

else:
for position, character in enumerate(line):
if character not in ValidationSet:
raise ValidationError(
f"Invalid character '{character}' at "
f"position {position} on line "
f"{line_number} (does not match IUPAC "
"characters for this sequence type). "
"Allowed characters are "
f"{self.alphabet}.")

else:
last_line_was_ID = False

except UnicodeDecodeError as e:
raise ValidationError(f'utf-8 cannot decode byte on line '
f'{line_number}') from e

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)


class DNAFASTAFormat(FASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet = "ACGTRYKMSWBDHVN"
def fasta_to_series(ff, constructor=skbio.DNA, lowercase=False):
data = {}
for sequence in read_from_fasta(str(ff), constructor,
lowercase=lowercase):
id_ = sequence.metadata['id']
# this may no longer do anything b/c of format validation, but leaving
# here as a safeguard & we may want to examine/address later
# relevant PR associated with this change:
# https://github.com/qiime2/q2-types/pull/335
if id_ in data:
raise ValueError("FASTA format sequence IDs must be unique. The "
"following ID was found more than once: %s."
% id_)
data[id_] = sequence
return pd.Series(data)


# These classes and their helper functions are located in this module to avoid
# circular imports.
class FastqGzFormat(model.BinaryFileFormat):
"""
A gzipped fastq file.
Expand Down
5 changes: 0 additions & 5 deletions q2_types/bowtie2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,4 @@
from ._formats import (Bowtie2IndexFileFormat, Bowtie2IndexDirFmt)
from ._types import Bowtie2Index

from ..plugin_setup import plugin, citations

plugin.register_views(Bowtie2IndexDirFmt,
citations=[citations['langmead2012fast']])

__all__ = ['Bowtie2IndexFileFormat', 'Bowtie2IndexDirFmt', 'Bowtie2Index']
26 changes: 26 additions & 0 deletions q2_types/bowtie2/_deferred_setup/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.plugin import Citations

from .. import Bowtie2IndexDirFmt
from .. import Bowtie2Index

from ...plugin_setup import plugin

citations = Citations.load('citations.bib', package='q2_types.bowtie2')
plugin.register_views(Bowtie2IndexDirFmt,
citations=[citations['langmead2012fast']])

plugin.register_semantic_types(Bowtie2Index)

plugin.register_artifact_class(
Bowtie2Index,
directory_format=Bowtie2IndexDirFmt,
description='An index of sequences for Bowtie 2 to search against.'
)
5 changes: 0 additions & 5 deletions q2_types/bowtie2/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,9 @@
# ----------------------------------------------------------------------------

from qiime2.plugin import SemanticType
from . import Bowtie2IndexDirFmt
from ..plugin_setup import plugin


# Technically there is a bit more to this, for instance the ref sequences may
# or may not be present in an index, or may be the only thing in an index,
# but let's not worry about that just yet.
Bowtie2Index = SemanticType('Bowtie2Index')

plugin.register_semantic_types(Bowtie2Index)
plugin.register_artifact_class(Bowtie2Index, Bowtie2IndexDirFmt)
10 changes: 10 additions & 0 deletions q2_types/bowtie2/citations.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@article{langmead2012fast,
title={Fast gapped-read alignment with Bowtie 2},
author={Langmead, Ben and Salzberg, Steven L},
journal={Nature methods},
volume={9},
number={4},
pages={357},
year={2012},
publisher={Nature Publishing Group}
}
23 changes: 0 additions & 23 deletions q2_types/citations.bib
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
@article{mcdonald2012biological,
title={The Biological Observation Matrix (BIOM) format or: how I learned to stop worrying and love the ome-ome},
author={McDonald, Daniel and Clemente, Jose C and Kuczynski, Justin and Rideout, Jai Ram and Stombaugh, Jesse and Wendel, Doug and Wilke, Andreas and Huse, Susan and Hufnagle, John and Meyer, Folker and Knight, Rob and Caporaso, J Gregory},
journal={GigaScience},
volume={1},
number={1},
pages={7},
year={2012},
publisher={BioMed Central},
doi={10.1186/2047-217X-1-7}
}

@InProceedings{ mckinney-proc-scipy-2010,
author = { Wes McKinney },
title = { Data Structures for Statistical Computing in Python },
Expand All @@ -18,14 +6,3 @@ @InProceedings{ mckinney-proc-scipy-2010
year = { 2010 },
editor = { St{\'e}fan van der Walt and Jarrod Millman }
}

@article{langmead2012fast,
title={Fast gapped-read alignment with Bowtie 2},
author={Langmead, Ben and Salzberg, Steven L},
journal={Nature methods},
volume={9},
number={4},
pages={357},
year={2012},
publisher={Nature Publishing Group}
}
8 changes: 2 additions & 6 deletions q2_types/distance_matrix/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import importlib

from ._format import LSMatFormat, DistanceMatrixDirectoryFormat
from ._type import DistanceMatrix
from ._formats import LSMatFormat, DistanceMatrixDirectoryFormat
from ._types import DistanceMatrix

__all__ = ['LSMatFormat', 'DistanceMatrixDirectoryFormat', 'DistanceMatrix']

importlib.import_module('q2_types.distance_matrix._transformer')
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,21 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.plugin import SemanticType
import importlib

from ..plugin_setup import plugin
from . import DistanceMatrixDirectoryFormat
from .. import LSMatFormat, DistanceMatrixDirectoryFormat
from .. import DistanceMatrix

from ...plugin_setup import plugin

DistanceMatrix = SemanticType('DistanceMatrix')
plugin.register_formats(LSMatFormat, DistanceMatrixDirectoryFormat)

plugin.register_semantic_types(DistanceMatrix)

plugin.register_artifact_class(
DistanceMatrix,
directory_format=DistanceMatrixDirectoryFormat,
description="A symmetric matrix representing distances between entities."
)

importlib.import_module('._transformers', __name__)
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

import skbio

from ..plugin_setup import plugin
from . import LSMatFormat
from .. import LSMatFormat

from ...plugin_setup import plugin


@plugin.register_transformer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import skbio.io
import qiime2.plugin.model as model

from ..plugin_setup import plugin


class LSMatFormat(model.TextFileFormat):
def sniff(self):
Expand All @@ -20,6 +18,3 @@ def sniff(self):

DistanceMatrixDirectoryFormat = model.SingleFileDirectoryFormat(
'DistanceMatrixDirectoryFormat', 'distance-matrix.tsv', LSMatFormat)


plugin.register_formats(LSMatFormat, DistanceMatrixDirectoryFormat)
12 changes: 12 additions & 0 deletions q2_types/distance_matrix/_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.plugin import SemanticType


DistanceMatrix = SemanticType('DistanceMatrix')
Loading
Loading