Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: migrate q2-demux types/formats/transformers to q2-types for more general access #307

Merged
merged 15 commits into from
May 15, 2024
Merged
4 changes: 2 additions & 2 deletions q2_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@

# feature_data needs to be imported before feature_table to avoid circular
# import.
importlib.import_module('q2_types.multiplexed_sequences')
importlib.import_module('q2_types.feature_data')
importlib.import_module('q2_types.per_sample_sequences')
importlib.import_module('q2_types.feature_map')
importlib.import_module('q2_types.feature_table')
importlib.import_module('q2_types.distance_matrix')
importlib.import_module('q2_types.tree')
importlib.import_module('q2_types.ordination')
importlib.import_module('q2_types.sample_data')
importlib.import_module('q2_types.per_sample_sequences')
importlib.import_module('q2_types.bowtie2')
importlib.import_module('q2_types.metadata')
importlib.import_module('q2_types.multiplexed_sequences')
importlib.import_module('q2_types.kraken2')
importlib.import_module('q2_types.feature_data_mag')
importlib.import_module('q2_types.genome_data')
Expand Down
208 changes: 208 additions & 0 deletions q2_types/_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
import gzip
import itertools

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

# These classes and their helper functions are located in this module to avoid
# circular imports.


def _construct_validator_from_alphabet(alphabet_str):
if alphabet_str:
Validator = re.compile(fr'[{alphabet_str}]+\r?\n?')
ValidationSet = frozenset(alphabet_str)
else:
Validator, ValidationSet = None, None
return Validator, ValidationSet


class FASTAFormat(model.TextFileFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.aligned = False
self.alphabet = None

def _validate_(self, level):
FASTAValidator, ValidationSet = _construct_validator_from_alphabet(
self.alphabet)
self._validate_FASTA(level, FASTAValidator, ValidationSet)

def _validate_line_lengths(
self, seq_len, prev_seq_len, prev_seq_start_line):
if prev_seq_len != seq_len:
raise ValidationError('The sequence starting on line '
f'{prev_seq_start_line} was length '
f'{prev_seq_len}. All previous sequences '
f'were length {seq_len}. All sequences must '
'be the same length for AlignedFASTAFormat.')

def _validate_FASTA(self, level, FASTAValidator=None, ValidationSet=None):
last_line_was_ID = False
ids = {}

seq_len = 0
prev_seq_len = 0
prev_seq_start_line = 0

level_map = {'min': 100, 'max': float('inf')}
max_lines = level_map[level]

with self.path.open('rb') as fh:
try:
first = fh.read(6)
if first[:3] == b'\xEF\xBB\xBF':
first = first[3:]

# Empty files should validate
if first.strip() == b'':
return

if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"description. Descriptions must "
"start with '>'")
fh.seek(0)

for line_number, line in enumerate(fh, 1):
line = line.strip()
if line_number >= max_lines:
return
line = line.decode('utf-8-sig')

if line.startswith('>'):
if FASTAValidator and ValidationSet:
if seq_len == 0:
seq_len = prev_seq_len

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)

prev_seq_len = 0
prev_seq_start_line = 0

if last_line_was_ID:
raise ValidationError('Multiple consecutive '
'descriptions starting on '
f'line {line_number-1!r}')

line = line.split()

if line[0] == '>':
if len(line) == 1:
raise ValidationError(
f'Description on line {line_number} is '
'missing an ID.')
else:
raise ValidationError(
f'ID on line {line_number} starts with a '
'space. IDs may not start with spaces')

if line[0] in ids:
raise ValidationError(
f'ID on line {line_number} is a duplicate of '
f'another ID on line {ids[line[0]]}.')

ids[line[0]] = line_number
last_line_was_ID = True

elif FASTAValidator and ValidationSet:
if re.fullmatch(FASTAValidator, line):
if prev_seq_start_line == 0:
prev_seq_start_line = line_number

prev_seq_len += len(line)
last_line_was_ID = False

else:
for position, character in enumerate(line):
if character not in ValidationSet:
raise ValidationError(
f"Invalid character '{character}' at "
f"position {position} on line "
f"{line_number} (does not match IUPAC "
"characters for this sequence type). "
"Allowed characters are "
f"{self.alphabet}.")

else:
last_line_was_ID = False

except UnicodeDecodeError as e:
raise ValidationError(f'utf-8 cannot decode byte on line '
f'{line_number}') from e

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)


class DNAFASTAFormat(FASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet = "ACGTRYKMSWBDHVN"


class FastqGzFormat(model.BinaryFileFormat):
"""
A gzipped fastq file.

"""

def _check_n_records(self, n=None):
with gzip.open(str(self), mode='rt', encoding='ascii') as fh:
zipper = itertools.zip_longest(*[fh] * 4)
if n is None:
file_ = enumerate(zipper)
else:
file_ = zip(range(1, n), zipper)
for i, record in file_:
header, seq, sep, qual = record

if not header.startswith('@'):
raise ValidationError('Header on line %d is not FASTQ, '
'records may be misaligned' %
(i * 4 + 1))

if seq is None or seq == '\n':
raise ValidationError('Missing sequence for record '
'beginning on line %d'
% (i * 4 + 1))
elif not seq.isupper():
raise ValidationError('Lowercase case sequence on line %d'
% (i * 4 + 2))

if sep is None:
raise ValidationError('Missing separator for record '
'beginning on line %d'
% (i * 4 + 1))
elif not sep.startswith('+'):
raise ValidationError('Invalid separator on line %d'
% (i * 4 + 3))

if qual is None:
raise ValidationError('Missing quality for record '
'beginning on line %d'
% (i * 4 + 1))
elif len(qual) != len(seq):
raise ValidationError('Quality score length doesn\'t '
'match sequence length for record '
'beginning on line %d'
% (i * 4 + 1))

def _validate_(self, level):
with self.open() as fh:
if fh.peek(2)[:2] != b'\x1f\x8b':
raise ValidationError('File is uncompressed')

record_count_map = {'min': 5, 'max': None}
self._check_n_records(record_count_map[level])
5 changes: 4 additions & 1 deletion q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
from ._transformer import (
NucleicAcidIterator, DNAIterator, PairedDNAIterator, AlignedDNAIterator,
ProteinIterator, AlignedProteinIterator, RNAIterator, AlignedRNAIterator,
PairedRNAIterator)
PairedRNAIterator, BarcodePairedSequenceFastqIterator,
BarcodeSequenceFastqIterator)

__all__ = [
'TaxonomyFormat', 'TaxonomyDirectoryFormat', 'HeaderlessTSVTaxonomyFormat',
Expand All @@ -63,6 +64,8 @@
'MixedCaseAlignedDNASequencesDirectoryFormat',
'MixedCaseAlignedRNAFASTAFormat',
'MixedCaseAlignedRNASequencesDirectoryFormat',
'BarcodePairedSequenceFastqIterator',
'BarcodeSequenceFastqIterator',
'MixedCaseProteinFASTAFormat',
'MixedCaseAlignedProteinFASTAFormat',
'MixedCaseProteinSequencesDirectoryFormat',
Expand Down
119 changes: 1 addition & 118 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from qiime2.plugin import ValidationError
import qiime2

from .._util import FASTAFormat, DNAFASTAFormat
from ..plugin_setup import plugin


Expand Down Expand Up @@ -144,118 +145,6 @@ def _validate_(self, level):
'TSVTaxonomyDirectoryFormat', 'taxonomy.tsv', TSVTaxonomyFormat)


class FASTAFormat(model.TextFileFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.aligned = False
self.alphabet = None

def _validate_(self, level):
FASTAValidator, ValidationSet = _construct_validator_from_alphabet(
self.alphabet)
self._validate_FASTA(level, FASTAValidator, ValidationSet)

def _validate_FASTA(self, level, FASTAValidator=None, ValidationSet=None):
last_line_was_ID = False
ids = {}

seq_len = 0
prev_seq_len = 0
prev_seq_start_line = 0

level_map = {'min': 100, 'max': float('inf')}
max_lines = level_map[level]

with self.path.open('rb') as fh:
try:
first = fh.read(6)
if first[:3] == b'\xEF\xBB\xBF':
first = first[3:]

# Empty files should validate
if first.strip() == b'':
return

if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"description. Descriptions must "
"start with '>'")
fh.seek(0)

for line_number, line in enumerate(fh, 1):
line = line.strip()
if line_number >= max_lines:
return
line = line.decode('utf-8-sig')

if line.startswith('>'):
if FASTAValidator and ValidationSet:
if seq_len == 0:
seq_len = prev_seq_len

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)

prev_seq_len = 0
prev_seq_start_line = 0

if last_line_was_ID:
raise ValidationError('Multiple consecutive '
'descriptions starting on '
f'line {line_number-1!r}')

line = line.split()

if line[0] == '>':
if len(line) == 1:
raise ValidationError(
f'Description on line {line_number} is '
'missing an ID.')
else:
raise ValidationError(
f'ID on line {line_number} starts with a '
'space. IDs may not start with spaces')

if line[0] in ids:
raise ValidationError(
f'ID on line {line_number} is a duplicate of '
f'another ID on line {ids[line[0]]}.')

ids[line[0]] = line_number
last_line_was_ID = True

elif FASTAValidator and ValidationSet:
if re.fullmatch(FASTAValidator, line):
if prev_seq_start_line == 0:
prev_seq_start_line = line_number

prev_seq_len += len(line)
last_line_was_ID = False

else:
for position, character in enumerate(line):
if character not in ValidationSet:
raise ValidationError(
f"Invalid character '{character}' at "
f"position {position} on line "
f"{line_number} (does not match IUPAC "
"characters for this sequence type). "
"Allowed characters are "
f"{self.alphabet}.")

else:
last_line_was_ID = False

except UnicodeDecodeError as e:
raise ValidationError(f'utf-8 cannot decode byte on line '
f'{line_number}') from e

if self.aligned:
self._validate_line_lengths(
seq_len, prev_seq_len, prev_seq_start_line)


class AlignedFASTAFormatMixin:
def _turn_into_alignment(self):
self.aligned = True
Expand All @@ -271,12 +160,6 @@ def _validate_line_lengths(
'be the same length for AlignedFASTAFormat.')


class DNAFASTAFormat(FASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet = "ACGTRYKMSWBDHVN"


DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'DNASequencesDirectoryFormat', 'dna-sequences.fasta', DNAFASTAFormat)

Expand Down
Loading
Loading