Skip to content

Commit

Permalink
IMP: Expand alphabet for ProteinFASTAFormat (#306)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sann5 authored Dec 27, 2023
1 parent 8a0e191 commit 1827eab
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ target/
.*.swp

.DS_Store
.vscode
12 changes: 10 additions & 2 deletions q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat, FASTAFormat, AlignedFASTAFormatMixin,
AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat, RNASequencesDirectoryFormat,
MixedCaseProteinSequencesDirectoryFormat,
MixedCaseAlignedProteinSequencesDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
MixedCaseAlignedProteinFASTAFormat, RNASequencesDirectoryFormat,
RNAFASTAFormat, AlignedRNAFASTAFormat, AlignedRNASequencesDirectoryFormat,
PairedRNASequencesDirectoryFormat, BLAST6Format, BLAST6DirectoryFormat,
MixedCaseDNAFASTAFormat, MixedCaseDNASequencesDirectoryFormat,
Expand Down Expand Up @@ -59,6 +62,11 @@
'MixedCaseAlignedDNAFASTAFormat',
'MixedCaseAlignedDNASequencesDirectoryFormat',
'MixedCaseAlignedRNAFASTAFormat',
'MixedCaseAlignedRNASequencesDirectoryFormat']
'MixedCaseAlignedRNASequencesDirectoryFormat',
'MixedCaseProteinFASTAFormat',
'MixedCaseAlignedProteinFASTAFormat',
'MixedCaseProteinSequencesDirectoryFormat',
'MixedCaseAlignedProteinSequencesDirectoryFormat',
]

importlib.import_module('q2_types.feature_data._transformer')
39 changes: 36 additions & 3 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def validate(self, *args):
class ProteinFASTAFormat(FASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet = "ABCDEFGHIKLMNPQRSTVWXYZ*"
self.alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ*"


ProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
Expand All @@ -413,6 +413,19 @@ def __init__(self, *args, **kwargs):
ProteinFASTAFormat)


class MixedCaseProteinFASTAFormat(ProteinFASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
lower_case = "abcdefghijklmnopqrstuvwxyz"
self.alphabet = self.alphabet + lower_case


MixedCaseProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'MixedCaseProteinSequencesDirectoryFormat',
'protein-sequences.fasta',
MixedCaseProteinFASTAFormat)


class AlignedProteinFASTAFormat(AlignedFASTAFormatMixin, ProteinFASTAFormat):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -425,6 +438,23 @@ def __init__(self, *args, **kwargs):
AlignedProteinFASTAFormat)


class MixedCaseAlignedProteinFASTAFormat(
AlignedFASTAFormatMixin, MixedCaseProteinFASTAFormat
):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
super()._turn_into_alignment()


MixedCaseAlignedProteinSequencesDirectoryFormat = (
model.SingleFileDirectoryFormat(
'MixedCaseAlignedProteinSequencesDirectoryFormat',
'aligned-protein-sequences.fasta',
MixedCaseAlignedProteinFASTAFormat
)
)


class BLAST6Format(model.TextFileFormat):
def validate(self, *args):
try:
Expand All @@ -447,8 +477,11 @@ def validate(self, *args):
DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat, RNAFASTAFormat,
AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
MixedCaseAlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat,
MixedCaseProteinSequencesDirectoryFormat,
MixedCaseAlignedProteinSequencesDirectoryFormat, RNAFASTAFormat,
RNASequencesDirectoryFormat, AlignedRNAFASTAFormat,
AlignedRNASequencesDirectoryFormat, PairedRNASequencesDirectoryFormat,
BLAST6Format, BLAST6DirectoryFormat, MixedCaseDNAFASTAFormat,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ VASECEVKCMPTFQFFKKGQKVGEFSGAN
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQ-------VGEFSGAN
>sequence3
MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNUIFLEVDVDDCQD
VASECEVKCMPTFO-------VGEFSGAN
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>sequence1
-------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN
>sequence2
mvkqiesktafqealdaagdklvvvdfsatwcgpcckmikpffhslsekysnviflevdvddcq
dvasecevkcmptfq-------vgefsgan
>sequence3
MVKQiEskTAFqJaLdAAGDkLVvvVDFSATWCGPCkMIKpfFhSlSEkYSNUiFLEVDVDDCQD
VASecEVkCMPTFo-------VGEFSGAN
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>sequence1
MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
AAAQIR
>sequence2
mvkqiesktafqealdaagdklvvvdfsatwcgpcckmikpffhslsekysnviflevdvddcq
dvasecevkcmptfqffkkgqkvgefsgan*
>sequence3
TePdzNzWkRuZqYtWuYkSwUqFpUnHmDbGhFdZsPiYkCzHqXlCeByJrEoAuJvDlIrP
eGpOgMeJzQqRhCfQxUpZlDwDgOxToQtCiQdD*
3 changes: 3 additions & 0 deletions q2_types/feature_data/tests/data/protein-sequences.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ AAAQIR
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN*
>sequence3
TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRP
EGPOGMEJZQQRHCFQXUPZLDWDGOXTOQTCIQDD*
67 changes: 67 additions & 0 deletions q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat, FASTAFormat,
MixedCaseProteinFASTAFormat,
MixedCaseAlignedProteinFASTAFormat,
MixedCaseAlignedProteinSequencesDirectoryFormat,
MixedCaseProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat,
RNAFASTAFormat, RNASequencesDirectoryFormat, AlignedRNAFASTAFormat,
AlignedRNASequencesDirectoryFormat, BLAST6DirectoryFormat,
Expand Down Expand Up @@ -808,6 +812,69 @@ def test_aligned_protein_sequences_directory_format(self):

format.validate()

def test_mixed_case_aligned_protein_fasta_format_validate_positive(self):
filepath = self.get_data_path(
'mixed-case-aligned-protein-sequences.fasta'
)
format = MixedCaseAlignedProteinFASTAFormat(filepath, mode='r')

format.validate()
format.validate('min')

def test_mixed_case_aligned_protein_fasta_format_unaligned(self):
filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
format = MixedCaseAlignedProteinFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(
ValidationError, 'line 5 was length 95.* previous .* 70'):
format.validate()

def test_mixed_case_protein_fasta_format_validate_positive(self):
filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
format = MixedCaseProteinFASTAFormat(filepath, mode='r')

format.validate()

def test_mixed_case_protein_fasta_format_invalid_characters(self):
filepath = self.get_data_path(
'mixed-case-aligned-protein-sequences.fasta'
)
format = MixedCaseProteinFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(
ValidationError,
"Invalid character '-' at position 0 on line 2"):
format.validate()

def test_mixed_case_aligned_protein_sequences_directory_format(self):
filepath = self.get_data_path(
'mixed-case-aligned-protein-sequences.fasta'
)
temp_dir = self.temp_dir.name
shutil.copy(filepath,
os.path.join(
temp_dir,
'aligned-protein-sequences.fasta'
)
)
format = MixedCaseAlignedProteinSequencesDirectoryFormat(
temp_dir, mode='r'
)

format.validate()

def test_mixed_case_protein_sequences_directory_format(self):
filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
temp_dir = self.temp_dir.name
shutil.copy(filepath,
os.path.join(
temp_dir, 'protein-sequences.fasta'
)
)
format = MixedCaseProteinSequencesDirectoryFormat(temp_dir, mode='r')

format.validate()


class TestBLAST6Format(TestPluginBase):
package = 'q2_types.feature_data.tests'
Expand Down
51 changes: 39 additions & 12 deletions q2_types/feature_data/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,23 +1321,29 @@ def test_proteinfasta_format_to_series(self):

obs = obs.astype(str)

index = pd.Index(['sequence1', 'sequence2'])
index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
exp = pd.Series(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEK'
'HPDVVHAKVDTEAERELAAAAQIR',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLS'
'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*'],
'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*',
'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
index=index, dtype=object)

assert_series_equal(exp, obs)

def test_series_to_proteinfasta_format(self):
transformer = self.get_transformer(pd.Series, ProteinFASTAFormat)

index = pd.Index(['sequence1', 'sequence2'])
index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
input = pd.Series(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEK'
'HPDVVHAKVDTEAERELAAAAQIR',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLS'
'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN'],
'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*',
'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
index=index, dtype=object)

obs = transformer(input)
Expand All @@ -1354,12 +1360,17 @@ def test_proteinfasta_format_with_duplicate_ids_to_series(self):
def test_proteinfasta_format_to_metadata(self):
_, obs = self.transform_format(ProteinFASTAFormat, qiime2.Metadata,
'protein-sequences.fasta')
index = pd.Index(['sequence1', 'sequence2'], name='Feature ID')
index = pd.Index(
['sequence1', 'sequence2', 'sequence3'], name='Feature ID'
)
exp_df = pd.DataFrame(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRA'
'FAPTFAESSEKHPDVVHAKVDTEAERELAAAAQIR',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQFFKKGQKVGEFSGAN*'],
'VKCMPTFQFFKKGQKVGEFSGAN*',
'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
index=index, columns=['Sequence'], dtype=object)
exp = qiime2.Metadata(exp_df)

Expand All @@ -1369,13 +1380,18 @@ def test_aligned_proteinfasta_format_to_metadata(self):
_, obs = self.transform_format(AlignedProteinFASTAFormat,
qiime2.Metadata,
'aligned-protein-sequences.fasta')
index = pd.Index(['sequence1', 'sequence2'], name='Feature ID')
index = pd.Index(
['sequence1', 'sequence2', 'sequence3'], name='Feature ID'
)
exp_df = pd.DataFrame(['------------------------VDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQFFKKGQKVGEFSGAN',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQ-------VGEFSGAN'],
'VKCMPTFQ-------VGEFSGAN',
'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
'VASECEVKCMPTFO-------VGEFSGAN'],
index=index, columns=['Sequence'], dtype=object)
exp = qiime2.Metadata(exp_df)

Expand All @@ -1387,13 +1403,16 @@ def test_aligned_proteinfasta_format_to_series(self):

obs = obs.astype(str)

index = pd.Index(['sequence1', 'sequence2'])
index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
exp = pd.Series(['------------------------VDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQFFKKGQKVGEFSGAN',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQ-------VGEFSGAN'],
'VKCMPTFQ-------VGEFSGAN',
'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
'VASECEVKCMPTFO-------VGEFSGAN'],
index=index, dtype=object)

assert_series_equal(exp, obs)
Expand All @@ -1402,13 +1421,16 @@ def test_series_to_aligned_proteinfasta_format(self):
transformer = self.get_transformer(
pd.Series, AlignedProteinFASTAFormat)

index = pd.Index(['sequence1', 'sequence2'])
index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
input = pd.Series(['------------------------VDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQFFKKGQKVGEFSGAN',
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQ-------VGEFSGAN'],
'VKCMPTFQ-------VGEFSGAN',
'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
'VASECEVKCMPTFO-------VGEFSGAN'],
index=index, dtype=object)

obs = transformer(input)
Expand All @@ -1426,6 +1448,11 @@ def test_series_to_aligned_proteinfasta_format(self):
'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
'VKCMPTFQ-------VGEFSGAN\n')
self.assertEqual(obs_lines[4], '>sequence3\n')
self.assertEqual(obs_lines[5],
'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
'VASECEVKCMPTFO-------VGEFSGAN\n')

def test_aligned_protein_fasta_format_to_protein_iterator(self):
input, obs = self.transform_format(
Expand Down

0 comments on commit 1827eab

Please sign in to comment.