IMP: Expand alphabet for ProteinFASTAFormat (#306)

qiime2 · Dec 27, 2023 · 1827eab · 1827eab
1 parent 8a0e191
commit 1827eab
Show file tree

Hide file tree

Showing 9 changed files with 177 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,4 @@ target/
 .*.swp
 
 .DS_Store
+.vscode
diff --git a/q2_types/feature_data/__init__.py b/q2_types/feature_data/__init__.py
@@ -16,7 +16,10 @@
     AlignedDNASequencesDirectoryFormat, DifferentialFormat,
     DifferentialDirectoryFormat, FASTAFormat, AlignedFASTAFormatMixin,
     AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat,
-    ProteinFASTAFormat, AlignedProteinFASTAFormat, RNASequencesDirectoryFormat,
+    MixedCaseProteinSequencesDirectoryFormat,
+    MixedCaseAlignedProteinSequencesDirectoryFormat,
+    ProteinFASTAFormat, AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
+    MixedCaseAlignedProteinFASTAFormat, RNASequencesDirectoryFormat,
     RNAFASTAFormat, AlignedRNAFASTAFormat, AlignedRNASequencesDirectoryFormat,
     PairedRNASequencesDirectoryFormat, BLAST6Format, BLAST6DirectoryFormat,
     MixedCaseDNAFASTAFormat, MixedCaseDNASequencesDirectoryFormat,
@@ -59,6 +62,11 @@
     'MixedCaseAlignedDNAFASTAFormat',
     'MixedCaseAlignedDNASequencesDirectoryFormat',
     'MixedCaseAlignedRNAFASTAFormat',
-    'MixedCaseAlignedRNASequencesDirectoryFormat']
+    'MixedCaseAlignedRNASequencesDirectoryFormat',
+    'MixedCaseProteinFASTAFormat',
+    'MixedCaseAlignedProteinFASTAFormat',
+    'MixedCaseProteinSequencesDirectoryFormat',
+    'MixedCaseAlignedProteinSequencesDirectoryFormat',
+    ]
 
 importlib.import_module('q2_types.feature_data._transformer')
diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
@@ -404,7 +404,7 @@ def validate(self, *args):
 class ProteinFASTAFormat(FASTAFormat):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.alphabet = "ABCDEFGHIKLMNPQRSTVWXYZ*"
+        self.alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ*"
 
 
 ProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
@@ -413,6 +413,19 @@ def __init__(self, *args, **kwargs):
     ProteinFASTAFormat)
 
 
+class MixedCaseProteinFASTAFormat(ProteinFASTAFormat):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        lower_case = "abcdefghijklmnopqrstuvwxyz"
+        self.alphabet = self.alphabet + lower_case
+
+
+MixedCaseProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
+    'MixedCaseProteinSequencesDirectoryFormat',
+    'protein-sequences.fasta',
+    MixedCaseProteinFASTAFormat)
+
+
 class AlignedProteinFASTAFormat(AlignedFASTAFormatMixin, ProteinFASTAFormat):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -425,6 +438,23 @@ def __init__(self, *args, **kwargs):
     AlignedProteinFASTAFormat)
 
 
+class MixedCaseAlignedProteinFASTAFormat(
+    AlignedFASTAFormatMixin, MixedCaseProteinFASTAFormat
+):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        super()._turn_into_alignment()
+
+
+MixedCaseAlignedProteinSequencesDirectoryFormat = (
+    model.SingleFileDirectoryFormat(
+        'MixedCaseAlignedProteinSequencesDirectoryFormat',
+        'aligned-protein-sequences.fasta',
+        MixedCaseAlignedProteinFASTAFormat
+    )
+)
+
+
 class BLAST6Format(model.TextFileFormat):
     def validate(self, *args):
         try:
@@ -447,8 +477,11 @@ def validate(self, *args):
     DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
     AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
     DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
-    AlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
-    AlignedProteinSequencesDirectoryFormat, RNAFASTAFormat,
+    AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
+    MixedCaseAlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
+    AlignedProteinSequencesDirectoryFormat,
+    MixedCaseProteinSequencesDirectoryFormat,
+    MixedCaseAlignedProteinSequencesDirectoryFormat, RNAFASTAFormat,
     RNASequencesDirectoryFormat, AlignedRNAFASTAFormat,
     AlignedRNASequencesDirectoryFormat, PairedRNASequencesDirectoryFormat,
     BLAST6Format, BLAST6DirectoryFormat, MixedCaseDNAFASTAFormat,

diff --git a/q2_types/feature_data/tests/data/aligned-protein-sequences.fasta b/q2_types/feature_data/tests/data/aligned-protein-sequences.fasta
@@ -4,3 +4,6 @@ VASECEVKCMPTFQFFKKGQKVGEFSGAN
 >sequence2
 MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
 VASECEVKCMPTFQ-------VGEFSGAN
+>sequence3
+MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNUIFLEVDVDDCQD
+VASECEVKCMPTFO-------VGEFSGAN
diff --git a/q2_types/feature_data/tests/data/mixed-case-aligned-protein-sequences.fasta b/q2_types/feature_data/tests/data/mixed-case-aligned-protein-sequences.fasta
@@ -0,0 +1,9 @@
+>sequence1
+-------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
+VASECEVKCMPTFQFFKKGQKVGEFSGAN
+>sequence2
+mvkqiesktafqealdaagdklvvvdfsatwcgpcckmikpffhslsekysnviflevdvddcq
+dvasecevkcmptfq-------vgefsgan
+>sequence3
+MVKQiEskTAFqJaLdAAGDkLVvvVDFSATWCGPCkMIKpfFhSlSEkYSNUiFLEVDVDDCQD
+VASecEVkCMPTFo-------VGEFSGAN
diff --git a/q2_types/feature_data/tests/data/mixed-case-protein-sequences.fasta b/q2_types/feature_data/tests/data/mixed-case-protein-sequences.fasta
@@ -0,0 +1,9 @@
+>sequence1
+MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
+AAAQIR
+>sequence2
+mvkqiesktafqealdaagdklvvvdfsatwcgpcckmikpffhslsekysnviflevdvddcq
+dvasecevkcmptfqffkkgqkvgefsgan*
+>sequence3
+TePdzNzWkRuZqYtWuYkSwUqFpUnHmDbGhFdZsPiYkCzHqXlCeByJrEoAuJvDlIrP
+eGpOgMeJzQqRhCfQxUpZlDwDgOxToQtCiQdD*
diff --git a/q2_types/feature_data/tests/data/protein-sequences.fasta b/q2_types/feature_data/tests/data/protein-sequences.fasta
@@ -4,3 +4,6 @@ AAAQIR
 >sequence2
 MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
 VASECEVKCMPTFQFFKKGQKVGEFSGAN*
+>sequence3
+TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZSPIYKCZHQXLCEBYJREOAUJVDLIRP
+EGPOGMEJZQQRHCFQXUPZLDWDGOXTOQTCIQDD*
diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py
@@ -18,6 +18,10 @@
     PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
     AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
     ProteinFASTAFormat, AlignedProteinFASTAFormat, FASTAFormat,
+    MixedCaseProteinFASTAFormat,
+    MixedCaseAlignedProteinFASTAFormat,
+    MixedCaseAlignedProteinSequencesDirectoryFormat,
+    MixedCaseProteinSequencesDirectoryFormat,
     AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat,
     RNAFASTAFormat, RNASequencesDirectoryFormat, AlignedRNAFASTAFormat,
     AlignedRNASequencesDirectoryFormat, BLAST6DirectoryFormat,
@@ -808,6 +812,69 @@ def test_aligned_protein_sequences_directory_format(self):
 
         format.validate()
 
+    def test_mixed_case_aligned_protein_fasta_format_validate_positive(self):
+        filepath = self.get_data_path(
+            'mixed-case-aligned-protein-sequences.fasta'
+            )
+        format = MixedCaseAlignedProteinFASTAFormat(filepath, mode='r')
+
+        format.validate()
+        format.validate('min')
+
+    def test_mixed_case_aligned_protein_fasta_format_unaligned(self):
+        filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
+        format = MixedCaseAlignedProteinFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(
+                ValidationError, 'line 5 was length 95.* previous .* 70'):
+            format.validate()
+
+    def test_mixed_case_protein_fasta_format_validate_positive(self):
+        filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
+        format = MixedCaseProteinFASTAFormat(filepath, mode='r')
+
+        format.validate()
+
+    def test_mixed_case_protein_fasta_format_invalid_characters(self):
+        filepath = self.get_data_path(
+            'mixed-case-aligned-protein-sequences.fasta'
+            )
+        format = MixedCaseProteinFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(
+                ValidationError,
+                "Invalid character '-' at position 0 on line 2"):
+            format.validate()
+
+    def test_mixed_case_aligned_protein_sequences_directory_format(self):
+        filepath = self.get_data_path(
+            'mixed-case-aligned-protein-sequences.fasta'
+            )
+        temp_dir = self.temp_dir.name
+        shutil.copy(filepath,
+                    os.path.join(
+                        temp_dir,
+                        'aligned-protein-sequences.fasta'
+                        )
+                    )
+        format = MixedCaseAlignedProteinSequencesDirectoryFormat(
+            temp_dir, mode='r'
+            )
+
+        format.validate()
+
+    def test_mixed_case_protein_sequences_directory_format(self):
+        filepath = self.get_data_path('mixed-case-protein-sequences.fasta')
+        temp_dir = self.temp_dir.name
+        shutil.copy(filepath,
+                    os.path.join(
+                        temp_dir, 'protein-sequences.fasta'
+                        )
+                    )
+        format = MixedCaseProteinSequencesDirectoryFormat(temp_dir, mode='r')
+
+        format.validate()
+
 
 class TestBLAST6Format(TestPluginBase):
     package = 'q2_types.feature_data.tests'

diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py
@@ -1321,23 +1321,29 @@ def test_proteinfasta_format_to_series(self):
 
         obs = obs.astype(str)
 
-        index = pd.Index(['sequence1', 'sequence2'])
+        index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
         exp = pd.Series(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEK'
                          'HPDVVHAKVDTEAERELAAAAQIR',
                          'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLS'
-                         'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*'],
+                         'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*',
+                         'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
+                         'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
+                         'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
                         index=index, dtype=object)
 
         assert_series_equal(exp, obs)
 
     def test_series_to_proteinfasta_format(self):
         transformer = self.get_transformer(pd.Series, ProteinFASTAFormat)
 
-        index = pd.Index(['sequence1', 'sequence2'])
+        index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
         input = pd.Series(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEK'
                            'HPDVVHAKVDTEAERELAAAAQIR',
                            'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLS'
-                           'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN'],
+                           'EKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGAN*',
+                           'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
+                           'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
+                           'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
                           index=index, dtype=object)
 
         obs = transformer(input)
@@ -1354,12 +1360,17 @@ def test_proteinfasta_format_with_duplicate_ids_to_series(self):
     def test_proteinfasta_format_to_metadata(self):
         _, obs = self.transform_format(ProteinFASTAFormat, qiime2.Metadata,
                                        'protein-sequences.fasta')
-        index = pd.Index(['sequence1', 'sequence2'], name='Feature ID')
+        index = pd.Index(
+            ['sequence1', 'sequence2', 'sequence3'], name='Feature ID'
+        )
         exp_df = pd.DataFrame(['MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRA'
                                'FAPTFAESSEKHPDVVHAKVDTEAERELAAAAQIR',
                                'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
                                'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
-                               'VKCMPTFQFFKKGQKVGEFSGAN*'],
+                               'VKCMPTFQFFKKGQKVGEFSGAN*',
+                               'TEPDZNZWKRUZQYTWUYKSWUQFPUNHMDBGHFDZ'
+                               'SPIYKCZHQXLCEBYJREOAUJVDLIRPEGPOGMEJ'
+                               'ZQQRHCFQXUPZLDWDGOXTOQTCIQDD*'],
                               index=index, columns=['Sequence'], dtype=object)
         exp = qiime2.Metadata(exp_df)
 
@@ -1369,13 +1380,18 @@ def test_aligned_proteinfasta_format_to_metadata(self):
         _, obs = self.transform_format(AlignedProteinFASTAFormat,
                                        qiime2.Metadata,
                                        'aligned-protein-sequences.fasta')
-        index = pd.Index(['sequence1', 'sequence2'], name='Feature ID')
+        index = pd.Index(
+            ['sequence1', 'sequence2', 'sequence3'], name='Feature ID'
+        )
         exp_df = pd.DataFrame(['------------------------VDFSATWCGPC'
                                'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
                                'VKCMPTFQFFKKGQKVGEFSGAN',
                                'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
                                'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
-                               'VKCMPTFQ-------VGEFSGAN'],
+                               'VKCMPTFQ-------VGEFSGAN',
+                               'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
+                               'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
+                               'VASECEVKCMPTFO-------VGEFSGAN'],
                               index=index, columns=['Sequence'], dtype=object)
         exp = qiime2.Metadata(exp_df)
 
@@ -1387,13 +1403,16 @@ def test_aligned_proteinfasta_format_to_series(self):
 
         obs = obs.astype(str)
 
-        index = pd.Index(['sequence1', 'sequence2'])
+        index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
         exp = pd.Series(['------------------------VDFSATWCGPC'
                          'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
                          'VKCMPTFQFFKKGQKVGEFSGAN',
                          'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
                          'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
-                         'VKCMPTFQ-------VGEFSGAN'],
+                         'VKCMPTFQ-------VGEFSGAN',
+                         'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
+                         'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
+                         'VASECEVKCMPTFO-------VGEFSGAN'],
                         index=index, dtype=object)
 
         assert_series_equal(exp, obs)
@@ -1402,13 +1421,16 @@ def test_series_to_aligned_proteinfasta_format(self):
         transformer = self.get_transformer(
             pd.Series, AlignedProteinFASTAFormat)
 
-        index = pd.Index(['sequence1', 'sequence2'])
+        index = pd.Index(['sequence1', 'sequence2', 'sequence3'])
         input = pd.Series(['------------------------VDFSATWCGPC'
                            'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
                            'VKCMPTFQFFKKGQKVGEFSGAN',
                            'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
                            'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
-                           'VKCMPTFQ-------VGEFSGAN'],
+                           'VKCMPTFQ-------VGEFSGAN',
+                           'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
+                           'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
+                           'VASECEVKCMPTFO-------VGEFSGAN'],
                           index=index, dtype=object)
 
         obs = transformer(input)
@@ -1426,6 +1448,11 @@ def test_series_to_aligned_proteinfasta_format(self):
                          'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPC'
                          'KMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECE'
                          'VKCMPTFQ-------VGEFSGAN\n')
+        self.assertEqual(obs_lines[4], '>sequence3\n')
+        self.assertEqual(obs_lines[5],
+                         'MVKQIESKTAFQJALDAAGDKLVVVDFSATWCGPC'
+                         'KMIKPFFHSLSEKYSNUIFLEVDVDDCQD'
+                         'VASECEVKCMPTFO-------VGEFSGAN\n')
 
     def test_aligned_protein_fasta_format_to_protein_iterator(self):
         input, obs = self.transform_format(