bokulich-lab · misialq · Jul 11, 2024 · Jun 28, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/q2_amr/amrfinderplus/types/__init__.py b/q2_amr/amrfinderplus/types/__init__.py
@@ -6,13 +6,19 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 from q2_amr.amrfinderplus.types._format import (
+    AMRFinderPlusAnnotationDirFmt,
+    AMRFinderPlusAnnotationFormat,
+    AMRFinderPlusAnnotationsDirFmt,
     AMRFinderPlusDatabaseDirFmt,
     BinaryFormat,
     TextFormat,
 )
 
 __all__ = [
     "AMRFinderPlusDatabaseDirFmt",
+    "AMRFinderPlusAnnotationFormat",
+    "AMRFinderPlusAnnotationsDirFmt",
+    "AMRFinderPlusAnnotationDirFmt",
     "TextFormat",
     "BinaryFormat",
 ]
diff --git a/q2_amr/amrfinderplus/types/_format.py b/q2_amr/amrfinderplus/types/_format.py
@@ -5,7 +5,10 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import pandas as pd
 from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat
+from q2_types.per_sample_sequences._format import MultiDirValidationMixin
+from qiime2.core.exceptions import ValidationError
 from qiime2.plugin import model
 
 
@@ -57,3 +60,67 @@ def amr_dna_comp_path_maker(self, species, extension):
     @amr_dna_tab.set_path_maker
     def amr_dna_tab_path_maker(self, species):
         return "AMR_DNA-%s.tab" % species
+
+
+class AMRFinderPlusAnnotationFormat(model.TextFileFormat):
+    def _validate(self):
+        header_coordinates = [
+            "Protein identifier",
+            "Contig id",
+            "Start",
+            "Stop",
+            "Strand",
+            "Gene symbol",
+            "Sequence name",
+            "Scope",
+            "Element type",
+            "Element subtype",
+            "Class",
+            "Subclass",
+            "Method",
+            "Target length",
+            "Reference sequence length",
+            "% Coverage of reference sequence",
+            "% Identity to reference sequence",
+            "Alignment length",
+            "Accession of closest sequence",
+            "Name of closest sequence",
+            "HMM id",
+            "HMM description",
+            "Hierarchy node",
+        ]
+        header = header_coordinates[:1] + header_coordinates[5:]
+        try:
+            header_obs = pd.read_csv(str(self), sep="\t", nrows=0).columns.tolist()
+            if header != header_obs and header_coordinates != header_obs:
+                raise ValidationError(
+                    "Header line does not match AMRFinderPlusAnnotationFormat. Must "
+                    "consist of the following values: "
+                    + ", ".join(header_coordinates)
+                    + ".\n\nWhile Contig id, Start, Stop and Strand are optional."
+                    + "\n\nFound instead: "
+                    + ", ".join(header_obs)
+                )
+        except pd.errors.EmptyDataError:
+            pass
+
+    def _validate_(self, level):
+        self._validate()
+
+
+class AMRFinderPlusAnnotationsDirFmt(MultiDirValidationMixin, model.DirectoryFormat):
+    annotation = model.FileCollection(
+        r".*amr_(annotations|mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat
+    )
+
+    @annotation.set_path_maker
+    def annotation_path_maker(self, sample_id, mag_id):
+        prefix = f"{sample_id}/{mag_id}_" if mag_id else f"{sample_id}/"
+        return f"{prefix}amr_annotations.tsv"
+
+
+AMRFinderPlusAnnotationDirFmt = model.SingleFileDirectoryFormat(
+    "AMRFinderPlusAnnotationDirFmt",
+    r"amr_(annotations|mutations)\.tsv$",
+    AMRFinderPlusAnnotationFormat,
+)
diff --git a/q2_amr/amrfinderplus/types/_type.py b/q2_amr/amrfinderplus/types/_type.py
@@ -5,6 +5,14 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+from q2_types.feature_data import FeatureData
+from q2_types.sample_data import SampleData
 from qiime2.core.type import SemanticType
 
 AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase")
+AMRFinderPlusAnnotations = SemanticType(
+    "AMRFinderPlusAnnotations", variant_of=SampleData.field["type"]
+)
+AMRFinderPlusAnnotation = SemanticType(
+    "AMRFinderPlusAnnotation", variant_of=FeatureData.field["type"]
+)
diff --git a/...ests/data/annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv b/...ests/data/annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv
@@ -0,0 +1,3 @@
+Protein identifier	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description	Hierarchy node
+aph3pp-Ib_partial_5p_neg	aph(3'')-Ib	aminoglycoside O-phosphotransferase APH(3'')-Ib	core	AMR	AMR	AMINOGLYCOSIDE	STREPTOMYCIN	PARTIALP	225	267	81.27	100.00	217	WP_001082319.1	aminoglycoside O-phosphotransferase APH(3'')-Ib	NF032896.1	APH(3'') family aminoglycoside O-phosphotransferase	aph(3'')-Ib
+blaOXA-436_partial	blaOXA	OXA-48 family class D beta-lactamase	core	AMR	AMR	BETA-LACTAM	BETA-LACTAM	PARTIALP	233	265	87.92	100.00	233	WP_058842180.1	OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436	NF012161.0	class D beta-lactamase	blaOXA-48_fam
diff --git a/...s/data/annotation/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv b/...s/data/annotation/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv
@@ -0,0 +1,3 @@
+Protein identifier	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description	Hierarchy node
+aph3pp-Ib_partial_5p_neg	aph(3'')-Ib	aminoglycoside O-phosphotransferase APH(3'')-Ib	core	AMR	AMR	AMINOGLYCOSIDE	STREPTOMYCIN	PARTIALP	225	267	81.27	100.00	217	WP_001082319.1	aminoglycoside O-phosphotransferase APH(3'')-Ib	NF032896.1	APH(3'') family aminoglycoside O-phosphotransferase	aph(3'')-Ib
+blaOXA-436_partial	blaOXA	OXA-48 family class D beta-lactamase	core	AMR	AMR	BETA-LACTAM	BETA-LACTAM	PARTIALP	233	265	87.92	100.00	233	WP_058842180.1	OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436	NF012161.0	class D beta-lactamase	blaOXA-48_fam
diff --git a/q2_amr/amrfinderplus/types/tests/data/annotation_wrong/amr_annotation.tsv b/q2_amr/amrfinderplus/types/tests/data/annotation_wrong/amr_annotation.tsv
@@ -0,0 +1 @@
+Incorrect Header 1  Incorrect Header 2  Incorrect Header 3
diff --git a/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py
@@ -5,14 +5,101 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import os
+import tempfile
+
+from qiime2.core.exceptions import ValidationError
 from qiime2.plugin.testing import TestPluginBase
 
-from q2_amr.amrfinderplus.types._format import AMRFinderPlusDatabaseDirFmt
+from q2_amr.amrfinderplus.types._format import (
+    AMRFinderPlusAnnotationDirFmt,
+    AMRFinderPlusAnnotationFormat,
+    AMRFinderPlusAnnotationsDirFmt,
+    AMRFinderPlusDatabaseDirFmt,
+)
 
 
-class TestAMRFinderPlusDatabaseTypesAndFormats(TestPluginBase):
+class TestAMRFinderPlusTypesAndFormats(TestPluginBase):
     package = "q2_amr.amrfinderplus.types.tests"
 
     def test_amrfinderplus_database_directory_format_validate_positive(self):
         format = AMRFinderPlusDatabaseDirFmt(self.get_data_path("database"), mode="r")
         format.validate()
+
+    def test_amrfinderplus_annotation_format_validate_positive(self):
+        filepath = self.get_data_path(
+            "annotation/no_coordinates/"
+            "aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv"
+        )
+
+        format = AMRFinderPlusAnnotationFormat(filepath, mode="r")
+        format.validate()
+
+    def test_amrfinderplus_annotation_format_validate_positive_coordinates(self):
+        filepath = self.get_data_path(
+            "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d"
+            "_amr_annotations.tsv"
+        )
+        format = AMRFinderPlusAnnotationFormat(filepath, mode="r")
+        format.validate()
+
+    def test_amrfinderplus_annotation_format_validate_positive_empty(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_file_path = os.path.join(temp_dir, "amr_annotations.tsv")
+            with open(temp_file_path, "w"):
+                pass
+            format = AMRFinderPlusAnnotationFormat(temp_file_path, mode="r")
+            format.validate()
+
+    def test_amrfinderplus_annotation_format_validation_error(self):
+        with self.assertRaises(ValidationError) as context:
+            path = self.get_data_path("annotation_wrong/amr_annotation.tsv")
+            format = AMRFinderPlusAnnotationFormat(path, mode="r")
+            format.validate()
+
+            header_coordinates = [
+                "Protein identifier",
+                "Contig id",
+                "Start",
+                "Stop",
+                "Strand",
+                "Gene symbol",
+                "Sequence name",
+                "Scope",
+                "Element type",
+                "Element subtype",
+                "Class",
+                "Subclass",
+                "Method",
+                "Target length",
+                "Reference sequence length",
+                "% Coverage of reference sequence",
+                "% Identity to reference sequence",
+                "Alignment length",
+                "Accession of closest sequence",
+                "Name of closest sequence",
+                "HMM id",
+                "HMM description",
+            ]
+            expected_message = (
+                "Header line does not match AMRFinderPlusAnnotation format. Must "
+                "consist of the following values: "
+                + ", ".join(header_coordinates)
+                + ".\nWhile Contig id, Start, Stop and Strand are optional."
+                + "\n\nFound instead: "
+                + "Incorrect Header 1, Incorrect Header 2, Incorrect Header 3"
+            )
+
+            self.assertEqual(str(context.exception), expected_message)
+
+    def test_amrfinderplus_annotation_directory_format(self):
+        dirpath = self.get_data_path(
+            "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d"
+        )
+        annotations = AMRFinderPlusAnnotationDirFmt(dirpath, mode="r")
+        assert isinstance(annotations, AMRFinderPlusAnnotationDirFmt)
+
+    def test_amrfinderplus_annotations_directory_format(self):
+        dirpath = self.get_data_path("annotation")
+        annotations = AMRFinderPlusAnnotationsDirFmt(dirpath, mode="r")
+        assert isinstance(annotations, AMRFinderPlusAnnotationsDirFmt)
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
@@ -7,6 +7,7 @@
 # ----------------------------------------------------------------------------
 import importlib
 
+from q2_types.feature_data import FeatureData
 from q2_types.feature_table import FeatureTable, Frequency
 from q2_types.per_sample_sequences import (
     MAGs,
@@ -29,11 +30,18 @@
 
 from q2_amr import __version__
 from q2_amr.amrfinderplus.types._format import (
+    AMRFinderPlusAnnotationDirFmt,
+    AMRFinderPlusAnnotationFormat,
+    AMRFinderPlusAnnotationsDirFmt,
     AMRFinderPlusDatabaseDirFmt,
     BinaryFormat,
     TextFormat,
 )
-from q2_amr.amrfinderplus.types._type import AMRFinderPlusDatabase
+from q2_amr.amrfinderplus.types._type import (
+    AMRFinderPlusAnnotation,
+    AMRFinderPlusAnnotations,
+    AMRFinderPlusDatabase,
+)
 from q2_amr.card.database import fetch_card_db
 from q2_amr.card.heatmap import heatmap
 from q2_amr.card.kmer import (
@@ -1084,6 +1092,8 @@
     CARDReadsAlleleKmerAnalysis,
     CARDMAGsKmerAnalysis,
     AMRFinderPlusDatabase,
+    AMRFinderPlusAnnotations,
+    AMRFinderPlusAnnotation,
 )
 
 plugin.register_semantic_type_to_format(
@@ -1118,6 +1128,15 @@
     AMRFinderPlusDatabase,
     artifact_format=AMRFinderPlusDatabaseDirFmt,
 )
+
+plugin.register_semantic_type_to_format(
+    SampleData[AMRFinderPlusAnnotations],
+    artifact_format=AMRFinderPlusAnnotationsDirFmt,
+)
+plugin.register_semantic_type_to_format(
+    FeatureData[AMRFinderPlusAnnotation],
+    artifact_format=AMRFinderPlusAnnotationDirFmt,
+)
 plugin.register_formats(
     CARDKmerDatabaseDirectoryFormat,
     CARDKmerJSONFormat,
@@ -1145,6 +1164,9 @@
     AMRFinderPlusDatabaseDirFmt,
     TextFormat,
     BinaryFormat,
+    AMRFinderPlusAnnotationFormat,
+    AMRFinderPlusAnnotationsDirFmt,
+    AMRFinderPlusAnnotationDirFmt,
 )
 
 importlib.import_module("q2_amr.card.types._transformer")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Incorrect Header 1 Incorrect Header 2 Incorrect Header 3