Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adding AMRFinderPlusAnnotation type #86

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d33e0cb
added new amrfinder directory and moved types to card directory
VinzentRisch Jun 28, 2024
32e8b7c
dirformat with filecollections
VinzentRisch Jul 1, 2024
d1b2ca6
dirformat with validating all filepaths
VinzentRisch Jul 1, 2024
facc75d
added test data to package data
VinzentRisch Jul 2, 2024
f948195
added amrprot.pot file to git
VinzentRisch Jul 2, 2024
c445800
merge main
VinzentRisch Jul 3, 2024
0670d7e
added new annotation format
VinzentRisch Jul 3, 2024
bfafdb8
added sampledata and feature data dir fmts
VinzentRisch Jul 4, 2024
bb9220c
register all formats
VinzentRisch Jul 4, 2024
317e5cb
using filecollections for the database format
VinzentRisch Jul 4, 2024
71a8da2
merge 80
VinzentRisch Jul 4, 2024
0bf7f20
renamed to dirfmt
VinzentRisch Jul 4, 2024
060f24d
merge 80
VinzentRisch Jul 4, 2024
8378b45
overwrite all pathmakers with code from busco moshpit
VinzentRisch Jul 4, 2024
82a1558
added field to annotation format
VinzentRisch Jul 4, 2024
f42d845
changed name of file in annotation format to allow oter names
VinzentRisch Jul 4, 2024
514688c
registered annotations types in plusgin setup
VinzentRisch Jul 5, 2024
07e9f52
Revert "overwrite all pathmakers with code from busco moshpit"
VinzentRisch Jul 5, 2024
78c4329
Merge branch '80_amrfinder_database_type' into 85_amrfinderplusannota…
VinzentRisch Jul 5, 2024
34a34b8
removed nested structure of annotaion type
VinzentRisch Jul 5, 2024
4f09ee7
changed type of featuredata one to also include mutations in name
VinzentRisch Jul 9, 2024
cf7ae74
changed pathmakers
VinzentRisch Jul 10, 2024
bbaca6e
changed type and path_maker
VinzentRisch Jul 10, 2024
b97152f
added validation positive for emty files
VinzentRisch Jul 10, 2024
ee10a37
removed changes cds and version format files
VinzentRisch Jul 11, 2024
05c8496
removed files also from package data
VinzentRisch Jul 11, 2024
2a9cbdf
merge 80
VinzentRisch Jul 11, 2024
416de74
arm to amr
VinzentRisch Jul 11, 2024
fc819a2
merge main
VinzentRisch Jul 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions q2_amr/amrfinderplus/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,19 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusAnnotationDirFmt,
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
BinaryFormat,
TextFormat,
)

__all__ = [
"AMRFinderPlusDatabaseDirFmt",
"AMRFinderPlusAnnotationFormat",
"AMRFinderPlusAnnotationsDirFmt",
"AMRFinderPlusAnnotationDirFmt",
"TextFormat",
"BinaryFormat",
]
67 changes: 67 additions & 0 deletions q2_amr/amrfinderplus/types/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import pandas as pd
from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat
from q2_types.per_sample_sequences._format import MultiDirValidationMixin
from qiime2.core.exceptions import ValidationError
from qiime2.plugin import model


Expand Down Expand Up @@ -57,3 +60,67 @@ def amr_dna_comp_path_maker(self, species, extension):
@amr_dna_tab.set_path_maker
def amr_dna_tab_path_maker(self, species):
return "AMR_DNA-%s.tab" % species


class AMRFinderPlusAnnotationFormat(model.TextFileFormat):
def _validate(self):
header_coordinates = [
"Protein identifier",
"Contig id",
"Start",
"Stop",
"Strand",
"Gene symbol",
"Sequence name",
"Scope",
"Element type",
"Element subtype",
"Class",
"Subclass",
"Method",
"Target length",
"Reference sequence length",
"% Coverage of reference sequence",
"% Identity to reference sequence",
"Alignment length",
"Accession of closest sequence",
"Name of closest sequence",
"HMM id",
"HMM description",
"Hierarchy node",
]
header = header_coordinates[:1] + header_coordinates[5:]
try:
header_obs = pd.read_csv(str(self), sep="\t", nrows=0).columns.tolist()
if header != header_obs and header_coordinates != header_obs:
raise ValidationError(
"Header line does not match AMRFinderPlusAnnotationFormat. Must "
"consist of the following values: "
+ ", ".join(header_coordinates)
+ ".\n\nWhile Contig id, Start, Stop and Strand are optional."
+ "\n\nFound instead: "
+ ", ".join(header_obs)
)
except pd.errors.EmptyDataError:
pass

def _validate_(self, level):
self._validate()


class AMRFinderPlusAnnotationsDirFmt(MultiDirValidationMixin, model.DirectoryFormat):
annotation = model.FileCollection(
r".*amr_(annotations|mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat
)

@annotation.set_path_maker
def annotation_path_maker(self, sample_id, mag_id):
prefix = f"{sample_id}/{mag_id}_" if mag_id else f"{sample_id}/"
return f"{prefix}amr_annotations.tsv"


AMRFinderPlusAnnotationDirFmt = model.SingleFileDirectoryFormat(
"AMRFinderPlusAnnotationDirFmt",
r"amr_(annotations|mutations)\.tsv$",
AMRFinderPlusAnnotationFormat,
)
8 changes: 8 additions & 0 deletions q2_amr/amrfinderplus/types/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from q2_types.feature_data import FeatureData
from q2_types.sample_data import SampleData
from qiime2.core.type import SemanticType

AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase")
AMRFinderPlusAnnotations = SemanticType(
"AMRFinderPlusAnnotations", variant_of=SampleData.field["type"]
)
AMRFinderPlusAnnotation = SemanticType(
"AMRFinderPlusAnnotation", variant_of=FeatureData.field["type"]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Incorrect Header 1 Incorrect Header 2 Incorrect Header 3
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,101 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import tempfile

from qiime2.core.exceptions import ValidationError
from qiime2.plugin.testing import TestPluginBase

from q2_amr.amrfinderplus.types._format import AMRFinderPlusDatabaseDirFmt
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusAnnotationDirFmt,
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
)


class TestAMRFinderPlusDatabaseTypesAndFormats(TestPluginBase):
class TestAMRFinderPlusTypesAndFormats(TestPluginBase):
package = "q2_amr.amrfinderplus.types.tests"

def test_amrfinderplus_database_directory_format_validate_positive(self):
format = AMRFinderPlusDatabaseDirFmt(self.get_data_path("database"), mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validate_positive(self):
filepath = self.get_data_path(
"annotation/no_coordinates/"
"aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv"
)

format = AMRFinderPlusAnnotationFormat(filepath, mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validate_positive_coordinates(self):
filepath = self.get_data_path(
"annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d"
"_amr_annotations.tsv"
)
format = AMRFinderPlusAnnotationFormat(filepath, mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validate_positive_empty(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_file_path = os.path.join(temp_dir, "amr_annotations.tsv")
with open(temp_file_path, "w"):
pass
format = AMRFinderPlusAnnotationFormat(temp_file_path, mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validation_error(self):
with self.assertRaises(ValidationError) as context:
path = self.get_data_path("annotation_wrong/amr_annotation.tsv")
format = AMRFinderPlusAnnotationFormat(path, mode="r")
format.validate()

header_coordinates = [
"Protein identifier",
"Contig id",
"Start",
"Stop",
"Strand",
"Gene symbol",
"Sequence name",
"Scope",
"Element type",
"Element subtype",
"Class",
"Subclass",
"Method",
"Target length",
"Reference sequence length",
"% Coverage of reference sequence",
"% Identity to reference sequence",
"Alignment length",
"Accession of closest sequence",
"Name of closest sequence",
"HMM id",
"HMM description",
]
expected_message = (
"Header line does not match AMRFinderPlusAnnotation format. Must "
"consist of the following values: "
+ ", ".join(header_coordinates)
+ ".\nWhile Contig id, Start, Stop and Strand are optional."
+ "\n\nFound instead: "
+ "Incorrect Header 1, Incorrect Header 2, Incorrect Header 3"
)

self.assertEqual(str(context.exception), expected_message)

def test_amrfinderplus_annotation_directory_format(self):
dirpath = self.get_data_path(
"annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d"
)
annotations = AMRFinderPlusAnnotationDirFmt(dirpath, mode="r")
assert isinstance(annotations, AMRFinderPlusAnnotationDirFmt)

def test_amrfinderplus_annotations_directory_format(self):
dirpath = self.get_data_path("annotation")
annotations = AMRFinderPlusAnnotationsDirFmt(dirpath, mode="r")
assert isinstance(annotations, AMRFinderPlusAnnotationsDirFmt)
24 changes: 23 additions & 1 deletion q2_amr/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
import importlib

from q2_types.feature_data import FeatureData
from q2_types.feature_table import FeatureTable, Frequency
from q2_types.per_sample_sequences import (
MAGs,
Expand All @@ -29,11 +30,18 @@

from q2_amr import __version__
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusAnnotationDirFmt,
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
BinaryFormat,
TextFormat,
)
from q2_amr.amrfinderplus.types._type import AMRFinderPlusDatabase
from q2_amr.amrfinderplus.types._type import (
AMRFinderPlusAnnotation,
AMRFinderPlusAnnotations,
AMRFinderPlusDatabase,
)
from q2_amr.card.database import fetch_card_db
from q2_amr.card.heatmap import heatmap
from q2_amr.card.kmer import (
Expand Down Expand Up @@ -1084,6 +1092,8 @@
CARDReadsAlleleKmerAnalysis,
CARDMAGsKmerAnalysis,
AMRFinderPlusDatabase,
AMRFinderPlusAnnotations,
AMRFinderPlusAnnotation,
)

plugin.register_semantic_type_to_format(
Expand Down Expand Up @@ -1118,6 +1128,15 @@
AMRFinderPlusDatabase,
artifact_format=AMRFinderPlusDatabaseDirFmt,
)

plugin.register_semantic_type_to_format(
SampleData[AMRFinderPlusAnnotations],
artifact_format=AMRFinderPlusAnnotationsDirFmt,
)
plugin.register_semantic_type_to_format(
FeatureData[AMRFinderPlusAnnotation],
artifact_format=AMRFinderPlusAnnotationDirFmt,
)
plugin.register_formats(
CARDKmerDatabaseDirectoryFormat,
CARDKmerJSONFormat,
Expand Down Expand Up @@ -1145,6 +1164,9 @@
AMRFinderPlusDatabaseDirFmt,
TextFormat,
BinaryFormat,
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusAnnotationDirFmt,
)

importlib.import_module("q2_amr.card.types._transformer")
Loading