From 937f1439f6b5bd5d38aee5a42678e4cc4510079b Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 11:49:41 +0100 Subject: [PATCH 1/4] ENH: collate_loci action added --- q2_types/genome_data/_methods.py | 27 ++++++++++++++++- .../tests/data/uncollated_loci_1/loci1.gff | 10 +++++++ .../tests/data/uncollated_loci_1/loci2.gff | 5 ++++ .../tests/data/uncollated_loci_2/loci3.gff | 5 ++++ .../tests/data/uncollated_loci_2/loci4.gff | 10 +++++++ q2_types/genome_data/tests/test_methods.py | 29 +++++++++++++++++++ q2_types/plugin_setup.py | 14 ++++++++- setup.py | 2 ++ 8 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff create mode 100644 q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff create mode 100644 q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff create mode 100644 q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 99764ce2..681aa4b2 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -12,9 +12,34 @@ import numpy as np from qiime2.util import duplicate -from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt +from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt, + LociDirectoryFormat) +def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: + """ + Collate the individual loci directories from the partitions. + Parameters: + - loci: A list of LociDirectoryFormat containing the gff files. + Returns: + - collated_loci: A LociDirectoryFormat object containing the collated gff files. + """ + collated_loci = LociDirectoryFormat() + for loci_dir in loci: + for fp in os.listdir(loci_dir.path): + try: + duplicate( + os.path.join(loci_dir.path, fp), + os.path.join(collated_loci.path, fp) + ) + except FileExistsError: + # raise a warning + warnings.warn( + f"Skipping {fp}. File already exists " + f"in the destination directory." + ) + return collated_loci + def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: result = SeedOrthologDirFmt() diff --git a/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff b/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff b/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff b/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff b/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 2aff71ad..2997fa76 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -7,11 +7,14 @@ # ---------------------------------------------------------------------------- import filecmp import os +import warnings from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations +from q2_types.genome_data import LociDirectoryFormat +from q2_types.genome_data._methods import collate_loci class TestOrthologsPartitionCollating(TestPluginBase): @@ -33,6 +36,32 @@ def test_collate_orthologs(self): collated_orthologs.path / "2.emapper.seed_orthologs") ) + def test_collate_loci(self): + p1 = self.get_data_path("uncollated_loci_1") + p2 = self.get_data_path("uncollated_loci_2") + loci_list = [ + LociDirectoryFormat(p1, mode="r"), + LociDirectoryFormat(p2, mode="r") + ] + + collated_loci = collate_loci(loci_list) + self.assertTrue(all(os.path.exists( + collated_loci.path / f"loci{no}.gff") for no in [1,2,3,4])) + + def test_collate_loci_file_exists(self): + p1 = self.get_data_path("uncollated_loci_1") + loci_list = [ + LociDirectoryFormat(p1, mode="r"), + LociDirectoryFormat(p1, mode="r") + ] + + with warnings.catch_warnings(record=True) as w: + collated_loci = collate_loci(loci_list) + self.assertIn("File already exists", str(w[-1].message)) + + self.assertTrue(all(os.path.exists( + collated_loci.path / f"loci{no}.gff") for no in [1,2])) + def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index ed59706c..bcb64c81 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -19,7 +19,8 @@ from q2_types.feature_data_mag import MAG from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, GenomeData, NOG +from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci +from q2_types.genome_data._methods import collate_loci from q2_types.sample_data import SampleData @@ -136,6 +137,17 @@ "and collates them into a single artifact.", ) +plugin.methods.register_function( + function=collate_loci, + inputs={"loci": List[GenomeData[Loci]]}, + parameters={}, + outputs={"collated_loci": GenomeData[Loci]}, + input_descriptions={"loci": "A collection of loci to be collated."}, + name="Collate loci", + description="Takes a collection of GenomeData[Loci]'s " + "and collates them into a single artifact.", +) + importlib.import_module('q2_types.bowtie2._deferred_setup') importlib.import_module('q2_types.distance_matrix._deferred_setup') importlib.import_module('q2_types.feature_data._deferred_setup') diff --git a/setup.py b/setup.py index a71fae39..5fe83b7b 100644 --- a/setup.py +++ b/setup.py @@ -87,6 +87,8 @@ 'data/genes_samples/sample2/*', 'data/loci-invalid/*', 'data/loci/*', + 'data/uncollated_loci_1/*', + 'data/uncollated_loci_2/*', 'data/genome-sequences/*', 'data/ortholog/*', 'data/ortholog-annotation-extra/*', From 97ea0969be6d46309c8c4941872bada6a4a10ae4 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 11:54:49 +0100 Subject: [PATCH 2/4] ENH: code enhanced --- q2_types/genome_data/tests/test_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 2997fa76..754bd5f1 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -46,7 +46,7 @@ def test_collate_loci(self): collated_loci = collate_loci(loci_list) self.assertTrue(all(os.path.exists( - collated_loci.path / f"loci{no}.gff") for no in [1,2,3,4])) + collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) def test_collate_loci_file_exists(self): p1 = self.get_data_path("uncollated_loci_1") @@ -60,7 +60,7 @@ def test_collate_loci_file_exists(self): self.assertIn("File already exists", str(w[-1].message)) self.assertTrue(all(os.path.exists( - collated_loci.path / f"loci{no}.gff") for no in [1,2])) + collated_loci.path / f"loci{no}.gff") for no in [1, 2])) def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") From 2ee2ec556b84b8ae1455f9d57774bc75225dfa0a Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Wed, 11 Dec 2024 11:56:47 +0100 Subject: [PATCH 3/4] ENH: code enhanced --- q2_types/genome_data/_methods.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 681aa4b2..f4d9cb10 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -22,7 +22,8 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: Parameters: - loci: A list of LociDirectoryFormat containing the gff files. Returns: - - collated_loci: A LociDirectoryFormat object containing the collated gff files. + - collated_loci: A LociDirectoryFormat object containing the + collated gff files. """ collated_loci = LociDirectoryFormat() for loci_dir in loci: @@ -40,6 +41,7 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: ) return collated_loci + def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: result = SeedOrthologDirFmt() From 7dbba4f25a7f843dd9a5b264c4251d79f821dd04 Mon Sep 17 00:00:00 2001 From: DorielaGrabocka Date: Mon, 16 Dec 2024 15:15:19 +0100 Subject: [PATCH 4/4] ENH: duplication procedure simplified --- q2_types/genome_data/_methods.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index f4d9cb10..9fdce2dc 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -27,14 +27,13 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: """ collated_loci = LociDirectoryFormat() for loci_dir in loci: - for fp in os.listdir(loci_dir.path): + for fp in loci_dir.path.iterdir(): try: duplicate( - os.path.join(loci_dir.path, fp), - os.path.join(collated_loci.path, fp) + fp, + collated_loci.path / os.path.basename(fp) ) except FileExistsError: - # raise a warning warnings.warn( f"Skipping {fp}. File already exists " f"in the destination directory."