From 045ec95f5bfe968f592fa97eaeb36a52bac3bc8a Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 28 May 2024 11:18:41 +0200 Subject: [PATCH 01/10] migrated partition and collating actions from moshpit to types --- q2_types/_util.py | 45 ++++++++ q2_types/feature_data_mag/__init__.py | 6 +- q2_types/feature_data_mag/_methods.py | 78 +++++++++++++ ...24dee6fe-9b84-45bb-8145-de7b092533a1.fasta | 4 + ...fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta | 4 + .../a.annotations | 0 .../b.annotations | 0 .../c.annotations | 0 ...24dee6fe-9b84-45bb-8145-de7b092533a1.fasta | 4 + ...fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta | 4 + .../a/a.annotations | 0 .../b/b.annotations | 0 .../c/c.annotations | 0 .../feature_data_mag/tests/test_methods.py | 85 +++++++++++++++ q2_types/genome_data/__init__.py | 3 +- q2_types/genome_data/_methods.py | 75 +++++++++++++ .../1.emapper.seed_orthologs} | 0 .../2.emapper.seed_orthologs | 3 + .../ortholog_1/1.emapper.seed_orthologs | 3 + .../ortholog_2/2.emapper.seed_orthologs | 3 + q2_types/genome_data/tests/test_format.py | 10 +- q2_types/genome_data/tests/test_methods.py | 54 +++++++++ q2_types/per_sample_sequences/__init__.py | 7 +- q2_types/per_sample_sequences/_methods.py | 90 +++++++++++++++ .../tests/data/collated_mags/MANIFEST | 7 ++ ...24dee6fe-9b84-45bb-8145-de7b092533a1.fasta | 4 + ...fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta | 4 + ...d65a71fa-4279-4588-b937-0747ed5d604d.fasta | 6 + .../tests/data/partitioned_mags/mag1/MANIFEST | 7 ++ ...24dee6fe-9b84-45bb-8145-de7b092533a1.fasta | 4 + ...fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta | 4 + .../tests/data/partitioned_mags/mag2/MANIFEST | 7 ++ ...d65a71fa-4279-4588-b937-0747ed5d604d.fasta | 6 + .../tests/test_methods.py | 103 ++++++++++++++++++ q2_types/tests/__init__.py | 7 ++ q2_types/tests/test_util.py | 48 ++++++++ 36 files changed, 677 insertions(+), 8 deletions(-) create mode 100644 q2_types/feature_data_mag/_methods.py create mode 100644 q2_types/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta create mode 100644 q2_types/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta create mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations create mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations create mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations create mode 100644 q2_types/feature_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta create mode 100644 q2_types/feature_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta create mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations create mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations create mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations create mode 100644 q2_types/feature_data_mag/tests/test_methods.py create mode 100644 q2_types/genome_data/_methods.py rename q2_types/genome_data/tests/data/{ortholog/test_sample.emapper.seed_orthologs => collated_orthologs/1.emapper.seed_orthologs} (100%) create mode 100644 q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs create mode 100644 q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs create mode 100644 q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs create mode 100644 q2_types/genome_data/tests/test_methods.py create mode 100644 q2_types/per_sample_sequences/_methods.py create mode 100644 q2_types/per_sample_sequences/tests/data/collated_mags/MANIFEST create mode 100644 q2_types/per_sample_sequences/tests/data/collated_mags/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta create mode 100644 q2_types/per_sample_sequences/tests/data/collated_mags/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta create mode 100644 q2_types/per_sample_sequences/tests/data/collated_mags/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta create mode 100644 q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/MANIFEST create mode 100644 q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta create mode 100644 q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta create mode 100644 q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/MANIFEST create mode 100644 q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta create mode 100644 q2_types/per_sample_sequences/tests/test_methods.py create mode 100644 q2_types/tests/__init__.py create mode 100644 q2_types/tests/test_util.py diff --git a/q2_types/_util.py b/q2_types/_util.py index 9348d4db..512b6c75 100644 --- a/q2_types/_util.py +++ b/q2_types/_util.py @@ -8,6 +8,8 @@ import re import gzip import itertools +import warnings +from typing import List import qiime2.plugin.model as model from qiime2.plugin import ValidationError @@ -206,3 +208,46 @@ def _validate_(self, level): record_count_map = {'min': 5, 'max': None} self._check_n_records(record_count_map[level]) + + +def _validate_num_partitions( + num_samples: int, num_partitions: int, sample_type: str = "sample" +) -> int: + + if num_partitions is None: + return num_samples + elif num_partitions > num_samples: + warnings.warn( + "You have requested a number of partitions " + f"'{num_partitions}' that is greater than your number " + f"of {sample_type}s '{num_samples}.' Your data will be " + f"partitioned by {sample_type} into '{num_samples}' " + "partitions." + ) + return num_samples + else: + return num_partitions + + +def _validate_mag_ids( + num_partitions: int, num_mags: int, mags_all: List[tuple] +): + # If num_partitions == num_mags and MAG ids are not unique + # the output will be missing these duplicated-id MAGs. + # While this is technically impossible since + # MAGs should have unique IDs by construction, it could still happen that a + # used imports MAGs with non-unique IDs. In such case this test would be + # useful. + + if num_partitions == num_mags: + mag_ids = [mag_id[1] for mag_id in mags_all] + duplicates = [ + mag_id for mag_id in mag_ids if mag_ids.count(mag_id) > 1 + ] + if len(duplicates) > 0: + raise ValueError( + "MAG IDs are not unique. " + "They must be unique in order to output all partitions " + "correctly. Printing duplicate MAG IDs: " + f"{set(duplicates)}" + ) \ No newline at end of file diff --git a/q2_types/feature_data_mag/__init__.py b/q2_types/feature_data_mag/__init__.py index 3701e8bc..a3103353 100644 --- a/q2_types/feature_data_mag/__init__.py +++ b/q2_types/feature_data_mag/__init__.py @@ -13,13 +13,17 @@ OrthologAnnotationDirFmt, OrthologFileFmt ) +from ._methods import partition_feature_data_mags, collate_feature_data_mags, \ + collate_ortholog_annotations from ._type import MAG, NOG, OG, KEGG, Contig from ._transformer import MAGIterator __all__ = [ 'MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'NOG', 'OG', 'KEGG', - 'OrthologAnnotationDirFmt', 'OrthologFileFmt', 'Contig' + 'OrthologAnnotationDirFmt', 'OrthologFileFmt', 'Contig', + 'partition_feature_data_mags', 'collate_feature_data_mags', + 'collate_ortholog_annotations', ] importlib.import_module('q2_types.feature_data_mag._format') diff --git a/q2_types/feature_data_mag/_methods.py b/q2_types/feature_data_mag/_methods.py new file mode 100644 index 00000000..a7f3f813 --- /dev/null +++ b/q2_types/feature_data_mag/_methods.py @@ -0,0 +1,78 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os + +import numpy as np +from qiime2.util import duplicate + +from q2_types._util import _validate_num_partitions, _validate_mag_ids +from q2_types.feature_data_mag import MAGSequencesDirFmt, \ + OrthologAnnotationDirFmt + + +def partition_feature_data_mags( + mags: MAGSequencesDirFmt, num_partitions: int = None +) -> MAGSequencesDirFmt: + """ + Returns a dictionary where each key is either the mag_id or an index, and + values are the new objects with the mags. + """ + partitioned_mags = {} + mags_all = [] + + # Get a list where every entry is a tuple representing one MAG + for mag_id, mag_fp in mags.feature_dict().items(): + mags_all.append((mag_fp, mag_id)) + + # Count number of mags and validate the num_partitions + num_mags = len(mags_all) + num_partitions = _validate_num_partitions(num_mags, num_partitions, "MAG") + _validate_mag_ids(num_partitions, num_mags, mags_all) + + # Split list MAGs into n arrays, where n = num_partitions + arrays_of_mags = np.array_split(mags_all, num_partitions) + + for i, _mag in enumerate(arrays_of_mags, 1): + result = MAGSequencesDirFmt() + + for mag_fp, mag_id in _mag: + duplicate(mag_fp, result.path / os.path.basename(mag_fp)) + + # If num_partitions == num_mags we will only have gone through one + # MAG in the above loop and will use its id as a key. Otherwise, we + # may have gone through multiple MAGs in the above loop and will be + # using indices for keys + if num_partitions == num_mags: + partitioned_mags[mag_id] = result + else: + partitioned_mags[i] = result + + return partitioned_mags + + +def collate_feature_data_mags(mags: MAGSequencesDirFmt) -> MAGSequencesDirFmt: + collated_mags = MAGSequencesDirFmt() + for mag in mags: + for fp in mag.path.iterdir(): + duplicate(fp, collated_mags.path / fp.name) + + return collated_mags + + +def collate_ortholog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt +) -> OrthologAnnotationDirFmt: + # Init output + collated_annotations = OrthologAnnotationDirFmt() + + # Copy annotations into output + for anno in ortholog_annotations: + for fp in anno.path.iterdir(): + duplicate(fp, collated_annotations.path / fp.name) + + return collated_annotations diff --git a/q2_types/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/q2_types/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta new file mode 100644 index 00000000..c072c82b --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta @@ -0,0 +1,4 @@ +>NZ_00000000.1_contig1 +ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG +>NZ_00000000.1_contig2 +TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA diff --git a/q2_types/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/q2_types/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta new file mode 100644 index 00000000..e5910508 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta @@ -0,0 +1,4 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/q2_types/feature_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta new file mode 100644 index 00000000..c072c82b --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta @@ -0,0 +1,4 @@ +>NZ_00000000.1_contig1 +ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG +>NZ_00000000.1_contig2 +TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA diff --git a/q2_types/feature_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/q2_types/feature_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta new file mode 100644 index 00000000..e5910508 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta @@ -0,0 +1,4 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data_mag/tests/test_methods.py b/q2_types/feature_data_mag/tests/test_methods.py new file mode 100644 index 00000000..babc9e7a --- /dev/null +++ b/q2_types/feature_data_mag/tests/test_methods.py @@ -0,0 +1,85 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import filecmp +from unittest.mock import patch + +from qiime2.plugin.testing import TestPluginBase + +from q2_types.feature_data_mag import MAGSequencesDirFmt, \ + OrthologAnnotationDirFmt +from q2_types.feature_data_mag._methods import partition_feature_data_mags, \ + collate_feature_data_mags, collate_ortholog_annotations + + +class TestFeatureDataMAGsPartitionCollating(TestPluginBase): + package = "q2_types.feature_data_mag.tests" + + @patch("q2_types._util._validate_mag_ids") + @patch("q2_types._util._validate_num_partitions") + def test_partition_feature_data_mags( + self, + mock_validate_num_partitions, + mock_validate_mag_ids + ): + # Partition Feature Data + p = self.get_data_path("collated_mags") + mags = MAGSequencesDirFmt(path=p, mode="r") + mock_validate_num_partitions.return_value = 2 + partitioned_mags = partition_feature_data_mags(mags) + + # Expected mag ids + mag_ids = [ + "24dee6fe-9b84-45bb-8145-de7b092533a1", + "fb0bc871-04f6-486b-a10e-8e0cb66f8de3" + ] + + # compare partitions + for i in [0, 1]: + dircmp = filecmp.dircmp( + partitioned_mags[mag_ids[i]].path, mags.path + ) + self.assertListEqual([f"{mag_ids[i]}.fasta"], dircmp.common) + + def test_collate_feature_data_mags(self): + # collate test data + p1 = self.get_data_path("partitioned_mags/mag1") + p2 = self.get_data_path("partitioned_mags/mag2") + mags = [ + MAGSequencesDirFmt(p1, mode="r"), + MAGSequencesDirFmt(p2, mode="r") + ] + collated_mags = collate_feature_data_mags(mags) + + # compare directories + expected = self.get_data_path("collated_mags") + dircmp = filecmp.dircmp(collated_mags.path, expected) + self.assertListEqual( + [ + "24dee6fe-9b84-45bb-8145-de7b092533a1.fasta", + "fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta" + ], + dircmp.common + ) + + def test_collate_ortholog_annotations(self): + p = self.get_data_path("partitioned_ortholog_annotations") + annotations = [ + OrthologAnnotationDirFmt(f"{p}/{letter}", mode="r") + for letter in ["a", "b", "c"] + ] + collated_annotations = collate_ortholog_annotations(annotations) + + # assert that all files are there + compare = filecmp.dircmp( + collated_annotations.path, + self.get_data_path("collated_ortholog_annotations") + ) + self.assertListEqual( + compare.common, + [f"{letter}.annotations" for letter in ["a", "b", "c"]] + ) \ No newline at end of file diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index fd3d285d..fafd4fd9 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -12,6 +12,7 @@ GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, LociDirectoryFormat, OrthologFileFmt, SeedOrthologDirFmt, ) +from ._methods import collate_orthologs, partition_orthologs from ._transformer import IntervalMetadataIterator from ._type import ( GenomeData, Genes, Proteins, Loci, Ortholog, BLAST6 @@ -21,7 +22,7 @@ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', 'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat', 'IntervalMetadataIterator', 'OrthologFileFmt', 'Ortholog', - 'SeedOrthologDirFmt', 'BLAST6', + 'SeedOrthologDirFmt', 'BLAST6', 'collate_orthologs', 'partition_orthologs', ] importlib.import_module('q2_types.genome_data._format') diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py new file mode 100644 index 00000000..6781b5f5 --- /dev/null +++ b/q2_types/genome_data/_methods.py @@ -0,0 +1,75 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import glob +import os +import shutil +import warnings + +import numpy as np +from qiime2.util import duplicate + +from q2_types.genome_data import SeedOrthologDirFmt + + +def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: + result = SeedOrthologDirFmt() + + for ortholog in orthologs: + for fp in ortholog.path.iterdir(): + duplicate(fp, result.path / os.path.basename(fp)) + + return result + + +def partition_orthologs( + orthologs: SeedOrthologDirFmt, num_partitions: int = None +) -> SeedOrthologDirFmt: + """ + Returns a dictionary where each key is either the sample_id and + values are the new objects with the orthologs. + """ + partitioned_orthologs = {} + + # TODO: this logic should move to the format itself + orthologs = glob.glob(os.path.join(str(orthologs), "*.seed_orthologs")) + names = [ + os.path.basename(x).replace(".emapper.seed_orthologs", "") + for x in orthologs + ] + orthologs = list(zip(names, orthologs)) + + num_samples = len(orthologs) + if num_partitions is None: + num_partitions = num_samples + elif num_partitions > num_samples: + warnings.warn( + "You have requested a number of partitions" + f" '{num_partitions}' that is greater than your number" + f" of samples '{num_samples}.' Your data will be" + f" partitioned by sample into '{num_samples}'" + " partitions." + ) + num_partitions = num_samples + + orthologs = np.array_split(orthologs, num_partitions) + for i, samples in enumerate(orthologs, 1): + result = SeedOrthologDirFmt() + + for sample_id, sample_fp in samples: + duplicate(sample_fp, result.path / os.path.basename(sample_fp)) + + # If num_partitions == num_samples we will only have gone through one + # sample in the above loop and will use its id as a key. Otherwise we + # may have gone through multiple samples in the above loop and will be + # using indices for keys + if num_partitions == num_samples: + partitioned_orthologs[sample_id] = result + else: + partitioned_orthologs[i] = result + + return partitioned_orthologs diff --git a/q2_types/genome_data/tests/data/ortholog/test_sample.emapper.seed_orthologs b/q2_types/genome_data/tests/data/collated_orthologs/1.emapper.seed_orthologs similarity index 100% rename from q2_types/genome_data/tests/data/ortholog/test_sample.emapper.seed_orthologs rename to q2_types/genome_data/tests/data/collated_orthologs/1.emapper.seed_orthologs diff --git a/q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs b/q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs new file mode 100644 index 00000000..7ab4ebd8 --- /dev/null +++ b/q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs @@ -0,0 +1,3 @@ +0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0 +0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0 +0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0 diff --git a/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs new file mode 100644 index 00000000..7ab4ebd8 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs @@ -0,0 +1,3 @@ +0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0 +0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0 +0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0 diff --git a/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs new file mode 100644 index 00000000..7ab4ebd8 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs @@ -0,0 +1,3 @@ +0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0 +0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0 +0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0 diff --git a/q2_types/genome_data/tests/test_format.py b/q2_types/genome_data/tests/test_format.py index d8b639e4..135fc41b 100644 --- a/q2_types/genome_data/tests/test_format.py +++ b/q2_types/genome_data/tests/test_format.py @@ -22,31 +22,31 @@ class TestFormats(TestPluginBase): def test_ortholog_file_fmt(self): dirpath = self.get_data_path( - 'ortholog/test_sample.emapper.seed_orthologs') + 'partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs') fmt_obj = OrthologFileFmt(dirpath, mode='r') fmt_obj.validate() def test_seed_ortholog_dir_fmt_collection_file_name(self): - dirpath = self.get_data_path('ortholog') + dirpath = self.get_data_path('partitioned_orthologs/ortholog_1') fmt_obj = SeedOrthologDirFmt(dirpath, mode='r') for relpath, obj in fmt_obj.seed_orthologs.iter_views(OrthologFileFmt): obs = str(obj).split("/")[-1].split("/")[-1] - exp = "test_sample.emapper.seed_orthologs" + exp = "1.emapper.seed_orthologs" self.assertEqual(obs, exp) def test_seed_ortholog_dir_fmt_good_validate(self): - dirpath = self.get_data_path('ortholog') + dirpath = self.get_data_path('partitioned_orthologs/ortholog_1') fmt_obj = SeedOrthologDirFmt(dirpath, mode='r') fmt_obj.validate() def test_seed_ortholog_dir_fmt_collection(self): - dirpath = self.get_data_path('ortholog/') + dirpath = self.get_data_path('partitioned_orthologs/ortholog_1') fmt = SeedOrthologDirFmt(dirpath, mode='r') for relpath, obj in fmt.seed_orthologs.iter_views(OrthologFileFmt): diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py new file mode 100644 index 00000000..acdf5f22 --- /dev/null +++ b/q2_types/genome_data/tests/test_methods.py @@ -0,0 +1,54 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os + +from qiime2.plugin.testing import TestPluginBase + +from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ + partition_orthologs + + +class TestOrthologsPartitionCollating(TestPluginBase): + package = "q2_types.genome_data.tests" + + def test_collate_orthologs(self): + p1 = self.get_data_path("partitioned_orthologs/ortholog_1") + p2 = self.get_data_path("partitioned_orthologs/ortholog_2") + orthologs = [ + SeedOrthologDirFmt(p1, mode="r"), + SeedOrthologDirFmt(p2, mode="r") + ] + + collated_orthologs = collate_orthologs(orthologs) + self.assertTrue(os.path.exists( + collated_orthologs.path / "1.emapper.seed_orthologs") + ) + self.assertTrue(os.path.exists( + collated_orthologs.path / "2.emapper.seed_orthologs") + ) + + def test_partition_orthologs(self): + p = self.get_data_path("collated_orthologs") + orthologs = SeedOrthologDirFmt(path=p, mode="r") + obs = partition_orthologs(orthologs, 2) + + self.assertTrue(os.path.exists( + obs["1"].path / "1.emapper.seed_orthologs") + ) + self.assertTrue(os.path.exists( + obs["1"].path / "1.emapper.seed_orthologs") + ) + + def test_partition_orthologs_warning_message(self): + path = self.get_data_path("collated_orthologs") + orthologs = SeedOrthologDirFmt(path=path, mode="r") + + with self.assertWarnsRegex( + UserWarning, "You have requested a number of.*5.*2.*2" + ): + partition_orthologs(orthologs, 5) diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index 6f4e9507..f4284449 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -28,6 +28,9 @@ ContigSequencesDirFmt, MultiBowtie2IndexDirFmt, BAMFormat, BAMDirFmt, MultiBAMDirFmt, MultiFASTADirectoryFormat) + +from ._methods import partition_sample_data_mags, collate_sample_data_mags + from ._type import (Sequences, SequencesWithQuality, PairedEndSequencesWithQuality, JoinedSequencesWithQuality, MAGs, Contigs, @@ -51,6 +54,8 @@ 'ContigSequencesDirFmt', 'Contigs', 'SingleBowtie2Index', 'MultiBowtie2Index', 'MultiBowtie2IndexDirFmt', 'BAMFormat', 'BAMDirFmt', 'MultiBAMDirFmt', - 'MultiFASTADirectoryFormat'] + 'MultiFASTADirectoryFormat', 'partition_sample_data_mags', + 'collate_sample_data_mags' + ] importlib.import_module('q2_types.per_sample_sequences._transformer') diff --git a/q2_types/per_sample_sequences/_methods.py b/q2_types/per_sample_sequences/_methods.py new file mode 100644 index 00000000..7858279f --- /dev/null +++ b/q2_types/per_sample_sequences/_methods.py @@ -0,0 +1,90 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os +import shutil + +import numpy as np +import pandas as pd +from qiime2.util import duplicate + +from q2_types._util import _validate_num_partitions +from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt + + +def partition_sample_data_mags( + mags: MultiMAGSequencesDirFmt, num_partitions: int = None +) -> MultiMAGSequencesDirFmt: + """ + Returns a dictionary where each key is either the mag_id or an index, and + values are the new objects with the mags. + """ + partitioned_mags = {} + mags_all = [{k: v} for k, v in mags.sample_dict().items()] + + num_partitions = _validate_num_partitions( + len(mags_all), num_partitions, "sample" + ) + + arrays_of_mags = np.array_split(mags_all, num_partitions) + + for i, samples in enumerate(arrays_of_mags, 1): + result = MultiMAGSequencesDirFmt() + all_samples = set(k for d in samples for k in d.keys()) + manifest = pd.read_csv(mags.path / "MANIFEST", index_col=None) + manifest = manifest[manifest["sample-id"].isin(all_samples)] + manifest.to_csv(result.path / "MANIFEST", index=False) + + for sample_dict in samples: + for sample_id, mag_dict in sample_dict.items(): + for mag_id, mag_fp in mag_dict.items(): + os.makedirs(result.path / sample_id, exist_ok=True) + duplicate( + mag_fp, + result.path / sample_id / os.path.basename(mag_fp) + ) + + # If num_partitions == num_samples we will only have gone through one + # sample in the above loop and will use its id as a key. Otherwise we + # may have gone through multiple MAGs in the above loop and will be + # using indices for keys + if num_partitions == len(mags_all): + partitioned_mags[sample_id] = result + else: + partitioned_mags[i] = result + + return partitioned_mags + + +def collate_sample_data_mags( + mags: MultiMAGSequencesDirFmt +) -> MultiMAGSequencesDirFmt: + collated_mags = MultiMAGSequencesDirFmt() + + # For every partition + for mag in mags: + + # For every sample in the partition + for file_or_dir in mag.path.iterdir(): + + if file_or_dir.is_dir(): + sample = file_or_dir + os.makedirs(collated_mags.path / sample.name, exist_ok=True) + + # For every mag in the sample + for mag in sample.iterdir(): + duplicate(mag, collated_mags.path / sample.name / mag.name) + + # If its a file, it should be the manifest + # Since its present many times it will be overwritten, but that ok + else: + manifest = file_or_dir + # Overwrite is necessary + shutil.copy(manifest, collated_mags.path / manifest.name) + + return collated_mags + diff --git a/q2_types/per_sample_sequences/tests/data/collated_mags/MANIFEST b/q2_types/per_sample_sequences/tests/data/collated_mags/MANIFEST new file mode 100644 index 00000000..9f6ba795 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/collated_mags/MANIFEST @@ -0,0 +1,7 @@ +sample-id,mag-id,filename +sample1,ca7012fc-ba65-40c3-84f5-05aa478a7585,sample1/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta +sample1,fb0bc871-04f6-486b-a10e-8e0cb66f8de3,sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta +sample1,24dee6fe-9b84-45bb-8145-de7b092533a1,sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta +sample2,db03f8b6-28e1-48c5-a47c-9c65f38f7357,sample2/db03f8b6-28e1-48c5-a47c-9c65f38f7357.fasta +sample2,d65a71fa-4279-4588-b937-0747ed5d604d,sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta +sample2,fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf,sample2/fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf.fasta diff --git a/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta new file mode 100644 index 00000000..c072c82b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta @@ -0,0 +1,4 @@ +>NZ_00000000.1_contig1 +ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG +>NZ_00000000.1_contig2 +TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA diff --git a/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta new file mode 100644 index 00000000..e5910508 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/collated_mags/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta @@ -0,0 +1,4 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC diff --git a/q2_types/per_sample_sequences/tests/data/collated_mags/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta b/q2_types/per_sample_sequences/tests/data/collated_mags/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta new file mode 100644 index 00000000..b2330298 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/collated_mags/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta @@ -0,0 +1,6 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC +>NZ_CP018863.1_contig3 +GACGATCTGCGCCTGGCCCATGTGATGGCCGATTCCGTGGATTCGCAGACCATGGCCCGCTTCAAGGCGCTGGACCTGAAAATCGAGACCAAGCCGGATCTCACCCCTGTCACGGATGCGGACCGCGCCGCTGAAGAGGCCATCCGCGGCCAACTCTCCCGGGCCCGGCCGCGCGACGCGGTCCTCGGCGAGGAATACGGCAGCAGCGGCCACGGCTCCCGCCGCTGGATCATCGATCCCATCGACGGCACGAAGAACTTCGTCCGCGGGGTGCCGGTCTGGGCCACCTTGATCGCGCTGGTAGACGAAGACCGTCCCGTGGTCGGCCTGGTCAGCGCGCCGGCTCTGGGCAAGCGCTGGTGGGCCGCGACCGGAACCGGTGCCTACATGGGACGTTCGCTGTCCGCGGCCACCCGGCTCCGGGTATCCGATGTCAACCGGCTCGAGGACGCGTCCCTCTCCTATTCCAGCCTCACCGGCTGGCAGGAACGCGGCAACTTCCCGGAGTTCCTCGGCCTCACCGAATCCGTCTGGCGCACCCGTGCCTACGGGGACTTCTGGTCCTACTGCATGGTGGCCGAGGGCGCCGTCGACATTGCCTGCGAACCCGAACTCAACCTCTATGACATGGCGGCCCTCGTGCCGATCGTGACCGAGGCCGGCGGACGGTTCAGTTCGCTCGAGGGCGAGGACGGACCCTTCGGCGGCAACGCGTTGGCCACGAACGGCACGCTGCACGACGAGGTCCTCTACCGGCTCAATCCGCAGTTGCGCGGCCAGCGTCCGGCCGCACACCCGGAGGACGGGTCCCTGCCGGAAACCGCTCCGGAGGCCTCCATGGAGGCGGACGGCCTGCGCTGACGCTGTCTTTTGTGACGAATTACGACGGCGGCCGTCCCCATTCCGGGGATGGCCGCCTTTTCGTTCCCGTAACAAAGATGCGGCCCCTCCGGCCGGACAATAATCTCGATGGCAGGTCACGAGTGCCAGCGCTAAACCC diff --git a/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/MANIFEST b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/MANIFEST new file mode 100644 index 00000000..9f6ba795 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/MANIFEST @@ -0,0 +1,7 @@ +sample-id,mag-id,filename +sample1,ca7012fc-ba65-40c3-84f5-05aa478a7585,sample1/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta +sample1,fb0bc871-04f6-486b-a10e-8e0cb66f8de3,sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta +sample1,24dee6fe-9b84-45bb-8145-de7b092533a1,sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta +sample2,db03f8b6-28e1-48c5-a47c-9c65f38f7357,sample2/db03f8b6-28e1-48c5-a47c-9c65f38f7357.fasta +sample2,d65a71fa-4279-4588-b937-0747ed5d604d,sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta +sample2,fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf,sample2/fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf.fasta diff --git a/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta new file mode 100644 index 00000000..c072c82b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta @@ -0,0 +1,4 @@ +>NZ_00000000.1_contig1 +ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG +>NZ_00000000.1_contig2 +TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA diff --git a/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta new file mode 100644 index 00000000..e5910508 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag1/sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta @@ -0,0 +1,4 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC diff --git a/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/MANIFEST b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/MANIFEST new file mode 100644 index 00000000..9f6ba795 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/MANIFEST @@ -0,0 +1,7 @@ +sample-id,mag-id,filename +sample1,ca7012fc-ba65-40c3-84f5-05aa478a7585,sample1/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta +sample1,fb0bc871-04f6-486b-a10e-8e0cb66f8de3,sample1/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta +sample1,24dee6fe-9b84-45bb-8145-de7b092533a1,sample1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta +sample2,db03f8b6-28e1-48c5-a47c-9c65f38f7357,sample2/db03f8b6-28e1-48c5-a47c-9c65f38f7357.fasta +sample2,d65a71fa-4279-4588-b937-0747ed5d604d,sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta +sample2,fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf,sample2/fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf.fasta diff --git a/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta new file mode 100644 index 00000000..b2330298 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/partitioned_mags/mag2/sample2/d65a71fa-4279-4588-b937-0747ed5d604d.fasta @@ -0,0 +1,6 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC +>NZ_CP018863.1_contig3 +GACGATCTGCGCCTGGCCCATGTGATGGCCGATTCCGTGGATTCGCAGACCATGGCCCGCTTCAAGGCGCTGGACCTGAAAATCGAGACCAAGCCGGATCTCACCCCTGTCACGGATGCGGACCGCGCCGCTGAAGAGGCCATCCGCGGCCAACTCTCCCGGGCCCGGCCGCGCGACGCGGTCCTCGGCGAGGAATACGGCAGCAGCGGCCACGGCTCCCGCCGCTGGATCATCGATCCCATCGACGGCACGAAGAACTTCGTCCGCGGGGTGCCGGTCTGGGCCACCTTGATCGCGCTGGTAGACGAAGACCGTCCCGTGGTCGGCCTGGTCAGCGCGCCGGCTCTGGGCAAGCGCTGGTGGGCCGCGACCGGAACCGGTGCCTACATGGGACGTTCGCTGTCCGCGGCCACCCGGCTCCGGGTATCCGATGTCAACCGGCTCGAGGACGCGTCCCTCTCCTATTCCAGCCTCACCGGCTGGCAGGAACGCGGCAACTTCCCGGAGTTCCTCGGCCTCACCGAATCCGTCTGGCGCACCCGTGCCTACGGGGACTTCTGGTCCTACTGCATGGTGGCCGAGGGCGCCGTCGACATTGCCTGCGAACCCGAACTCAACCTCTATGACATGGCGGCCCTCGTGCCGATCGTGACCGAGGCCGGCGGACGGTTCAGTTCGCTCGAGGGCGAGGACGGACCCTTCGGCGGCAACGCGTTGGCCACGAACGGCACGCTGCACGACGAGGTCCTCTACCGGCTCAATCCGCAGTTGCGCGGCCAGCGTCCGGCCGCACACCCGGAGGACGGGTCCCTGCCGGAAACCGCTCCGGAGGCCTCCATGGAGGCGGACGGCCTGCGCTGACGCTGTCTTTTGTGACGAATTACGACGGCGGCCGTCCCCATTCCGGGGATGGCCGCCTTTTCGTTCCCGTAACAAAGATGCGGCCCCTCCGGCCGGACAATAATCTCGATGGCAGGTCACGAGTGCCAGCGCTAAACCC diff --git a/q2_types/per_sample_sequences/tests/test_methods.py b/q2_types/per_sample_sequences/tests/test_methods.py new file mode 100644 index 00000000..e96e779e --- /dev/null +++ b/q2_types/per_sample_sequences/tests/test_methods.py @@ -0,0 +1,103 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import filecmp +from unittest.mock import patch + +from qiime2.plugin.testing import TestPluginBase + +from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt +from q2_types.per_sample_sequences._methods import ( + partition_sample_data_mags, collate_sample_data_mags +) + + +class TestSampleDataMAGsPartitionCollating(TestPluginBase): + package = "q2_types.per_sample_sequences.tests" + + @patch("q2_types._util._validate_mag_ids") + @patch("q2_types._util._validate_num_partitions") + def test_partition_sample_data_mags( + self, + mock_validate_num_partitions, + mock_validate_mag_ids + ): + # Partition mags + p = self.get_data_path("collated_mags") + mags = MultiMAGSequencesDirFmt(path=p, mode="r") + mock_validate_num_partitions.return_value = 2 + partitioned_mags = partition_sample_data_mags(mags, 2) + + # Expected mag ids for every sample + mag_ids_sample_1 = [ + "24dee6fe-9b84-45bb-8145-de7b092533a1.fasta", + "fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta" + ] + mag_ids_sample_2 = [ + "d65a71fa-4279-4588-b937-0747ed5d604d.fasta", + ] + + # Compare dirs + exp_partitions = [ + ("sample1", mag_ids_sample_1), ("sample2", mag_ids_sample_2) + ] + for _id, mag_ids in exp_partitions: + dircmp = filecmp.dircmp( + partitioned_mags[_id].path, + mags.path + ) + self.assertListEqual( + ["MANIFEST", _id], dircmp.common + ) + dircmp = filecmp.dircmp( + f"{partitioned_mags[_id].path}/{_id}", + f"{mags.path}/{_id}" + ) + self.assertListEqual( + [ + *mag_ids, + ], + dircmp.common + ) + + def test_collate_sample_data_mags(self): + p1 = self.get_data_path("partitioned_mags/mag1") + p2 = self.get_data_path("partitioned_mags/mag2") + mags = [ + MultiMAGSequencesDirFmt(p1, mode="r"), + MultiMAGSequencesDirFmt(p2, mode="r") + ] + + collated_mags = collate_sample_data_mags(mags) + expected = self.get_data_path("collated_mags") + + # compare first dir + dircmp = filecmp.dircmp(collated_mags.path, expected) + self.assertListEqual(["MANIFEST", "sample1", "sample2"], dircmp.common) + + # Compare second dir + dircmp = filecmp.dircmp( + f"{collated_mags.path}/sample1", + f"{expected}/sample1" + ) + self.assertListEqual( + [ + "24dee6fe-9b84-45bb-8145-de7b092533a1.fasta", + "fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta" + ], + dircmp.common + ) + + # compare third dir + dircmp = filecmp.dircmp( + f"{collated_mags.path}/sample2", + f"{expected}/sample2" + ) + self.assertListEqual( + ["d65a71fa-4279-4588-b937-0747ed5d604d.fasta"], + dircmp.common + ) diff --git a/q2_types/tests/__init__.py b/q2_types/tests/__init__.py new file mode 100644 index 00000000..16cef8fc --- /dev/null +++ b/q2_types/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/tests/test_util.py b/q2_types/tests/test_util.py new file mode 100644 index 00000000..835786fc --- /dev/null +++ b/q2_types/tests/test_util.py @@ -0,0 +1,48 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin.testing import TestPluginBase + +from q2_types._util import _validate_num_partitions, _validate_mag_ids + + +class TestUtil(TestPluginBase): + package = "q2_types.tests" + + def test_validate_num_partitions_None(self): + num_partitions = _validate_num_partitions(14, None) + self.assertEqual(14, num_partitions) + + def test_validate_num_partitions_Warning(self): + num_partitions = 15 + num_samples = 14 + with self.assertWarnsRegex( + Warning, + "You have requested a number of partitions" + f" '{num_partitions}' that is greater than your number" + f" of samples '{num_samples}.'" + ): + _ = _validate_num_partitions(num_samples, num_partitions) + + def test_validate_num_partitions_valid(self): + num_partitions = _validate_num_partitions(14, 2) + self.assertEqual(2, num_partitions) + + def test_validate_mag_ids_valid(self): + _validate_mag_ids( + 6, + 6, + [(0, "a"), (0, "b"), (0, "c"), (0, "d"), (0, "e"), (0, "f")] + ) + + def test_validate_mag_ids_invalid(self): + with self.assertRaisesRegex(ValueError, "MAG IDs are not unique. "): + _validate_mag_ids( + 6, + 6, + [(0, "a"), (0, "a"), (0, "c"), (0, "d"), (0, "e"), (0, "f")] + ) \ No newline at end of file From 57234cc5fc6450239501c35a07e39378f8353001 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 15:04:31 +0200 Subject: [PATCH 02/10] added everything but bug cicular imports --- q2_types/feature_data_mag/_methods.py | 6 +- q2_types/plugin_setup.py | 109 ++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/q2_types/feature_data_mag/_methods.py b/q2_types/feature_data_mag/_methods.py index a7f3f813..e9c2ad5f 100644 --- a/q2_types/feature_data_mag/_methods.py +++ b/q2_types/feature_data_mag/_methods.py @@ -68,11 +68,11 @@ def collate_ortholog_annotations( ortholog_annotations: OrthologAnnotationDirFmt ) -> OrthologAnnotationDirFmt: # Init output - collated_annotations = OrthologAnnotationDirFmt() + collated_ortholog_annotations = OrthologAnnotationDirFmt() # Copy annotations into output for anno in ortholog_annotations: for fp in anno.path.iterdir(): - duplicate(fp, collated_annotations.path / fp.name) + duplicate(fp, collated_ortholog_annotations.path / fp.name) - return collated_annotations + return collated_ortholog_annotations diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 7cde3a14..770d63b5 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -9,9 +9,19 @@ import pandas as pd import qiime2.plugin import qiime2.sdk +from qiime2.core.type import Int, Range, Collection, List +import q2_types from q2_types import __version__ +from q2_types.feature_data_mag import MAG, NOG +# from q2_types.genome_data import +from q2_types.per_sample_sequences import MAGs +from q2_types.feature_data import FeatureData +from q2_types.genome_data import BLAST6 +from q2_types.sample_data import SampleData + + citations = qiime2.plugin.Citations.load('citations.bib', package='q2_types') plugin = qiime2.plugin.Plugin( name='types', @@ -27,3 +37,102 @@ citations=[citations['mckinney-proc-scipy-2010']]) # __init__.py loads first and imports all of the subpackages. + +plugin.methods.register_function( + function=q2_types.per_sample_sequences.partition_sample_data_mags, + inputs={"mags": SampleData[MAGs]}, + parameters={"num_partitions": Int % Range(1, None)}, + outputs={"partitioned_mags": Collection[SampleData[MAGs]]}, + input_descriptions={"mags": "The MAGs to partition."}, + parameter_descriptions={ + "num_partitions": "The number of partitions to split the MAGs" + " into. Defaults to partitioning into individual" + " MAGs." + }, + name="Partition MAGs", + description="Partition a SampleData[MAGs] artifact into smaller " + "artifacts containing subsets of the MAGs", +) + +plugin.methods.register_function( + function=q2_types.genome_data.partition_orthologs, + inputs={"orthologs": SampleData[BLAST6]}, + parameters={"num_partitions": Int % Range(1, None)}, + outputs={"partitioned_orthologs": Collection[SampleData[BLAST6]]}, + input_descriptions={"orthologs": "The orthologs to partition."}, + parameter_descriptions={ + "num_partitions": "The number of partitions to split the MAGs" + " into. Defaults to partitioning into individual" + " MAGs." + }, + name="Partition orthologs", + description="Partition a SampleData[BLAST6] artifact into smaller " + "artifacts containing subsets of the BLAST6 reports.", +) + +plugin.methods.register_function( + function=q2_types.per_sample_sequences.collate_sample_data_mags, + inputs={"mags": List[SampleData[MAGs]]}, + parameters={}, + outputs={"collated_mags": SampleData[MAGs]}, + input_descriptions={"mags": "A collection of MAGs to be collated."}, + name="Collate mags", + description="Takes a collection of SampleData[MAGs]'s " + "and collates them into a single artifact.", +) + +plugin.methods.register_function( + function=q2_types.feature_data_mag.partition_feature_data_mags, + inputs={"mags": FeatureData[MAG]}, + parameters={"num_partitions": Int % Range(1, None)}, + outputs={"partitioned_mags": Collection[FeatureData[MAG]]}, + input_descriptions={"mags": "MAGs to partition."}, + parameter_descriptions={ + "num_partitions": "The number of partitions to split the MAGs" + " into. Defaults to partitioning into individual" + " MAGs." + }, + name="Partition MAGs", + description="Partition a FeatureData[MAG] artifact into smaller " + "artifacts containing subsets of the MAGs", +) + +plugin.methods.register_function( + function=q2_types.feature_data_mag.collate_feature_data_mags, + inputs={"mags": List[FeatureData[MAG]]}, + parameters={}, + outputs={"collated_mags": FeatureData[MAG]}, + input_descriptions={"mags": "A collection of MAGs to be collated."}, + name="Collate mags", + description="Takes a collection of FeatureData[MAG]'s " + "and collates them into a single artifact.", +) + +plugin.methods.register_function( + function=q2_types.genome_data.collate_orthologs, + inputs={"orthologs": List[SampleData[BLAST6]]}, + parameters={}, + outputs={"collated_orthologs": SampleData[BLAST6]}, + input_descriptions={"orthologs": "Orthologs to collate"}, + parameter_descriptions={}, + name="Collate Orthologs", + description="Takes a collection SampleData[BLAST6] artifacts " + "and collates them into a single artifact.", +) + +plugin.methods.register_function( + function=q2_types.feature_data_mag.collate_ortholog_annotations, + inputs={'ortholog_annotations': List[FeatureData[NOG]]}, + parameters={}, + outputs=[('collated_ortholog_annotations', FeatureData[NOG])], + input_descriptions={ + 'ortholog_annotations': "Collection of ortholog annotations." + }, + output_descriptions={ + 'collated_ortholog_annotations': "Collated ortholog annotations." + }, + name='Collate ortholog annotations.', + description="Takes a collection of FeatureData[NOG]'s " + "and collates them into a single artifact.", +) + From 8103975801e62b41800628e61e6b51be26cbd698 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 30 May 2024 11:10:52 +0200 Subject: [PATCH 03/10] removed import --- q2_types/plugin_setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 770d63b5..51209cce 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -15,7 +15,6 @@ from q2_types import __version__ from q2_types.feature_data_mag import MAG, NOG -# from q2_types.genome_data import from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData from q2_types.genome_data import BLAST6 From 08e3fd181a492bdf4a2bf76df092fac33ec7744b Mon Sep 17 00:00:00 2001 From: Evan Bolyen Date: Wed, 31 Jul 2024 14:14:13 -0700 Subject: [PATCH 04/10] SQUASH: fix type drift from #338 --- q2_types/feature_data_mag/__init__.py | 6 ++-- q2_types/feature_data_mag/_methods.py | 17 +---------- .../a/a.annotations | 0 .../b/b.annotations | 0 .../c/c.annotations | 0 .../feature_data_mag/tests/test_methods.py | 23 ++------------- q2_types/plugin_setup.py | 28 ++++--------------- 7 files changed, 11 insertions(+), 63 deletions(-) delete mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations delete mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations delete mode 100644 q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations diff --git a/q2_types/feature_data_mag/__init__.py b/q2_types/feature_data_mag/__init__.py index cc2c431d..483c35ab 100644 --- a/q2_types/feature_data_mag/__init__.py +++ b/q2_types/feature_data_mag/__init__.py @@ -10,9 +10,7 @@ from ._types import MAG, Contig from ._objects import MAGIterator -from ._methods import partition_feature_data_mags, collate_feature_data_mags, \ - collate_ortholog_annotations +from ._methods import partition_feature_data_mags, collate_feature_data_mags __all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig', - 'partition_feature_data_mags', 'collate_feature_data_mags', - 'collate_ortholog_annotations'] + 'partition_feature_data_mags', 'collate_feature_data_mags'] diff --git a/q2_types/feature_data_mag/_methods.py b/q2_types/feature_data_mag/_methods.py index e9c2ad5f..c4aa50d6 100644 --- a/q2_types/feature_data_mag/_methods.py +++ b/q2_types/feature_data_mag/_methods.py @@ -11,8 +11,7 @@ from qiime2.util import duplicate from q2_types._util import _validate_num_partitions, _validate_mag_ids -from q2_types.feature_data_mag import MAGSequencesDirFmt, \ - OrthologAnnotationDirFmt +from q2_types.feature_data_mag import MAGSequencesDirFmt def partition_feature_data_mags( @@ -62,17 +61,3 @@ def collate_feature_data_mags(mags: MAGSequencesDirFmt) -> MAGSequencesDirFmt: duplicate(fp, collated_mags.path / fp.name) return collated_mags - - -def collate_ortholog_annotations( - ortholog_annotations: OrthologAnnotationDirFmt -) -> OrthologAnnotationDirFmt: - # Init output - collated_ortholog_annotations = OrthologAnnotationDirFmt() - - # Copy annotations into output - for anno in ortholog_annotations: - for fp in anno.path.iterdir(): - duplicate(fp, collated_ortholog_annotations.path / fp.name) - - return collated_ortholog_annotations diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/a/a.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/b/b.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations b/q2_types/feature_data_mag/tests/data/partitioned_ortholog_annotations/c/c.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/q2_types/feature_data_mag/tests/test_methods.py b/q2_types/feature_data_mag/tests/test_methods.py index babc9e7a..793d26f8 100644 --- a/q2_types/feature_data_mag/tests/test_methods.py +++ b/q2_types/feature_data_mag/tests/test_methods.py @@ -10,10 +10,9 @@ from qiime2.plugin.testing import TestPluginBase -from q2_types.feature_data_mag import MAGSequencesDirFmt, \ - OrthologAnnotationDirFmt +from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.feature_data_mag._methods import partition_feature_data_mags, \ - collate_feature_data_mags, collate_ortholog_annotations + collate_feature_data_mags class TestFeatureDataMAGsPartitionCollating(TestPluginBase): @@ -65,21 +64,3 @@ def test_collate_feature_data_mags(self): ], dircmp.common ) - - def test_collate_ortholog_annotations(self): - p = self.get_data_path("partitioned_ortholog_annotations") - annotations = [ - OrthologAnnotationDirFmt(f"{p}/{letter}", mode="r") - for letter in ["a", "b", "c"] - ] - collated_annotations = collate_ortholog_annotations(annotations) - - # assert that all files are there - compare = filecmp.dircmp( - collated_annotations.path, - self.get_data_path("collated_ortholog_annotations") - ) - self.assertListEqual( - compare.common, - [f"{letter}.annotations" for letter in ["a", "b", "c"]] - ) \ No newline at end of file diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 21a6ca76..0e86701c 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -16,10 +16,10 @@ import q2_types from q2_types import __version__ -from q2_types.feature_data_mag import MAG, NOG +from q2_types.feature_data_mag import MAG from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData -from q2_types.genome_data import BLAST6 +from q2_types.genome_data import Orthologs, NOG from q2_types.sample_data import SampleData @@ -56,9 +56,9 @@ plugin.methods.register_function( function=q2_types.genome_data.partition_orthologs, - inputs={"orthologs": SampleData[BLAST6]}, + inputs={"orthologs": SampleData[Orthologs]}, parameters={"num_partitions": Int % Range(1, None)}, - outputs={"partitioned_orthologs": Collection[SampleData[BLAST6]]}, + outputs={"partitioned_orthologs": Collection[SampleData[Orthologs]]}, input_descriptions={"orthologs": "The orthologs to partition."}, parameter_descriptions={ "num_partitions": "The number of partitions to split the MAGs" @@ -110,9 +110,9 @@ plugin.methods.register_function( function=q2_types.genome_data.collate_orthologs, - inputs={"orthologs": List[SampleData[BLAST6]]}, + inputs={"orthologs": List[SampleData[Orthologs]]}, parameters={}, - outputs={"collated_orthologs": SampleData[BLAST6]}, + outputs={"collated_orthologs": SampleData[Orthologs]}, input_descriptions={"orthologs": "Orthologs to collate"}, parameter_descriptions={}, name="Collate Orthologs", @@ -120,22 +120,6 @@ "and collates them into a single artifact.", ) -plugin.methods.register_function( - function=q2_types.feature_data_mag.collate_ortholog_annotations, - inputs={'ortholog_annotations': List[FeatureData[NOG]]}, - parameters={}, - outputs=[('collated_ortholog_annotations', FeatureData[NOG])], - input_descriptions={ - 'ortholog_annotations': "Collection of ortholog annotations." - }, - output_descriptions={ - 'collated_ortholog_annotations': "Collated ortholog annotations." - }, - name='Collate ortholog annotations.', - description="Takes a collection of FeatureData[NOG]'s " - "and collates them into a single artifact.", -) - importlib.import_module('q2_types.bowtie2._deferred_setup') importlib.import_module('q2_types.distance_matrix._deferred_setup') From de69df7de356dc3492ed80b4d71bb17d8035e669 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 2 Aug 2024 12:08:15 +0200 Subject: [PATCH 05/10] typos in plugin setup --- q2_types/plugin_setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 0e86701c..f3bb0de2 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -76,7 +76,7 @@ parameters={}, outputs={"collated_mags": SampleData[MAGs]}, input_descriptions={"mags": "A collection of MAGs to be collated."}, - name="Collate mags", + name="Collate MAGs", description="Takes a collection of SampleData[MAGs]'s " "and collates them into a single artifact.", ) @@ -103,7 +103,7 @@ parameters={}, outputs={"collated_mags": FeatureData[MAG]}, input_descriptions={"mags": "A collection of MAGs to be collated."}, - name="Collate mags", + name="Collate MAGs", description="Takes a collection of FeatureData[MAG]'s " "and collates them into a single artifact.", ) @@ -115,7 +115,7 @@ outputs={"collated_orthologs": SampleData[Orthologs]}, input_descriptions={"orthologs": "Orthologs to collate"}, parameter_descriptions={}, - name="Collate Orthologs", + name="Collate orthologs", description="Takes a collection SampleData[BLAST6] artifacts " "and collates them into a single artifact.", ) From 2831f462e39167cc857276b81f47030a77a22eb8 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 2 Aug 2024 12:11:12 +0200 Subject: [PATCH 06/10] lint --- q2_types/_util.py | 2 +- q2_types/genome_data/_methods.py | 1 - q2_types/per_sample_sequences/__init__.py | 1 - q2_types/per_sample_sequences/_methods.py | 1 - q2_types/plugin_setup.py | 2 +- q2_types/tests/test_util.py | 2 +- 6 files changed, 3 insertions(+), 6 deletions(-) diff --git a/q2_types/_util.py b/q2_types/_util.py index 76cc7089..9f633b76 100644 --- a/q2_types/_util.py +++ b/q2_types/_util.py @@ -137,4 +137,4 @@ def _validate_mag_ids( "They must be unique in order to output all partitions " "correctly. Printing duplicate MAG IDs: " f"{set(duplicates)}" - ) \ No newline at end of file + ) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 6781b5f5..6535a13c 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -7,7 +7,6 @@ # ---------------------------------------------------------------------------- import glob import os -import shutil import warnings import numpy as np diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index a43d8c7d..428867a3 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -55,4 +55,3 @@ 'MultiFASTADirectoryFormat', 'AlignmentMap', 'MultiAlignmentMap', 'partition_sample_data_mags', 'collate_sample_data_mags' ] - diff --git a/q2_types/per_sample_sequences/_methods.py b/q2_types/per_sample_sequences/_methods.py index 7858279f..1195729b 100644 --- a/q2_types/per_sample_sequences/_methods.py +++ b/q2_types/per_sample_sequences/_methods.py @@ -87,4 +87,3 @@ def collate_sample_data_mags( shutil.copy(manifest, collated_mags.path / manifest.name) return collated_mags - diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index f3bb0de2..3039f23d 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -19,7 +19,7 @@ from q2_types.feature_data_mag import MAG from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, NOG +from q2_types.genome_data import Orthologs from q2_types.sample_data import SampleData diff --git a/q2_types/tests/test_util.py b/q2_types/tests/test_util.py index 835786fc..edaa929a 100644 --- a/q2_types/tests/test_util.py +++ b/q2_types/tests/test_util.py @@ -45,4 +45,4 @@ def test_validate_mag_ids_invalid(self): 6, 6, [(0, "a"), (0, "a"), (0, "c"), (0, "d"), (0, "e"), (0, "f")] - ) \ No newline at end of file + ) From b9992c1ae46a3ea829ec2ae9a445da8f8d74164b Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 2 Aug 2024 12:31:31 +0200 Subject: [PATCH 07/10] added package data --- .../a.annotations | 0 .../b.annotations | 0 .../c.annotations | 0 setup.py | 24 ++++++++++++++++--- 4 files changed, 21 insertions(+), 3 deletions(-) delete mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations delete mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations delete mode 100644 q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/a.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/b.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations b/q2_types/feature_data_mag/tests/data/collated_ortholog_annotations/c.annotations deleted file mode 100644 index e69de29b..00000000 diff --git a/setup.py b/setup.py index a25ecd20..8df56a30 100644 --- a/setup.py +++ b/setup.py @@ -60,12 +60,26 @@ 'data/bowtie/*/*', 'data/bowtie/*/*/*/*', 'data/bowtie/*/*/*', - 'data/error_correction_details/*'], + 'data/error_correction_details/*', + 'data/collated_mags/*', + 'data/collated_mags/sample1/*', + 'data/collated_mags/sample2/*', + 'data/partitioned_mags/*', + 'data/partitioned_mags/mag1/sample1/*', + 'data/partitioned_mags/mag1/*', + 'data/partitioned_mags/mag2/sample2/*', + 'data/partitioned_mags/mag2/*'], 'q2_types.sample_data.tests': ['data/*'], 'q2_types.tree.tests': ['data/*'], 'q2_types.feature_data_mag.tests': - ['data/*', 'data/*/*', - 'data/mags-fa/*', 'data/mags-fasta/*'], + ['data/*', + 'data/*/*', + 'data/mags-fa/*', + 'data/mags-fasta/*' + 'data/collated_mags/*', + 'data/partitioned_mags/*', + 'data/partitioned_mags/mag1/*', + 'data/partitioned_mags/mag2/*'], 'q2_types.genome_data.tests': ['data/*', 'data/genes/*', @@ -78,6 +92,10 @@ 'data/ortholog-annotation-samples/*', 'data/ortholog-annotation/*', 'data/proteins/*', + 'data/collated_orthologs/*', + 'data/partitioned_orthologs/ortholog_1/*', + 'data/partitioned_orthologs/ortholog_2/*', + ], 'q2_types.kraken2.tests': [ 'data/*', From 213919917c81148f3f0583ac884b56d1e7250eda Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 3 Sep 2024 15:40:41 +0200 Subject: [PATCH 08/10] added collate_ortholog_annotations from moshpit --- q2_types/genome_data/__init__.py | 5 +++-- q2_types/genome_data/_methods.py | 16 +++++++++++++- .../a/a.annotations | 0 .../b/b.annotations | 0 .../c/c.annotations | 0 .../collated/a.annotations | 0 .../collated/b.annotations | 0 .../collated/c.annotations | 0 q2_types/genome_data/tests/test_methods.py | 21 ++++++++++++++++++- 9 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/a/a.annotations create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/b/b.annotations create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/c/c.annotations create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/a.annotations create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/b.annotations create mode 100644 q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/c.annotations diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index b883f355..1de55bd7 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -16,7 +16,8 @@ from ._types import ( GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) -from ._methods import collate_orthologs, partition_orthologs +from ._methods import collate_orthologs, partition_orthologs, \ + collate_ortholog_annotations __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', @@ -24,5 +25,5 @@ 'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs', 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', 'OrthologAnnotationDirFmt', 'NOG', - 'collate_orthologs', 'partition_orthologs', + 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations" ] diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 6535a13c..99764ce2 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -12,7 +12,7 @@ import numpy as np from qiime2.util import duplicate -from q2_types.genome_data import SeedOrthologDirFmt +from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: @@ -72,3 +72,17 @@ def partition_orthologs( partitioned_orthologs[i] = result return partitioned_orthologs + + +def collate_ortholog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt +) -> OrthologAnnotationDirFmt: + # Init output + collated_annotations = OrthologAnnotationDirFmt() + + # Copy annotations into output + for anno in ortholog_annotations: + for fp in anno.path.iterdir(): + duplicate(fp, collated_annotations.path / fp.name) + + return collated_annotations diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/a/a.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/a/a.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/b/b.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/b/b.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/c/c.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/c/c.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/a.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/a.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/b.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/b.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/c.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/c.annotations new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index acdf5f22..2aff71ad 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -5,12 +5,13 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import filecmp import os from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs + partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations class TestOrthologsPartitionCollating(TestPluginBase): @@ -52,3 +53,21 @@ def test_partition_orthologs_warning_message(self): UserWarning, "You have requested a number of.*5.*2.*2" ): partition_orthologs(orthologs, 5) + + def test_collate_ortholog_annotations(self): + p = self.get_data_path("ortholog-annotations-collating") + annotations = [ + OrthologAnnotationDirFmt(f"{p}/{letter}", mode="r") + for letter in ["a", "b", "c"] + ] + collated_annotations = collate_ortholog_annotations(annotations) + + # assert that all files are there + compare = filecmp.dircmp( + collated_annotations.path, + self.get_data_path("ortholog-annotations-collating/collated") + ) + self.assertListEqual( + compare.common, + [f"{letter}.annotations" for letter in ["a", "b", "c"]] + ) From 6af9325573d5c021c6308ccc27b9105c9d2ab7cb Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 3 Sep 2024 16:53:38 +0200 Subject: [PATCH 09/10] added registration for collate_ortholog_annotations --- q2_types/plugin_setup.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 3039f23d..ed59706c 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -19,7 +19,7 @@ from q2_types.feature_data_mag import MAG from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs +from q2_types.genome_data import Orthologs, GenomeData, NOG from q2_types.sample_data import SampleData @@ -120,6 +120,21 @@ "and collates them into a single artifact.", ) +plugin.methods.register_function( + function=q2_types.genome_data.collate_ortholog_annotations, + inputs={'ortholog_annotations': List[GenomeData[NOG]]}, + parameters={}, + outputs=[('collated_annotations', GenomeData[NOG])], + input_descriptions={ + 'ortholog_annotations': "Collection of ortholog annotations." + }, + output_descriptions={ + 'collated_annotations': "Collated ortholog annotations." + }, + name='Collate ortholog annotations.', + description="Takes a collection of GenomeData[NOG]'s " + "and collates them into a single artifact.", +) importlib.import_module('q2_types.bowtie2._deferred_setup') importlib.import_module('q2_types.distance_matrix._deferred_setup') From bb1745668bf94021a8bb12de31b99e0c676c5359 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 3 Sep 2024 17:32:25 +0200 Subject: [PATCH 10/10] added package data for ortholog-annotations-collating --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8df56a30..a043d374 100644 --- a/setup.py +++ b/setup.py @@ -95,7 +95,10 @@ 'data/collated_orthologs/*', 'data/partitioned_orthologs/ortholog_1/*', 'data/partitioned_orthologs/ortholog_2/*', - + 'data/ortholog-annotations-collating/a/*', + 'data/ortholog-annotations-collating/b/*', + 'data/ortholog-annotations-collating/c/*', + 'data/ortholog-annotations-collating/collated/*', ], 'q2_types.kraken2.tests': [ 'data/*',