qiime2 · ebolyen · Sep 5, 2024 · May 28, 2024 · May 29, 2024 · May 30, 2024
diff --git a/q2_types/_util.py b/q2_types/_util.py
@@ -7,6 +7,8 @@
 # ----------------------------------------------------------------------------
 import gzip
 import itertools
+import warnings
+from typing import List
 
 import skbio
 import pandas as pd
@@ -93,3 +95,46 @@ def _validate_(self, level):
 
         record_count_map = {'min': 5, 'max': None}
         self._check_n_records(record_count_map[level])
+
+
+def _validate_num_partitions(
+        num_samples: int, num_partitions: int, sample_type: str = "sample"
+) -> int:
+
+    if num_partitions is None:
+        return num_samples
+    elif num_partitions > num_samples:
+        warnings.warn(
+            "You have requested a number of partitions "
+            f"'{num_partitions}' that is greater than your number "
+            f"of {sample_type}s '{num_samples}.' Your data will be "
+            f"partitioned by {sample_type} into '{num_samples}' "
+            "partitions."
+        )
+        return num_samples
+    else:
+        return num_partitions
+
+
+def _validate_mag_ids(
+    num_partitions: int, num_mags: int, mags_all: List[tuple]
+):
+    # If num_partitions == num_mags and MAG ids are not unique
+    # the output will be missing these duplicated-id MAGs.
+    # While this is technically impossible since
+    # MAGs should have unique IDs by construction, it could still happen that a
+    # used imports MAGs with non-unique IDs. In such case this test would be
+    # useful.
+
+    if num_partitions == num_mags:
+        mag_ids = [mag_id[1] for mag_id in mags_all]
+        duplicates = [
+            mag_id for mag_id in mag_ids if mag_ids.count(mag_id) > 1
+        ]
+        if len(duplicates) > 0:
+            raise ValueError(
+                "MAG IDs are not unique. "
+                "They must be unique in order to output all partitions "
+                "correctly. Printing duplicate MAG IDs: "
+                f"{set(duplicates)}"
+            )
diff --git a/q2_types/feature_data_mag/__init__.py b/q2_types/feature_data_mag/__init__.py
@@ -10,5 +10,7 @@
 
 from ._types import MAG, Contig
 from ._objects import MAGIterator
+from ._methods import partition_feature_data_mags, collate_feature_data_mags
 
-__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig']
+__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig',
+           'partition_feature_data_mags', 'collate_feature_data_mags']
diff --git a/q2_types/feature_data_mag/_methods.py b/q2_types/feature_data_mag/_methods.py
@@ -0,0 +1,63 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+
+import numpy as np
+from qiime2.util import duplicate
+
+from q2_types._util import _validate_num_partitions, _validate_mag_ids
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+
+
+def partition_feature_data_mags(
+    mags: MAGSequencesDirFmt, num_partitions: int = None
+) -> MAGSequencesDirFmt:
+    """
+    Returns a dictionary where each key is either the mag_id or an index, and
+    values are the new objects with the mags.
+    """
+    partitioned_mags = {}
+    mags_all = []
+
+    # Get a list where every entry is a tuple representing one MAG
+    for mag_id, mag_fp in mags.feature_dict().items():
+        mags_all.append((mag_fp, mag_id))
+
+    # Count number of mags and validate the num_partitions
+    num_mags = len(mags_all)
+    num_partitions = _validate_num_partitions(num_mags, num_partitions, "MAG")
+    _validate_mag_ids(num_partitions, num_mags, mags_all)
+
+    # Split list MAGs into n arrays, where n = num_partitions
+    arrays_of_mags = np.array_split(mags_all, num_partitions)
+
+    for i, _mag in enumerate(arrays_of_mags, 1):
+        result = MAGSequencesDirFmt()
+
+        for mag_fp, mag_id in _mag:
+            duplicate(mag_fp, result.path / os.path.basename(mag_fp))
+
+        # If num_partitions == num_mags we will only have gone through one
+        # MAG in the above loop and will use its id as a key. Otherwise, we
+        # may have gone through multiple MAGs in the above loop and will be
+        # using indices for keys
+        if num_partitions == num_mags:
+            partitioned_mags[mag_id] = result
+        else:
+            partitioned_mags[i] = result
+
+    return partitioned_mags
+
+
+def collate_feature_data_mags(mags: MAGSequencesDirFmt) -> MAGSequencesDirFmt:
+    collated_mags = MAGSequencesDirFmt()
+    for mag in mags:
+        for fp in mag.path.iterdir():
+            duplicate(fp, collated_mags.path / fp.name)
+
+    return collated_mags
diff --git a/...ypes/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/...ypes/feature_data_mag/tests/data/collated_mags/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta
@@ -0,0 +1,4 @@
+>NZ_00000000.1_contig1
+ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG
+>NZ_00000000.1_contig2
+TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA
diff --git a/...ypes/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/...ypes/feature_data_mag/tests/data/collated_mags/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta
@@ -0,0 +1,4 @@
+>NZ_CP018863.1_contig1
+GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG
+>NZ_CP018863.1_contig2
+CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC
diff --git a/...ture_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/...ture_data_mag/tests/data/partitioned_mags/mag1/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta
@@ -0,0 +1,4 @@
+>NZ_00000000.1_contig1
+ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG
+>NZ_00000000.1_contig2
+TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA
diff --git a/...ture_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta b/...ture_data_mag/tests/data/partitioned_mags/mag2/fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta
@@ -0,0 +1,4 @@
+>NZ_CP018863.1_contig1
+GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG
+>NZ_CP018863.1_contig2
+CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC
diff --git a/q2_types/feature_data_mag/tests/test_methods.py b/q2_types/feature_data_mag/tests/test_methods.py
@@ -0,0 +1,66 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import filecmp
+from unittest.mock import patch
+
+from qiime2.plugin.testing import TestPluginBase
+
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+from q2_types.feature_data_mag._methods import partition_feature_data_mags, \
+    collate_feature_data_mags
+
+
+class TestFeatureDataMAGsPartitionCollating(TestPluginBase):
+    package = "q2_types.feature_data_mag.tests"
+
+    @patch("q2_types._util._validate_mag_ids")
+    @patch("q2_types._util._validate_num_partitions")
+    def test_partition_feature_data_mags(
+        self,
+        mock_validate_num_partitions,
+        mock_validate_mag_ids
+    ):
+        # Partition Feature Data
+        p = self.get_data_path("collated_mags")
+        mags = MAGSequencesDirFmt(path=p, mode="r")
+        mock_validate_num_partitions.return_value = 2
+        partitioned_mags = partition_feature_data_mags(mags)
+
+        # Expected mag ids
+        mag_ids = [
+            "24dee6fe-9b84-45bb-8145-de7b092533a1",
+            "fb0bc871-04f6-486b-a10e-8e0cb66f8de3"
+        ]
+
+        # compare partitions
+        for i in [0, 1]:
+            dircmp = filecmp.dircmp(
+                partitioned_mags[mag_ids[i]].path, mags.path
+            )
+            self.assertListEqual([f"{mag_ids[i]}.fasta"], dircmp.common)
+
+    def test_collate_feature_data_mags(self):
+        # collate test data
+        p1 = self.get_data_path("partitioned_mags/mag1")
+        p2 = self.get_data_path("partitioned_mags/mag2")
+        mags = [
+            MAGSequencesDirFmt(p1, mode="r"),
+            MAGSequencesDirFmt(p2, mode="r")
+        ]
+        collated_mags = collate_feature_data_mags(mags)
+
+        # compare directories
+        expected = self.get_data_path("collated_mags")
+        dircmp = filecmp.dircmp(collated_mags.path, expected)
+        self.assertListEqual(
+            [
+                "24dee6fe-9b84-45bb-8145-de7b092533a1.fasta",
+                "fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta"
+            ],
+            dircmp.common
+        )
diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py
@@ -16,11 +16,14 @@
 from ._types import (
     GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG
 )
+from ._methods import collate_orthologs, partition_orthologs, \
+    collate_ortholog_annotations
 
 __all__ = [
     'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format',
     'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
     'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
     'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
-    'OrthologAnnotationDirFmt', 'NOG'
+    'OrthologAnnotationDirFmt', 'NOG',
+    'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
     ]
diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py
@@ -0,0 +1,88 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import glob
+import os
+import warnings
+
+import numpy as np
+from qiime2.util import duplicate
+
+from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt
+
+
+def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt:
+    result = SeedOrthologDirFmt()
+
+    for ortholog in orthologs:
+        for fp in ortholog.path.iterdir():
+            duplicate(fp, result.path / os.path.basename(fp))
+
+    return result
+
+
+def partition_orthologs(
+        orthologs: SeedOrthologDirFmt, num_partitions: int = None
+) -> SeedOrthologDirFmt:
+    """
+    Returns a dictionary where each key is either the sample_id and
+    values are the new objects with the orthologs.
+    """
+    partitioned_orthologs = {}
+
+    # TODO: this logic should move to the format itself
+    orthologs = glob.glob(os.path.join(str(orthologs), "*.seed_orthologs"))
+    names = [
+        os.path.basename(x).replace(".emapper.seed_orthologs", "")
+        for x in orthologs
+    ]
+    orthologs = list(zip(names, orthologs))
+
+    num_samples = len(orthologs)
+    if num_partitions is None:
+        num_partitions = num_samples
+    elif num_partitions > num_samples:
+        warnings.warn(
+            "You have requested a number of partitions"
+            f" '{num_partitions}' that is greater than your number"
+            f" of samples '{num_samples}.' Your data will be"
+            f" partitioned by sample into '{num_samples}'"
+            " partitions."
+        )
+        num_partitions = num_samples
+
+    orthologs = np.array_split(orthologs, num_partitions)
+    for i, samples in enumerate(orthologs, 1):
+        result = SeedOrthologDirFmt()
+
+        for sample_id, sample_fp in samples:
+            duplicate(sample_fp, result.path / os.path.basename(sample_fp))
+
+        # If num_partitions == num_samples we will only have gone through one
+        # sample in the above loop and will use its id as a key. Otherwise we
+        # may have gone through multiple samples in the above loop and will be
+        # using indices for keys
+        if num_partitions == num_samples:
+            partitioned_orthologs[sample_id] = result
+        else:
+            partitioned_orthologs[i] = result
+
+    return partitioned_orthologs
+
+
+def collate_ortholog_annotations(
+    ortholog_annotations: OrthologAnnotationDirFmt
+) -> OrthologAnnotationDirFmt:
+    # Init output
+    collated_annotations = OrthologAnnotationDirFmt()
+
+    # Copy annotations into output
+    for anno in ortholog_annotations:
+        for fp in anno.path.iterdir():
+            duplicate(fp, collated_annotations.path / fp.name)
+
+    return collated_annotations
diff --git a/...tholog/test_sample.emapper.seed_orthologs → ...llated_orthologs/1.emapper.seed_orthologs b/...tholog/test_sample.emapper.seed_orthologs → ...llated_orthologs/1.emapper.seed_orthologs
diff --git a/q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs b/q2_types/genome_data/tests/data/collated_orthologs/2.emapper.seed_orthologs
@@ -0,0 +1,3 @@
+0_0	316407.85674276	0.0	1597.0	1	2460	1	820	100.0	48.8	100.0
+0_2	316407.85674277	4.42e-217	629.0	1	930	1	310	100.0	18.5	100.0
+0_1	316407.21321894	2.29e-303	857.0	1	1284	1	428	100.0	25.5	100.0
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/a/a.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/a/a.annotations
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/b/b.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/b/b.annotations
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/c/c.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/c/c.annotations
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/a.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/a.annotations
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/b.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/b.annotations
diff --git a/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/c.annotations b/q2_types/genome_data/tests/data/ortholog-annotations-collating/collated/c.annotations
diff --git a/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_1/1.emapper.seed_orthologs
@@ -0,0 +1,3 @@
+0_0	316407.85674276	0.0	1597.0	1	2460	1	820	100.0	48.8	100.0
+0_2	316407.85674277	4.42e-217	629.0	1	930	1	310	100.0	18.5	100.0
+0_1	316407.21321894	2.29e-303	857.0	1	1284	1	428	100.0	25.5	100.0
diff --git a/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs b/q2_types/genome_data/tests/data/partitioned_orthologs/ortholog_2/2.emapper.seed_orthologs
@@ -0,0 +1,3 @@
+0_0	316407.85674276	0.0	1597.0	1	2460	1	820	100.0	48.8	100.0
+0_2	316407.85674277	4.42e-217	629.0	1	930	1	310	100.0	18.5	100.0
+0_1	316407.21321894	2.29e-303	857.0	1	1284	1	428	100.0	25.5	100.0