Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Migrated partition and collating actions from q2-moshpit to q2-types #334

Merged
merged 12 commits into from
Sep 5, 2024
Merged
45 changes: 45 additions & 0 deletions q2_types/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# ----------------------------------------------------------------------------
import gzip
import itertools
import warnings
from typing import List

import skbio
import pandas as pd
Expand Down Expand Up @@ -93,3 +95,46 @@ def _validate_(self, level):

record_count_map = {'min': 5, 'max': None}
self._check_n_records(record_count_map[level])


def _validate_num_partitions(
num_samples: int, num_partitions: int, sample_type: str = "sample"
) -> int:

if num_partitions is None:
return num_samples
elif num_partitions > num_samples:
warnings.warn(
"You have requested a number of partitions "
f"'{num_partitions}' that is greater than your number "
f"of {sample_type}s '{num_samples}.' Your data will be "
f"partitioned by {sample_type} into '{num_samples}' "
"partitions."
)
return num_samples
else:
return num_partitions


def _validate_mag_ids(
num_partitions: int, num_mags: int, mags_all: List[tuple]
):
# If num_partitions == num_mags and MAG ids are not unique
# the output will be missing these duplicated-id MAGs.
# While this is technically impossible since
# MAGs should have unique IDs by construction, it could still happen that a
# used imports MAGs with non-unique IDs. In such case this test would be
# useful.

if num_partitions == num_mags:
mag_ids = [mag_id[1] for mag_id in mags_all]
duplicates = [
mag_id for mag_id in mag_ids if mag_ids.count(mag_id) > 1
]
if len(duplicates) > 0:
raise ValueError(
"MAG IDs are not unique. "
"They must be unique in order to output all partitions "
"correctly. Printing duplicate MAG IDs: "
f"{set(duplicates)}"
)
4 changes: 3 additions & 1 deletion q2_types/feature_data_mag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@

from ._types import MAG, Contig
from ._objects import MAGIterator
from ._methods import partition_feature_data_mags, collate_feature_data_mags

__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig']
__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig',
'partition_feature_data_mags', 'collate_feature_data_mags']
63 changes: 63 additions & 0 deletions q2_types/feature_data_mag/_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os

import numpy as np
from qiime2.util import duplicate

from q2_types._util import _validate_num_partitions, _validate_mag_ids
from q2_types.feature_data_mag import MAGSequencesDirFmt


def partition_feature_data_mags(
mags: MAGSequencesDirFmt, num_partitions: int = None
) -> MAGSequencesDirFmt:
"""
Returns a dictionary where each key is either the mag_id or an index, and
values are the new objects with the mags.
"""
partitioned_mags = {}
mags_all = []

# Get a list where every entry is a tuple representing one MAG
for mag_id, mag_fp in mags.feature_dict().items():
mags_all.append((mag_fp, mag_id))

# Count number of mags and validate the num_partitions
num_mags = len(mags_all)
num_partitions = _validate_num_partitions(num_mags, num_partitions, "MAG")
_validate_mag_ids(num_partitions, num_mags, mags_all)

# Split list MAGs into n arrays, where n = num_partitions
arrays_of_mags = np.array_split(mags_all, num_partitions)

for i, _mag in enumerate(arrays_of_mags, 1):
result = MAGSequencesDirFmt()

for mag_fp, mag_id in _mag:
duplicate(mag_fp, result.path / os.path.basename(mag_fp))

# If num_partitions == num_mags we will only have gone through one
# MAG in the above loop and will use its id as a key. Otherwise, we
# may have gone through multiple MAGs in the above loop and will be
# using indices for keys
if num_partitions == num_mags:
partitioned_mags[mag_id] = result
else:
partitioned_mags[i] = result

return partitioned_mags


def collate_feature_data_mags(mags: MAGSequencesDirFmt) -> MAGSequencesDirFmt:
collated_mags = MAGSequencesDirFmt()
for mag in mags:
for fp in mag.path.iterdir():
duplicate(fp, collated_mags.path / fp.name)

return collated_mags
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_00000000.1_contig1
ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG
>NZ_00000000.1_contig2
TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_CP018863.1_contig1
GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG
>NZ_CP018863.1_contig2
CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_00000000.1_contig1
ATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATGTTTCCAGATGCAATGCGTGGGCACTGATG
>NZ_00000000.1_contig2
TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGTGA
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_CP018863.1_contig1
GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTGCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGCGTCGGGGTGGAGGTAGACGATTCCCGGCAGAACCACTGGTGGTCCGGCCCGGACCGGTCAAGGCCTTCGACGTCACCGTCGAACAGGACCTTTCCAATGCCGGCCCCTTCCTCGCGGCAGCCCTGGCCACGAAGGGAACGGTTCGGATCCCAGGCTGGCCGGAGAAAACCACGCAGGTAGGTGACAAATGGCGCAGCATCCTGGCGCAACTCGGCGCCACTGTCAGCTACGAGAACGGCACCCTCACCGTAACCGGCGGGGCAGAGATCACCGGGGCGCAGCTCGCCGACACCAGCGAACTTGCCCCCACCACGGCGGCGCTCTGTGCCCTGGCCGGCAGCGAATCCAGGCTCACCGGAATTGCCCACTTGCGGGGACACGAAACAGACCGGCTGGCGGCTCTGGTCGCGGAAATCAATGCCTTGGGTGGCGACGCCGAAGAAACCGAAGACGGGTTGATCATCCGTCCGAGGCCACTGCATGGCGGGGTCTTCCATTCATATGAGGACCACAGGATGGCCACCGCCGGAGCCATTATCGGGCTGGCAGTCGAGGGCGTGGAAGTCGAAGACATCGGCACCACGGCCAAGACCATGCCCGAGTTCCCGCGGCTATGGCAGGACCTGTTCGAGACTTCCGTCCGCCAGTCCGAGGCGGGAGCGCTCTAAGGTGGTGCGCGGCAACCGTACGTGGGACGAGTCCGATGTCCGCATCCGTCCCAACAAGCGCGGCTCGCGTCCGCGTACCAAGGAACGGCCTGCCCACGAAGACGCCGTCATCGGGCGGATCATCACCGTGGACCGCGGCCGCTACACCGCGGTCGTCGATGAAGACACTGCCCGGGAACGGGTGGTTGTCGCCGCCCGTG
>NZ_CP018863.1_contig2
CCCGGGAGCTTCGTCGCAGTCCGGTGGTGGCCGGCGACTTCGTAGCGCTCGTCGGTGACATTACCGGTGAGCCGGATACGTTGGCCCGGCTGGTCCGGATTGAGGAACGCCGGACGCTGTTGCGCCGCAGCGCCGACGATACAGATCCCGTGGAGCGGGTAGTCGTCGCCAATGCAGACCAGCTGGTCATCGTCGTGGCCGCCGCAAACCCCGAGCCGCGCACCGGTTTCATCGACCGCGCCTTGGTAGCGGCGTACGACGCCGGTATCAGCCCGCTGCTGTGCGTCACCAAAGCGGACGTCAAGGATCCCGAAGAACTGCTCTCCAACTACCGGCACCTGGACCTGCCCGTGATCGTCAGCCGGACGGCCGGCACGGAGGGCTCCGGGGTGGATGCACGGTCCGCCGACGGGCTGTCTGCCCGTCTCGACCGTGACGCCGTAGCGGCGCTCCGTGGCTATCTGGATGGGATGGTCAGCGTCATGCTCGGCCATTCGGGCGTGGGCAAGTCCACCATGGTCAATGCCCTCACGGGGGCGGAGCGCGCCACGGGGGGAGTCAACGCGGTGACCGGGCGGGGCCGGCATACCTCCTCCTCGGCGCTGGCCCTGAAGCTGGCCGACGCTCCGGCTGGCAGCTGGATCATCGACACGCCCGGCATCCGTTCTTTTGGACTGGCCCACGTGGACCCGGACCGGATCATTTCCGCTTTTCCCGATTTGGAGCCCGGGACGGCGGACTGCGAGCGGGGCTGCAAGCACGACGACCATGCCGTCAACTGCGGCGTGGACGCCTGGGTGGCCTCCGGGCAGGCCGGCGAATCCGGCCCGGCACGGCTGGCCTCGCTGCGCCGTTTGCTGGGAACGGAAGAACGCGCCCAGGCGAAGGAACTCGGGTTCCAGTAGCACCGCCGTCGTCGGTCAGGGACTTCACATCCCGCATCCGGCCGCCAAATAAGGATAAGTTGAAGCCTATGACCCGTGACGTTCAAAGCTATAAC
66 changes: 66 additions & 0 deletions q2_types/feature_data_mag/tests/test_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import filecmp
from unittest.mock import patch

from qiime2.plugin.testing import TestPluginBase

from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.feature_data_mag._methods import partition_feature_data_mags, \
collate_feature_data_mags


class TestFeatureDataMAGsPartitionCollating(TestPluginBase):
package = "q2_types.feature_data_mag.tests"

@patch("q2_types._util._validate_mag_ids")
@patch("q2_types._util._validate_num_partitions")
def test_partition_feature_data_mags(
self,
mock_validate_num_partitions,
mock_validate_mag_ids
):
# Partition Feature Data
p = self.get_data_path("collated_mags")
mags = MAGSequencesDirFmt(path=p, mode="r")
mock_validate_num_partitions.return_value = 2
partitioned_mags = partition_feature_data_mags(mags)

# Expected mag ids
mag_ids = [
"24dee6fe-9b84-45bb-8145-de7b092533a1",
"fb0bc871-04f6-486b-a10e-8e0cb66f8de3"
]

# compare partitions
for i in [0, 1]:
dircmp = filecmp.dircmp(
partitioned_mags[mag_ids[i]].path, mags.path
)
self.assertListEqual([f"{mag_ids[i]}.fasta"], dircmp.common)

def test_collate_feature_data_mags(self):
# collate test data
p1 = self.get_data_path("partitioned_mags/mag1")
p2 = self.get_data_path("partitioned_mags/mag2")
mags = [
MAGSequencesDirFmt(p1, mode="r"),
MAGSequencesDirFmt(p2, mode="r")
]
collated_mags = collate_feature_data_mags(mags)

# compare directories
expected = self.get_data_path("collated_mags")
dircmp = filecmp.dircmp(collated_mags.path, expected)
self.assertListEqual(
[
"24dee6fe-9b84-45bb-8145-de7b092533a1.fasta",
"fb0bc871-04f6-486b-a10e-8e0cb66f8de3.fasta"
],
dircmp.common
)
5 changes: 4 additions & 1 deletion q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
from ._types import (
GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG
)
from ._methods import collate_orthologs, partition_orthologs, \
collate_ortholog_annotations

__all__ = [
'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format',
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG'
'OrthologAnnotationDirFmt', 'NOG',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
]
88 changes: 88 additions & 0 deletions q2_types/genome_data/_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import glob
import os
import warnings

import numpy as np
from qiime2.util import duplicate

from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt


def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt:
result = SeedOrthologDirFmt()

for ortholog in orthologs:
for fp in ortholog.path.iterdir():
duplicate(fp, result.path / os.path.basename(fp))

return result


def partition_orthologs(
orthologs: SeedOrthologDirFmt, num_partitions: int = None
) -> SeedOrthologDirFmt:
"""
Returns a dictionary where each key is either the sample_id and
values are the new objects with the orthologs.
"""
partitioned_orthologs = {}

# TODO: this logic should move to the format itself
orthologs = glob.glob(os.path.join(str(orthologs), "*.seed_orthologs"))
names = [
os.path.basename(x).replace(".emapper.seed_orthologs", "")
for x in orthologs
]
orthologs = list(zip(names, orthologs))

num_samples = len(orthologs)
if num_partitions is None:
num_partitions = num_samples
elif num_partitions > num_samples:
warnings.warn(
"You have requested a number of partitions"
f" '{num_partitions}' that is greater than your number"
f" of samples '{num_samples}.' Your data will be"
f" partitioned by sample into '{num_samples}'"
" partitions."
)
num_partitions = num_samples

orthologs = np.array_split(orthologs, num_partitions)
for i, samples in enumerate(orthologs, 1):
result = SeedOrthologDirFmt()

for sample_id, sample_fp in samples:
duplicate(sample_fp, result.path / os.path.basename(sample_fp))

# If num_partitions == num_samples we will only have gone through one
# sample in the above loop and will use its id as a key. Otherwise we
# may have gone through multiple samples in the above loop and will be
# using indices for keys
if num_partitions == num_samples:
partitioned_orthologs[sample_id] = result
else:
partitioned_orthologs[i] = result

return partitioned_orthologs


def collate_ortholog_annotations(
ortholog_annotations: OrthologAnnotationDirFmt
) -> OrthologAnnotationDirFmt:
# Init output
collated_annotations = OrthologAnnotationDirFmt()

# Copy annotations into output
for anno in ortholog_annotations:
for fp in anno.path.iterdir():
duplicate(fp, collated_annotations.path / fp.name)

return collated_annotations
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0
0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0
0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0
0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0
0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0
0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0
0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0
Loading
Loading