Skip to content

Commit

Permalink
ENH: Adds new format GenomeDataDirectoryFormat with genome_dict f…
Browse files Browse the repository at this point in the history
…unction (#345)
  • Loading branch information
VinzentRisch authored Oct 8, 2024
1 parent 08bae72 commit 4b9509a
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 6 deletions.
3 changes: 2 additions & 1 deletion q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
GenomeDataDirectoryFormat,
)
from ._objects import IntervalMetadataIterator
from ._types import (
Expand All @@ -24,6 +25,6 @@
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG',
'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
]
54 changes: 50 additions & 4 deletions q2_types/genome_data/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
from collections import defaultdict

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError
Expand All @@ -18,7 +19,52 @@ def _validate_(self, level):
pass


class GenesDirectoryFormat(model.DirectoryFormat):
class GenomeDataDirectoryFormat(model.DirectoryFormat):
def genome_dict(self, relative=False):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file.
For files, it returns a mapping of file name to filepath for each file.
Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.
Returns
-------
dict
Mapping of filename -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
ids = defaultdict(dict)
for entry in self.path.iterdir():
if entry.is_dir():
sample_id = entry.name
for path in entry.iterdir():
file_name = path.stem
file_path = (
path.absolute().relative_to(self.path.absolute())
if relative else path.absolute()
)
ids[sample_id][file_name] = str(file_path)
ids[sample_id] = dict(sorted(ids[sample_id].items()))
else:
file_name = entry.stem
file_path = (
entry.absolute().relative_to(self.path.absolute())
if relative else entry.absolute()
)
ids[file_name] = str(file_path)

return dict(sorted(ids.items()))


class GenesDirectoryFormat(GenomeDataDirectoryFormat):
genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
format=DNAFASTAFormat)

Expand All @@ -27,7 +73,7 @@ def genes_path_maker(self, genome_id):
return '%s.fasta' % genome_id


class ProteinsDirectoryFormat(model.DirectoryFormat):
class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
format=ProteinFASTAFormat)

Expand Down Expand Up @@ -159,7 +205,7 @@ def _validate_(self, level):
f'{line_number}') from e


class LociDirectoryFormat(model.DirectoryFormat):
class LociDirectoryFormat(GenomeDataDirectoryFormat):
loci = model.FileCollection(r'.+\.gff$',
format=GFF3Format)

Expand All @@ -168,7 +214,7 @@ def loci_path_maker(self, genome_id):
return '%s.gff' % genome_id


class GenomeSequencesDirectoryFormat(model.DirectoryFormat):
class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)

@genomes.set_path_maker
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>gene1 some_description1
GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT
ATAAACAGTGCTGGAGGC
>gene2 some_description2
CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA
CGGGGGGCCTTGG
>gene3 some_description3
GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA
CTGGAGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>gene11 some_description11
ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT
ATAAACAGTGCTGGAGGC
>gene12 some_description12
AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA
CGGGGGGCCTTGG
48 changes: 47 additions & 1 deletion q2_types/genome_data/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import unittest
from pathlib import Path

from qiime2.core.exceptions import ValidationError
from qiime2.plugin.testing import TestPluginBase
Expand All @@ -15,6 +15,7 @@
GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
GenomeDataDirectoryFormat
)


Expand Down Expand Up @@ -179,6 +180,51 @@ def test_ortholog_annotations_annot_dict(self):
}
self.assertDictEqual(obs, exp)

def test_genome_data_dirfmt_samples_genome_dict(self):
genes = GenomeDataDirectoryFormat(
self.get_data_path('genes_samples'), mode='r')

obs = genes.genome_dict()
exp = {
'sample1': {
'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
},
'sample2': {
'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
},
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'sample1': {
'genes1': 'sample1/genes1.fa',
},
'sample2': {
'genes2': 'sample2/genes2.fa',
},
}
self.assertDictEqual(obs, exp)

def test_genes_dirfmt_genome_dict(self):
genes = (
GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
)

obs = genes.genome_dict()
exp = {
'genes1': str(Path(genes.path / 'genes1.fa')),
'genes2': str(Path(genes.path / 'genes2.fa'))
}
self.assertDictEqual(obs, exp)

obs = genes.genome_dict(relative=True)
exp = {
'genes1': 'genes1.fa',
'genes2': 'genes2.fa'
}
self.assertDictEqual(obs, exp)


if __name__ == '__main__':
unittest.main()
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
'q2_types.genome_data.tests':
['data/*',
'data/genes/*',
'data/genes_samples/sample1/*',
'data/genes_samples/sample2/*',
'data/loci-invalid/*',
'data/loci/*',
'data/genome-sequences/*',
Expand Down

0 comments on commit 4b9509a

Please sign in to comment.