Skip to content

Commit 5dcff1f

Browse files
authored
ENH: Adds FileDictMixin (#347)
1 parent e732a7d commit 5dcff1f

File tree

15 files changed

+344
-116
lines changed

15 files changed

+344
-116
lines changed

q2_types/_util.py

+95
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
# ----------------------------------------------------------------------------
88
import gzip
99
import itertools
10+
import re
1011
import warnings
12+
from collections import defaultdict
1113
from typing import List
1214

1315
import skbio
@@ -138,3 +140,96 @@ def _validate_mag_ids(
138140
"correctly. Printing duplicate MAG IDs: "
139141
f"{set(duplicates)}"
140142
)
143+
144+
145+
class FileDictMixin:
146+
def file_dict(self, relative=False):
147+
"""
148+
For per sample directories it returns a mapping of sample id to
149+
another dictionary where keys represent the file name and values
150+
correspond to the filepath for each file matching the pathspec.
151+
For files, it returns a mapping of file name to filepath for each
152+
file matching the pathspec. If the dir format has the attribute
153+
'suffixes', then these are removed from filenames.
154+
155+
Parameters
156+
---------
157+
relative : bool
158+
Whether to return filepaths relative to the directory's location.
159+
Returns absolute filepaths by default.
160+
161+
Returns
162+
-------
163+
dict
164+
Mapping of sample id -> filepath as described above.
165+
Or mapping of sample id -> dict {filename: filepath} as
166+
described above.
167+
Both levels of the dictionary are sorted alphabetically by key.
168+
"""
169+
file_pattern = re.compile(self.pathspec)
170+
ids = defaultdict(dict)
171+
172+
for entry in self.path.iterdir():
173+
if entry.is_dir():
174+
outer_id = entry.name
175+
for path in entry.iterdir():
176+
if file_pattern.match(path.name):
177+
178+
file_path, inner_id = self._process_path(
179+
path=path,
180+
relative=relative,
181+
)
182+
183+
ids[outer_id][inner_id] = file_path
184+
ids[outer_id] = dict(sorted(ids[outer_id].items()))
185+
else:
186+
if file_pattern.match(entry.name):
187+
188+
file_path, inner_id = self._process_path(
189+
path=entry,
190+
relative=relative,
191+
)
192+
193+
ids[inner_id] = file_path
194+
195+
return dict(sorted(ids.items()))
196+
197+
def _process_path(self, path, relative=False):
198+
"""
199+
This function processes the input file path to generate an absolute or
200+
relative path string and the ID derived from the file name. The ID is
201+
extracted by removing one of the suffixes from the file name. If the
202+
class does not have a suffixes attribute, then the ID is defined to
203+
be the filename.
204+
205+
Parameters:
206+
---------
207+
path : Path
208+
A Path object representing the file path to process.
209+
relative : bool
210+
A flag indicating whether the returned path should be relative
211+
to the directory formats path or absolute.
212+
213+
Returns:
214+
-------
215+
processed_path : str
216+
The relative or absolute path to the file.
217+
_id : str
218+
The ID derived from the file name.
219+
"""
220+
file_name = path.stem
221+
_id = file_name
222+
suffixes = getattr(self, "suffixes", [])
223+
224+
if suffixes:
225+
for suffix in suffixes:
226+
if file_name.endswith(suffix):
227+
_id = file_name[:-len(suffix)]
228+
break
229+
230+
processed_path = (
231+
path.absolute().relative_to(self.path.absolute())
232+
if relative
233+
else path.absolute()
234+
)
235+
return str(processed_path), _id

q2_types/genome_data/__init__.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat,
1212
GFF3Format, OrthologFileFmt, SeedOrthologDirFmt,
1313
GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt,
14-
GenomeDataDirectoryFormat,
1514
)
1615
from ._objects import IntervalMetadataIterator
1716
from ._types import (
@@ -25,6 +24,6 @@
2524
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
2625
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
2726
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
28-
'OrthologAnnotationDirFmt', 'NOG', 'GenomeDataDirectoryFormat',
27+
'OrthologAnnotationDirFmt', 'NOG',
2928
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
3029
]

q2_types/genome_data/_formats.py

+13-57
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88
import re
9-
from collections import defaultdict
109

1110
import qiime2.plugin.model as model
1211
from qiime2.plugin import ValidationError
1312

13+
from q2_types._util import FileDictMixin
1414
from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat
1515

1616

@@ -19,63 +19,18 @@ def _validate_(self, level):
1919
pass
2020

2121

22-
class GenomeDataDirectoryFormat(model.DirectoryFormat):
23-
def genome_dict(self, relative=False):
24-
"""
25-
For per sample directories it returns a mapping of sample id to
26-
another dictionary where keys represent the file name and values
27-
correspond to the filepath for each file.
28-
For files, it returns a mapping of file name to filepath for each file.
29-
30-
Parameters
31-
---------
32-
relative : bool
33-
Whether to return filepaths relative to the directory's location.
34-
Returns absolute filepaths by default.
35-
36-
Returns
37-
-------
38-
dict
39-
Mapping of filename -> filepath as described above.
40-
Or mapping of sample id -> dict {filename: filepath} as
41-
described above.
42-
Both levels of the dictionary are sorted alphabetically by key.
43-
"""
44-
ids = defaultdict(dict)
45-
for entry in self.path.iterdir():
46-
if entry.is_dir():
47-
sample_id = entry.name
48-
for path in entry.iterdir():
49-
file_name = path.stem
50-
file_path = (
51-
path.absolute().relative_to(self.path.absolute())
52-
if relative else path.absolute()
53-
)
54-
ids[sample_id][file_name] = str(file_path)
55-
ids[sample_id] = dict(sorted(ids[sample_id].items()))
56-
else:
57-
file_name = entry.stem
58-
file_path = (
59-
entry.absolute().relative_to(self.path.absolute())
60-
if relative else entry.absolute()
61-
)
62-
ids[file_name] = str(file_path)
63-
64-
return dict(sorted(ids.items()))
65-
66-
67-
class GenesDirectoryFormat(GenomeDataDirectoryFormat):
68-
genes = model.FileCollection(r'.+\.(fa|fna|fasta)$',
69-
format=DNAFASTAFormat)
22+
class GenesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
23+
pathspec = r'.+\.(fa|fna|fasta)$'
24+
genes = model.FileCollection(pathspec, format=DNAFASTAFormat)
7025

7126
@genes.set_path_maker
7227
def genes_path_maker(self, genome_id):
7328
return '%s.fasta' % genome_id
7429

7530

76-
class ProteinsDirectoryFormat(GenomeDataDirectoryFormat):
77-
proteins = model.FileCollection(r'.+\.(fa|faa|fasta)$',
78-
format=ProteinFASTAFormat)
31+
class ProteinsDirectoryFormat(model.DirectoryFormat, FileDictMixin):
32+
pathspec = r'.+\.(fa|faa|fasta)$'
33+
proteins = model.FileCollection(pathspec, format=ProteinFASTAFormat)
7934

8035
@proteins.set_path_maker
8136
def proteins_path_maker(self, genome_id):
@@ -205,17 +160,18 @@ def _validate_(self, level):
205160
f'{line_number}') from e
206161

207162

208-
class LociDirectoryFormat(GenomeDataDirectoryFormat):
209-
loci = model.FileCollection(r'.+\.gff$',
210-
format=GFF3Format)
163+
class LociDirectoryFormat(model.DirectoryFormat, FileDictMixin):
164+
pathspec = r'.+\.gff$'
165+
loci = model.FileCollection(pathspec, format=GFF3Format)
211166

212167
@loci.set_path_maker
213168
def loci_path_maker(self, genome_id):
214169
return '%s.gff' % genome_id
215170

216171

217-
class GenomeSequencesDirectoryFormat(GenomeDataDirectoryFormat):
218-
genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat)
172+
class GenomeSequencesDirectoryFormat(model.DirectoryFormat, FileDictMixin):
173+
pathspec = r'.+\.(fasta|fa)$'
174+
genomes = model.FileCollection(pathspec, format=DNAFASTAFormat)
219175

220176
@genomes.set_path_maker
221177
def genomes_path_maker(self, genome_id):

q2_types/genome_data/tests/test_formats.py

-47
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88
import unittest
9-
from pathlib import Path
109

1110
from qiime2.core.exceptions import ValidationError
1211
from qiime2.plugin.testing import TestPluginBase
@@ -15,7 +14,6 @@
1514
GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format,
1615
LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt,
1716
OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat,
18-
GenomeDataDirectoryFormat
1917
)
2018

2119

@@ -180,51 +178,6 @@ def test_ortholog_annotations_annot_dict(self):
180178
}
181179
self.assertDictEqual(obs, exp)
182180

183-
def test_genome_data_dirfmt_samples_genome_dict(self):
184-
genes = GenomeDataDirectoryFormat(
185-
self.get_data_path('genes_samples'), mode='r')
186-
187-
obs = genes.genome_dict()
188-
exp = {
189-
'sample1': {
190-
'genes1': str(Path(genes.path / 'sample1/genes1.fa')),
191-
},
192-
'sample2': {
193-
'genes2': str(Path(genes.path / 'sample2/genes2.fa')),
194-
},
195-
}
196-
self.assertDictEqual(obs, exp)
197-
198-
obs = genes.genome_dict(relative=True)
199-
exp = {
200-
'sample1': {
201-
'genes1': 'sample1/genes1.fa',
202-
},
203-
'sample2': {
204-
'genes2': 'sample2/genes2.fa',
205-
},
206-
}
207-
self.assertDictEqual(obs, exp)
208-
209-
def test_genes_dirfmt_genome_dict(self):
210-
genes = (
211-
GenomeDataDirectoryFormat(self.get_data_path('genes'), mode='r')
212-
)
213-
214-
obs = genes.genome_dict()
215-
exp = {
216-
'genes1': str(Path(genes.path / 'genes1.fa')),
217-
'genes2': str(Path(genes.path / 'genes2.fa'))
218-
}
219-
self.assertDictEqual(obs, exp)
220-
221-
obs = genes.genome_dict(relative=True)
222-
exp = {
223-
'genes1': 'genes1.fa',
224-
'genes2': 'genes2.fa'
225-
}
226-
self.assertDictEqual(obs, exp)
227-
228181

229182
if __name__ == '__main__':
230183
unittest.main()

q2_types/kraken2/_formats.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
#
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
8-
98
import pandas as pd
109
from pandas.core.dtypes.common import is_string_dtype
1110
from qiime2.plugin import model, ValidationError
1211

12+
from q2_types._util import FileDictMixin
13+
1314

1415
class Kraken2ReportFormat(model.TextFileFormat):
1516
MEASURE_COLUMNS = {
@@ -67,10 +68,10 @@ def _validate_(self, level):
6768
)
6869

6970

70-
class Kraken2ReportDirectoryFormat(model.DirectoryFormat):
71-
reports = model.FileCollection(
72-
r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
73-
)
71+
class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin):
72+
pathspec = r'.+report\.(txt|tsv)$'
73+
suffixes = ['.report']
74+
reports = model.FileCollection(pathspec, format=Kraken2ReportFormat)
7475

7576
@reports.set_path_maker
7677
def reports_path_maker(self, sample_id, mag_id=None):
@@ -146,10 +147,10 @@ def _validate_(self, level):
146147
)
147148

148149

149-
class Kraken2OutputDirectoryFormat(model.DirectoryFormat):
150-
reports = model.FileCollection(
151-
r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat
152-
)
150+
class Kraken2OutputDirectoryFormat(model.DirectoryFormat, FileDictMixin):
151+
pathspec = r'.+output\.(txt|tsv)$'
152+
suffixes = ['.output']
153+
reports = model.FileCollection(pathspec, format=Kraken2OutputFormat)
153154

154155
@reports.set_path_maker
155156
def reports_path_maker(self, sample_id, mag_id=None):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
C k119_33069 1912795 10855 1912795:Q
2+
C k119_55515 1583098 5698 1583098:Q
3+
C k119_66468 1323375 5173 1323375:Q
4+
C k119_33506 182217 17101 182217:Q
5+
C k119_22814 1472 19997 1472:Q
6+
C k119_23274 29388 23523 29388:Q
7+
C k119_45180 545501 25821 545501:Q
8+
C k119_34380 1218 4423 1218:Q
9+
C k119_1654 2518177 31450 2518177:Q
10+
C k119_45407 221027 2908 221027:Q
11+
C k119_12788 59919 2856 59919:Q
12+
U k119_34900 0 3045 0:Q
13+
C k119_45855 851 19053 851:Q
14+
C k119_90411 2647897 2589 2647897:Q
15+
C k119_57806 2653681 4515 2653681:Q
16+
C k119_58481 131567 19174 131567:Q
17+
C k119_47669 2682541 11848 2682541:Q
18+
C k119_59208 1977865 3665 1977865:Q
19+
C k119_16398 2770780 5030 2770780:Q
20+
C k119_60835 400634 2807 400634:Q
21+
C k119_49584 2490633 6493 2490633:Q
22+
C k119_28869 111780 8356 111780:Q
23+
C k119_94747 2305987 3774 2305987:Q
24+
C k119_40414 983544 27806 983544:Q
25+
C k119_73618 2563896 3473 2563896:Q
26+
C k119_84540 332101 3409 332101:Q
27+
C k119_73768 2593542 29942 2593542:Q
28+
C k119_41848 34105 8793 34105:Q
29+
C k119_43035 1301 4680 1301:Q
30+
C k119_65066 1547445 10430 1547445:Q
31+
C k119_10361 491950 68731 491950:Q
32+
C k119_10711 52959 8685 52959:Q

0 commit comments

Comments
 (0)