|
5 | 5 | #
|
6 | 6 | # The full license is in the file LICENSE, distributed with this software.
|
7 | 7 | # ----------------------------------------------------------------------------
|
| 8 | +from collections import defaultdict |
8 | 9 |
|
9 | 10 | import pandas as pd
|
10 | 11 | from pandas.core.dtypes.common import is_string_dtype
|
@@ -67,7 +68,104 @@ def _validate_(self, level):
|
67 | 68 | )
|
68 | 69 |
|
69 | 70 |
|
70 |
| -class Kraken2ReportDirectoryFormat(model.DirectoryFormat): |
| 71 | +class FileDictMixin: |
| 72 | + def file_dict(self, relative=False, suffixes=None): |
| 73 | + """ |
| 74 | + For per sample directories it returns a mapping of sample id to |
| 75 | + another dictionary where keys represent the file name and values |
| 76 | + correspond to the filepath for each file. |
| 77 | + For files, it returns a mapping of file name to filepath for each |
| 78 | + file. The specified suffixes are removed from filenames. |
| 79 | +
|
| 80 | + Parameters |
| 81 | + --------- |
| 82 | + relative : bool |
| 83 | + Whether to return filepaths relative to the directory's location. |
| 84 | + Returns absolute filepaths by default. |
| 85 | + suffixes : List |
| 86 | + A list of suffixes that should be removed from the filenames to |
| 87 | + generate the ID. |
| 88 | +
|
| 89 | + Returns |
| 90 | + ------- |
| 91 | + dict |
| 92 | + Mapping of filename -> filepath as described above. |
| 93 | + Or mapping of sample id -> dict {filename: filepath} as |
| 94 | + described above. |
| 95 | + Both levels of the dictionary are sorted alphabetically by key. |
| 96 | + """ |
| 97 | + ids = defaultdict(dict) |
| 98 | + for entry in self.path.iterdir(): |
| 99 | + if entry.is_dir(): |
| 100 | + outer_id = entry.name |
| 101 | + for path in entry.iterdir(): |
| 102 | + file_path, inner_id = _create_path( |
| 103 | + path=path, |
| 104 | + relative=relative, |
| 105 | + dir_format=self, |
| 106 | + suffixes=suffixes |
| 107 | + ) |
| 108 | + |
| 109 | + ids[outer_id][inner_id] = str(file_path) |
| 110 | + ids[outer_id] = dict(sorted(ids[outer_id].items())) |
| 111 | + else: |
| 112 | + file_path, inner_id = _create_path( |
| 113 | + path=entry, |
| 114 | + relative=relative, |
| 115 | + dir_format=self, |
| 116 | + suffixes=suffixes |
| 117 | + |
| 118 | + ) |
| 119 | + |
| 120 | + ids[inner_id] = str(file_path) |
| 121 | + |
| 122 | + return dict(sorted(ids.items())) |
| 123 | + |
| 124 | + |
| 125 | +def _create_path(path, relative, dir_format, suffixes): |
| 126 | + """ |
| 127 | + This function processes the input file path to generate an absolute or |
| 128 | + relative path string and the ID derived from the file name. The ID is |
| 129 | + extracted by removing the one of the specified suffixes from the file |
| 130 | + name. If no suffixes are specified the ID is defined to be the filename. |
| 131 | +
|
| 132 | + Parameters: |
| 133 | + --------- |
| 134 | + path : Path |
| 135 | + A Path object representing the file path to process. |
| 136 | + relative : bool |
| 137 | + A flag indicating whether the returned path should be relative |
| 138 | + to the directory formats path or absolute. |
| 139 | + dir_format : DirectoryFormat. |
| 140 | + Any object of class DirectoryFormat. |
| 141 | +
|
| 142 | + Returns: |
| 143 | + ------- |
| 144 | + path_dict : str |
| 145 | + The full relative or absolut path to the file. |
| 146 | + _id : str |
| 147 | + The ID derived from the file name. ID will be "" if the filename |
| 148 | + consists only of the suffix. |
| 149 | + """ |
| 150 | + file_name = path.stem |
| 151 | + |
| 152 | + _id = file_name |
| 153 | + |
| 154 | + if suffixes: |
| 155 | + for suffix in suffixes: |
| 156 | + if file_name.endswith(suffix[1:]): |
| 157 | + _id = file_name[:-len(suffix)] |
| 158 | + break |
| 159 | + |
| 160 | + path_dict = ( |
| 161 | + path.absolute().relative_to(dir_format.path.absolute()) |
| 162 | + if relative |
| 163 | + else path.absolute() |
| 164 | + ) |
| 165 | + return str(path_dict), _id |
| 166 | + |
| 167 | + |
| 168 | +class Kraken2ReportDirectoryFormat(model.DirectoryFormat, FileDictMixin): |
71 | 169 | reports = model.FileCollection(
|
72 | 170 | r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat
|
73 | 171 | )
|
|
0 commit comments