diff --git a/grabbit/core.py b/grabbit/core.py index 6bbe081..d8a5e49 100644 --- a/grabbit/core.py +++ b/grabbit/core.py @@ -8,10 +8,9 @@ from os.path import (join, basename, dirname, abspath, split, exists, isdir, relpath, isabs) from functools import partial -from copy import deepcopy +from copy import copy, deepcopy import warnings from keyword import iskeyword -from itertools import chain __all__ = ['File', 'Entity', 'Layout'] @@ -56,7 +55,7 @@ def _matches(self, entities=None, extensions=None, domains=None, if isinstance(extensions, six.string_types): extensions = [extensions] extensions = '(' + '|'.join(extensions) + ')$' - if re.search(extensions, self.path) is None: + if re.search(extensions, self.filename) is None: return False if domains is not None: @@ -147,7 +146,7 @@ def copy(self, path_patterns, symbolic_link=False, root=None, class Domain(object): - def __init__(self, name, config): + def __init__(self, config): """ A set of rules that applies to one or more directories within a Layout. @@ -156,11 +155,9 @@ def __init__(self, name, config): name (str): The name of the Domain. config (dict): The configuration dictionary that defines the entities and paths for the current domain. - root (str, list): The root directory or directories to which the - Domain's rules applies. Can be either a single path, or a list. """ - self.name = name + self.name = config['name'] self.config = config self.entities = {} self.files = [] @@ -303,26 +300,9 @@ def count(self, files=False): return len(self.files) if files else len(self.unique()) -class LayoutMetaclass(type): - ''' Metaclass for Layout; used to enable merging of multiple Layouts into - a single Layout when a list of paths is passed as input. - ''' - def __call__(cls, path, *args, **kwargs): - - paths = listify(path) - if len(paths) == 1: - return super(LayoutMetaclass, cls).__call__(paths[0], *args, - **kwargs) - layouts = [] - for p in paths: - layout = super(LayoutMetaclass, cls).__call__(p, *args, **kwargs) - layouts.append(layout) - return merge_layouts(layouts) - - -class Layout(six.with_metaclass(LayoutMetaclass, object)): +class Layout(object): - def __init__(self, root=None, config=None, index=None, + def __init__(self, paths, root=None, index=None, dynamic_getters=False, absolute_paths=True, regex_search=False, entity_mapper=None, path_patterns=None, config_filename='layout.json', include=None, exclude=None): @@ -330,21 +310,23 @@ def __init__(self, root=None, config=None, index=None, A container for all the files and metadata found at the specified path. Args: - root (str): Directory that all other paths will be relative to. - Every other path the Layout sees must be at this level or below. - domains (str, list, dict): A specification of the configuration - object(s) defining domains to use in the Layout. Can be one of: - - - A dictionary containing config information - - A string giving the path to a JSON file containing the config - - A string giving the path to a directory containing a - configuration file with the name defined in config_filename - - A tuple with two elements, where the first element is one of - the above (i.e., dict or string), and the second element is - an iterable of directories to apply the config to. - - A list, where each element is any of the above (dict, string, - or tuple). - + paths (str, list): The path(s) where project files are located. + Must be one of: + + - A path to a directory containing files to index + - A list of paths to directories to index + - A list of 2-tuples where each tuple encodes a mapping from + directories to domains. The first element is a string or + list giving the paths to one or more directories to index. + The second element specifies which domains to apply to the + specified files, and can be one of: + * A string giving the path to a JSON config file + * A dictionary containing config information + * A list of any combination of strings or dicts + + root (str): Optional directory that all other paths will be + relative to. If set, every other path the Layout sees must be + at this level or below. If None, filesystem root ('/') is used. index (str): Optional path to a saved index file. If a valid value is passed, this index is used to populate Files and Entities, and the normal indexing process (which requires scanning all @@ -407,66 +389,72 @@ def __init__(self, root=None, config=None, index=None, self.include = listify(include or []) self.exclude = listify(exclude or []) self.absolute_paths = absolute_paths - self.root = abspath(root) if absolute_paths else root - - if config is not None: - for c in listify(config): - if isinstance(c, tuple): - c, root = c - else: - root = None - self._load_domain(c, root, True) - - if index is None: - self.index() - else: - self.load_index(index) - - def _load_domain(self, config, root=None, from_init=False): - - if isinstance(config, six.string_types): + if root is None: + root = '/' + self.root = abspath(root) - if isdir(config): - config = join(config, self.config_filename) + self._domain_map = {} - if not exists(config): - raise ValueError("Config file '%s' cannot be found." % config) + # Extract path --> domain mapping + self._paths_to_index = {} - config_filename = config - config = json.load(open(config, 'r')) + def add_path(path, val): + path = abspath(path) + self._paths_to_index[path] = val - if root is None and not from_init: - root = dirname(abspath(config_filename)) - - if 'name' not in config: - raise ValueError("Config file missing 'name' attribute.") - - if config['name'] in self.domains: - raise ValueError("Config with name '%s' already exists in " - "Layout. Name of each config file must be " - "unique across entire Layout." % config['name']) + for p in listify(paths, ignore=list): + if isinstance(p, six.string_types): + add_path(p, []) + else: + doms = listify(p[1]) + doms = [self._get_or_load_domain(d) for d in doms] + for elem in listify(p[0]): + add_path(elem, doms) - if root is None and from_init: - # warnings.warn("No valid root directory found for domain '%s'. " - # "Falling back on root directory for Layout (%s)." - # % (config['name'], self.root)) - root = self.root + # Verify existence of all paths + for p in self._paths_to_index: + if not exists(p): + raise ValueError("Search path {} doesn't exist.".format(p)) - if config.get('root') in [None, '.']: - config['root'] = root + if index is None: + self.index() + else: + self.load_index(index) - for root in listify(config['root']): - if not exists(root): - raise ValueError("Root directory %s for domain %s does not " - "exist!" % (root, config['name'])) + def _get_or_load_domain(self, domain): + ''' Return a domain if one already exists, or create a new one if not. - # Load entities - domain = Domain(config['name'], config) - for e in config.get('entities', []): + Args: + domain (str, dict): Can be one of: + - The name of the Domain to return (fails if none exists) + - A path to the Domain configuration file + - A dictionary containing configuration information + ''' + if isinstance(domain, six.string_types): + if domain in self.domains: + return self.domains[domain] + elif exists(domain): + domain = json.load(open(domain, 'r')) + else: + raise ValueError("No domain could be found/loaded from input " + "'{}'; value must be either the name of an " + "existing Domain, or a valid path to a " + "configuration file.".format(domain)) + + # At this point, domain is a dict + name = domain['name'] + if name in self.domains: + msg = ("Domain with name '{}' already exists; returning existing " + "Domain configuration.".format(name)) + warnings.warn(msg) + return self.domains[name] + + entities = domain.get('entities', []) + domain = Domain(domain) + for e in entities: self.add_entity(domain=domain, **e) - - self.domains[domain.name] = domain - return domain + self.domains[name] = domain + return self.domains[name] def get_domain_entities(self, domains=None): # Get all Entities included in the specified Domains, in the same @@ -479,22 +467,19 @@ def get_domain_entities(self, domains=None): ents.update(self.domains[d].entities) return ents - def _check_inclusions(self, f, domains=None, fullpath=True): + def _check_inclusions(self, f, domains=None): ''' Check file or directory against regexes in config to determine if it should be included in the index ''' filename = f if isinstance(f, six.string_types) else f.path - if not fullpath: - filename = basename(filename) - if domains is None: - domains = list(self.domains.keys()) - - domains = [self.domains[dom] for dom in listify(domains)] + domains = list(self.domains.values()) # Inject the Layout at the first position for global include/exclude + domains = list(domains) domains.insert(0, self) + for dom in domains: # If file matches any include regex, then True if dom.include: @@ -524,10 +509,8 @@ def _validate_file(self, f): return True def _get_files(self, root): - ''' Returns all files in project (pre-filtering). Extend this in - subclasses as needed. ''' - results = [os.walk(r, topdown=True) for r in listify(root)] - return list(chain(*results)) + ''' Returns all files in directory (non-recursively). ''' + return os.listdir(root) def _make_file_object(self, root, f): ''' Initialize a new File oject from a directory and filename. Extend @@ -545,12 +528,8 @@ def _index_file(self, root, f, domains, update_layout=True): # Create the file object--allows for subclassing f = self._make_file_object(root, f) - for d in listify(domains): - if d not in self.domains: - raise ValueError("Cannot index file '%s' in domain '%s'; " - "no domain with that name exists." % - (f.path, d)) - domain = self.domains[d] + for domain in listify(domains): + domain = self.domains[domain] match_vals = {} for e in domain.entities.values(): m = e.match_file(f) @@ -591,62 +570,40 @@ def index(self): self._reset_index() - # Track all candidate files - files_to_index = defaultdict(set) - - # Track any additional config files we run into - extra_configs = [] - - def _index_domain_files(dom): - - doms_to_add = set(dom.config.get('domains', []) + [dom.name]) - - dataset = self._get_files(dom.config['root']) - - # Loop over all files in domain - for root, directories, filenames in dataset: - - def check_incl(f): - return self._check_inclusions(f, dom.name) - - # Exclude directories that match exclude regex - full_dirs = [join(root, d) for d in directories] - full_dirs = filter(check_incl, full_dirs) - full_dirs = filter(self._validate_dir, full_dirs) - directories[:] = [split(d)[1] for d in full_dirs] - - for f in filenames: - full_path = join(root, f) - # Add config file to tracking - if f == self.config_filename: - if full_path not in extra_configs: - extra_configs.append(full_path) - # Add file to the candidate index - elif (self._check_inclusions(full_path, dom.name) and - self._validate_file(full_path)): - # If the file is below the Layout root, use a relative - # path. Otherwise, use absolute path. - if full_path.startswith(self.root) and not \ - self.absolute_paths: - full_path = relpath(full_path, self.root) - files_to_index[full_path] |= doms_to_add - - for dom in self.domains.values(): - _index_domain_files(dom) - - # Set up any additional configs we found. Note that in edge cases, - # this approach has the potential to miss out on some configs, because - # it doesn't recurse. This will generally only happen under fairly - # weird circumstances though (e.g., the config file points to another - # root elsewhere in the filesystem, or there are inconsistent include/ - # exclude directives across nested configs), so this will do for now. - for dom in extra_configs: - dom = self._load_domain(dom) - _index_domain_files(dom) - - for filename, domains in files_to_index.items(): - _dir, _base = split(filename) - self._index_file(_dir, _base, list(domains)) + def _index_dir(dir_, domains): + + contents = [join(dir_, f) for f in self._get_files(dir_)] + + # Check for domain config file + config_file = join(dir_, self.config_filename) + + if exists(config_file): + new_dom = self._get_or_load_domain(config_file) + if new_dom not in domains: + domains.append(new_dom) + contents.remove(config_file) + + contents = filter(lambda x: self._check_inclusions(x, domains), + contents) + + for f in contents: + + full_path = join(dir_, f) + + if isdir(full_path): + # If the directory was explicitly passed in Layout init, + # overwrite the current set of domains with what was passed + domains = self._paths_to_index.get(full_path, domains) + _index_dir(full_path, list(domains)) + + elif self._validate_file(full_path): + _dir, _base = split(full_path) + dom_names = [d.name for d in domains] + self._index_file(_dir, _base, dom_names) + + # Index each directory + for path, domains in self._paths_to_index.items(): + _index_dir(path, list(domains)) def save_index(self, filename): ''' Save the current Layout's index to a .json file. @@ -765,17 +722,26 @@ def get(self, return_type='tuple', target=None, extensions=None, Returns: A named tuple (default) or a list (see return_type for details). """ + if regex_search is None: regex_search = self.regex_search result = [] filters = {} filters.update(kwargs) + for filename, file in self.files.items(): if not file._matches(filters, extensions, domains, regex_search): continue result.append(file) + # Convert to relative paths if needed + if not self.absolute_paths: + for i, f in enumerate(result): + f = copy(f) + f.path = relpath(f.path, self.root) + result[i] = f + if return_type == 'file': return natural_sort([f.path for f in result]) @@ -980,7 +946,11 @@ def build_path(self, source, path_patterns=None, strict=False): pattern in order to be a valid match. If False, extra entities will be ignored so long as all mandatory entities are found. ''' + if isinstance(source, six.string_types): + if source not in self.files: + source = join(self.root, source) + source = self.files[source] if isinstance(source, File): diff --git a/grabbit/extensions/writable.py b/grabbit/extensions/writable.py index 0b97aed..d45679a 100644 --- a/grabbit/extensions/writable.py +++ b/grabbit/extensions/writable.py @@ -130,7 +130,8 @@ def write_contents_to_file(path, contents=None, link_to=None, 'overwrite' overwrites the existing file; 'append' adds a suffix to each file copy, starting with 1. Default is 'fail'. """ - if not root and not isabs(path): + + if root is None and not isabs(path): root = os.getcwd() if root: diff --git a/grabbit/tests/test_core.py b/grabbit/tests/test_core.py index 269140d..61dcb0b 100644 --- a/grabbit/tests/test_core.py +++ b/grabbit/tests/test_core.py @@ -27,7 +27,7 @@ def bids_layout(request): # in this test.json 'subject' regex was left to contain possible # leading 0; the other fields (run, session) has leading 0 stripped config = join(DIRNAME, 'specs', 'test.json') - return Layout(root, config, regex_search=True) + return Layout([(root, config)], regex_search=True) else: hdfs = pytest.importorskip("hdfs") from grabbit.extensions import HDFSLayout @@ -36,21 +36,21 @@ def bids_layout(request): client.root), 'data', '7t_trt') config = psp.join('hdfs://localhost:9000{0}'.format( client.root), 'specs', 'test.json') - return HDFSLayout(root, config, regex_search=True) + return HDFSLayout([(root, config)], regex_search=True) @pytest.fixture(scope='module') def stamp_layout(): root = join(DIRNAME, 'data', 'valuable_stamps') config = join(DIRNAME, 'specs', 'stamps.json') - return Layout(root, config, config_filename='dir_config.json') + return Layout([(root, config)], config_filename='dir_config.json') @pytest.fixture(scope='module') def layout_include(request): root = join(DIRNAME, 'data', '7t_trt') config = join(DIRNAME, 'specs', 'test_include.json') - return Layout(root, config, regex_search=True) + return Layout([(root, config)], regex_search=True) class TestFile: @@ -145,18 +145,18 @@ def test_init(self, bids_layout): def test_init_with_include_arg(self, bids_layout): root = join(DIRNAME, 'data', '7t_trt') config = join(DIRNAME, 'specs', 'test.json') - layout = Layout(root, config, regex_search=True, include='sub-\d*') + layout = Layout([(root, config)], regex_search=True, include='sub-\d*') target = join(root, "dataset_description.json") assert target in bids_layout.files assert target not in layout.files assert join(root, "sub-01", "sub-01_sessions.tsv") in layout.files with pytest.raises(ValueError): - layout = Layout(root, config, include='sub-\d*', exclude="meh") + layout = Layout([(root, config)], include='sub-\d*', exclude="meh") def test_init_with_exclude_arg(self, bids_layout): root = join(DIRNAME, 'data', '7t_trt') config = join(DIRNAME, 'specs', 'test.json') - layout = Layout(root, config, regex_search=True, exclude='sub-\d*') + layout = Layout([(root, config)], regex_search=True, exclude='sub-\d*') target = join(root, "dataset_description.json") assert target in bids_layout.files assert target in layout.files @@ -171,21 +171,22 @@ def test_init_with_config_options(self): config1 = join(DIRNAME, 'specs', 'stamps.json') config2 = join(dir1, 'USA', 'dir_config.json') - # Fails because Domain usa_stamps is included twice - with pytest.raises(ValueError) as e: - layout = Layout(root, [config1, config2], exclude=['7t_trt'], - config_filename='dir_config.json') - assert e.value.message.startswith('Config with name') + # # Fails because Domain usa_stamps is included twice + # with pytest.raises(ValueError) as e: + # layout = Layout([(root, [config1, config2])], exclude=['7t_trt'], + # config_filename='dir_config.json') + # print(dir(e)) + # assert e.value.message.startswith('Config with name') # Test with two configs - layout = Layout(root, [config1, config2], exclude=['7t_trt']) + layout = Layout([(root, [config1, config2])], exclude=['7t_trt']) files = [f.filename for f in layout.files.values()] assert 'name=Inverted_Jenny#value=75000#country=USA.txt' in files assert 'name=5c_Francis_E_Willard#value=1dollar.txt' in files assert 'name=1_Lotus#value=1#country=Canada.txt' in files # Test with two configs and on-the-fly directory remapping - layout = Layout(dir1, [(config1, [dir1, dir2])], + layout = Layout([dir1, ([dir1, dir2], config1)], exclude=['USA/']) files = [f.filename for f in layout.files.values()] assert 'name=Inverted_Jenny#value=75000#country=USA.txt' in files @@ -199,13 +200,12 @@ def test_absolute_paths(self, bids_layout): root = os.path.relpath(root) config = join(DIRNAME, 'specs', 'test.json') - layout = Layout(root, config, absolute_paths=True) - + layout = Layout([(root, config)], absolute_paths=True) result = layout.get(subject=1, run=1, session=1) assert result assert all([os.path.isabs(f.filename) for f in result]) - layout = Layout(root, config, absolute_paths=False) + layout = Layout([(root, config)], absolute_paths=False) result = layout.get(subject=1, run=1, session=1) assert result assert not any([os.path.isabs(f.filename) for f in result]) @@ -217,13 +217,13 @@ def test_absolute_paths(self, bids_layout): config = psp.join('hdfs://localhost:9000{0}'.format( layout._hdfs_client.root), 'specs', 'test.json') - layout = Layout(root, config, absolute_paths=False) + layout = Layout([(root, config)], absolute_paths=False) result = layout.get(subject=1, run=1, session=1) assert result assert all([os.path.isabs(f.filename) for f in result]) - layout = Layout(root, config, absolute_paths=True) + layout = Layout([(root, config)], absolute_paths=True) result = layout.get(subject=1, run=1, session=1) assert result assert all([os.path.isabs(f.filename) for f in result]) @@ -240,7 +240,7 @@ def test_dynamic_getters(self, data_dir, config): if ('hdfs' in data_dir or 'hdfs' in config): pytest.importorskip('hdfs') - layout = Layout(data_dir, config, dynamic_getters=True) + layout = Layout([(data_dir, config)], dynamic_getters=True) assert hasattr(layout, 'get_subjects') assert '01' in getattr(layout, 'get_subjects')() @@ -319,11 +319,6 @@ def test_index_regex(self, bids_layout, layout_include): targ = join('models', 'excluded_model.json') assert targ not in domain.files - with pytest.raises(ValueError): - layout_include._load_domain({'entities': [], - 'index': {'include': 'test', - 'exclude': 'test'}}) - def test_save_index(self, bids_layout): tmp = tempfile.mkstemp(suffix='.json')[1] bids_layout.save_index(tmp) @@ -367,12 +362,12 @@ def hash_file(self, file): # Test with external mapper em = EntityMapper() - layout = Layout(root, config, regex_search=True, entity_mapper=em) + layout = Layout([(root, config)], regex_search=True, entity_mapper=em) f = list(layout.files.values())[20] assert hash(f.path) == f.entities['hash'] # Test with mapper set to self - layout = MappingLayout(root, config, regex_search=True, + layout = MappingLayout([(root, config)], regex_search=True, entity_mapper='self') f = list(layout.files.values())[10] assert str(hash(f.path)) + '.hsh' == f.entities['hash'] @@ -380,7 +375,7 @@ def hash_file(self, file): # Should fail if we use a spec with entities that have mappers but # don't specify an entity-mapping object with pytest.raises(ValueError): - layout = Layout(root, config, regex_search=True) + layout = Layout([(root, config)], regex_search=True) def test_clone(self, bids_layout): lc = bids_layout.clone() @@ -394,9 +389,7 @@ def test_clone(self, bids_layout): def test_excludes(self, tmpdir): root = tmpdir.mkdir("ohmyderivatives").mkdir("ds") config = join(DIRNAME, 'specs', 'test.json') - layout = Layout(str(root), config, regex_search=True) - assert layout._check_inclusions(str(root.join("ohmyimportantfile")), - fullpath=False) + layout = Layout([(str(root), config)], regex_search=True) assert not layout._check_inclusions( str(root.join("badbadderivatives"))) @@ -405,7 +398,6 @@ def test_multiple_domains(self, stamp_layout): assert {'stamps', 'usa_stamps'} == set(layout.domains.keys()) usa = layout.domains['usa_stamps'] general = layout.domains['stamps'] - print([f.filename for f in usa.files]) assert len(usa.files) == 3 assert len(layout.files) == len(general.files) assert not set(usa.files) - set(general.files) diff --git a/grabbit/tests/test_writable.py b/grabbit/tests/test_writable.py index 5b1a8bf..bfccbf1 100644 --- a/grabbit/tests/test_writable.py +++ b/grabbit/tests/test_writable.py @@ -18,7 +18,8 @@ def writable_file(tmpdir): def layout(): data_dir = join(dirname(__file__), 'data', '7t_trt') config = join(dirname(__file__), 'specs', 'test.json') - layout = Layout(data_dir, config, absolute_paths=False) + layout = Layout([(data_dir, config)], absolute_paths=False, + root=data_dir) return layout @@ -204,12 +205,12 @@ def test_write_contents_to_file_defaults(self, layout): contents = 'test' data_dir = join(dirname(__file__), 'data', '7t_trt') config = join(dirname(__file__), 'specs', 'test.json') - layout = Layout(data_dir, [config, { + layout = Layout([(data_dir, [config, { 'name': "test_writable", 'default_path_patterns': ['sub-{subject}/ses-{session}/{subject}' '{session}{run}{type}{task}{acquisition}' '{bval}'] - }]) + }])], root=data_dir) entities = {'subject': 'Bob', 'session': '01', 'run': '1', 'type': 'test', 'task': 'test', 'acquisition': 'test', 'bval': 0} diff --git a/grabbit/utils.py b/grabbit/utils.py index 97a32bb..cb4bf64 100644 --- a/grabbit/utils.py +++ b/grabbit/utils.py @@ -29,7 +29,7 @@ def splitext(path): return li -def listify(obj): +def listify(obj, ignore=(list, tuple, type(None))): ''' Wraps all non-list or tuple objects in a list; provides a simple way to accept flexible arguments. ''' - return obj if isinstance(obj, (list, tuple, type(None))) else [obj] + return obj if isinstance(obj, ignore) else [obj]