diff --git a/CHANGELOG.md b/CHANGELOG.md index 82e1a380..228b9cbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1] + +### Modified + +- HOTFIX: When downloading cache only authenticate Alyx when necessary +- HOTFIX: Ensure http data server URL does not end in slash +- HOTFIX: Handle public aggregate dataset relative paths +- HOTFIX: No longer warns in silent mode when no param conflicts present +- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`) + +## [2.9.0] This version adds a couple of new ALF functions. ### Added diff --git a/one/__init__.py b/one/__init__.py index 1fedcb5c..fb65cac4 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.9.0' +__version__ = '2.9.1' diff --git a/one/api.py b/one/api.py index 9289336a..2e39dd2d 100644 --- a/one/api.py +++ b/one/api.py @@ -563,6 +563,9 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha repository update_exists : bool If true, the cache is updated to reflect the filesystem + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. Returns ------- @@ -924,6 +927,7 @@ def load_object(self, revision: Optional[str] = None, query_type: Optional[str] = None, download_only: bool = False, + check_hash: bool = True, **kwargs) -> Union[alfio.AlfBunch, List[Path]]: """ Load all attributes of an ALF object from a Session ID and an object name. @@ -951,6 +955,9 @@ def load_object(self, download_only : bool When true the data are downloaded and the file path is returned. NB: The order of the file path list is undefined. + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. kwargs Additional filters for datasets, including namespace and timescale. For full list see the :func:`one.alf.spec.describe` function. @@ -996,7 +1003,7 @@ def load_object(self, # For those that don't exist, download them offline = None if query_type == 'auto' else self.mode == 'local' - files = self._check_filesystem(datasets, offline=offline) + files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash) files = [x for x in files if x] if not files: raise alferr.ALFObjectNotFound(f'ALF object "{obj}" not found on disk') @@ -1015,7 +1022,7 @@ def load_dataset(self, revision: Optional[str] = None, query_type: Optional[str] = None, download_only: bool = False, - **kwargs) -> Any: + check_hash: bool = True) -> Any: """ Load a single dataset for a given session id and dataset name. @@ -1040,6 +1047,9 @@ def load_dataset(self, Query cache ('local') or Alyx database ('remote') download_only : bool When true the data are downloaded and the file path is returned. + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. Returns ------- @@ -1099,7 +1109,8 @@ def load_dataset(self, raise alferr.ALFObjectNotFound(f'Dataset "{dataset}" not found') # Check files exist / download remote files - file, = self._check_filesystem(datasets, **kwargs) + offline = None if query_type == 'auto' else self.mode == 'local' + file, = self._check_filesystem(datasets, offline=offline, check_hash=check_hash) if not file: raise alferr.ALFObjectNotFound('Dataset not found') @@ -1117,7 +1128,7 @@ def load_datasets(self, query_type: Optional[str] = None, assert_present=True, download_only: bool = False, - **kwargs) -> Any: + check_hash: bool = True) -> Any: """ Load datasets for a given session id. Returns two lists the length of datasets. The first is the data (or file paths if download_data is false), the second is a list of @@ -1146,6 +1157,9 @@ def load_datasets(self, If true, missing datasets raises and error, otherwise None is returned download_only : bool When true the data are downloaded and the file path is returned. + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. Returns ------- @@ -1257,7 +1271,8 @@ def _verify_specifiers(specifiers): _logger.warning(message) # Check files exist / download remote files - files = self._check_filesystem(present_datasets, **kwargs) + offline = None if query_type == 'auto' else self.mode == 'local' + files = self._check_filesystem(present_datasets, offline=offline, check_hash=check_hash) if any(x is None for x in files): missing_list = ', '.join(x for x, y in zip(present_datasets.rel_path, files) if not y) @@ -1284,7 +1299,8 @@ def _verify_specifiers(specifiers): def load_dataset_from_id(self, dset_id: Union[str, UUID], download_only: bool = False, - details: bool = False) -> Any: + details: bool = False, + check_hash: bool = True) -> Any: """ Load a dataset given a dataset UUID. @@ -1296,6 +1312,9 @@ def load_dataset_from_id(self, If true the dataset is downloaded (if necessary) and the filepath returned. details : bool If true a pandas Series is returned in addition to the data. + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. Returns ------- @@ -1314,7 +1333,7 @@ def load_dataset_from_id(self, except KeyError: raise alferr.ALFObjectNotFound('Dataset not found') - filepath, = self._check_filesystem(dataset) + filepath, = self._check_filesystem(dataset, check_hash=check_hash) if not filepath: raise alferr.ALFObjectNotFound('Dataset not found') output = filepath if download_only else alfio.load_file_content(filepath) @@ -1332,6 +1351,7 @@ def load_collection(self, revision: Optional[str] = None, query_type: Optional[str] = None, download_only: bool = False, + check_hash: bool = True, **kwargs) -> Union[Bunch, List[Path]]: """ Load all objects in an ALF collection from a Session ID. Any datasets with matching object @@ -1357,6 +1377,9 @@ def load_collection(self, Query cache ('local') or Alyx database ('remote') download_only : bool When true the data are downloaded and the file path is returned. + check_hash : bool + Consider dataset missing if local file hash does not match. In online mode, the dataset + will be re-downloaded. kwargs Additional filters for datasets, including namespace and timescale. For full list see the one.alf.spec.describe function. @@ -1397,7 +1420,7 @@ def load_collection(self, # For those that don't exist, download them offline = None if query_type == 'auto' else self.mode == 'local' - files = self._check_filesystem(datasets, offline=offline) + files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash) if not any(files): raise alferr.ALFObjectNotFound(f'ALF collection "{collection}" not found on disk') # Remove missing items @@ -1456,7 +1479,8 @@ def setup(cache_dir=None, silent=False, **kwargs): @lru_cache(maxsize=1) def ONE(*, mode='auto', wildcards=True, **kwargs): - """ONE API factory + """ONE API factory. + Determine which class to instantiate depending on parameters passed. Parameters @@ -1509,10 +1533,10 @@ def ONE(*, mode='auto', wildcards=True, **kwargs): class OneAlyx(One): - """An API for searching and loading data through the Alyx database""" + """An API for searching and loading data through the Alyx database.""" def __init__(self, username=None, password=None, base_url=None, cache_dir=None, mode='auto', wildcards=True, tables_dir=None, **kwargs): - """An API for searching and loading data through the Alyx database + """An API for searching and loading data through the Alyx database. Parameters ---------- @@ -1803,13 +1827,13 @@ def list_aggregates(self, relation: str, identifier: str = None, .reset_index(level=0) .drop('eid', axis=1) .rename_axis(index={'id': 'did'})) - # Since rel_path for public FI file records starts with public/aggregates instead of just - # aggregates as is the case for internal FI file records, as well as public and internal - # AWS file records, we need to make sure to use the file path part after 'aggregates' - # and not simply the second part, as relation. + # Since rel_path for public FI file records starts with 'public/aggregates' instead of just + # 'aggregates', we should discard the file path parts before 'aggregates' (if present) + records['rel_path'] = records['rel_path'].str.replace( + r'^[\w\/]+(?=aggregates\/)', '', n=1, regex=True) + # The relation is the first part after 'aggregates', i.e. the second part records['relation'] = records['rel_path'].map( - lambda x: x.split('aggregates')[-1].split('/')[1].lower() - ) + lambda x: x.split('aggregates')[-1].split('/')[1].lower()) records = records[records['relation'] == relation.lower()] def path2id(p) -> str: @@ -2563,14 +2587,14 @@ def path2eid(self, path_obj: Union[str, Path], query_type=None) -> util.Listable Parameters ---------- path_obj : str, pathlib.Path, list - Local path or list of local paths + Local path or list of local paths. query_type : str - If set to 'remote', will force database connection + If set to 'remote', will force database connection. Returns ------- str, list - An eid or list of eids + An eid or list of eids. """ # If path_obj is a list recurse through it and return a list if isinstance(path_obj, list): diff --git a/one/converters.py b/one/converters.py index 822757b2..dbfc8c43 100644 --- a/one/converters.py +++ b/one/converters.py @@ -187,12 +187,12 @@ def path2eid(self, path_obj): Parameters ---------- path_obj : pathlib.Path, str - Local path or list of local paths + Local path or list of local paths. Returns ------- eid, list - Experiment ID (eid) string or list of eids + Experiment ID (eid) string or list of eids. """ # else ensure the path ends with mouse,date, number session_path = get_session_path(path_obj) diff --git a/one/params.py b/one/params.py index cef0dee8..47409f9e 100644 --- a/one/params.py +++ b/one/params.py @@ -122,22 +122,25 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir cache_map = iopar.read(f'{_PAR_ID_STR}/{_CLIENT_ID_STR}', {'CLIENT_MAP': dict()}) if not silent: + prompt = 'Param %s, current value is ["%s"]:' par = iopar.as_dict(par_default) - for k in par.keys(): + # Iterate through non-password pars + for k in filter(lambda k: 'PWD' not in k, par.keys()): cpar = _get_current_par(k, par_current) - # Prompt for database URL; skip if client url already provided - if k == 'ALYX_URL': - if not client: - par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar - if '://' not in par[k]: - par[k] = 'https://' + par[k] - url_parsed = urlsplit(par[k]) - if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)): - raise ValueError(f'{k} must be valid HTTP URL') + # Prompt for database and FI URL + if 'URL' in k: + if k == 'ALYX_URL' and client: + continue # skip if client url already provided + par[k] = input(prompt % (k, cpar)).strip().rstrip('/') or cpar + if '://' not in par[k]: + par[k] = 'https://' + par[k] + url_parsed = urlsplit(par[k]) + if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)): + raise ValueError(f'{k} must be valid HTTP URL') + if k == 'ALYX_URL': client = par[k] - # Iterate through other non-password pars - elif 'PWD' not in k: - par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar + else: + par[k] = input(prompt % (k, cpar)).strip() or cpar cpar = _get_current_par('HTTP_DATA_SERVER_PWD', par_current) prompt = f'Enter the FlatIron HTTP password for {par["HTTP_DATA_SERVER_LOGIN"]} '\ @@ -179,12 +182,12 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir print('SETUP ABANDONED. Please re-run.') return par_current else: - par = par_current - if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key): - warnings.warn('Warning: the directory provided is already a cache for another URL.') # Precedence: user provided cache_dir; previously defined; the default location default_cache_dir = Path(CACHE_DIR_DEFAULT, client_key) cache_dir = cache_dir or cache_map.CLIENT_MAP.get(client_key, default_cache_dir) + par = par_current + if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key and v == cache_dir): + warnings.warn('Warning: the directory provided is already a cache for another URL.') # Update and save parameters Path(cache_dir).mkdir(exist_ok=True, parents=True) diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 530679c4..e978a2ea 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -1249,14 +1249,15 @@ def test_list_aggregates(self): """Test OneAlyx.list_aggregates""" # Test listing by relation datasets = self.one.list_aggregates('subjects') - self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects'))) + self.assertTrue(all(datasets['rel_path'].str.startswith('aggregates/Subjects'))) self.assertIn('exists_aws', datasets.columns) self.assertIn('session_path', datasets.columns) self.assertTrue(all(datasets['session_path'] == '')) self.assertTrue(self.one.list_aggregates('foobar').empty) # Test filtering with an identifier datasets = self.one.list_aggregates('subjects', 'ZM_1085') - self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects/ZM_1085'))) + expected = 'aggregates/Subjects/mainenlab/ZM_1085' + self.assertTrue(all(datasets['rel_path'].str.startswith(expected))) self.assertTrue(self.one.list_aggregates('subjects', 'foobar').empty) # Test that additional parts of data path are correctly removed # specifically /public in FI openalyx file rec @@ -1292,7 +1293,7 @@ def test_load_aggregate(self): # Touch a file to ensure that we do not try downloading expected = self.one.cache_dir.joinpath( - 'cortexlab/Subjects/ZM_1085/_ibl_subjectTraining.table.pqt') + 'aggregates/Subjects/mainenlab/ZM_1085/_ibl_subjectTraining.table.pqt') expected.parent.mkdir(parents=True), expected.touch() # Test loading with different input dataset formats diff --git a/one/tests/test_params.py b/one/tests/test_params.py index e38958b2..0ae61079 100644 --- a/one/tests/test_params.py +++ b/one/tests/test_params.py @@ -61,7 +61,7 @@ def test_setup(self, _): self.assertNotEqual(cache.ALYX_LOGIN, 'mistake') # Check that raises ValueError when bad URL provided - self.url = 'ftp://' + self.url = 'ftp://foo.bar.org' with self.assertRaises(ValueError), mock.patch('one.params.input', new=self._mock_input): one.params.setup() diff --git a/one/webclient.py b/one/webclient.py index d83c6e2e..0782122c 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -791,14 +791,16 @@ def download_cache_tables(self, source=None, destination=None): ------- List of parquet table file paths. """ - # query the database for the latest cache; expires=None overrides cached response - if not self.is_logged_in: - self.authenticate() source = str(source or f'{self.base_url}/cache.zip') destination = destination or self.cache_dir Path(destination).mkdir(exist_ok=True, parents=True) - headers = self._headers if source.startswith(self.base_url) else None + headers = None + if source.startswith(self.base_url): + if not self.is_logged_in: + self.authenticate() + headers = self._headers + with tempfile.TemporaryDirectory(dir=destination) as tmp: file = http_download_file(source, headers=headers,