From a17cc18b374370e855f1822e68037980473575db Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 6 Feb 2024 15:47:51 +0200 Subject: [PATCH 01/20] Resolves #114; minor punctuation; control thread count w/ env var --- one/alf/io.py | 9 +++++---- one/api.py | 37 +++++++++++++++++++------------------ one/registration.py | 6 +++--- one/webclient.py | 2 +- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/one/alf/io.py b/one/alf/io.py index 9be8a67f..a3be627e 100644 --- a/one/alf/io.py +++ b/one/alf/io.py @@ -687,7 +687,7 @@ def remove_uuid_recursive(folder, dry=False) -> None: def next_num_folder(session_date_folder: Union[str, Path]) -> str: - """Return the next number for a session given a session_date_folder""" + """Return the next number for a session given a session_date_folder.""" session_date_folder = Path(session_date_folder) if not session_date_folder.exists(): return '001' @@ -701,7 +701,7 @@ def next_num_folder(session_date_folder: Union[str, Path]) -> str: def remove_empty_folders(folder: Union[str, Path]) -> None: - """Will iteratively remove any children empty folders""" + """Iteratively remove any empty child folders.""" all_folders = sorted(x for x in Path(folder).rglob('*') if x.is_dir()) for f in reversed(all_folders): # Reversed sorted ensures we remove deepest first try: @@ -712,8 +712,9 @@ def remove_empty_folders(folder: Union[str, Path]) -> None: def filter_by(alf_path, wildcards=True, **kwargs): """ - Given a path and optional filters, returns all ALF files and their associated parts. The - filters constitute a logical AND. For all but `extra`, if a list is provided, one or more + Given a path and optional filters, returns all ALF files and their associated parts. + + The filters constitute a logical AND. For all but `extra`, if a list is provided, one or more elements must match (a logical OR). Parameters diff --git a/one/api.py b/one/api.py index 616ca999..ab624f7c 100644 --- a/one/api.py +++ b/one/api.py @@ -12,6 +12,7 @@ from urllib.error import URLError import time import threading +import os import pandas as pd import numpy as np @@ -34,18 +35,18 @@ _logger = logging.getLogger(__name__) __all__ = ['ONE', 'One', 'OneAlyx'] -"""int: The number of download threads""" -N_THREADS = 4 +N_THREADS = os.environ.get('ONE_HTTP_DL_THREADS', 4) +"""int: The number of download threads.""" class One(ConversionMixin): - """An API for searching and loading data on a local filesystem""" + """An API for searching and loading data on a local filesystem.""" _search_terms = ( 'dataset', 'date_range', 'laboratory', 'number', 'projects', 'subject', 'task_protocol' ) uuid_filenames = None - """bool: whether datasets on disk have a UUID in their filename""" + """bool: whether datasets on disk have a UUID in their filename.""" def __init__(self, cache_dir=None, mode='auto', wildcards=True, tables_dir=None): """An API for searching and loading data on a local filesystem @@ -86,16 +87,16 @@ def __repr__(self): @property def offline(self): - """bool: True if mode is local or no Web client set""" + """bool: True if mode is local or no Web client set.""" return self.mode == 'local' or not getattr(self, '_web_client', False) @util.refresh def search_terms(self, query_type=None) -> tuple: - """List the search term keyword args for use in the search method""" + """List the search term keyword args for use in the search method.""" return self._search_terms def _reset_cache(self): - """Replace the cache object with a Bunch that contains the right fields""" + """Replace the cache object with a Bunch that contains the right fields.""" self._cache = Bunch({'_meta': { 'expired': False, 'created_time': None, @@ -214,18 +215,18 @@ def _save_cache(self, save_dir=None, force=False): lock_file.unlink() def refresh_cache(self, mode='auto'): - """Check and reload cache tables + """Check and reload cache tables. Parameters ---------- mode : {'local', 'refresh', 'auto', 'remote'} Options are 'local' (don't reload); 'refresh' (reload); 'auto' (reload if expired); - 'remote' (don't reload) + 'remote' (don't reload). Returns ------- datetime.datetime - Loaded timestamp + Loaded timestamp. """ # NB: Currently modified table will be lost if called with 'refresh'; # May be instances where modified cache is saved then immediately replaced with a new @@ -253,13 +254,13 @@ def _update_cache_from_records(self, strict=False, **kwargs): strict : bool If not True, the columns don't need to match. Extra columns in input tables are dropped and missing columns are added and filled with np.nan. - **kwargs - pandas.DataFrame or pandas.Series to insert/update for each table + kwargs + pandas.DataFrame or pandas.Series to insert/update for each table. Returns ------- datetime.datetime: - A timestamp of when the cache was updated + A timestamp of when the cache was updated. Example ------- @@ -272,7 +273,7 @@ def _update_cache_from_records(self, strict=False, **kwargs): When strict is True the input columns must exactly match those oo the cache table, including the order. KeyError - One or more of the keyword arguments does not match a table in One._cache + One or more of the keyword arguments does not match a table in One._cache. """ updated = None for table, records in kwargs.items(): @@ -389,7 +390,7 @@ def search(self, details=False, query_type=None, **kwargs): one.search_terms() - For all of the search parameters, a single value or list may be provided. For `dataset`, + For all search parameters, a single value or list may be provided. For `dataset`, the sessions returned will contain all listed datasets. For the other parameters, the session must contain at least one of the entries. @@ -521,7 +522,7 @@ def sort_fcn(itm): eids = sessions.index.to_list() if details: - return eids, sessions.reset_index(drop=True).to_dict('records', Bunch) + return eids, sessions.reset_index(drop=True).to_dict('records', into=Bunch) else: return eids @@ -1237,7 +1238,7 @@ def _verify_specifiers(specifiers): # Make list of metadata Bunches out of the table records = (present_datasets .reset_index(names='id') - .to_dict('records', Bunch)) + .to_dict('records', into=Bunch)) # Ensure result same length as input datasets list files = [None if not here else files.pop(0) for here in present] @@ -2025,7 +2026,7 @@ def search(self, details=False, query_type=None, **kwargs): one.search_terms(query_type='remote') - For all of the search parameters, a single value or list may be provided. For `dataset`, + For all search parameters, a single value or list may be provided. For `dataset`, the sessions returned will contain all listed datasets. For the other parameters, the session must contain at least one of the entries. diff --git a/one/registration.py b/one/registration.py index 8cb0f170..3a2f4897 100644 --- a/one/registration.py +++ b/one/registration.py @@ -190,17 +190,17 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru def find_files(self, session_path): """ - Returns an generator of file names that match one of the dataset type patterns in Alyx + Returns a generator of file names that match one of the dataset type patterns in Alyx. Parameters ---------- session_path : str, pathlib.Path - The session path to search + The session path to search. Yields ------- pathlib.Path - File paths that match the dataset type patterns in Alyx + File paths that match the dataset type patterns in Alyx. """ session_path = Path(session_path) for p in session_path.rglob('*.*.*'): diff --git a/one/webclient.py b/one/webclient.py index 3e57f1ad..ff66b1f1 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -980,7 +980,7 @@ def rest(self, url=None, action=None, id=None, data=None, files=None, Option file(s) to upload. no_cache : bool If true the `list` and `read` actions are performed without returning the cache. - **kwargs + kwargs Filters as per the Alyx REST documentation cf. https://openalyx.internationalbrainlab.org/docs/ From d4301adbaf1d267779d48ca11ce02a90afaa3c2f Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 13 Feb 2024 14:19:39 +0200 Subject: [PATCH 02/20] alf.io.iter_sessions more performant --- one/alf/io.py | 8 ++++++-- one/tests/alf/test_alf_io.py | 18 +++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/one/alf/io.py b/one/alf/io.py index 9be8a67f..e3565b73 100644 --- a/one/alf/io.py +++ b/one/alf/io.py @@ -375,7 +375,7 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple): return [alfpath.joinpath(f) for f in files_alf], attributes -def iter_sessions(root_dir): +def iter_sessions(root_dir, lab_folders=False): """ Recursively iterate over session paths in a given directory. @@ -383,15 +383,19 @@ def iter_sessions(root_dir): ---------- root_dir : str, pathlib.Path The folder to look for sessions. + lab_folders : bool + If true, glob pattern reflects `root_dir` containing /Subjects folders. This is + slightly more performant when Subjects folders present. Yields ------- pathlib.Path The next session path in lexicographical order. """ + pattern = ('*/Subjects/' if lab_folders else '') + '*/????-??-??/*' if spec.is_session_path(root_dir): yield root_dir - for path in sorted(Path(root_dir).rglob('*')): + for path in sorted(Path(root_dir).rglob(pattern)): if path.is_dir() and spec.is_session_path(path): yield path diff --git a/one/tests/alf/test_alf_io.py b/one/tests/alf/test_alf_io.py index 200c5b76..6c0449ff 100644 --- a/one/tests/alf/test_alf_io.py +++ b/one/tests/alf/test_alf_io.py @@ -631,7 +631,7 @@ def tearDown(self) -> None: obj.unlink() if obj.is_file() else obj.rmdir() def test_next_num_folder(self): - """Test for one.alf.io.next_num_folder""" + """Test for one.alf.io.next_num_folder.""" self.session_path.rmdir() # Remove '001' folder next_num = alfio.next_num_folder(self.session_path.parent) self.assertEqual('001', next_num) @@ -657,7 +657,7 @@ def test_next_num_folder(self): alfio.next_num_folder(self.session_path.parent) def test_remove_empty_folders(self): - """Test for one.alf.io.remove_empty_folders""" + """Test for one.alf.io.remove_empty_folders.""" root = Path(self.tempdir.name) / 'glob_dir' root.mkdir() root.joinpath('empty0').mkdir(exist_ok=True) @@ -668,7 +668,7 @@ def test_remove_empty_folders(self): self.assertTrue(len(list(root.glob('*'))) == 1) def test_iter_sessions(self): - """Test for one.alf.io.iter_sessions""" + """Test for one.alf.io.iter_sessions.""" # Create invalid session folder self.session_path.parent.parent.joinpath('bad_session').mkdir() @@ -677,9 +677,17 @@ def test_iter_sessions(self): self.assertFalse(next(valid_sessions, False)) # makes sure that the session path returns itself on the iterator self.assertEqual(self.session_path, next(alfio.iter_sessions(self.session_path))) + # test lab_folders arg + valid_sessions = alfio.iter_sessions(self.tempdir.name, lab_folders=True) + self.assertEqual(self.session_path, next(valid_sessions)) + subjects_path = Path(self.tempdir.name, 'fakelab', 'Subjects') + valid_sessions = alfio.iter_sessions(subjects_path, lab_folders=False) + self.assertEqual(self.session_path, next(valid_sessions)) + valid_sessions = alfio.iter_sessions(subjects_path, lab_folders=True) + self.assertFalse(next(valid_sessions, False)) def test_iter_datasets(self): - """Test for one.alf.io.iter_datasets""" + """Test for one.alf.io.iter_datasets.""" # Create valid dataset dset = self.session_path.joinpath('collection', 'object.attribute.ext') dset.parent.mkdir() @@ -691,5 +699,5 @@ def test_iter_datasets(self): self.assertEqual([Path(*dset.parts[-2:])], ses_files) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main(exit=False, verbosity=2) From 738559f2dc5f2a3e5390d3507ee14b5ece0d1a2b Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 13 Feb 2024 14:25:07 +0200 Subject: [PATCH 03/20] QC enumeration; add QC column to cache table; filter datasets by QC --- one/alf/cache.py | 36 ++++++++++++++++++++---------------- one/alf/spec.py | 13 +++++++++++++ one/api.py | 7 ++++--- one/tests/test_one.py | 27 +++++++++++++++++++++++++++ one/util.py | 42 +++++++++++++++++++++++++++++++++++++----- one/webclient.py | 18 +++++++++--------- 6 files changed, 110 insertions(+), 33 deletions(-) diff --git a/one/alf/cache.py b/one/alf/cache.py index 8034c826..e41508fc 100644 --- a/one/alf/cache.py +++ b/one/alf/cache.py @@ -27,6 +27,7 @@ from iblutil.io import parquet from iblutil.io.hashfile import md5 +from one.alf.spec import QC from one.alf.io import iter_sessions, iter_datasets from one.alf.files import session_path_parts, get_alf_path from one.converters import session_record2path @@ -38,14 +39,17 @@ # Global variables # ------------------------------------------------------------------------------------------------- +QC_TYPE = pd.CategoricalDtype(categories=[e.name for e in sorted(QC)], ordered=True) +"""pandas.api.types.CategoricalDtype : The cache table QC column data type.""" + SESSIONS_COLUMNS = ( 'id', # int64 - 'lab', - 'subject', + 'lab', # str + 'subject', # str 'date', # datetime.date 'number', # int - 'task_protocol', - 'projects', + 'task_protocol', # str + 'projects', # str ) DATASETS_COLUMNS = ( @@ -64,7 +68,7 @@ # ------------------------------------------------------------------------------------------------- def _ses_str_id(session_path): - """Returns a str id from a session path in the form '(lab/)subject/date/number'""" + """Returns a str id from a session path in the form '(lab/)subject/date/number'.""" return Path(*filter(None, session_path_parts(session_path, assert_valid=True))).as_posix() @@ -140,7 +144,7 @@ def _metadata(origin): Parameters ---------- origin : str, pathlib.Path - Path to full directory, or computer name / db name + Path to full directory, or computer name / db name. """ return { 'date_created': datetime.datetime.now().isoformat(sep=' ', timespec='minutes'), @@ -150,17 +154,17 @@ def _metadata(origin): def _make_sessions_df(root_dir) -> pd.DataFrame: """ - Given a root directory, recursively finds all sessions and returns a sessions DataFrame + Given a root directory, recursively finds all sessions and returns a sessions DataFrame. Parameters ---------- root_dir : str, pathlib.Path - The folder to look for sessions + The folder to look for sessions. Returns ------- pandas.DataFrame - A pandas DataFrame of session info + A pandas DataFrame of session info. """ rows = [] for full_path in iter_sessions(root_dir): @@ -176,19 +180,19 @@ def _make_sessions_df(root_dir) -> pd.DataFrame: def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame: """ - Given a root directory, recursively finds all datasets and returns a datasets DataFrame + Given a root directory, recursively finds all datasets and returns a datasets DataFrame. Parameters ---------- root_dir : str, pathlib.Path - The folder to look for sessions + The folder to look for sessions. hash_files : bool - If True, an MD5 is computed for each file and stored in the 'hash' column + If True, an MD5 is computed for each file and stored in the 'hash' column. Returns ------- pandas.DataFrame - A pandas DataFrame of dataset info + A pandas DataFrame of dataset info. """ df = pd.DataFrame([], columns=DATASETS_COLUMNS) # Go through sessions and append datasets @@ -216,7 +220,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab root directory. hash_ids : bool If True, experiment and dataset IDs will be UUIDs generated from the system and relative - paths (required for use with ONE API) + paths (required for use with ONE API). hash_files : bool If True, an MD5 hash is computed for each dataset and stored in the datasets table. This will substantially increase cache generation time. @@ -227,9 +231,9 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab Returns ------- pathlib.Path - The full path of the saved sessions parquet table + The full path of the saved sessions parquet table. pathlib.Path - The full path of the saved datasets parquet table + The full path of the saved datasets parquet table. """ root_dir = Path(root_dir).resolve() diff --git a/one/alf/spec.py b/one/alf/spec.py index ecbb37b6..ef95d0ef 100644 --- a/one/alf/spec.py +++ b/one/alf/spec.py @@ -1,6 +1,7 @@ """The complete ALF specification descriptors and validators.""" import re import textwrap +from enum import IntEnum from uuid import UUID from typing import Union @@ -124,6 +125,18 @@ ) +class QC(IntEnum): + """Data QC outcomes. + + This enumeration is used by the Alyx database. NB: Pandas cache tables use different codes. + """ + CRITICAL = 50 + FAIL = 40 + WARNING = 30 + NOT_SET = 0 + PASS = 10 + + def path_pattern() -> str: """Returns a template string representing the where the ALF parts lie in an ALF path. Brackets denote optional parts. This is used for documentation purposes only. diff --git a/one/api.py b/one/api.py index 616ca999..5e40304f 100644 --- a/one/api.py +++ b/one/api.py @@ -135,7 +135,7 @@ def load_cache(self, tables_dir=None, **kwargs): cache.set_index(idx_columns, inplace=True) # Patch older tables - cache = util.patch_cache(cache, meta['raw'][table].get('min_api_version')) + cache = util.patch_cache(cache, meta['raw'][table].get('min_api_version'), table) # Check sorted # Sorting makes MultiIndex indexing O(N) -> O(1) @@ -732,8 +732,9 @@ def list_datasets(self, eid=None, filename=None, collection=None, revision=None, >>> datasets = one.list_datasets(eid, {'object': ['wheel', 'trial?']}) """ datasets = self._cache['datasets'] - filter_args = dict(collection=collection, filename=filename, wildcards=self.wildcards, - revision=revision, revision_last_before=False, assert_unique=False) + filter_args = dict( + collection=collection, filename=filename, wildcards=self.wildcards, + revision=revision, revision_last_before=False, assert_unique=False) if not eid: datasets = util.filter_datasets(datasets, **filter_args) return datasets.copy() if details else datasets['rel_path'].unique().tolist() diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 0507d4e7..1c5821d2 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -52,6 +52,7 @@ ) import one.params import one.alf.exceptions as alferr +from one.alf import spec from . import util from . import OFFLINE_ONLY, TEST_DB_1, TEST_DB_2 # 1 = TestAlyx; 2 = OpenAlyx @@ -242,6 +243,32 @@ def test_filter(self): assert_unique=False, revision_last_before=False) self.assertEqual(4, len(verifiable)) + # QC + datasets.loc[:, 'qc'] = ['NOT_SET', 'PASS', 'WARNING', 'FAIL', 'CRITICAL'] + verifiable = filter_datasets(datasets, assert_unique=False) + self.assertEqual(4, len(verifiable), 'failed to filter QC value') + self.assertTrue(all(verifiable.qc < 'CRITICAL'), 'did not exclude CRITICAL QC by default') + + # 'ignore_qc_not_set' kwarg should ignore records without QC + verifiable = filter_datasets(datasets, assert_unique=False, ignore_qc_not_set=True) + self.assertEqual(3, len(verifiable), 'failed to filter QC value') + self.assertTrue(all(verifiable.qc > 'NOT_SET'), 'did not exclude NOT_SET QC datasets') + + # Check QC input types + verifiable = filter_datasets(datasets, assert_unique=False, qc='PASS') + self.assertEqual(2, len(verifiable), 'failed to filter QC value') + self.assertTrue(all(verifiable.qc < 'WARNING')) + + verifiable = filter_datasets(datasets, assert_unique=False, qc=30) + self.assertEqual(3, len(verifiable), 'failed to filter QC value') + self.assertTrue(all(verifiable.qc < 'FAIL')) + + verifiable = filter_datasets(datasets, qc=spec.QC.PASS, ignore_qc_not_set=True) + self.assertEqual(1, len(verifiable)) + self.assertTrue(all(verifiable['qc'] == 'PASS')) + + datasets.iat[-1, -1] = 'PASS' # set CRITICAL dataset to PASS so not excluded by default + # Revisions revisions = [ 'alf/probe00/#2020-01-01#/spikes.times.npy', diff --git a/one/util.py b/one/util.py index cd70b5a8..cfa49923 100644 --- a/one/util.py +++ b/one/util.py @@ -14,7 +14,7 @@ import one.alf.exceptions as alferr from one.alf.files import rel_path_parts, get_session_path, get_alf_path, remove_uuid_string -from one.alf.spec import FILE_SPEC, regex as alf_regex +from one.alf.spec import QC, FILE_SPEC, regex as alf_regex logger = logging.getLogger(__name__) @@ -283,8 +283,9 @@ def _file_spec(**kwargs): return filespec -def filter_datasets(all_datasets, filename=None, collection=None, revision=None, - revision_last_before=True, assert_unique=True, wildcards=False): +def filter_datasets( + all_datasets, filename=None, collection=None, revision=None, revision_last_before=True, + qc=QC.FAIL, ignore_qc_not_set=False, assert_unique=True, wildcards=False): """ Filter the datasets cache table by the relative path (dataset name, collection and revision). When None is passed, all values will match. To match on empty parts, use an empty string. @@ -305,6 +306,11 @@ def filter_datasets(all_datasets, filename=None, collection=None, revision=None, When true and no exact match exists, the (lexicographically) previous revision is used instead. When false the revision string is matched like collection and filename, with regular expressions permitted. + qc : str, int, one.alf.spec.QC + Returns datasets at or below this QC level. Integer values should correspond to the QC + enumeration NOT the qc category column codes in the pandas table. + ignore_qc_not_set : bool + When true, do not return datasets for which QC is NOT_SET. assert_unique : bool When true an error is raised if multiple collections or datasets are found. wildcards : bool @@ -334,6 +340,14 @@ def filter_datasets(all_datasets, filename=None, collection=None, revision=None, >>> datasets = filter_datasets(all_datasets, dict(object='spikes', attribute='times')) + Filter by QC outcome - datasets with WARNING or better + + >>> datasets filter_datasets(all_datasets, qc='WARNING') + + Filter by QC outcome and ignore datasets with unset QC - datasets with PASS only + + >>> datasets filter_datasets(all_datasets, qc='PASS', ignore_qc_not_set=True) + Notes ----- - It is not possible to match datasets that are in a given collection OR NOT in ANY collection. @@ -368,7 +382,17 @@ def filter_datasets(all_datasets, filename=None, collection=None, revision=None, # Build regex string pattern = alf_regex('^' + spec_str, **regex_args) - match = all_datasets[all_datasets['rel_path'].str.match(pattern)] + path_match = all_datasets['rel_path'].str.match(pattern) + + # Test on QC outcome + if not isinstance(qc, QC): # cast to QC enum for validation + qc = QC[qc] if isinstance(qc, str) else QC(qc) + qc_match = all_datasets['qc'].le(qc.name) + if ignore_qc_not_set: + qc_match &= all_datasets['qc'].ne('NOT_SET') + + # Filter datasets on path and QC + match = all_datasets[path_match & qc_match] if len(match) == 0 or not (revision_last_before or assert_unique): return match @@ -558,7 +582,7 @@ def cache_int2str(table: pd.DataFrame) -> pd.DataFrame: return table -def patch_cache(table: pd.DataFrame, min_api_version=None) -> pd.DataFrame: +def patch_cache(table: pd.DataFrame, min_api_version=None, name=None) -> pd.DataFrame: """Reformat older cache tables to comply with this version of ONE. Currently this function will 1. convert integer UUIDs to string UUIDs; 2. rename the 'project' @@ -570,10 +594,18 @@ def patch_cache(table: pd.DataFrame, min_api_version=None) -> pd.DataFrame: A cache table (from One._cache). min_api_version : str The minimum API version supported by this cache table. + name : {'dataset', 'session'} str + The name of the table. """ min_version = version.parse(min_api_version or '0.0.0') table = cache_int2str(table) # Rename project column if min_version < version.Version('1.13.0') and 'project' in table.columns: table.rename(columns={'project': 'projects'}, inplace=True) + if name == 'datasets' and min_version < version.Version('2.0.0') and 'qc' not in table.columns: + qc_categories = [e.name for e in sorted(QC)] + qc = pd.Categorical.from_codes( + np.zeros(len(table.index), dtype=int), categories=qc_categories, ordered=True + ) + table = table.assign(qc=qc) return table diff --git a/one/webclient.py b/one/webclient.py index 3e57f1ad..40448c19 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -464,10 +464,10 @@ class AlyxClient: """ _token = None _headers = {} # Headers for REST requests only - """str: The Alyx username""" user = None - """str: The Alyx database URL""" + """str: The Alyx username.""" base_url = None + """str: The Alyx database URL.""" def __init__(self, base_url=None, username=None, password=None, cache_dir=None, silent=False, cache_rest='GET'): @@ -1012,7 +1012,7 @@ def rest(self, url=None, action=None, id=None, data=None, files=None, pprint(list(endpoint_scheme.keys())) self.print_endpoint_info(endpoint) return - # make sure the the desired action exists, if not throw an informative error + # make sure the desired action exists, if not throw an informative error if action not in endpoint_scheme: raise ValueError('Action "' + action + '" for REST endpoint "' + endpoint + '" does ' + 'not exist. Available actions are: ' + @@ -1053,26 +1053,26 @@ def rest(self, url=None, action=None, id=None, data=None, files=None, if not isinstance(id, str) and id is not None: id = str(id) # e.g. may be uuid.UUID if action == 'read': - assert (endpoint_scheme[action]['action'] == 'get') + assert endpoint_scheme[action]['action'] == 'get' return self.get('/' + endpoint + '/' + id.split('/')[-1], **cache_args) elif action == 'create': - assert (endpoint_scheme[action]['action'] == 'post') + assert endpoint_scheme[action]['action'] == 'post' return self.post('/' + endpoint, data=data, files=files) elif action == 'delete': - assert (endpoint_scheme[action]['action'] == 'delete') + assert endpoint_scheme[action]['action'] == 'delete' return self.delete('/' + endpoint + '/' + id.split('/')[-1]) elif action == 'partial_update': - assert (endpoint_scheme[action]['action'] == 'patch') + assert endpoint_scheme[action]['action'] == 'patch' return self.patch('/' + endpoint + '/' + id.split('/')[-1], data=data, files=files) elif action == 'update': - assert (endpoint_scheme[action]['action'] == 'put') + assert endpoint_scheme[action]['action'] == 'put' return self.put('/' + endpoint + '/' + id.split('/')[-1], data=data, files=files) # JSON field interface convenience methods def _check_inputs(self, endpoint: str) -> None: # make sure the queried endpoint exists, if not throw an informative error if endpoint not in self.rest_schemes.keys(): - av = [k for k in self.rest_schemes.keys() if not k.startswith('_') and k] + av = (k for k in self.rest_schemes.keys() if not k.startswith('_') and k) raise ValueError('REST endpoint "' + endpoint + '" does not exist. Available ' + 'endpoints are \n ' + '\n '.join(av)) return From cf606039e8ae1167547a5b5db2f09ab61dfa6b8d Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 13 Feb 2024 16:55:35 +0200 Subject: [PATCH 04/20] Move QC_TYPE to one.util to fix circular import --- one/__init__.py | 2 +- one/alf/cache.py | 4 ---- one/tests/__init__.py | 6 +++--- one/util.py | 10 +++++----- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/one/__init__.py b/one/__init__.py index fddf4607..df50e79e 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.6.0' +__version__ = '2.7.0' diff --git a/one/alf/cache.py b/one/alf/cache.py index e41508fc..c6b7965e 100644 --- a/one/alf/cache.py +++ b/one/alf/cache.py @@ -27,7 +27,6 @@ from iblutil.io import parquet from iblutil.io.hashfile import md5 -from one.alf.spec import QC from one.alf.io import iter_sessions, iter_datasets from one.alf.files import session_path_parts, get_alf_path from one.converters import session_record2path @@ -39,9 +38,6 @@ # Global variables # ------------------------------------------------------------------------------------------------- -QC_TYPE = pd.CategoricalDtype(categories=[e.name for e in sorted(QC)], ordered=True) -"""pandas.api.types.CategoricalDtype : The cache table QC column data type.""" - SESSIONS_COLUMNS = ( 'id', # int64 'lab', # str diff --git a/one/tests/__init__.py b/one/tests/__init__.py index ffe32667..9a2f1897 100644 --- a/one/tests/__init__.py +++ b/one/tests/__init__.py @@ -1,14 +1,14 @@ -"""Tests for ONE-api""" +"""Tests for ONE-api.""" import os import json from pathlib import Path -"""int: Flag for skipping tests that require an http connection""" +"""int: Flag for skipping tests that require an http connection.""" OFFLINE_ONLY = int(os.getenv('OFFLINE_ONLY', '0')) def _get_test_db(): - """Load test database credentials for testing ONE api + """Load test database credentials for testing ONE api. Allows users to test ONE using their own Alyx database. The tests use two databases: the first for tests requiring POST requests; the second for tests that do not affect the database. diff --git a/one/util.py b/one/util.py index cfa49923..b217ff10 100644 --- a/one/util.py +++ b/one/util.py @@ -18,6 +18,9 @@ logger = logging.getLogger(__name__) +QC_TYPE = pd.CategoricalDtype(categories=[e.name for e in sorted(QC)], ordered=True) +"""pandas.api.types.CategoricalDtype: The cache table QC column data type.""" + def Listable(t): """Return a typing.Union if the input and sequence of input.""" @@ -602,10 +605,7 @@ def patch_cache(table: pd.DataFrame, min_api_version=None, name=None) -> pd.Data # Rename project column if min_version < version.Version('1.13.0') and 'project' in table.columns: table.rename(columns={'project': 'projects'}, inplace=True) - if name == 'datasets' and min_version < version.Version('2.0.0') and 'qc' not in table.columns: - qc_categories = [e.name for e in sorted(QC)] - qc = pd.Categorical.from_codes( - np.zeros(len(table.index), dtype=int), categories=qc_categories, ordered=True - ) + if name == 'datasets' and min_version < version.Version('2.7.0') and 'qc' not in table.columns: + qc = pd.Categorical.from_codes(np.zeros(len(table.index), dtype=int), dtype=QC_TYPE) table = table.assign(qc=qc) return table From 2fbd63e56ff95e36e9115bd564b0e0acfa718c1f Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Wed, 14 Feb 2024 14:48:23 +0200 Subject: [PATCH 05/20] Add QC validate method --- one/alf/spec.py | 32 ++++++++++++++++++++++++++++++++ one/tests/alf/test_alf_spec.py | 11 +++++++++++ 2 files changed, 43 insertions(+) diff --git a/one/alf/spec.py b/one/alf/spec.py index ef95d0ef..9c29c6d0 100644 --- a/one/alf/spec.py +++ b/one/alf/spec.py @@ -136,6 +136,38 @@ class QC(IntEnum): NOT_SET = 0 PASS = 10 + @staticmethod + def validate(v): + """ + Validate QC input and return equivalent enumeration. + + Parameters + ---------- + v : int, str, QC + A QC enumeration, or equivalent int value or name. + + Returns + ------- + QC + The enumeration. + + Raises + ------ + ValueError + An invalid QC value was passed. + """ + if isinstance(v, QC): + return v + elif isinstance(v, str): + if v.isnumeric(): + return QC(int(v)) + try: + return QC[v.upper()] + except KeyError: + raise ValueError(f'{v} is not a valid QC') + else: + return QC(v) + def path_pattern() -> str: """Returns a template string representing the where the ALF parts lie in an ALF path. diff --git a/one/tests/alf/test_alf_spec.py b/one/tests/alf/test_alf_spec.py index 16ea4cb1..9532df50 100644 --- a/one/tests/alf/test_alf_spec.py +++ b/one/tests/alf/test_alf_spec.py @@ -226,6 +226,17 @@ def test_describe(self, sysout): with self.assertRaises(ValueError): alf_spec.describe('dimensions', width=5) + def test_qc_validate(self): + """Test for one.alf.spec.QC.validate enumeration method.""" + value = alf_spec.QC.FAIL + self.assertIs(alf_spec.QC.validate(value), value) + self.assertIs(alf_spec.QC.validate('40'), value) + self.assertIs(alf_spec.QC.validate(40), value) + self.assertIs(alf_spec.QC.validate('FAIL'), value) + self.assertRaises(ValueError, alf_spec.QC.validate, 'ERROR') + value = alf_spec.QC.CRITICAL + self.assertIs(alf_spec.QC.validate('critical'), value) + class TestALFErr(unittest.TestCase): """Tests for one.alf.exceptions""" From 3fb4143be8fcda54716be3437aaea2f64951e9c0 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Wed, 14 Feb 2024 14:50:55 +0200 Subject: [PATCH 06/20] Dataset QC added to ses2records and datasets2records --- one/util.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/one/util.py b/one/util.py index b217ff10..e8193afd 100644 --- a/one/util.py +++ b/one/util.py @@ -54,21 +54,21 @@ def ses2records(ses: dict): # Extract datasets table def _to_record(d): - rec = dict(file_size=d['file_size'], hash=d['hash'], exists=True) - rec['id'] = d['id'] + rec = dict(file_size=d['file_size'], hash=d['hash'], exists=True, id=d['id']) rec['eid'] = session.name file_path = urllib.parse.urlsplit(d['data_url'], allow_fragments=False).path.strip('/') file_path = get_alf_path(remove_uuid_string(file_path)) rec['session_path'] = get_session_path(file_path).as_posix() rec['rel_path'] = file_path[len(rec['session_path']):].strip('/') rec['default_revision'] = d['default_revision'] == 'True' + rec['qc'] = d.get('qc', 'NOT_SET') return rec if not ses.get('data_dataset_session_related'): return session, pd.DataFrame() records = map(_to_record, ses['data_dataset_session_related']) index = ['eid', 'id'] - datasets = pd.DataFrame(records).set_index(index).sort_index() + datasets = pd.DataFrame(records).set_index(index).sort_index().astype({'qc': QC_TYPE}) return session, datasets @@ -109,15 +109,16 @@ def datasets2records(datasets, additional=None) -> pd.DataFrame: rec['session_path'] = rec['session_path'].as_posix() rec['rel_path'] = file_path[len(rec['session_path']):].strip('/') rec['default_revision'] = d['default_dataset'] + rec['qc'] = d.get('qc') for field in additional or []: rec[field] = d.get(field) records.append(rec) index = ['eid', 'id'] if not records: - keys = (*index, 'file_size', 'hash', 'session_path', 'rel_path', 'default_revision') + keys = (*index, 'file_size', 'hash', 'session_path', 'rel_path', 'default_revision', 'qc') return pd.DataFrame(columns=keys).set_index(index) - return pd.DataFrame(records).set_index(index).sort_index() + return pd.DataFrame(records).set_index(index).sort_index().astype({'qc': QC_TYPE}) def parse_id(method): @@ -151,9 +152,7 @@ def wrapper(self, id, *args, **kwargs): def refresh(method): - """ - Refresh cache depending of query_type kwarg. - """ + """Refresh cache depending on query_type kwarg.""" @wraps(method) def wrapper(self, *args, **kwargs): From f234fe606936dbac911c5a7bbc7ed8547b37f32f Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 20 Feb 2024 16:28:10 +0200 Subject: [PATCH 07/20] Resolves #113 --- .github/workflows/python-publish.yaml | 37 +++++++++++++++++++++++++++ docs/contributing.md | 18 +++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 .github/workflows/python-publish.yaml diff --git a/.github/workflows/python-publish.yaml b/.github/workflows/python-publish.yaml new file mode 100644 index 00000000..f21fd20f --- /dev/null +++ b/.github/workflows/python-publish.yaml @@ -0,0 +1,37 @@ +# Reference for this action: +# https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries +name: Publish to PyPI + +on: + push: + tags: + - 'v*' + +permissions: + contents: read + +jobs: + deploy: + name: Build and publish Python distributions to PyPI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel + + - name: Build package + run: python setup.py sdist bdist_wheel + + - name: Publish package + # GitHub recommends pinning 3rd party actions to a commit SHA. + uses: pypa/gh-action-pypi-publish@37f50c210e3d2f9450da2cd423303d6a14a6e29f + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/docs/contributing.md b/docs/contributing.md index 7632a672..da8ed567 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -23,3 +23,21 @@ python ./make-script.py -d The HTML files are placed in `docs/_build/html/`. +# Contributing to code + +Always branch off branch `main` before commiting changes, then push to remote and open a PR into `main`. +A developer will then approve the PR and release. + +## Releasing (developers only) + +Note that in order to trigger a pypi release the tag must begin with 'v', e.g. `v2.8.0`. + +```shell +git checkout -b release/X.X.X origin/ +git checkout origin/main +git merge release/X.X.X +git tag vX.X.X +git push origin --tags +git push origin +git branch -d release/X.X.X +``` From 2e9c4fcec561d9929ad383976140ef247124a851 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 20 Feb 2024 16:57:30 +0200 Subject: [PATCH 08/20] QC documentation --- docs/FAQ.md | 17 +++ docs/notebooks/one_list/one_list.ipynb | 15 ++- docs/notebooks/one_search/one_search.ipynb | 18 ++-- one/alf/spec.py | 8 ++ one/api.py | 115 ++++++++++++++------- one/tests/test_one.py | 19 +++- 6 files changed, 141 insertions(+), 51 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index a2bb44ed..562c38a9 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -194,3 +194,20 @@ or provided a different tag (see [this question](#how-do-i-download-the-datasets Second, there are minor differences between the default/local modes and remote mode. Namely that in remote mode queries are generally case-insensitive. See the 'gotcha' section of '[Searching with ONE](notebooks/one_search/one_search.html#Gotchas)' for more information. + +## How do I load datasets that pass quality control +You can first filter sessions by those that the supplied datasets with QC level WARNING or less: + +```python +one = ONE() +# In local and auto mode +eids = one.search(dataset=['trials', 'spikes'], dataset_qc_lte='WARNING') +# In remote mode +eids = one.search(datasets=['trials.table.pqt', 'spikes.times.npy'], dataset_qc_lte='WARNING') +``` + +You can then load the datasets with list_datasets and load_datasets: +```python +dsets = one.list_datasets(eid, qc='WARNING', ignore_qc_not_set=True) +data, info = one.load_datasets(eid, dsets) +``` diff --git a/docs/notebooks/one_list/one_list.ipynb b/docs/notebooks/one_list/one_list.ipynb index d3f91764..31f7e1a0 100644 --- a/docs/notebooks/one_list/one_list.ipynb +++ b/docs/notebooks/one_list/one_list.ipynb @@ -244,7 +244,13 @@ "collections = one.list_collections(eid, filename='*spikes*')\n", "\n", "# All datasets with 'raw' in the name:\n", - "datasets = one.list_datasets(eid, '*raw*')\n" + "datasets = one.list_datasets(eid, '*raw*')\n", + "\n", + "# All datasets with a QC value less than or equal to 'WARNING' (i.e. includes 'PASS', 'NOT_SET' also):\n", + "datasets = one.list_datasets(eid, qc='WARNING')\n", + "\n", + "# All QC'd datasets with a value less than or equal to 'WARNING' (i.e. 'WARNING' or 'PASS'):\n", + "datasets = one.list_datasets(eid, qc='WARNING', ignore_qc_not_set=True)" ], "metadata": { "collapsed": false, @@ -384,7 +390,8 @@ "source": [ "## Combining with load methods\n", "The list methods are useful in combination with the load methods. For example, the output of\n", - "the `list_datasets` method can be a direct input of the `load_datasets` method:" + "the `list_datasets` method can be a direct input of the `load_datasets` method. Here we load all\n", + "spike and cluster datasets where the QC is either PASS or NOT_SET:" ], "metadata": { "collapsed": false @@ -403,7 +410,7 @@ } ], "source": [ - "datasets = one.list_datasets(eid, ['*spikes*', '*clusters*'])\n", + "datasets = one.list_datasets(eid, ['*spikes*', '*clusters*'], qc='PASS', ignore_qc_not_set=False)\n", "data, records = one.load_datasets(eid, datasets)" ], "metadata": { @@ -537,4 +544,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/docs/notebooks/one_search/one_search.ipynb b/docs/notebooks/one_search/one_search.ipynb index 567b8261..e3981b66 100644 --- a/docs/notebooks/one_search/one_search.ipynb +++ b/docs/notebooks/one_search/one_search.ipynb @@ -573,15 +573,19 @@ "As mentioned above, different search terms perform differently. Below are the search terms and their\n", "approximate SQL equivalents:\n", "\n", - "| Term | Lookup |\n", - "|--------------|-----------|\n", - "| dataset | LIKE AND |\n", - "| number | EXACT |\n", - "| date_range | BETWEEN |\n", - "| subject, etc.| LIKE OR |\n", + "| Term | Lookup |\n", + "|-----------------|----------|\n", + "| dataset | LIKE AND |\n", + "| dataset_qc_lte | <= |\n", + "| number | EXACT |\n", + "| date_range | BETWEEN |\n", + "| subject, etc. | LIKE OR |\n", "\n", "Combinations of terms form a logical AND, for example `one.search(subject=['foo', 'bar'], project='baz')`\n", "searches for sessions where the subject name contains foo OR bar, AND the project contains baz.\n", + "NB: When `dataset_qc_lte` which is provided with `dataset(s)`, sessions are returned where ALL matching datasets\n", + "have a less than or equal QC value. When `dataset_qc_lte` is provided alone, sessions are returned where\n", + "ANY of the datasets have a less than or equal QC value.\n", "\n", "#### Difference between remote mode search terms\n", "Many search terms perform differently between auto/local mode and [remote mode](../one_modes.html),\n", @@ -591,7 +595,7 @@ "In remote mode there are three ways to search for datasets:\n", "\n", "* **dataset** - a partial, case-insensitive match of a single dataset (multiple datasets not supported).\n", - "* **datasets** - an exact, case-sensitive match of one or more datasets. All datasets must be present.\n", + "* **datasets** - an exact, case-sensitive match of one or more datasets. All datasets must be present. If `dataset_qc` provided, this criterion applies only to these datasets.\n", "* **dataset_type** - an exact, case-sensitive match of one or more [dataset types](../datasets_and_types.html#Dataset-types). All dataset types must be present.\n", "\n", "#### Regex systems between modes\n", diff --git a/one/alf/spec.py b/one/alf/spec.py index 9c29c6d0..b65c6007 100644 --- a/one/alf/spec.py +++ b/one/alf/spec.py @@ -131,10 +131,18 @@ class QC(IntEnum): This enumeration is used by the Alyx database. NB: Pandas cache tables use different codes. """ CRITICAL = 50 + """Dataset practically unusable, e.g. clock can't be aligned; data missing or inaccurate.""" FAIL = 40 + """Dataset does not meet expected standards, e.g. trial event timings different to protocol.""" WARNING = 30 + """ + Dataset has minor quality issues, e.g. relatively high SNR, that should not affect most + analyses. + """ NOT_SET = 0 + """Dataset quality has not been assessed.""" PASS = 10 + """Dataset considered 'gold-standard', e.g. tight trial event timings, low recorded SNR.""" @staticmethod def validate(v): diff --git a/one/api.py b/one/api.py index 5e40304f..614fb3ad 100644 --- a/one/api.py +++ b/one/api.py @@ -27,7 +27,7 @@ import one.alf.files as alfiles import one.alf.exceptions as alferr from .alf.cache import make_parquet_db, DATASETS_COLUMNS, SESSIONS_COLUMNS -from .alf.spec import is_uuid_string +from .alf.spec import is_uuid_string, QC from . import __version__ from one.converters import ConversionMixin, session_record2path import one.util as util @@ -41,7 +41,8 @@ class One(ConversionMixin): """An API for searching and loading data on a local filesystem""" _search_terms = ( - 'dataset', 'date_range', 'laboratory', 'number', 'projects', 'subject', 'task_protocol' + 'dataset', 'date_range', 'laboratory', 'number', + 'projects', 'subject', 'task_protocol', 'dataset_qc' ) uuid_filenames = None @@ -253,13 +254,13 @@ def _update_cache_from_records(self, strict=False, **kwargs): strict : bool If not True, the columns don't need to match. Extra columns in input tables are dropped and missing columns are added and filled with np.nan. - **kwargs - pandas.DataFrame or pandas.Series to insert/update for each table + kwargs + pandas.DataFrame or pandas.Series to insert/update for each table. Returns ------- datetime.datetime: - A timestamp of when the cache was updated + A timestamp of when the cache was updated. Example ------- @@ -272,7 +273,7 @@ def _update_cache_from_records(self, strict=False, **kwargs): When strict is True the input columns must exactly match those oo the cache table, including the order. KeyError - One or more of the keyword arguments does not match a table in One._cache + One or more of the keyword arguments does not match a table in One._cache. """ updated = None for table, records in kwargs.items(): @@ -389,9 +390,9 @@ def search(self, details=False, query_type=None, **kwargs): one.search_terms() - For all of the search parameters, a single value or list may be provided. For `dataset`, - the sessions returned will contain all listed datasets. For the other parameters, - the session must contain at least one of the entries. + For all search parameters, a single value or list may be provided. For `dataset`, the + sessions returned will contain all listed datasets. For the other parameters, the session + must contain at least one of the entries. For all but `date_range` and `number`, any field that contains the search string is returned. Wildcards are not permitted, however if wildcards property is True, regular @@ -403,6 +404,10 @@ def search(self, details=False, query_type=None, **kwargs): One or more dataset names. Returns sessions containing all these datasets. A dataset matches if it contains the search string e.g. 'wheel.position' matches '_ibl_wheel.position.npy'. + dataset_qc_lte : str, int, one.alf.spec.QC + A dataset QC value, returns sessions with datasets at or below this QC value, including + those with no QC set. If `dataset` not passed, sessions with any passing QC datasets + are returned, otherwise all matching datasets must have the QC value or below. date_range : str, list, datetime.datetime, datetime.date, pandas.timestamp A single date to search or a list of 2 dates that define the range (inclusive). To define only the upper or lower date bound, set the other element to None. @@ -450,10 +455,20 @@ def search(self, details=False, query_type=None, **kwargs): >>> eids = one.search(date='2023-01-01', lab='churchlandlab', dataset=['trials', 'spikes']) + Search for sessions containing trials and spike data where QC for both are WARNING or less. + + >>> eids = one.search(dataset_qc_lte='WARNING', dataset=['trials', 'spikes']) + + Search for sessions with any datasets that have a QC of PASS or NOT_SET. + + >>> eids = one.search(dataset_qc_lte='PASS') + Notes ----- - In default and local mode, most queries are case-sensitive partial matches. When lists are provided, the search is a logical OR, except for `datasets`, which is a logical AND. + - If `dataset_qc` and `datasets` are defined, the QC criterion only applies to the provided + datasets and all must pass for a session to be returned. - All search terms are true for a session to be returned, i.e. subject matches AND project matches, etc. - In remote mode most queries are case-insensitive partial matches. @@ -496,18 +511,22 @@ def sort_fcn(itm): elif key == 'number': query = util.ensure_list(value) sessions = sessions[sessions[key].isin(map(int, query))] - # Dataset check is biggest so this should be done last - elif key == 'dataset': - query = util.ensure_list(value) + # Dataset/QC check is biggest so this should be done last + elif key == 'dataset' or (key == 'dataset_qc_lte' and 'dataset' not in queries): datasets = self._cache['datasets'] + qc = QC.validate(queries.get('dataset_qc_lte', 'FAIL')).name # validate value has_dset = sessions.index.isin(datasets.index.get_level_values('eid')) datasets = datasets.loc[(sessions.index.values[has_dset], ), :] + query = util.ensure_list(value if key == 'dataset' else '') # For each session check any dataset both contains query and exists mask = ( (datasets .groupby('eid', sort=False) - .apply(lambda x: all_present(x['rel_path'], query, x['exists']))) + .apply(lambda x: all_present( + x['rel_path'], query, x['exists'] & x['qc'].le(qc)) + )) ) + # eids of matching dataset records idx = mask[mask].index @@ -676,12 +695,15 @@ def list_subjects(self) -> List[str]: return self._cache['sessions']['subject'].sort_values().unique().tolist() @util.refresh - def list_datasets(self, eid=None, filename=None, collection=None, revision=None, - details=False, query_type=None) -> Union[np.ndarray, pd.DataFrame]: + def list_datasets( + self, eid=None, filename=None, collection=None, revision=None, qc=QC.FAIL, + ignore_qc_not_set=False, details=False, query_type=None + ) -> Union[np.ndarray, pd.DataFrame]: """ - Given an eid, return the datasets for those sessions. If no eid is provided, - a list of all datasets is returned. When details is false, a sorted array of unique - datasets is returned (their relative paths). + Given an eid, return the datasets for those sessions. + + If no eid is provided, a list of all datasets is returned. When details is false, a sorted + array of unique datasets is returned (their relative paths). Parameters ---------- @@ -698,6 +720,11 @@ def list_datasets(self, eid=None, filename=None, collection=None, revision=None, revision : str Filters datasets and returns only the ones matching the revision. Supports asterisks as wildcards. + qc : str, int, one.alf.spec.QC + Returns datasets at or below this QC level. Integer values should correspond to the QC + enumeration NOT the qc category column codes in the pandas table. + ignore_qc_not_set : bool + When true, do not return datasets for which QC is NOT_SET. details : bool When true, a pandas DataFrame is returned, otherwise a numpy array of relative paths (collection/revision/filename) - see one.alf.spec.describe for details. @@ -733,8 +760,9 @@ def list_datasets(self, eid=None, filename=None, collection=None, revision=None, """ datasets = self._cache['datasets'] filter_args = dict( - collection=collection, filename=filename, wildcards=self.wildcards, - revision=revision, revision_last_before=False, assert_unique=False) + collection=collection, filename=filename, wildcards=self.wildcards, revision=revision, + revision_last_before=False, assert_unique=False, qc=qc, + ignore_qc_not_set=ignore_qc_not_set) if not eid: datasets = util.filter_datasets(datasets, **filter_args) return datasets.copy() if details else datasets['rel_path'].unique().tolist() @@ -754,8 +782,9 @@ def list_datasets(self, eid=None, filename=None, collection=None, revision=None, def list_collections(self, eid=None, filename=None, collection=None, revision=None, details=False, query_type=None) -> Union[np.ndarray, dict]: """ - List the collections for a given experiment. If no experiment ID is given, - all collections are returned. + List the collections for a given experiment. + + If no experiment ID is given, all collections are returned. Parameters ---------- @@ -824,14 +853,15 @@ def list_collections(self, eid=None, filename=None, collection=None, revision=No def list_revisions(self, eid=None, filename=None, collection=None, revision=None, details=False, query_type=None): """ - List the revisions for a given experiment. If no experiment id is given, - all collections are returned. + List the revisions for a given experiment. + + If no experiment id is given, all collections are returned. Parameters ---------- eid : str, UUID, Path, dict Experiment session identifier; may be a UUID, URL, experiment reference string - details dict or Path + details dict or Path. filename : str, dict, list Filters datasets and returns only the revisions containing matching datasets. Supports lists asterisks as wildcards. May be a dict of ALF parts. @@ -841,14 +871,14 @@ def list_revisions(self, eid=None, filename=None, collection=None, revision=None Filter by a given pattern. Supports asterisks as wildcards. details : bool If true a dict of pandas datasets tables is returned with collections as keys, - otherwise a numpy array of unique collections + otherwise a numpy array of unique collections. query_type : str - Query cache ('local') or Alyx database ('remote') + Query cache ('local') or Alyx database ('remote'). Returns ------- list, dict - A list of unique collections or dict of datasets tables + A list of unique collections or dict of datasets tables. Examples -------- @@ -895,8 +925,9 @@ def load_object(self, download_only: bool = False, **kwargs) -> Union[alfio.AlfBunch, List[Path]]: """ - Load all attributes of an ALF object from a Session ID and an object name. Any datasets - with matching object name will be loaded. + Load all attributes of an ALF object from a Session ID and an object name. + + Any datasets with matching object name will be loaded. Parameters ---------- @@ -915,18 +946,18 @@ def load_object(self, returned (usually the most recent revision). Regular expressions/wildcards not permitted. query_type : str - Query cache ('local') or Alyx database ('remote') + Query cache ('local') or Alyx database ('remote'). download_only : bool When true the data are downloaded and the file path is returned. NB: The order of the file path list is undefined. - **kwargs + kwargs Additional filters for datasets, including namespace and timescale. For full list - see the one.alf.spec.describe function. + see the :func:`one.alf.spec.describe` function. Returns ------- one.alf.io.AlfBunch, list - An ALF bunch or if download_only is True, a list of Paths objects + An ALF bunch or if download_only is True, a list of Paths objects. Examples -------- @@ -1707,9 +1738,13 @@ def describe_dataset(self, dataset_type=None): return out @util.refresh - def list_datasets(self, eid=None, filename=None, collection=None, revision=None, - details=False, query_type=None) -> Union[np.ndarray, pd.DataFrame]: - filters = dict(collection=collection, filename=filename, revision=revision) + def list_datasets( + self, eid=None, filename=None, collection=None, revision=None, qc=QC.FAIL, + ignore_qc_not_set=False, details=False, query_type=None + ) -> Union[np.ndarray, pd.DataFrame]: + filters = dict( + collection=collection, filename=filename, revision=revision, + qc=qc, ignore_qc_not_set=ignore_qc_not_set) if (query_type or self.mode) != 'remote': return super().list_datasets(eid, details=details, query_type=query_type, **filters) elif not eid: @@ -1954,6 +1989,8 @@ def search_insertions(self, details=False, query_type=None, **kwargs): '_ibl_wheel.position.npy'. C.f. `datasets` argument. datasets : str, list One or more exact dataset names. Returns insertions containing all these datasets. + dataset_qc_lte : int, str, one.alf.spec.QC + The maximum QC value for associated datasets. dataset_types : str, list One or more dataset_types (exact matching). details : bool @@ -2026,7 +2063,7 @@ def search(self, details=False, query_type=None, **kwargs): one.search_terms(query_type='remote') - For all of the search parameters, a single value or list may be provided. For `dataset`, + For all search parameters, a single value or list may be provided. For `dataset`, the sessions returned will contain all listed datasets. For the other parameters, the session must contain at least one of the entries. @@ -2069,6 +2106,8 @@ def search(self, details=False, query_type=None, **kwargs): One or more of dataset_types. datasets : str, list One or more (exact) dataset names. Returns insertions containing all of these datasets. + dataset_qc_lte : int, str, one.alf.spec.QC + The maximum QC value for associated datasets. details : bool If true also returns a dict of dataset details. query_type : str, None diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 1c5821d2..8f49df09 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -133,7 +133,7 @@ def test_one_search(self): # Search datasets query = 'spikes.depths' - eids = one.search(data=query) + eids = one.search(dataset=query) self.assertTrue(eids) expected = [ 'd3372b15-f696-4279-9be5-98f15783b5bb', @@ -142,13 +142,26 @@ def test_one_search(self): ] self.assertEqual(eids, expected) + # Search QC + dataset + query = ['spikes.depths', 'spikes.times'] + eid = eids[0] + idx = (eid, 'a563480a-6a57-4221-b630-c7be49732ae5') + one._cache['datasets'].loc[idx, 'qc'] = 'FAIL' # Set QC for 1 spikes.times dataset to FAIL + eids = one.search(data=query, qc='WARNING') + self.assertEqual(eids, expected[1:], 'failed to filter FAIL QC') + + # Search QC only - the one session with no WARNING or lower datasets should be excluded + one._cache['datasets'].loc[eid, 'qc'] = 'FAIL' + self.assertNotIn(eid, one.search(qc='WARNING')) + # Filter non-existent # Set exist for one of the eids to false + query = 'spikes.depths' mask = (one._cache['datasets']['rel_path'].str.contains(query)) i = one._cache['datasets'][mask].index[0] one._cache['datasets'].loc[i, 'exists'] = False - self.assertTrue(len(eids) == len(one.search(data=query)) + 1) + self.assertTrue(len(expected) == len(one.search(data=query)) + 1) # Search task_protocol eids = one.search(task='habituation') @@ -987,6 +1000,7 @@ def test_ses2records(self): self.assertCountEqual(expected, datasets.columns) self.assertEqual(tuple(datasets.index.names), ('eid', 'id')) self.assertTrue(datasets.default_revision.all()) + self.assertIsInstance(datasets.qc.dtype, pd.CategoricalDtype) # Check behaviour when no datasets present ses['data_dataset_session_related'] = [] @@ -1005,6 +1019,7 @@ def test_datasets2records(self): expected = self.one._cache['datasets'].columns self.assertCountEqual(expected, (x for x in datasets.columns if x != 'default_revision')) self.assertEqual(tuple(datasets.index.names), ('eid', 'id')) + self.assertIsInstance(datasets.qc.dtype, pd.CategoricalDtype) # Test extracts additional fields fields = ('url', 'auto_datetime') From 70abd5a5c424650bb03b80815e8479f0f06bced0 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Tue, 27 Feb 2024 08:06:39 +0000 Subject: [PATCH 09/20] Fix tests --- one/alf/cache.py | 9 ++++++--- one/api.py | 2 +- one/tests/test_alyxclient.py | 2 +- one/tests/test_one.py | 22 +++++++++++++--------- one/tests/test_params.py | 2 +- one/tests/util.py | 4 +++- one/util.py | 3 +-- 7 files changed, 26 insertions(+), 18 deletions(-) diff --git a/one/alf/cache.py b/one/alf/cache.py index c6b7965e..4cfde509 100644 --- a/one/alf/cache.py +++ b/one/alf/cache.py @@ -30,6 +30,7 @@ from one.alf.io import iter_sessions, iter_datasets from one.alf.files import session_path_parts, get_alf_path from one.converters import session_record2path +from one.util import QC_TYPE __all__ = ['make_parquet_db', 'remove_missing_datasets', 'DATASETS_COLUMNS', 'SESSIONS_COLUMNS'] _logger = logging.getLogger(__name__) @@ -56,6 +57,7 @@ 'file_size', # file size in bytes 'hash', # sha1/md5, computed in load function 'exists', # bool + 'qc', # one.util.QC_TYPE ) @@ -91,7 +93,8 @@ def _get_dataset_info(full_ses_path, rel_dset_path, ses_eid=None, compute_hash=F 'rel_path': Path(rel_dset_path).as_posix(), 'file_size': file_size, 'hash': md5(full_dset_path) if compute_hash else None, - 'exists': True + 'exists': True, + 'qc': 'NOT_SET' } @@ -190,7 +193,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame: pandas.DataFrame A pandas DataFrame of dataset info. """ - df = pd.DataFrame([], columns=DATASETS_COLUMNS) + df = pd.DataFrame([], columns=DATASETS_COLUMNS).astype({'qc': QC_TYPE}) # Go through sessions and append datasets for session_path in iter_sessions(root_dir): rows = [] @@ -200,7 +203,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame: rows.append(file_info) df = pd.concat((df, pd.DataFrame(rows, columns=DATASETS_COLUMNS)), ignore_index=True, verify_integrity=True) - return df + return df.astype({'qc': QC_TYPE}) def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None): diff --git a/one/api.py b/one/api.py index 614fb3ad..7925f174 100644 --- a/one/api.py +++ b/one/api.py @@ -42,7 +42,7 @@ class One(ConversionMixin): """An API for searching and loading data on a local filesystem""" _search_terms = ( 'dataset', 'date_range', 'laboratory', 'number', - 'projects', 'subject', 'task_protocol', 'dataset_qc' + 'projects', 'subject', 'task_protocol', 'dataset_qc_lte' ) uuid_filenames = None diff --git a/one/tests/test_alyxclient.py b/one/tests/test_alyxclient.py index 371fc548..728c033c 100644 --- a/one/tests/test_alyxclient.py +++ b/one/tests/test_alyxclient.py @@ -104,7 +104,7 @@ def test_auth_methods(self): # Test download cache tables self.ac.logout() self.assertFalse(self.ac.is_logged_in) - url = self.ac.get('cache/info')['location'] + url = self.ac.get('cache/info').get('location') self.ac.download_cache_tables(url) self.assertTrue(self.ac.is_logged_in) diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 8f49df09..f1a1cdad 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -147,12 +147,12 @@ def test_one_search(self): eid = eids[0] idx = (eid, 'a563480a-6a57-4221-b630-c7be49732ae5') one._cache['datasets'].loc[idx, 'qc'] = 'FAIL' # Set QC for 1 spikes.times dataset to FAIL - eids = one.search(data=query, qc='WARNING') + eids = one.search(dataset=query, dataset_qc='WARNING') self.assertEqual(eids, expected[1:], 'failed to filter FAIL QC') # Search QC only - the one session with no WARNING or lower datasets should be excluded one._cache['datasets'].loc[eid, 'qc'] = 'FAIL' - self.assertNotIn(eid, one.search(qc='WARNING')) + self.assertNotIn(eid, one.search(dataset_qc='WARNING')) # Filter non-existent # Set exist for one of the eids to false @@ -161,7 +161,7 @@ def test_one_search(self): i = one._cache['datasets'][mask].index[0] one._cache['datasets'].loc[i, 'exists'] = False - self.assertTrue(len(expected) == len(one.search(data=query)) + 1) + self.assertTrue(len(expected) == len(one.search(dataset=query)) + 1) # Search task_protocol eids = one.search(task='habituation') @@ -876,7 +876,8 @@ def test_save_loaded_ids(self): dset.name = (eid, str(uuid4())) old_cache = self.one._cache['datasets'] try: - self.one._cache['datasets'] = pd.concat([self.one._cache.datasets, dset.to_frame().T]) + datasets = pd.concat([self.one._cache.datasets, dset.to_frame().T]).astype(old_cache.dtypes) + self.one._cache['datasets'] = datasets dsets = [dset['rel_path'], '_ibl_trials.feedback_times.npy'] new_files, rec = self.one.load_datasets(eid, dsets, assert_present=False) loaded = self.one._cache['_loaded_datasets'] @@ -1266,7 +1267,8 @@ def test_list_aggregates(self): 'trials.table.3ef042c6-82a4-426c-9aa9-be3b48d86652.pqt', 'data_repository': 'aws_aggregates', 'exists': True}], - 'default_dataset': True}, + 'default_dataset': True, + 'qc': 'NOT_SET'}, {'url': '7bdb08d6-b166-43d8-8981-816cf616d291', 'session': None, 'file_size': '', 'hash': '', 'file_records': [{'data_url': 'https://ibl.flatironinstitute.org/' @@ -1274,7 +1276,8 @@ def test_list_aggregates(self): 'trials.table.7bdb08d6-b166-43d8-8981-816cf616d291.pqt', 'data_repository': 'flatiron_aggregates', 'exists': True}], - 'default_dataset': True}, + 'default_dataset': True, + 'qc': 'NOT_SET'}, ] with mock.patch.object(self.one.alyx, 'rest', return_value=mock_ds): self.assertEqual(len(self.one.list_aggregates('subjects')), 2) @@ -1459,6 +1462,7 @@ def test_search_insertions(self): def test_search_terms(self): """Test OneAlyx.search_terms""" + assert self.one.mode != 'remote' search1 = self.one.search_terms() self.assertIn('dataset', search1) @@ -1633,7 +1637,7 @@ def test_download_aws(self): # Check output filename _, local = method.call_args.args self.assertTrue(local.as_posix().startswith(self.one.cache_dir.as_posix())) - self.assertTrue(local.as_posix().endswith(dsets.iloc[-1, -1])) + self.assertTrue(local.as_posix().endswith(dsets.iloc[-1]['rel_path'])) # Check keep_uuid = True self.one._download_datasets(dsets, keep_uuid=True) _, local = method.call_args.args @@ -1642,7 +1646,7 @@ def test_download_aws(self): # Test behaviour when dataset not remotely accessible dsets = dsets[:1].copy() rec = self.one.alyx.rest('datasets', 'read', id=dsets.index[0]) - # need to find the index of matching aws repo, this is not constant accross releases + # need to find the index of matching aws repo, this is not constant across releases iaws = list(map(lambda x: x['data_repository'].startswith('aws'), rec['file_records'])).index(True) rec['file_records'][iaws]['exists'] = False # Set AWS file record to non-existent @@ -1816,7 +1820,7 @@ def mock_input(prompt): pars = one.params.get(url) self.assertFalse('ALYX_PWD' in pars.as_dict()) self.assertEqual(one_obj.alyx._par.ALYX_URL, url) - client_pars = Path(self.tempdir.name).rglob(f'.{one_obj.alyx.base_url.split("/")[-1]}') + client_pars = Path(self.tempdir.name).rglob(f'.{one_obj.alyx.base_url.split("/")[-1]}'.replace(':', '_')) self.assertEqual(len(list(client_pars)), 1) # Save ALYX_PWD into params and see if setup modifies it with mock.patch('iblutil.io.params.getfile', new=self.get_file): diff --git a/one/tests/test_params.py b/one/tests/test_params.py index b758c3fc..e38958b2 100644 --- a/one/tests/test_params.py +++ b/one/tests/test_params.py @@ -19,7 +19,7 @@ class TestParamSetup(unittest.TestCase): def setUp(self) -> None: self.par_dir = tempfile.TemporaryDirectory() self.addCleanup(self.par_dir.cleanup) - self.url = TEST_DB_1['base_url'][8:] # URL without schema + self.url = TEST_DB_1['base_url'][8:].replace(':', '_') # URL without schema # Change the location of the parameters to our temp dir get_file = partial(util.get_file, self.par_dir.name) self.get_file_mock = mock.patch('iblutil.io.params.getfile', new=get_file) diff --git a/one/tests/util.py b/one/tests/util.py index 93f912bc..fee30011 100644 --- a/one/tests/util.py +++ b/one/tests/util.py @@ -10,6 +10,7 @@ from iblutil.io.params import set_hidden import one.params +from one.util import QC_TYPE def set_up_env() -> tempfile.TemporaryDirectory: @@ -149,11 +150,12 @@ def revisions_datasets_table(collections=('', 'alf/probe00', 'alf/probe01'), 'file_size': None, 'hash': None, 'exists': True, + 'qc': 'NOT_SET', 'eid': str(uuid4()), 'id': map(str, (uuid4() for _ in rel_path)) } - return pd.DataFrame(data=d).set_index(['eid', 'id']) + return pd.DataFrame(data=d).astype({'qc': QC_TYPE}).set_index(['eid', 'id']) def create_schema_cache(param_dir=None): diff --git a/one/util.py b/one/util.py index e8193afd..e7602def 100644 --- a/one/util.py +++ b/one/util.py @@ -387,8 +387,7 @@ def filter_datasets( path_match = all_datasets['rel_path'].str.match(pattern) # Test on QC outcome - if not isinstance(qc, QC): # cast to QC enum for validation - qc = QC[qc] if isinstance(qc, str) else QC(qc) + qc = QC.validate(qc) qc_match = all_datasets['qc'].le(qc.name) if ignore_qc_not_set: qc_match &= all_datasets['qc'].ne('NOT_SET') From 2e486cb7da695ab42f4b930d8b679edf746f5e50 Mon Sep 17 00:00:00 2001 From: olivier Date: Fri, 1 Mar 2024 15:09:51 +0000 Subject: [PATCH 10/20] Release candidate --- CHANGELOG.md | 28 +++++++++++++++++++++++++--- one/__init__.py | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16b0b6e8..e5b76d51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,34 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.7rc0] +This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC filters. This version no longer supports 'data' search filter. + +### Added + +- support for Alyx v2.0.0 +- support for pandas v3.0.0 +- one.alf.spec.QC enumeration +- ONE_HTTP_DL_THREADS environment variable allows user to specify maximum number of threads to use +- github workflow for releasing to PyPi + +### Modified + +- support 'qc' category field in dataset cache table +- One.search supports ´dataset_qc_lte` filter +- One.list_datasets supports ´dataset_qc_lte` and `ignore_qc_not_set` filters +- one.alf.io.iter_sessions more performant + +### Removed + +- One.search no longer supports 'data' filter: kwarg must be 'dataset' + +## [2.6.0] ### Modified -- `one.load_dataset` + +- One.load_dataset - add an option to skip computing hash for existing files when loading datasets `check_hash=False` - check filesize before computing hash for performance - ## [2.5.5] ### Modified diff --git a/one/__init__.py b/one/__init__.py index df50e79e..c7561959 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.7.0' +__version__ = '2.7rc0' From 05cbe0e4fb9532a3ddb8312d3fcceb2757a8f84f Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Wed, 6 Mar 2024 16:28:48 -0500 Subject: [PATCH 11/20] parse attribute in to_alf --- README.md | 2 +- one/alf/spec.py | 6 ++++-- one/registration.py | 4 ++-- one/tests/alf/test_alf_spec.py | 6 ++++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7560ca2d..5c8f75aa 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Coverage Status](https://coveralls.io/repos/github/int-brain-lab/ONE/badge.svg?branch=main)](https://coveralls.io/github/int-brain-lab/ONE?branch=main) ![CI workflow](https://github.com/int-brain-lab/ONE/actions/workflows/main.yaml/badge.svg?branch=main) -The Open Neurophysiology Environment is a scheme for sharing neurophysiology data in a standardized manner. It is a Python API for searching and loading ONE-standardized data, stored either on a user’s local machine or on a remote server. +The Open Neurophysiology Environment is a scheme for sharing neurophysiology data in a standardized manner. It is a Python API for searching and loading ONE-standardized data, stored either on a user's local machine or on a remote server. Please [Click here](https://int-brain-lab.github.io/ONE/) for the main documentation page. For a quick primer on the file naming convention we use, [click here](https://github.com/int-brain-lab/ONE/blob/main/docs/Open_Neurophysiology_Environment_Filename_Convention.pdf). diff --git a/one/alf/spec.py b/one/alf/spec.py index b65c6007..6fa018c3 100644 --- a/one/alf/spec.py +++ b/one/alf/spec.py @@ -425,8 +425,6 @@ def to_alf(object, attribute, extension, namespace=None, timescale=None, extra=N raise TypeError('An extension must be provided') elif extension.startswith('.'): extension = extension[1:] - if re.search('_(?!times$|intervals)', attribute): - raise ValueError('Object attributes must not contain underscores') if any(pt is not None and '.' in pt for pt in (object, attribute, namespace, extension, timescale)): raise ValueError('ALF parts must not contain a period (`.`)') @@ -438,6 +436,10 @@ def to_alf(object, attribute, extension, namespace=None, timescale=None, extra=N if timescale: timescale = filter(None, [timescale] if isinstance(timescale, str) else timescale) timescale = '_'.join(map(_dromedary, timescale)) + # Convert attribute to camel case, leaving '_times', etc. in tact + times_re = re.search('_(times|timestamps|intervals)$', attribute) + idx = times_re.start() if times_re else len(attribute) + attribute = _dromedary(attribute[:idx]) + attribute[idx:] object = _dromedary(object) # Optional extras may be provided as string or tuple of strings diff --git a/one/registration.py b/one/registration.py index 3a2f4897..d38b1eef 100644 --- a/one/registration.py +++ b/one/registration.py @@ -150,7 +150,7 @@ def create_new_session(self, subject, session_root=None, date=None, register=Tru An optional date for the session. If None the current time is used. register : bool If true, create session record on Alyx database. - **kwargs + kwargs Optional arguments for RegistrationClient.register_session. Returns @@ -421,7 +421,7 @@ def register_files(self, file_list, exists : bool Whether files exist in the repository. May be set to False when registering files before copying to the repository. - **kwargs + kwargs Extra arguments directly passed as REST request data to /register-files endpoint. Returns diff --git a/one/tests/alf/test_alf_spec.py b/one/tests/alf/test_alf_spec.py index 9532df50..72e82153 100644 --- a/one/tests/alf/test_alf_spec.py +++ b/one/tests/alf/test_alf_spec.py @@ -195,11 +195,13 @@ def test_to_alf(self): self.assertEqual(filename, '_ibl_spikes.times_ephysClock_minutes.ssv') filename = alf_spec.to_alf('wheel', 'timestamps', '.npy', 'ibl', 'bpod', ('raw', 'v12')) self.assertEqual(filename, '_ibl_wheel.timestamps_bpod.raw.v12.npy') + filename = alf_spec.to_alf('obj', 'attr_timestamps', '.npy') + self.assertEqual(filename, 'obj.attr_timestamps.npy') + filename = alf_spec.to_alf('obj', 'foo_bar_intervals', '.npy') + self.assertEqual(filename, 'obj.fooBar_intervals.npy') with self.assertRaises(TypeError): alf_spec.to_alf('spikes', 'times', '') - with self.assertRaises(ValueError): - alf_spec.to_alf('spikes', 'foo_bar', 'npy') with self.assertRaises(ValueError): alf_spec.to_alf('spikes.times', 'fooBar', 'npy') with self.assertRaises(ValueError): From 1098fd1e1e4570c22cf19ef5db0a891c731f09c8 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Mon, 11 Mar 2024 18:01:34 +0200 Subject: [PATCH 12/20] pattern arg for iter_sessions --- CHANGELOG.md | 4 ++-- one/__init__.py | 2 +- one/alf/io.py | 19 ++++++++++++++----- one/tests/alf/test_alf_io.py | 8 ++++---- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b76d51..5e883bbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.7rc0] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.7rc1] This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC filters. This version no longer supports 'data' search filter. ### Added @@ -15,7 +15,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC - support 'qc' category field in dataset cache table - One.search supports ´dataset_qc_lte` filter - One.list_datasets supports ´dataset_qc_lte` and `ignore_qc_not_set` filters -- one.alf.io.iter_sessions more performant +- one.alf.io.iter_sessions pattern arg to make more performant ### Removed diff --git a/one/__init__.py b/one/__init__.py index c7561959..e3748053 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.7rc0' +__version__ = '2.7rc1' diff --git a/one/alf/io.py b/one/alf/io.py index 8c9753f7..b2871684 100644 --- a/one/alf/io.py +++ b/one/alf/io.py @@ -375,7 +375,7 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple): return [alfpath.joinpath(f) for f in files_alf], attributes -def iter_sessions(root_dir, lab_folders=False): +def iter_sessions(root_dir, pattern='*'): """ Recursively iterate over session paths in a given directory. @@ -383,16 +383,25 @@ def iter_sessions(root_dir, lab_folders=False): ---------- root_dir : str, pathlib.Path The folder to look for sessions. - lab_folders : bool - If true, glob pattern reflects `root_dir` containing /Subjects folders. This is - slightly more performant when Subjects folders present. + pattern : str + Glob pattern to use. Default searches all folders. Providing a more specific pattern makes + this more performant (see examples). Yields ------- pathlib.Path The next session path in lexicographical order. + + Examples + -------- + Efficient iteration when `root_dir` contains /Subjects folders + + >>> sessions = list(iter_sessions(root_dir, pattern='*/Subjects/*/????-??-??/*')) + + Efficient iteration when `root_dir` contains subject folders + + >>> sessions = list(iter_sessions(root_dir, pattern='*/????-??-??/*')) """ - pattern = ('*/Subjects/' if lab_folders else '') + '*/????-??-??/*' if spec.is_session_path(root_dir): yield root_dir for path in sorted(Path(root_dir).rglob(pattern)): diff --git a/one/tests/alf/test_alf_io.py b/one/tests/alf/test_alf_io.py index 6c0449ff..5935d8f5 100644 --- a/one/tests/alf/test_alf_io.py +++ b/one/tests/alf/test_alf_io.py @@ -677,13 +677,13 @@ def test_iter_sessions(self): self.assertFalse(next(valid_sessions, False)) # makes sure that the session path returns itself on the iterator self.assertEqual(self.session_path, next(alfio.iter_sessions(self.session_path))) - # test lab_folders arg - valid_sessions = alfio.iter_sessions(self.tempdir.name, lab_folders=True) + # test pattern arg + valid_sessions = alfio.iter_sessions(self.tempdir.name, pattern='*/Subjects/*/????-??-??/*') self.assertEqual(self.session_path, next(valid_sessions)) subjects_path = Path(self.tempdir.name, 'fakelab', 'Subjects') - valid_sessions = alfio.iter_sessions(subjects_path, lab_folders=False) + valid_sessions = alfio.iter_sessions(subjects_path, pattern='*/????-??-??/*') self.assertEqual(self.session_path, next(valid_sessions)) - valid_sessions = alfio.iter_sessions(subjects_path, lab_folders=True) + valid_sessions = alfio.iter_sessions(subjects_path, pattern='*/Subjects/*/????-??-??/*') self.assertFalse(next(valid_sessions, False)) def test_iter_datasets(self): From bd36f75e8d71fb0719d0aaed0c59809b96d09110 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Wed, 13 Mar 2024 18:30:07 +0200 Subject: [PATCH 13/20] Ensure returned records always the same length as input file list --- one/__init__.py | 2 +- one/registration.py | 24 +++++++++++++++++++----- one/tests/alf/test_alf_io.py | 3 ++- one/tests/test_one.py | 14 ++++++++------ one/tests/test_registration.py | 27 ++++++++++++++------------- 5 files changed, 44 insertions(+), 26 deletions(-) diff --git a/one/__init__.py b/one/__init__.py index e3748053..767a98d1 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.7rc1' +__version__ = '2.7rc2' diff --git a/one/registration.py b/one/registration.py index d38b1eef..9e504d62 100644 --- a/one/registration.py +++ b/one/registration.py @@ -446,8 +446,10 @@ def register_files(self, file_list, """ F = defaultdict(list) # empty map whose keys will be session paths V = defaultdict(list) # empty map for versions - if isinstance(file_list, (str, pathlib.Path)): + if single_file := isinstance(file_list, (str, pathlib.Path)): file_list = [file_list] + else: + file_list = list(file_list) # Ensure not generator if versions is None or isinstance(versions, str): versions = itertools.repeat(versions) @@ -457,6 +459,9 @@ def register_files(self, file_list, # Filter valid files and sort by session for fn, ver in zip(map(pathlib.Path, file_list), versions): session_path = get_session_path(fn) + if not session_path: + _logger.debug(f'{fn}: Invalid session path') + continue if fn.suffix not in self.file_extensions: _logger.debug(f'{fn}: No matching extension "{fn.suffix}" in database') continue @@ -469,7 +474,7 @@ def register_files(self, file_list, V[session_path].append(ver) # For each unique session, make a separate POST request - records = [] + records = [None] * (len(F) if dry else len(file_list)) # If dry return data per session for session_path, files in F.items(): # this is the generic relative path: subject/yyyy-mm-dd/NNN details = session_path_parts(session_path.as_posix(), as_dict=True, assert_valid=True) @@ -501,8 +506,15 @@ def register_files(self, file_list, if details['lab'] and 'labs' not in kwargs: r_['labs'] = details['lab'] # If dry, store POST data, otherwise store resulting file records + if dry: + records[list(F).index(session_path)] = r_ + continue try: - records.append(r_ if dry else self.one.alyx.post('/register-file', data=r_)) + response = self.one.alyx.post('/register-file', data=r_) + # Ensure we keep the order of the output records: the files missing will remain + # as None type + for f, r in zip(files, response): + records[file_list.index(session_path / f)] = r except requests.exceptions.HTTPError as err: # 403 response when datasets already registered and protected by tags err_message = err.response.json() @@ -602,7 +614,9 @@ def register_files(self, file_list, r_['filesizes'] = [session_path.joinpath(p).stat().st_size for p in new_file_list] r_['check_protected'] = False # Speed things up by ignoring server-side checks - records.append(self.one.alyx.post('/register-file', data=r_)) + response = self.one.alyx.post('/register-file', data=r_) + for f, r in zip(files, response): # Populate records list in correct order + records[file_list.index(session_path / f)] = r files = new_file_list # Log file names @@ -610,7 +624,7 @@ def register_files(self, file_list, for p in files: _logger.info(f'ALYX REGISTERED DATA: {p}') - return records[0] if len(F.keys()) == 1 else records + return records[0] if single_file else records @staticmethod def _next_revision(revision: str, reserved: list = None, alpha: str = 'a') -> str: diff --git a/one/tests/alf/test_alf_io.py b/one/tests/alf/test_alf_io.py index 5935d8f5..aff1735e 100644 --- a/one/tests/alf/test_alf_io.py +++ b/one/tests/alf/test_alf_io.py @@ -678,7 +678,8 @@ def test_iter_sessions(self): # makes sure that the session path returns itself on the iterator self.assertEqual(self.session_path, next(alfio.iter_sessions(self.session_path))) # test pattern arg - valid_sessions = alfio.iter_sessions(self.tempdir.name, pattern='*/Subjects/*/????-??-??/*') + valid_sessions = alfio.iter_sessions( + self.tempdir.name, pattern='*/Subjects/*/????-??-??/*') self.assertEqual(self.session_path, next(valid_sessions)) subjects_path = Path(self.tempdir.name, 'fakelab', 'Subjects') valid_sessions = alfio.iter_sessions(subjects_path, pattern='*/????-??-??/*') diff --git a/one/tests/test_one.py b/one/tests/test_one.py index f1a1cdad..1f91a50c 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -1,6 +1,6 @@ -"""Unit tests for the one.api module +"""Unit tests for the one.api module. -Wherever possible the ONE tests should not rely on an internet connection +Wherever possible the ONE tests should not rely on an internet connection. Fixture locations: @@ -21,9 +21,9 @@ - When verifying remote changes via the rest method, use the no_cache flag to ensure the remote databaseis queried. You can clear the cache using AlyxClient.clear_rest_cache(), - or mock iblutil.io.params.getfile to return a temporary cache directory + or mock iblutil.io.params.getfile to return a temporary cache directory. - An One object created through the one.api.ONE function, make sure you restore the - properties to their original state on teardown, or call one.api.ONE.cache_clear() + properties to their original state on teardown, or call one.api.ONE.cache_clear(). """ import datetime @@ -876,7 +876,8 @@ def test_save_loaded_ids(self): dset.name = (eid, str(uuid4())) old_cache = self.one._cache['datasets'] try: - datasets = pd.concat([self.one._cache.datasets, dset.to_frame().T]).astype(old_cache.dtypes) + datasets = [self.one._cache.datasets, dset.to_frame().T] + datasets = pd.concat(datasets).astype(old_cache.dtypes) self.one._cache['datasets'] = datasets dsets = [dset['rel_path'], '_ibl_trials.feedback_times.npy'] new_files, rec = self.one.load_datasets(eid, dsets, assert_present=False) @@ -1820,7 +1821,8 @@ def mock_input(prompt): pars = one.params.get(url) self.assertFalse('ALYX_PWD' in pars.as_dict()) self.assertEqual(one_obj.alyx._par.ALYX_URL, url) - client_pars = Path(self.tempdir.name).rglob(f'.{one_obj.alyx.base_url.split("/")[-1]}'.replace(':', '_')) + client = f'.{one_obj.alyx.base_url.split("/")[-1]}'.replace(':', '_') + client_pars = Path(self.tempdir.name).rglob(client) self.assertEqual(len(list(client_pars)), 1) # Save ALYX_PWD into params and see if setup modifies it with mock.patch('iblutil.io.params.getfile', new=self.get_file): diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index 70910fc4..36f81f19 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -77,7 +77,7 @@ def setUp(self) -> None: self.client = registration.RegistrationClient(one=self.one) def test_water_administration(self): - """Test for RegistrationClient.register_water_administration""" + """Test for RegistrationClient.register_water_administration.""" record = self.client.register_water_administration(self.subject, 35.10000000235) self.assertEqual(record['subject'], self.subject) self.assertEqual(record['water_administered'], 35.1) @@ -105,7 +105,7 @@ def test_water_administration(self): self.client.register_water_administration(self.subject, 3.6, session=ses['url']) def test_register_weight(self): - """Test for RegistrationClient.register_weight""" + """Test for RegistrationClient.register_weight.""" record = self.client.register_weight(self.subject, 35.10000000235) self.assertEqual(record['subject'], self.subject) self.assertEqual(record['weight'], 35.1) @@ -115,7 +115,7 @@ def test_register_weight(self): self.client.register_weight(self.subject, 0.0) def test_ensure_ISO8601(self): - """Test for RegistrationClient.ensure_ISO8601""" + """Test for RegistrationClient.ensure_ISO8601.""" date = datetime.datetime(2021, 7, 14, 15, 53, 15, 525119) self.assertEqual(self.client.ensure_ISO8601(date), '2021-07-14T15:53:15.525119') self.assertEqual(self.client.ensure_ISO8601(date.date()), '2021-07-14T00:00:00') @@ -125,7 +125,7 @@ def test_ensure_ISO8601(self): self.client.ensure_ISO8601(f'{date:%D}') def test_exists(self): - """Test for RegistrationClient.assert_exists""" + """Test for RegistrationClient.assert_exists.""" # Check user endpoint with self.assertRaises(alferr.AlyxSubjectNotFound): self.client.assert_exists('foobar', 'subjects') @@ -142,7 +142,7 @@ def test_exists(self): self.client.assert_exists('foobar', 'subjects') def test_find_files(self): - """Test for RegistrationClient.find_files""" + """Test for RegistrationClient.find_files.""" # Remove a dataset type from the client to check that the dataset(s) are ignored existing = (x['filename_pattern'] and any(self.session_path.rglob(x['filename_pattern'])) for x in self.client.dtypes) @@ -154,7 +154,7 @@ def test_find_files(self): self.assertFalse(fnmatch.filter([x.name for x in files], removed['filename_pattern'])) def test_create_new_session(self): - """Test for RegistrationClient.create_new_session""" + """Test for RegistrationClient.create_new_session.""" # Check register = True session_path, eid = self.client.create_new_session( self.subject, date='2020-01-01', projects='ibl_neuropixel_brainwide_01' @@ -173,7 +173,7 @@ def test_create_new_session(self): self.assertIsNone(eid) def test_register_session(self): - """Test for RegistrationClient.register_session""" + """Test for RegistrationClient.register_session.""" # Find some datasets to create datasets = self.one.list_datasets(self.one.search(dataset='raw')[0]) session_path = self.one.alyx.cache_dir.joinpath( @@ -204,7 +204,7 @@ def test_register_session(self): self.assertEqual(start_time, ses['start_time']) def test_create_sessions(self): - """Test for RegistrationClient.create_sessions""" + """Test for RegistrationClient.create_sessions.""" session_path = self.session_path.parent / next_num_folder(self.session_path.parent) session_path.mkdir(parents=True) session_path.joinpath('create_me.flag').touch() @@ -223,7 +223,7 @@ def test_create_sessions(self): self.assertEqual(session_paths[0], session_path) def test_register_files(self): - """Test for RegistrationClient.register_files""" + """Test for RegistrationClient.register_files.""" # Test a few things not checked in register_session session_path, eid = self.client.create_new_session(self.subject) @@ -245,7 +245,8 @@ def test_register_files(self): ambiguous = self.client.dtypes[-1]['filename_pattern'].replace('*', 'npy') files = [session_path.joinpath('wheel.position.xxx'), # Unknown ext session_path.joinpath('foo.bar.npy'), # Unknown dtype - session_path.joinpath(ambiguous) # Ambiguous dtype + session_path.joinpath(ambiguous), # Ambiguous dtype + session_path.with_name('foo').joinpath('spikes.times.npy') # Invalid session ] version = ['1.2.9'] * len(files) with self.assertLogs('one.registration', logging.DEBUG) as dbg: @@ -253,7 +254,7 @@ def test_register_files(self): self.assertIn('wheel.position.xxx: No matching extension', dbg.records[0].message) self.assertRegex(dbg.records[1].message, 'No dataset type .* "foo.bar.npy"') self.assertRegex(dbg.records[2].message, f'Multiple matching .* "{ambiguous}"') - self.assertFalse(len(rec)) + self.assertEqual([None] * len(files), rec) # Check the handling of revisions rec, = self.client.register_files(str(file_name)) @@ -365,7 +366,7 @@ def test_register_files(self): self.assertRaises(HTTPError, self.client.register_files, file_list=[file]) def test_next_revision(self): - """Test RegistrationClient._next_revision method""" + """Test RegistrationClient._next_revision method.""" self.assertEqual('2020-01-01a', self.client._next_revision('2020-01-01')) reserved = ['2020-01-01a', '2020-01-01b'] self.assertEqual('2020-01-01c', self.client._next_revision('2020-01-01', reserved)) @@ -374,7 +375,7 @@ def test_next_revision(self): self.assertRaises(TypeError, self.client._next_revision, '2020-01-01', alpha='do') def test_instantiation(self): - """Test RegistrationClient.__init__ with no args""" + """Test RegistrationClient.__init__ with no args.""" with unittest.mock.patch('one.registration.ONE') as mk: client = registration.RegistrationClient() self.assertIsInstance(client.one, unittest.mock.MagicMock) From e163975bce50d51eb23a06a01a1f7701589608f7 Mon Sep 17 00:00:00 2001 From: Mayo Faulkner Date: Wed, 20 Mar 2024 17:20:01 +0000 Subject: [PATCH 14/20] method to check whether datasets are protected --- one/registration.py | 52 ++++++++++++++++++++++++++++++++++ one/tests/test_registration.py | 23 +++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/one/registration.py b/one/registration.py index 8cb0f170..ea64bc0c 100644 --- a/one/registration.py +++ b/one/registration.py @@ -392,6 +392,58 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs): session['data_dataset_session_related'] = ensure_list(recs) return session, recs + def check_protected_files(self, file_list, created_by=None): + """ + Check whether a set of files associated to a session are protected + Parameters + ---------- + file_list : list, str, pathlib.Path + A filepath (or list thereof) of ALF datasets to register to Alyx. + created_by : str + Name of Alyx user (defaults to whoever is logged in to ONE instance). + + Returns + ------- + list of dicts, dict + A status for each session whether any of the files specified are protected + datasets or not.If none of the datasets are protected, a response with status + 200 is returned, if any of the files are protected a response with status + 403 is returned. + """ + + F = defaultdict(list) # empty map whose keys will be session paths + + if isinstance(file_list, (str, pathlib.Path)): + file_list = [file_list] + + # Filter valid files and sort by session + for fn in map(pathlib.Path, file_list): + session_path = get_session_path(fn) + if fn.suffix not in self.file_extensions: + _logger.debug(f'{fn}: No matching extension "{fn.suffix}" in database') + continue + try: + get_dataset_type(fn, self.dtypes) + except ValueError as ex: + _logger.debug('%s', ex.args[0]) + continue + F[session_path].append(fn.relative_to(session_path)) + + # For each unique session, make a separate POST request + records = [] + for session_path, files in F.items(): + # this is the generic relative path: subject/yyyy-mm-dd/NNN + details = session_path_parts(session_path.as_posix(), as_dict=True, assert_valid=True) + rel_path = PurePosixPath(details['subject'], details['date'], details['number']) + + r_ = {'created_by': created_by or self.one.alyx.user, + 'path': rel_path.as_posix(), + 'filenames': [x.as_posix() for x in files] + } + records.append(self.one.alyx.get('/check-protected', data=r_, clobber=True)) + + return records[0] if len(F.keys()) == 1 else records + def register_files(self, file_list, versions=None, default=True, created_by=None, server_only=False, repository=None, exists=True, dry=False, max_md5_size=None, **kwargs): diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index 70910fc4..d3d04734 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -222,6 +222,29 @@ def test_create_sessions(self): self.assertEqual(ses[0]['number'], int(session_path.parts[-1])) self.assertEqual(session_paths[0], session_path) + def test_check_protected(self): + """Test for RegistrationClient.check_protected_files""" + + session_path, eid = self.client.create_new_session(self.subject) + file_name = session_path.joinpath('wheel.timestamps.npy') + file_name.touch() + + # register a dataset + rec, = self.client.register_files(str(file_name)) + + # Check if it is protected, it shouldn't be, response 200 + protected = self.client.check_protected_files(str(file_name)) + self.assertEqual(protected['status_code'], 200) + + # Add a protected tag to all the datasets + tag = self.tag['name'] + self.one.alyx.rest('datasets', 'partial_update', id=rec['id'], data={'tags': [tag]}) + + # check protected + protected = self.client.check_protected_files(str(file_name)) + self.assertEqual(protected['status_code'], 403) + self.assertEqual(protected['error'], 'One or more datasets is protected') + def test_register_files(self): """Test for RegistrationClient.register_files""" # Test a few things not checked in register_session From ccdcbbda20e324d21cbf891c168745171d0b9685 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Fri, 22 Mar 2024 18:09:47 +0200 Subject: [PATCH 15/20] Minor typo --- one/api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/one/api.py b/one/api.py index 616ca999..690403b2 100644 --- a/one/api.py +++ b/one/api.py @@ -2025,7 +2025,7 @@ def search(self, details=False, query_type=None, **kwargs): one.search_terms(query_type='remote') - For all of the search parameters, a single value or list may be provided. For `dataset`, + For all search parameters, a single value or list may be provided. For `dataset`, the sessions returned will contain all listed datasets. For the other parameters, the session must contain at least one of the entries. @@ -2067,7 +2067,9 @@ def search(self, details=False, query_type=None, **kwargs): dataset_types : str, list One or more of dataset_types. datasets : str, list - One or more (exact) dataset names. Returns insertions containing all of these datasets. + One or more (exact) dataset names. Returns sessions containing all of these datasets. + dataset_qc_lte : int, str, one.alf.spec.QC + The maximum QC value for associated datasets. details : bool If true also returns a dict of dataset details. query_type : str, None From 6fad7718a218dce43cb1937a4382a031baa438a3 Mon Sep 17 00:00:00 2001 From: Miles Wells Date: Mon, 25 Mar 2024 12:59:36 +0200 Subject: [PATCH 16/20] Fix registration test --- one/registration.py | 8 ++++---- one/tests/test_registration.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/one/registration.py b/one/registration.py index 9e504d62..2fd9ea8c 100644 --- a/one/registration.py +++ b/one/registration.py @@ -427,7 +427,8 @@ def register_files(self, file_list, Returns ------- list of dicts, dict - A list of newly created Alyx dataset records or the registration data if dry. + A list of newly created Alyx dataset records or the registration data if dry. If + a single file is passed in, a single dict is returned. Notes ----- @@ -448,8 +449,7 @@ def register_files(self, file_list, V = defaultdict(list) # empty map for versions if single_file := isinstance(file_list, (str, pathlib.Path)): file_list = [file_list] - else: - file_list = list(file_list) # Ensure not generator + file_list = list(map(pathlib.Path, file_list)) # Ensure list of path objects if versions is None or isinstance(versions, str): versions = itertools.repeat(versions) @@ -457,7 +457,7 @@ def register_files(self, file_list, versions = itertools.cycle(versions) # Filter valid files and sort by session - for fn, ver in zip(map(pathlib.Path, file_list), versions): + for fn, ver in zip(file_list, versions): session_path = get_session_path(fn) if not session_path: _logger.debug(f'{fn}: Invalid session path') diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index 36f81f19..e2cd5993 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -257,7 +257,7 @@ def test_register_files(self): self.assertEqual([None] * len(files), rec) # Check the handling of revisions - rec, = self.client.register_files(str(file_name)) + rec = self.client.register_files(str(file_name)) # Add a protected tag to all the datasets tag = self.tag['name'] self.one.alyx.rest('datasets', 'partial_update', id=rec['id'], data={'tags': [tag]}) From 7eb5a997d0c864dfa8ee7e77d69ed1cad264dc3d Mon Sep 17 00:00:00 2001 From: Mayo Faulkner Date: Mon, 25 Mar 2024 11:25:45 +0000 Subject: [PATCH 17/20] improve coverage --- one/registration.py | 75 +++++++++++++++++++--------------- one/tests/test_registration.py | 24 +++++++++++ 2 files changed, 66 insertions(+), 33 deletions(-) diff --git a/one/registration.py b/one/registration.py index ea64bc0c..94003b10 100644 --- a/one/registration.py +++ b/one/registration.py @@ -392,32 +392,37 @@ def register_session(self, ses_path, users=None, file_list=True, **kwargs): session['data_dataset_session_related'] = ensure_list(recs) return session, recs - def check_protected_files(self, file_list, created_by=None): + def prepare_files(self, file_list, versions=None): """ - Check whether a set of files associated to a session are protected + Validates file list for registration and splits files into a list of files per + session path. + Parameters ---------- file_list : list, str, pathlib.Path A filepath (or list thereof) of ALF datasets to register to Alyx. - created_by : str - Name of Alyx user (defaults to whoever is logged in to ONE instance). + versions : str, list of str + Optional version tags. Returns ------- - list of dicts, dict - A status for each session whether any of the files specified are protected - datasets or not.If none of the datasets are protected, a response with status - 200 is returned, if any of the files are protected a response with status - 403 is returned. + list of dicts, list of dicts + A dict containing a list of files for each session + A dict containg a list of versions for each session """ F = defaultdict(list) # empty map whose keys will be session paths - + V = defaultdict(list) # empty map for versions if isinstance(file_list, (str, pathlib.Path)): file_list = [file_list] + if versions is None or isinstance(versions, str): + versions = itertools.repeat(versions) + else: + versions = itertools.cycle(versions) + # Filter valid files and sort by session - for fn in map(pathlib.Path, file_list): + for fn, ver in zip(map(pathlib.Path, file_list), versions): session_path = get_session_path(fn) if fn.suffix not in self.file_extensions: _logger.debug(f'{fn}: No matching extension "{fn.suffix}" in database') @@ -428,6 +433,31 @@ def check_protected_files(self, file_list, created_by=None): _logger.debug('%s', ex.args[0]) continue F[session_path].append(fn.relative_to(session_path)) + V[session_path].append(ver) + + return F, V + + def check_protected_files(self, file_list, created_by=None): + """ + Check whether a set of files associated to a session are protected + Parameters + ---------- + file_list : list, str, pathlib.Path + A filepath (or list thereof) of ALF datasets to register to Alyx. + created_by : str + Name of Alyx user (defaults to whoever is logged in to ONE instance). + + Returns + ------- + list of dicts, dict + A status for each session whether any of the files specified are protected + datasets or not.If none of the datasets are protected, a response with status + 200 is returned, if any of the files are protected a response with status + 403 is returned. + """ + + # Validate files and rearrange into list per session + F, _ = self.prepare_files(file_list) # For each unique session, make a separate POST request records = [] @@ -496,29 +526,8 @@ def register_files(self, file_list, Server side database error (500 status code) Revision protected (403 status code) """ - F = defaultdict(list) # empty map whose keys will be session paths - V = defaultdict(list) # empty map for versions - if isinstance(file_list, (str, pathlib.Path)): - file_list = [file_list] - - if versions is None or isinstance(versions, str): - versions = itertools.repeat(versions) - else: - versions = itertools.cycle(versions) - # Filter valid files and sort by session - for fn, ver in zip(map(pathlib.Path, file_list), versions): - session_path = get_session_path(fn) - if fn.suffix not in self.file_extensions: - _logger.debug(f'{fn}: No matching extension "{fn.suffix}" in database') - continue - try: - get_dataset_type(fn, self.dtypes) - except ValueError as ex: - _logger.debug('%s', ex.args[0]) - continue - F[session_path].append(fn.relative_to(session_path)) - V[session_path].append(ver) + F, V = self.prepare_files(file_list, versions=versions) # For each unique session, make a separate POST request records = [] diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index d3d04734..d896fd95 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -222,6 +222,30 @@ def test_create_sessions(self): self.assertEqual(ses[0]['number'], int(session_path.parts[-1])) self.assertEqual(session_paths[0], session_path) + def test_prepare_files(self): + """Test for RegistrationClient.prepare_files""" + + session_path = self.session_path.parent / next_num_folder(self.session_path.parent) + session_path_2 = session_path.parent / next_num_folder(session_path) + file_list = [session_path.joinpath('wheel.position.npy'), + session_path.joinpath('wheel.timestamps.npy'), + session_path_2.joinpath('wheel.position.npy')] + + # Test with file list and version is None + F, V = self.client.prepare_files(file_list) + self.assertTrue(len(F), 2) + self.assertListEqual(sorted(list(F.keys())), sorted([session_path, session_path_2])) + for sess, n in zip([session_path, session_path_2], [2, 1]): + self.assertTrue(len(F[sess]), n) + self.assertTrue(len(V[sess]), n) + self.assertIsNone(V[session_path][0]) + + # Test with specifying version + versions = ['1.2.2', 'v1.2', '1.3.4'] + _, V = self.client.prepare_files(file_list, versions=versions) + self.assertListEqual(V[session_path], versions[:-1]) + self.assertListEqual(V[session_path_2], [versions[-1]]) + def test_check_protected(self): """Test for RegistrationClient.check_protected_files""" From d1b113dc5e5799efa0892d4eab903088bfb813dd Mon Sep 17 00:00:00 2001 From: Mayo Faulkner Date: Mon, 25 Mar 2024 11:40:15 +0000 Subject: [PATCH 18/20] incorporate v2.7 changes --- one/registration.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/one/registration.py b/one/registration.py index 94003b10..6b8641da 100644 --- a/one/registration.py +++ b/one/registration.py @@ -413,8 +413,9 @@ def prepare_files(self, file_list, versions=None): F = defaultdict(list) # empty map whose keys will be session paths V = defaultdict(list) # empty map for versions - if isinstance(file_list, (str, pathlib.Path)): + if single_file := isinstance(file_list, (str, pathlib.Path)): file_list = [file_list] + file_list = list(map(pathlib.Path, file_list)) # Ensure list of path objects if versions is None or isinstance(versions, str): versions = itertools.repeat(versions) @@ -422,8 +423,11 @@ def prepare_files(self, file_list, versions=None): versions = itertools.cycle(versions) # Filter valid files and sort by session - for fn, ver in zip(map(pathlib.Path, file_list), versions): + for fn, ver in zip(file_list, versions): session_path = get_session_path(fn) + if not session_path: + _logger.debug(f'{fn}: Invalid session path') + continue if fn.suffix not in self.file_extensions: _logger.debug(f'{fn}: No matching extension "{fn.suffix}" in database') continue @@ -509,7 +513,8 @@ def register_files(self, file_list, Returns ------- list of dicts, dict - A list of newly created Alyx dataset records or the registration data if dry. + A list of newly created Alyx dataset records or the registration data if dry. If + a single file is passed in, a single dict is returned. Notes ----- From 316135cf26e9530bf5ef231f59466f09b088211e Mon Sep 17 00:00:00 2001 From: Mayo Faulkner Date: Mon, 25 Mar 2024 11:49:26 +0000 Subject: [PATCH 19/20] single file argument --- one/registration.py | 8 ++++---- one/tests/test_registration.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/one/registration.py b/one/registration.py index d235802e..9beef08c 100644 --- a/one/registration.py +++ b/one/registration.py @@ -439,7 +439,7 @@ def prepare_files(self, file_list, versions=None): F[session_path].append(fn.relative_to(session_path)) V[session_path].append(ver) - return F, V + return F, V, single_file def check_protected_files(self, file_list, created_by=None): """ @@ -461,7 +461,7 @@ def check_protected_files(self, file_list, created_by=None): """ # Validate files and rearrange into list per session - F, _ = self.prepare_files(file_list) + F, _, single_file = self.prepare_files(file_list) # For each unique session, make a separate POST request records = [] @@ -476,7 +476,7 @@ def check_protected_files(self, file_list, created_by=None): } records.append(self.one.alyx.get('/check-protected', data=r_, clobber=True)) - return records[0] if len(F.keys()) == 1 else records + return records[0] if single_file else records def register_files(self, file_list, versions=None, default=True, created_by=None, server_only=False, @@ -532,7 +532,7 @@ def register_files(self, file_list, Revision protected (403 status code) """ - F, V = self.prepare_files(file_list, versions=versions) + F, V, single_file = self.prepare_files(file_list, versions=versions) # For each unique session, make a separate POST request records = [None] * (len(F) if dry else len(file_list)) # If dry return data per session diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index 0d587df9..96287b25 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -232,7 +232,7 @@ def test_prepare_files(self): session_path_2.joinpath('wheel.position.npy')] # Test with file list and version is None - F, V = self.client.prepare_files(file_list) + F, V, _ = self.client.prepare_files(file_list) self.assertTrue(len(F), 2) self.assertListEqual(sorted(list(F.keys())), sorted([session_path, session_path_2])) for sess, n in zip([session_path, session_path_2], [2, 1]): @@ -242,7 +242,7 @@ def test_prepare_files(self): # Test with specifying version versions = ['1.2.2', 'v1.2', '1.3.4'] - _, V = self.client.prepare_files(file_list, versions=versions) + _, V, _ = self.client.prepare_files(file_list, versions=versions) self.assertListEqual(V[session_path], versions[:-1]) self.assertListEqual(V[session_path_2], [versions[-1]]) From 92dcdde1cb33d7b4b87903f8859f17499485f46a Mon Sep 17 00:00:00 2001 From: Mayo Faulkner Date: Mon, 25 Mar 2024 12:32:05 +0000 Subject: [PATCH 20/20] return altered file list --- one/registration.py | 14 ++++++++++---- one/tests/test_registration.py | 6 +++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/one/registration.py b/one/registration.py index 9beef08c..05f0d374 100644 --- a/one/registration.py +++ b/one/registration.py @@ -406,13 +406,19 @@ def prepare_files(self, file_list, versions=None): Returns ------- - list of dicts, list of dicts + list of dicts A dict containing a list of files for each session + list of dicts A dict containg a list of versions for each session + list + A list of files converted to paths + bool + A boolean indicating if input was a single file """ F = defaultdict(list) # empty map whose keys will be session paths V = defaultdict(list) # empty map for versions + if single_file := isinstance(file_list, (str, pathlib.Path)): file_list = [file_list] file_list = list(map(pathlib.Path, file_list)) # Ensure list of path objects @@ -439,7 +445,7 @@ def prepare_files(self, file_list, versions=None): F[session_path].append(fn.relative_to(session_path)) V[session_path].append(ver) - return F, V, single_file + return F, V, file_list, single_file def check_protected_files(self, file_list, created_by=None): """ @@ -461,7 +467,7 @@ def check_protected_files(self, file_list, created_by=None): """ # Validate files and rearrange into list per session - F, _, single_file = self.prepare_files(file_list) + F, _, _, single_file = self.prepare_files(file_list) # For each unique session, make a separate POST request records = [] @@ -532,7 +538,7 @@ def register_files(self, file_list, Revision protected (403 status code) """ - F, V, single_file = self.prepare_files(file_list, versions=versions) + F, V, file_list, single_file = self.prepare_files(file_list, versions=versions) # For each unique session, make a separate POST request records = [None] * (len(F) if dry else len(file_list)) # If dry return data per session diff --git a/one/tests/test_registration.py b/one/tests/test_registration.py index 96287b25..9fd37cde 100644 --- a/one/tests/test_registration.py +++ b/one/tests/test_registration.py @@ -232,7 +232,7 @@ def test_prepare_files(self): session_path_2.joinpath('wheel.position.npy')] # Test with file list and version is None - F, V, _ = self.client.prepare_files(file_list) + F, V, _, _ = self.client.prepare_files(file_list) self.assertTrue(len(F), 2) self.assertListEqual(sorted(list(F.keys())), sorted([session_path, session_path_2])) for sess, n in zip([session_path, session_path_2], [2, 1]): @@ -242,7 +242,7 @@ def test_prepare_files(self): # Test with specifying version versions = ['1.2.2', 'v1.2', '1.3.4'] - _, V, _ = self.client.prepare_files(file_list, versions=versions) + _, V, _, _ = self.client.prepare_files(file_list, versions=versions) self.assertListEqual(V[session_path], versions[:-1]) self.assertListEqual(V[session_path_2], [versions[-1]]) @@ -254,7 +254,7 @@ def test_check_protected(self): file_name.touch() # register a dataset - rec, = self.client.register_files(str(file_name)) + rec = self.client.register_files(str(file_name)) # Check if it is protected, it shouldn't be, response 200 protected = self.client.check_protected_files(str(file_name))