diff --git a/dcicutils/_version.py b/dcicutils/_version.py index a3317c88d..43850248a 100644 --- a/dcicutils/_version.py +++ b/dcicutils/_version.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.6.2" +__version__ = "0.6.3" diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 62530106f..d41c784bc 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -451,39 +451,73 @@ def get_es_search_generator(es_client, index, body, page_size=200): yield es_hits -def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200, +def get_es_metadata(uuids, es_client=None, filters={}, sources=[], chunk_size=200, is_generator=False, key=None, ff_env=None): """ Given a list of string item uuids, will return a dictionary response of the full ES record for those items (or an empty dictionary if the items don't exist/ are not indexed) - You can pass in an Elasticsearch client (initialized by create_es_client) - through the es_client param to save init time. - Advanced users can optionally pass a dict of filters that will be added - to the Elasticsearch query. - For example: filters={'status': 'released'} - You can also specify NOT fields: - example: filters={'status': '!released'} - You can also specifiy lists of values for fields: - example: filters={'status': ['released', archived']} - NOTES: - - different filter field are combined using AND queries (must all match) - example: filters={'status': ['released'], 'public_release': ['2018-01-01']} - - values for the same field and combined with OR (such as multiple statuses) - Integer chunk_size may be used to control the number of uuids that are - passed to Elasticsearch in each query; setting this too high may cause - ES reads to timeout. - Boolean is_generator will return a generator for individual results if True; - if False (default), returns a list of results. - Same auth mechanism as the other metadata functions - """ - meta = _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env) + Returns + A dictionary with following keys + -keys with metadata + properties (raw frame without uuid), embedded, object + -keys summarizing interactions + linked_uuids_object, linked_uuids_embedded, links, rev_linked_to_me + -others + paths, aggregated_items, rev_link_names, item_type, principals_allowed, + unique_keys, sid, audit, uuid, propsheets + Args + uuids: + list of uuids to fetch from ES + es_client: + You can pass in an Elasticsearch client + (initialized by create_es_client) + through the es_client param to save init time. + filters: + Advanced users can optionally pass a dict of filters that will be added + to the Elasticsearch query. + For example: filters={'status': 'released'} + You can also specify NOT fields: + example: filters={'status': '!released'} + You can also specifiy lists of values for fields: + example: filters={'status': ['released', archived']} + NOTES: + - different filter field are combined using AND queries (must all match) + example: filters={'status': ['released'], 'public_release': ['2018-01-01']} + - values for the same field and combined with OR (such as multiple statuses) + sources: + You may also specify which fields are returned from ES by specifying a + list of source fields with the sources argument. + This field MUST include the full path of the field, such as 'embedded.uuid' + (for the embedded frame) or 'object.uuid' for the object frame. You may + also use the wildcard, such as 'embedded.*' for all fields in the embedded + frame. + You need to follow the dictionary structure of the get_es_metadata result + i.e. for getting uuids on the linked field 'files' + sources = ['properties.files'] + or + sources = ['embedded.files.uuid'] + i.e. getting all fields for lab in embedded frame + sources = ['embedded.lab.*'] + i.e. for getting a only object frame + sources = ['object.*'] + chunk_size: + Integer chunk_size may be used to control the number of uuids that are + passed to Elasticsearch in each query; setting this too high may cause + ES reads to timeout. + is_generator: + Boolean is_generator will return a generator for individual results if True; + if False (default), returns a list of results. + key: autentication key + ff_env: authentication by env (needs system variables) + """ + meta = _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env) if is_generator: return meta return list(meta) -def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env): +def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env): """ Internal function needed because there are multiple levels of iteration used to create the generator. @@ -510,7 +544,7 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env): } if filters: if not isinstance(filters, dict): - print('Invalid filter for get_es_metadata: %s' % filters) + raise Exception('Invalid filters for get_es_metadata: %s' % filters) else: for k, v in filters.items(): key_terms = [] @@ -529,6 +563,11 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env): es_query['query']['bool']['must_not'].append( {'terms': {'embedded.' + k + '.raw': key_not_terms}} ) + if sources: + if not isinstance(sources, list): + raise Exception('Invalid sources for get_es_metadata: %s' % sources) + else: + es_query['_source'] = sources # use chunk_limit as page size for performance reasons for es_page in get_es_search_generator(es_client, '_all', es_query, page_size=chunk_size): diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 0b763666d..fdc91a5b7 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -343,6 +343,7 @@ def test_upsert_metadata(integrated_ff): @pytest.mark.integrated def test_search_metadata(integrated_ff): + from types import GeneratorType search_res = ff_utils.search_metadata('search/?limit=all&type=File', key=integrated_ff['ff_key']) assert isinstance(search_res, list) # this will fail if items have not yet been indexed @@ -360,7 +361,12 @@ def test_search_metadata(integrated_ff): search_res_filt = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads', key=integrated_ff['ff_key']) assert len(search_res_filt) > 0 - # TODO add test for is_generator=True + # test is_generator=True + search_res_gen = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads', + key=integrated_ff['ff_key'], is_generator=True) + assert isinstance(search_res_gen, GeneratorType) + gen_res = [v for v in search_res_gen] # run the gen + assert len(gen_res) == 3 @pytest.mark.integrated @@ -395,6 +401,7 @@ def test_get_search_generator(integrated_ff): @pytest.mark.integrated def test_get_es_metadata(integrated_ff): from dcicutils import es_utils + from types import GeneratorType # use this test biosource and biosample test_biosource = '331111bc-8535-4448-903e-854af460b254' test_biosample = '111112bc-1111-4448-903e-854af460b123' @@ -460,7 +467,44 @@ def test_get_es_metadata(integrated_ff): filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']} bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key']) assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res) - # TODO add test for is_generator=True + # raise error if filters is not dict + with pytest.raises(Exception) as exec_info: + ff_utils.get_es_metadata(all_uuids, filters=['not', 'a', 'dict'], + key=integrated_ff['ff_key']) + assert 'Invalid filters for get_es_metadata' in str(exec_info.value) + + # test is_generator=True, compare to bios_neg_res + bios_neg_gen = ff_utils.get_es_metadata(all_uuids, filters=filters2, + is_generator=True, + key=integrated_ff['ff_key']) + assert isinstance(bios_neg_gen, GeneratorType) + # run the gen + gen_res = [v for v in bios_neg_gen] + assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in gen_res) + + # test sources + bios_neg_sources = ff_utils.get_es_metadata(all_uuids, filters=filters2, + sources=['object.*', 'embedded.biosource.uuid'], + key=integrated_ff['ff_key']) + for item in bios_neg_sources: + # get expected frame=object keys from matching biosample from search res + matching_bios = [bio for bio in bios_neg_res if bio['uuid'] == item['object']['uuid']] + expected_obj_keys = set(matching_bios[0].keys()) + assert set(item.keys()) == {'object', 'embedded'} + # expect all keys in object frame, since we used object.* + assert set(item['object'].keys()) == expected_obj_keys + assert set(item['embedded'].keys()) == {'biosource'} + # expected only uuid in embedded.biosource + for biosource in item['embedded']['biosource']: + assert set(biosource.keys()) == {'uuid'} + # confirm that all items were found + assert set([item['uuid'] for item in bios_neg_es]) == set(item['object']['uuid'] for item in bios_neg_sources) + # raise error if sources is not list + with pytest.raises(Exception) as exec_info2: + ff_utils.get_es_metadata(all_uuids, filters=filters2, + sources='not a list', + key=integrated_ff['ff_key']) + assert 'Invalid sources for get_es_metadata' in str(exec_info2.value) @pytest.mark.integrated