Skip to content

Commit

Permalink
Merge pull request #37 from 4dn-dcic/get_es_meta_source
Browse files Browse the repository at this point in the history
Add sources parameter to get_es_metadata
  • Loading branch information
Carl Vitzthum authored Apr 8, 2019
2 parents 719390b + f28e058 commit 2d73683
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 27 deletions.
2 changes: 1 addition & 1 deletion dcicutils/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information."""

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "0.6.2"
__version__ = "0.6.3"
87 changes: 63 additions & 24 deletions dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,39 +451,73 @@ def get_es_search_generator(es_client, index, body, page_size=200):
yield es_hits


def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200,
def get_es_metadata(uuids, es_client=None, filters={}, sources=[], chunk_size=200,
is_generator=False, key=None, ff_env=None):
"""
Given a list of string item uuids, will return a
dictionary response of the full ES record for those items (or an empty
dictionary if the items don't exist/ are not indexed)
You can pass in an Elasticsearch client (initialized by create_es_client)
through the es_client param to save init time.
Advanced users can optionally pass a dict of filters that will be added
to the Elasticsearch query.
For example: filters={'status': 'released'}
You can also specify NOT fields:
example: filters={'status': '!released'}
You can also specifiy lists of values for fields:
example: filters={'status': ['released', archived']}
NOTES:
- different filter field are combined using AND queries (must all match)
example: filters={'status': ['released'], 'public_release': ['2018-01-01']}
- values for the same field and combined with OR (such as multiple statuses)
Integer chunk_size may be used to control the number of uuids that are
passed to Elasticsearch in each query; setting this too high may cause
ES reads to timeout.
Boolean is_generator will return a generator for individual results if True;
if False (default), returns a list of results.
Same auth mechanism as the other metadata functions
"""
meta = _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env)
Returns
A dictionary with following keys
-keys with metadata
properties (raw frame without uuid), embedded, object
-keys summarizing interactions
linked_uuids_object, linked_uuids_embedded, links, rev_linked_to_me
-others
paths, aggregated_items, rev_link_names, item_type, principals_allowed,
unique_keys, sid, audit, uuid, propsheets
Args
uuids:
list of uuids to fetch from ES
es_client:
You can pass in an Elasticsearch client
(initialized by create_es_client)
through the es_client param to save init time.
filters:
Advanced users can optionally pass a dict of filters that will be added
to the Elasticsearch query.
For example: filters={'status': 'released'}
You can also specify NOT fields:
example: filters={'status': '!released'}
You can also specifiy lists of values for fields:
example: filters={'status': ['released', archived']}
NOTES:
- different filter field are combined using AND queries (must all match)
example: filters={'status': ['released'], 'public_release': ['2018-01-01']}
- values for the same field and combined with OR (such as multiple statuses)
sources:
You may also specify which fields are returned from ES by specifying a
list of source fields with the sources argument.
This field MUST include the full path of the field, such as 'embedded.uuid'
(for the embedded frame) or 'object.uuid' for the object frame. You may
also use the wildcard, such as 'embedded.*' for all fields in the embedded
frame.
You need to follow the dictionary structure of the get_es_metadata result
i.e. for getting uuids on the linked field 'files'
sources = ['properties.files']
or
sources = ['embedded.files.uuid']
i.e. getting all fields for lab in embedded frame
sources = ['embedded.lab.*']
i.e. for getting a only object frame
sources = ['object.*']
chunk_size:
Integer chunk_size may be used to control the number of uuids that are
passed to Elasticsearch in each query; setting this too high may cause
ES reads to timeout.
is_generator:
Boolean is_generator will return a generator for individual results if True;
if False (default), returns a list of results.
key: autentication key
ff_env: authentication by env (needs system variables)
"""
meta = _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env)
if is_generator:
return meta
return list(meta)


def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env):
"""
Internal function needed because there are multiple levels of iteration
used to create the generator.
Expand All @@ -510,7 +544,7 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
}
if filters:
if not isinstance(filters, dict):
print('Invalid filter for get_es_metadata: %s' % filters)
raise Exception('Invalid filters for get_es_metadata: %s' % filters)
else:
for k, v in filters.items():
key_terms = []
Expand All @@ -529,6 +563,11 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
es_query['query']['bool']['must_not'].append(
{'terms': {'embedded.' + k + '.raw': key_not_terms}}
)
if sources:
if not isinstance(sources, list):
raise Exception('Invalid sources for get_es_metadata: %s' % sources)
else:
es_query['_source'] = sources
# use chunk_limit as page size for performance reasons
for es_page in get_es_search_generator(es_client, '_all', es_query,
page_size=chunk_size):
Expand Down
48 changes: 46 additions & 2 deletions test/test_ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ def test_upsert_metadata(integrated_ff):

@pytest.mark.integrated
def test_search_metadata(integrated_ff):
from types import GeneratorType
search_res = ff_utils.search_metadata('search/?limit=all&type=File', key=integrated_ff['ff_key'])
assert isinstance(search_res, list)
# this will fail if items have not yet been indexed
Expand All @@ -360,7 +361,12 @@ def test_search_metadata(integrated_ff):
search_res_filt = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads',
key=integrated_ff['ff_key'])
assert len(search_res_filt) > 0
# TODO add test for is_generator=True
# test is_generator=True
search_res_gen = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads',
key=integrated_ff['ff_key'], is_generator=True)
assert isinstance(search_res_gen, GeneratorType)
gen_res = [v for v in search_res_gen] # run the gen
assert len(gen_res) == 3


@pytest.mark.integrated
Expand Down Expand Up @@ -395,6 +401,7 @@ def test_get_search_generator(integrated_ff):
@pytest.mark.integrated
def test_get_es_metadata(integrated_ff):
from dcicutils import es_utils
from types import GeneratorType
# use this test biosource and biosample
test_biosource = '331111bc-8535-4448-903e-854af460b254'
test_biosample = '111112bc-1111-4448-903e-854af460b123'
Expand Down Expand Up @@ -460,7 +467,44 @@ def test_get_es_metadata(integrated_ff):
filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']}
bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key'])
assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res)
# TODO add test for is_generator=True
# raise error if filters is not dict
with pytest.raises(Exception) as exec_info:
ff_utils.get_es_metadata(all_uuids, filters=['not', 'a', 'dict'],
key=integrated_ff['ff_key'])
assert 'Invalid filters for get_es_metadata' in str(exec_info.value)

# test is_generator=True, compare to bios_neg_res
bios_neg_gen = ff_utils.get_es_metadata(all_uuids, filters=filters2,
is_generator=True,
key=integrated_ff['ff_key'])
assert isinstance(bios_neg_gen, GeneratorType)
# run the gen
gen_res = [v for v in bios_neg_gen]
assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in gen_res)

# test sources
bios_neg_sources = ff_utils.get_es_metadata(all_uuids, filters=filters2,
sources=['object.*', 'embedded.biosource.uuid'],
key=integrated_ff['ff_key'])
for item in bios_neg_sources:
# get expected frame=object keys from matching biosample from search res
matching_bios = [bio for bio in bios_neg_res if bio['uuid'] == item['object']['uuid']]
expected_obj_keys = set(matching_bios[0].keys())
assert set(item.keys()) == {'object', 'embedded'}
# expect all keys in object frame, since we used object.*
assert set(item['object'].keys()) == expected_obj_keys
assert set(item['embedded'].keys()) == {'biosource'}
# expected only uuid in embedded.biosource
for biosource in item['embedded']['biosource']:
assert set(biosource.keys()) == {'uuid'}
# confirm that all items were found
assert set([item['uuid'] for item in bios_neg_es]) == set(item['object']['uuid'] for item in bios_neg_sources)
# raise error if sources is not list
with pytest.raises(Exception) as exec_info2:
ff_utils.get_es_metadata(all_uuids, filters=filters2,
sources='not a list',
key=integrated_ff['ff_key'])
assert 'Invalid sources for get_es_metadata' in str(exec_info2.value)


@pytest.mark.integrated
Expand Down

0 comments on commit 2d73683

Please sign in to comment.