Merge pull request #37 from 4dn-dcic/get_es_meta_source

Add sources parameter to get_es_metadata
4dn-dcic · Apr 8, 2019 · 2d73683 · 2d73683
2 parents 719390b + f28e058
commit 2d73683
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 27 deletions.
diff --git a/dcicutils/_version.py b/dcicutils/_version.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.6.2"
+__version__ = "0.6.3"
diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -451,39 +451,73 @@ def get_es_search_generator(es_client, index, body, page_size=200):
         yield es_hits
 
 
-def get_es_metadata(uuids, es_client=None, filters={}, chunk_size=200,
+def get_es_metadata(uuids, es_client=None, filters={}, sources=[], chunk_size=200,
                     is_generator=False, key=None, ff_env=None):
     """
     Given a list of string item uuids, will return a
     dictionary response of the full ES record for those items (or an empty
     dictionary if the items don't exist/ are not indexed)
-    You can pass in an Elasticsearch client (initialized by create_es_client)
-    through the es_client param to save init time.
-    Advanced users can optionally pass a dict of filters that will be added
-    to the Elasticsearch query.
-        For example: filters={'status': 'released'}
-        You can also specify NOT fields:
-            example: filters={'status': '!released'}
-        You can also specifiy lists of values for fields:
-            example: filters={'status': ['released', archived']}
-    NOTES:
-        - different filter field are combined using AND queries (must all match)
-            example: filters={'status': ['released'], 'public_release': ['2018-01-01']}
-        - values for the same field and combined with OR (such as multiple statuses)
-    Integer chunk_size may be used to control the number of uuids that are
-    passed to Elasticsearch in each query; setting this too high may cause
-    ES reads to timeout.
-    Boolean is_generator will return a generator for individual results if True;
-    if False (default), returns a list of results.
-    Same auth mechanism as the other metadata functions
-    """
-    meta = _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env)
+    Returns
+        A dictionary with following keys
+            -keys with metadata
+                properties (raw frame without uuid), embedded, object
+            -keys summarizing interactions
+                linked_uuids_object, linked_uuids_embedded, links, rev_linked_to_me
+            -others
+                paths, aggregated_items, rev_link_names, item_type, principals_allowed,
+                unique_keys, sid, audit, uuid, propsheets
+    Args
+        uuids:
+            list of uuids to fetch from ES
+        es_client:
+            You can pass in an Elasticsearch client
+            (initialized by create_es_client)
+            through the es_client param to save init time.
+        filters:
+            Advanced users can optionally pass a dict of filters that will be added
+            to the Elasticsearch query.
+                For example: filters={'status': 'released'}
+                You can also specify NOT fields:
+                    example: filters={'status': '!released'}
+                You can also specifiy lists of values for fields:
+                    example: filters={'status': ['released', archived']}
+            NOTES:
+                - different filter field are combined using AND queries (must all match)
+                    example: filters={'status': ['released'], 'public_release': ['2018-01-01']}
+                - values for the same field and combined with OR (such as multiple statuses)
+        sources:
+            You may also specify which fields are returned from ES by specifying a
+            list of source fields with the sources argument.
+            This field MUST include the full path of the field, such as 'embedded.uuid'
+            (for the embedded frame) or 'object.uuid' for the object frame. You may
+            also use the wildcard, such as 'embedded.*' for all fields in the embedded
+            frame.
+            You need to follow the dictionary structure of the get_es_metadata result
+            i.e. for getting uuids on the linked field 'files'
+                sources = ['properties.files']
+                or
+                sources = ['embedded.files.uuid']
+            i.e. getting all fields for lab in embedded frame
+                sources = ['embedded.lab.*']
+            i.e. for getting a only object frame
+                sources = ['object.*']
+        chunk_size:
+            Integer chunk_size may be used to control the number of uuids that are
+            passed to Elasticsearch in each query; setting this too high may cause
+            ES reads to timeout.
+        is_generator:
+            Boolean is_generator will return a generator for individual results if True;
+            if False (default), returns a list of results.
+        key: autentication key
+        ff_env: authentication by env (needs system variables)
+    """
+    meta = _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env)
     if is_generator:
         return meta
     return list(meta)
 
 
-def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
+def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, key, ff_env):
     """
     Internal function needed because there are multiple levels of iteration
     used to create the generator.
@@ -510,7 +544,7 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
         }
         if filters:
             if not isinstance(filters, dict):
-                print('Invalid filter for get_es_metadata: %s' % filters)
+                raise Exception('Invalid filters for get_es_metadata: %s' % filters)
             else:
                 for k, v in filters.items():
                     key_terms = []
@@ -529,6 +563,11 @@ def _get_es_metadata(uuids, es_client, filters, chunk_size, key, ff_env):
                         es_query['query']['bool']['must_not'].append(
                             {'terms': {'embedded.' + k + '.raw': key_not_terms}}
                         )
+        if sources:
+            if not isinstance(sources, list):
+                raise Exception('Invalid sources for get_es_metadata: %s' % sources)
+            else:
+                es_query['_source'] = sources
         # use chunk_limit as page size for performance reasons
         for es_page in get_es_search_generator(es_client, '_all', es_query,
                                                page_size=chunk_size):

diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py
@@ -343,6 +343,7 @@ def test_upsert_metadata(integrated_ff):
 
 @pytest.mark.integrated
 def test_search_metadata(integrated_ff):
+    from types import GeneratorType
     search_res = ff_utils.search_metadata('search/?limit=all&type=File', key=integrated_ff['ff_key'])
     assert isinstance(search_res, list)
     # this will fail if items have not yet been indexed
@@ -360,7 +361,12 @@ def test_search_metadata(integrated_ff):
     search_res_filt = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads',
                                                key=integrated_ff['ff_key'])
     assert len(search_res_filt) > 0
-    # TODO add test for is_generator=True
+    # test is_generator=True
+    search_res_gen = ff_utils.search_metadata('/search/?limit=3&type=File&file_type=reads',
+                                              key=integrated_ff['ff_key'], is_generator=True)
+    assert isinstance(search_res_gen, GeneratorType)
+    gen_res = [v for v in search_res_gen]  # run the gen
+    assert len(gen_res) == 3
 
 
 @pytest.mark.integrated
@@ -395,6 +401,7 @@ def test_get_search_generator(integrated_ff):
 @pytest.mark.integrated
 def test_get_es_metadata(integrated_ff):
     from dcicutils import es_utils
+    from types import GeneratorType
     # use this test biosource and biosample
     test_biosource = '331111bc-8535-4448-903e-854af460b254'
     test_biosample = '111112bc-1111-4448-903e-854af460b123'
@@ -460,7 +467,44 @@ def test_get_es_metadata(integrated_ff):
     filters2 = {'status': ['in review by lab'], 'modifications.modification_type': ['!Other'], '@type': ['Biosample']}
     bios_neg_es = ff_utils.get_es_metadata(all_uuids, filters=filters2, key=integrated_ff['ff_key'])
     assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in bios_neg_res)
-    # TODO add test for is_generator=True
+    # raise error if filters is not dict
+    with pytest.raises(Exception) as exec_info:
+        ff_utils.get_es_metadata(all_uuids, filters=['not', 'a', 'dict'],
+                                 key=integrated_ff['ff_key'])
+    assert 'Invalid filters for get_es_metadata' in str(exec_info.value)
+
+    # test is_generator=True, compare to bios_neg_res
+    bios_neg_gen = ff_utils.get_es_metadata(all_uuids, filters=filters2,
+                                            is_generator=True,
+                                            key=integrated_ff['ff_key'])
+    assert isinstance(bios_neg_gen, GeneratorType)
+    # run the gen
+    gen_res = [v for v in bios_neg_gen]
+    assert set([item['uuid'] for item in bios_neg_es]) == set(item['uuid'] for item in gen_res)
+
+    # test sources
+    bios_neg_sources = ff_utils.get_es_metadata(all_uuids, filters=filters2,
+                                                sources=['object.*', 'embedded.biosource.uuid'],
+                                                key=integrated_ff['ff_key'])
+    for item in bios_neg_sources:
+        # get expected frame=object keys from matching biosample from search res
+        matching_bios = [bio for bio in bios_neg_res if bio['uuid'] == item['object']['uuid']]
+        expected_obj_keys = set(matching_bios[0].keys())
+        assert set(item.keys()) == {'object', 'embedded'}
+        # expect all keys in object frame, since we used object.*
+        assert set(item['object'].keys()) == expected_obj_keys
+        assert set(item['embedded'].keys()) == {'biosource'}
+        # expected only uuid in embedded.biosource
+        for biosource in item['embedded']['biosource']:
+            assert set(biosource.keys()) == {'uuid'}
+    # confirm that all items were found
+    assert set([item['uuid'] for item in bios_neg_es]) == set(item['object']['uuid'] for item in bios_neg_sources)
+    # raise error if sources is not list
+    with pytest.raises(Exception) as exec_info2:
+        ff_utils.get_es_metadata(all_uuids, filters=filters2,
+                                 sources='not a list',
+                                 key=integrated_ff['ff_key'])
+    assert 'Invalid sources for get_es_metadata' in str(exec_info2.value)
 
 
 @pytest.mark.integrated