Skip to content

Commit

Permalink
Merge pull request #557 from hubmapconsortium/karlburke/SearchAPIRein…
Browse files Browse the repository at this point in the history
…dexCollectionSupport

Karlburke/search api reindex collection support
  • Loading branch information
yuanzhou authored Oct 23, 2023
2 parents bd946e8 + 9f57ac2 commit 77bd01a
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 273 deletions.
27 changes: 0 additions & 27 deletions entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1707,33 +1707,6 @@ paths:
description: The target entity could not be found
'500':
description: Internal error
'/collections/{id}':
get:
summary: 'Returns the information of the Collection specified by the uuid with all connected datasets. If a valid token is provided with group membership in the HuBMAP-Read group any collection matching the id will be returned. Otherwise if no token is provided or a valid token with no HuBMAP-Read group membership then only a public collection will be returned. Public collections are defined as being published via a DOI (collection.doi_registered == true) and at least one of the connected datasets is public (dataset.metadata.data_access_level == ''public''). For public collections only connected datasets that are public are returned with it.'
parameters:
- name: id
in: path
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
required: true
schema:
type: string
responses:
'200':
description: The collection is returned
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Collection'
'400':
description: Invalid or misformatted entity identifier
'401':
description: The user's token has expired or the user did not supply a valid token
'404':
description: The target entity could not be found
'500':
description: Internal error
'/entities/new/{entity_type}':
post:
summary: Create a new entity of the target type
Expand Down
201 changes: 2 additions & 199 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,90 +524,6 @@ def get_entity_visibility(id):

return jsonify(entity_scope.value)

"""
Retrieve the collection detail by id
The gateway treats this endpoint as public accessible
An optional Globus groups token can be provided in a standard Authentication Bearer header. If a valid token
is provided with group membership in the HuBMAP-Read group any collection matching the id will be returned.
otherwise if no token is provided or a valid token with no HuBMAP-Read group membership then
only a public collection will be returned. Public collections are defined as being published via a DOI
(collection.registered_doi not null) and at least one of the connected datasets is public
(dataset.status == 'Published'). For public collections only connected datasets that are
public are returned with it.
By default we only reuturn the following Dataset properties:
- collection.dataset.uuid
- collection.dataset.hubmap_id
- collection.dataset.data_types
- collection.dataset.status
- collection.dataset.last_modified_timestamp
- collection.dataset.created_by_user_displayname
Parameters
----------
id : str
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target collection
Returns
-------
json
The collection detail with a list of connected datasets (only public datasets
if user doesn't have the right access permission)
"""
@app.route('/collections/<id>', methods = ['GET'])
def get_collection(id):
# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

# Use the internal token to query the target collection
# since public collections don't require user token
token = get_internal_token()

# Get the entity dict from cache if exists
# Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
collection_dict = query_target_entity(id, token)

# A bit validation
if collection_dict['entity_type'] != 'Collection':
bad_request_error("Target entity of the given id is not a collection")

# Try to get user token from Authorization header
# It's highly possible that there's no token provided
user_token = get_user_token(request)

# The user_token is flask.Response on error
# Without token, the user can only access public collections, modify the collection result
# by only returning public datasets attached to this collection
if isinstance(user_token, Response):
# When the requested collection is not public, send back 401
if ('registered_doi' not in collection_dict) or ('doi_url' not in collection_dict):
# Require a valid token in this case
unauthorized_error("The requested collection is not public, a Globus token with the right access permission is required.")

# Otherwise only return the public datasets attached to this collection
# for Collection.datasets property
complete_dict = get_complete_public_collection_dict(collection_dict)
else:
# When the groups token is valid, but the user doesn't belong to HuBMAP-READ group
# Or the token is valid but doesn't contain group information (auth token or transfer token)
# Only return the public datasets attached to this Collection
if not user_in_hubmap_read_group(request):
complete_dict = get_complete_public_collection_dict(collection_dict)
else:
# We'll need to return all the properties including those
# generated by `on_read_trigger` to have a complete result
complete_dict = schema_manager.get_complete_entity_result(user_token, collection_dict)

# Will also filter the result based on schema
normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict)

# Response with the final result
return jsonify(normalized_complete_dict)

def _get_entity_visibility(normalized_entity_type, entity_dict):
if normalized_entity_type not in schema_manager.get_all_entity_types():
logger.log( logging.ERROR
Expand Down Expand Up @@ -880,8 +796,7 @@ def get_entity_types():
Parameters
----------
entity_type : str
One of the supported entity types: Dataset, Sample, Donor
Will handle Collection via API endpoint `/collections`
One of the supported entity types: Dataset, Collection, Sample, Donor
Returns
-------
Expand Down Expand Up @@ -952,80 +867,6 @@ def get_entities_by_type(entity_type):
# Response with the final result
return jsonify(final_result)

"""
Retrieve all the public collections
The gateway treats this endpoint as public accessible
Result filtering is supported based on query string
For example: /collections?property=uuid
Only return public collections, for either
- a valid token in HuBMAP-Read group,
- a valid token with no HuBMAP-Read group or
- no token at all
Public collections are defined as being published via a DOI
(collection.registered_doi is not null) and at least one of the connected datasets is published
(dataset.status == 'Published'). For public collections only connected datasets that are
published are returned with it.
Returns
-------
json
A list of all the public collection dictionaries (with attached public datasts)
"""
@app.route('/collections', methods = ['GET'])
def get_collections():
final_result = []

# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

normalized_entity_type = 'Collection'

# Result filtering based on query string
if bool(request.args):
property_key = request.args.get('property')

if property_key is not None:
result_filtering_accepted_property_keys = ['uuid']

# Validate the target property
if property_key not in result_filtering_accepted_property_keys:
bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}")

# Only return a list of the filtered property value of each public collection
final_result = app_neo4j_queries.get_public_collections(neo4j_driver_instance, property_key)
else:
bad_request_error("The specified query string is not supported. Use '?property=<key>' to filter the result")
# Return all the details if no property filtering
else:
# Use the internal token since no user token is requried to access public collections
token = get_internal_token()

# Get back a list of public collections dicts
collections_list = app_neo4j_queries.get_public_collections(neo4j_driver_instance)

# Modify the Collection.datasets property for each collection dict
# to contain only public datasets
for collection_dict in collections_list:
# Only return the public datasets attached to this collection for Collection.datasets property
collection_dict = get_complete_public_collection_dict(collection_dict)

# Generate trigger data and merge into a big dict
# and skip some of the properties that are time-consuming to generate via triggers
properties_to_skip = ['datasets']
complete_collections_list = schema_manager.get_complete_entities_list(token, collections_list, properties_to_skip)

# Final result after normalization
final_result = schema_manager.normalize_entities_list_for_response(complete_collections_list)

# Response with the final result
return jsonify(final_result)


"""
Create an entity of the target type in neo4j
Expand All @@ -1036,7 +877,7 @@ def get_collections():
Parameters
----------
entity_type : str
One of the target entity types (case-insensitive since will be normalized): Dataset, Donor, Sample, Upload
One of the target entity types (case-insensitive since will be normalized): Dataset, Donor, Sample, Upload, Collection
Returns
-------
Expand Down Expand Up @@ -4149,44 +3990,6 @@ def validate_token_if_auth_header_exists(request):
def get_internal_token():
return auth_helper_instance.getProcessSecret()


"""
Return the complete collection dict for a given raw collection dict
Parameters
----------
collection_dict : dict
The raw collection dict returned by Neo4j
Returns
-------
dict
A dictionary of complete collection detail with all the generated 'on_read_trigger' data
The generated Collection.datasts contains only public datasets
if user/token doesn't have the right access permission
"""
def get_complete_public_collection_dict(collection_dict):
# Use internal token to query entity since
# no user token is required to access a public collection
token = get_internal_token()

# Collection.datasets is transient property and generated by the trigger method
# We'll need to return all the properties including those
# generated by `on_read_trigger` to have a complete result
complete_dict = schema_manager.get_complete_entity_result(token, collection_dict)

# Loop through Collection.datasets and only return the published/public datasets
public_datasets = []
for dataset in complete_dict['datasets']:
if dataset['status'].lower() == DATASET_STATUS_PUBLISHED:
public_datasets.append(dataset)

# Modify the result and only show the public datasets in this collection
complete_dict['datasets'] = public_datasets

return complete_dict


"""
Generate 'before_create_triiger' data and create the entity details in Neo4j
Expand Down
47 changes: 0 additions & 47 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,53 +97,6 @@ def get_entities_by_type(neo4j_driver, entity_type, property_key = None):

return results

"""
Get all the public collection nodes
Parameters
----------
neo4j_driver : neo4j.Driver object
The neo4j database connection pool
property_key : str
A target property key for result filtering
Returns
-------
list
A list of public collections returned from the Cypher query
"""
def get_public_collections(neo4j_driver, property_key = None):
results = []

if property_key:
query = (f"MATCH (e:Collection) "
f"WHERE e.registered_doi IS NOT NULL AND e.doi_url IS NOT NULL "
# COLLECT() returns a list
# apoc.coll.toSet() reruns a set containing unique nodes
f"RETURN apoc.coll.toSet(COLLECT(e.{property_key})) AS {record_field_name}")
else:
query = (f"MATCH (e:Collection) "
f"WHERE e.registered_doi IS NOT NULL AND e.doi_url IS NOT NULL "
# COLLECT() returns a list
# apoc.coll.toSet() reruns a set containing unique nodes
f"RETURN apoc.coll.toSet(COLLECT(e)) AS {record_field_name}")

logger.info("======get_public_collections() query======")
logger.info(query)

with neo4j_driver.session() as session:
record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query)

if record and record[record_field_name]:
if property_key:
# Just return the list of property values from each entity node
results = record[record_field_name]
else:
# Convert the list of nodes to a list of dicts
results = schema_neo4j_queries.nodes_to_dicts(record[record_field_name])

return results

"""
Retrieve the ancestor organ(s) of a given entity
Expand Down

0 comments on commit 77bd01a

Please sign in to comment.