Skip to content

Commit

Permalink
Merge pull request #563 from hubmapconsortium/Derek-Furst/get-multi-r…
Browse files Browse the repository at this point in the history
…evision

Derek furst/get multi revision
  • Loading branch information
yuanzhou authored Nov 13, 2023
2 parents cd3b08d + 126b8a6 commit 80134b3
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 1 deletion.
45 changes: 44 additions & 1 deletion entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1874,6 +1874,49 @@ paths:
description: The target dataset could not be found
'500':
description: Internal error
'/datasets/{id}/multi-revisions':
get:
summary: 'Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain. E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions 1-5 will be returned in reverse order (newest first). Non-public access is only required to retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each.'
parameters:
- name: id
in: path
description: The unique identifier of entity. This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
required: true
schema:
type: string
- name: include_dataset
in: query
description: A case insensitive string. Any value besides true will have no effect. If the string is 'true', the full dataset for each revision will be included in the response
required: false
schema:
type: string
enum: ['true', 'false']
responses:
'200':
description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain
content:
application/json:
schema:
type: object
properties:
uuid:
type: string
description: The uuid of a dataset
revision_number:
type: integer
description: The number in the revision chain of this dataset where 1 is the oldest revision
dataset:
$ref: '#/components/schemas/Dataset'
'400':
description: Invalid or misformatted entity identifier, or the given entity is not a Dataset
'401':
description: The user's token has expired or the user did not supply a valid token
'403':
description: The user is not authorized to query the revision number of the given dataset.
'404':
description: The target dataset could not be found
'500':
description: Internal error
'/datasets/{id}/revisions':
get:
summary: 'From a given ID of a versioned dataset, retrieve a list of every dataset in the chain ordered from most recent to oldest. The revision number, as well as the dataset uuid will be included. An optional parameter ?include_dataset=true will include the full dataset for each revision as well. Public/Consortium access rules apply, if is for a non-public dataset and no token or a token without membership in HuBMAP-Read group is sent with the request then a 403 response should be returned. If the given id is published, but later revisions are not and the user is not in HuBMAP-Read group, only published revisions will be returned. The field next_revision_uuid will not be returned if the next revision is unpublished'
Expand All @@ -1890,7 +1933,7 @@ paths:
required: false
schema:
type: string
enum: ['true', 'false']
enum: [ 'true', 'false' ]
responses:
'200':
description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain
Expand Down
94 changes: 94 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2279,6 +2279,100 @@ def get_dataset_revision_number(id):
return jsonify(revision_number)


"""
Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain.
E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions
1-5 will be returned in reverse order (newest first). Non-public access is only required to
retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary
contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each.
By default, only the revision number and uuids are included. To include the full dataset, the query
parameter "include_dataset" can be given with the value of "true". If this parameter is not included or
is set to false, the dataset will not be included. For example, to include the full datasets for each revision,
use '/datasets/<id>/multi-revisions?include_dataset=true'. To omit the datasets, either set include_dataset=false, or
simply do not include this parameter.
Parameters
----------
id : str
The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset
Returns
-------
list
The list of revision datasets
"""
@app.route('/entities/<id>/multi-revisions', methods=['GET'])
@app.route('/datasets/<id>/multi-revisions', methods=['GET'])
def get_multi_revisions_list(id):
# By default, do not return dataset. Only return dataset if include_dataset is true
property_key = 'uuid'
if bool(request.args):
include_dataset = request.args.get('include_dataset')
if (include_dataset is not None) and (include_dataset.lower() == 'true'):
property_key = None
# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

# Use the internal token to query the target entity
# since public entities don't require user token
token = get_internal_token()

# Query target entity against uuid-api and neo4j and return as a dict if exists
entity_dict = query_target_entity(id, token)
normalized_entity_type = entity_dict['entity_type']

# Only for Dataset
if not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
abort_bad_req("The entity is not a Dataset. Found entity type:" + normalized_entity_type)

# Only published/public datasets don't require token
if entity_dict['status'].lower() != DATASET_STATUS_PUBLISHED:
# Token is required and the user must belong to HuBMAP-READ group
token = get_user_token(request, non_public_access_required=True)

# By now, either the entity is public accessible or
# the user token has the correct access level
# Get the all the sorted (DESC based on creation timestamp) revisions
sorted_revisions_list = app_neo4j_queries.get_sorted_multi_revisions(neo4j_driver_instance, entity_dict['uuid'],
fetch_all=user_in_hubmap_read_group(request),
property_key=property_key)

# Skip some of the properties that are time-consuming to generate via triggers
properties_to_skip = [
'direct_ancestors',
'collections',
'upload',
'title'
]

normalized_revisions_list = []
sorted_revisions_list_merged = sorted_revisions_list[0] + sorted_revisions_list[1][::-1]

if property_key is None:
for revision in sorted_revisions_list_merged:
complete_revision_list = schema_manager.get_complete_entities_list(token, revision, properties_to_skip)
normal = schema_manager.normalize_entities_list_for_response(complete_revision_list)
normalized_revisions_list.append(normal)
else:
normalized_revisions_list = sorted_revisions_list_merged

# Now all we need to do is to compose the result list
results = []
revision_number = len(normalized_revisions_list)
for revision in normalized_revisions_list:
result = {
'revision_number': revision_number,
'uuids': revision
}
results.append(result)
revision_number -= 1

return jsonify(results)



"""
Retract a published dataset with a retraction reason and sub status
Expand Down
60 changes: 60 additions & 0 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,66 @@ def get_sorted_revisions(neo4j_driver, uuid):
return results


"""
Get all revisions for a given dataset uuid and sort them in descending order based on their creation time
Parameters
----------
neo4j_driver : neo4j.Driver object
The neo4j database connection pool
uuid : str
The uuid of target entity
fetch_all : bool
Whether to fetch all Datasets or only include Published
property_key : str
Return only a particular property from the cypher query, None for return all
Returns
-------
dict
A multi-dimensional list [prev_revisions<list<list>>, next_revisions<list<list>>]
"""


def get_sorted_multi_revisions(neo4j_driver, uuid, fetch_all=True, property_key=False):
results = []
match_case = '' if fetch_all is True else 'AND prev.status = "Published" AND next.status = "Published" '
collect_prop = f".{property_key}" if property_key else ''

query = (
"MATCH (e:Dataset), (next:Dataset), (prev:Dataset),"
f"p = (e)-[:REVISION_OF *0..]->(prev),"
f"n = (e)<-[:REVISION_OF *0..]-(next) "
f"WHERE e.uuid='{uuid}' {match_case}"
"WITH length(p) AS p_len, prev, length(n) AS n_len, next "
"ORDER BY prev.created_timestamp, next.created_timestamp DESC "
f"WITH p_len, collect(distinct prev{collect_prop}) AS prev_revisions, n_len, collect(distinct next{collect_prop}) AS next_revisions "
f"RETURN [collect(distinct next_revisions), collect(distinct prev_revisions)] AS {record_field_name}"
)

logger.info("======get_sorted_revisions() query======")
logger.info(query)

with neo4j_driver.session() as session:
record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query)

if record and record[record_field_name] and len(record[record_field_name]) > 0:
record[record_field_name][0].pop() # the target will appear twice, pop it from the next list
if property_key:
return record[record_field_name]
else:
for collection in record[record_field_name]: # two collections: next, prev
revs = []
for rev in collection: # each collection list contains revision lists, so 2 dimensional array
# Convert the list of nodes to a list of dicts
nodes_to_dicts = schema_neo4j_queries.nodes_to_dicts(rev)
revs.append(nodes_to_dicts)

results.append(revs)

return results


"""
Get all previous revisions of the target entity by uuid
Expand Down

0 comments on commit 80134b3

Please sign in to comment.