Merge pull request #563 from hubmapconsortium/Derek-Furst/get-multi-r…

…evision Derek furst/get multi revision
hubmapconsortium · Nov 13, 2023 · 80134b3 · 80134b3
2 parents cd3b08d + 126b8a6
commit 80134b3
Show file tree

Hide file tree

Showing 3 changed files with 198 additions and 1 deletion.
diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml
@@ -1874,6 +1874,49 @@ paths:
           description: The target dataset could not be found
         '500':
           description: Internal error
+  '/datasets/{id}/multi-revisions':
+    get:
+      summary: 'Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain. E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions 1-5 will be returned in reverse order (newest first). Non-public access is only required to retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each.'
+      parameters:
+        - name: id
+          in: path
+          description: The unique identifier of entity.  This identifier can be either an HuBMAP ID (e.g. HBM123.ABCD.456) or UUID
+          required: true
+          schema:
+            type: string
+        - name: include_dataset
+          in: query
+          description: A case insensitive string. Any value besides true will have no effect. If the string is 'true', the full dataset for each revision will be included in the response
+          required: false
+          schema:
+            type: string
+            enum: ['true', 'false']
+      responses:
+        '200':
+          description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  uuid:
+                    type: string
+                    description: The uuid of a dataset
+                  revision_number:
+                    type: integer
+                    description: The number in the revision chain of this dataset where 1 is the oldest revision
+                  dataset:
+                    $ref: '#/components/schemas/Dataset'
+        '400':
+          description: Invalid or misformatted entity identifier, or the given entity is not a Dataset
+        '401':
+          description: The user's token has expired or the user did not supply a valid token
+        '403':
+          description: The user is not authorized to query the revision number of the given dataset.
+        '404':
+          description: The target dataset could not be found
+        '500':
+          description: Internal error
   '/datasets/{id}/revisions':
     get:
       summary: 'From a given ID of a versioned dataset, retrieve a list of every dataset in the chain ordered from most recent to oldest. The revision number, as well as the dataset uuid will be included. An optional parameter ?include_dataset=true will include the full dataset for each revision as well. Public/Consortium access rules apply, if is for a non-public dataset and no token or a token without membership in HuBMAP-Read group is sent with the request then a 403 response should be returned. If the given id is published, but later revisions are not and the user is not in HuBMAP-Read group, only published revisions will be returned. The field next_revision_uuid will not be returned if the next revision is unpublished'
@@ -1890,7 +1933,7 @@ paths:
           required: false
           schema:
             type: string
-            enum: ['true', 'false']
+            enum: [ 'true', 'false' ]
       responses:
         '200':
           description: The list of revised datasets that the referenced dataset is a member of including the index number of the revision, where 1 is the oldest version of any revision chain

diff --git a/src/app.py b/src/app.py
@@ -2279,6 +2279,100 @@ def get_dataset_revision_number(id):
     return jsonify(revision_number)
 
 
+"""
+Retrieve a list of all multi revisions of a dataset from the id of any dataset in the chain. 
+E.g: If there are 5 revisions, and the id for revision 4 is given, a list of revisions
+1-5 will be returned in reverse order (newest first). Non-public access is only required to 
+retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary
+contains the dataset revision number and its list of uuids. Optionally, the full dataset can be included for each.
+
+By default, only the revision number and uuids are included. To include the full dataset, the query 
+parameter "include_dataset" can be given with the value of "true". If this parameter is not included or 
+is set to false, the dataset will not be included. For example, to include the full datasets for each revision,
+use '/datasets/<id>/multi-revisions?include_dataset=true'. To omit the datasets, either set include_dataset=false, or
+simply do not include this parameter. 
+
+Parameters
+----------
+id : str
+    The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset 
+
+Returns
+-------
+list
+    The list of revision datasets
+"""
+@app.route('/entities/<id>/multi-revisions', methods=['GET'])
+@app.route('/datasets/<id>/multi-revisions', methods=['GET'])
+def get_multi_revisions_list(id):
+    # By default, do not return dataset. Only return dataset if include_dataset is true
+    property_key = 'uuid'
+    if bool(request.args):
+        include_dataset = request.args.get('include_dataset')
+        if (include_dataset is not None) and (include_dataset.lower() == 'true'):
+            property_key = None
+    # Token is not required, but if an invalid token provided,
+    # we need to tell the client with a 401 error
+    validate_token_if_auth_header_exists(request)
+
+    # Use the internal token to query the target entity
+    # since public entities don't require user token
+    token = get_internal_token()
+
+    # Query target entity against uuid-api and neo4j and return as a dict if exists
+    entity_dict = query_target_entity(id, token)
+    normalized_entity_type = entity_dict['entity_type']
+
+    # Only for Dataset
+    if not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
+        abort_bad_req("The entity is not a Dataset. Found entity type:" + normalized_entity_type)
+
+    # Only published/public datasets don't require token
+    if entity_dict['status'].lower() != DATASET_STATUS_PUBLISHED:
+        # Token is required and the user must belong to HuBMAP-READ group
+        token = get_user_token(request, non_public_access_required=True)
+
+    # By now, either the entity is public accessible or
+    # the user token has the correct access level
+    # Get the all the sorted (DESC based on creation timestamp) revisions
+    sorted_revisions_list = app_neo4j_queries.get_sorted_multi_revisions(neo4j_driver_instance, entity_dict['uuid'],
+                                                                         fetch_all=user_in_hubmap_read_group(request),
+                                                                         property_key=property_key)
+
+    # Skip some of the properties that are time-consuming to generate via triggers
+    properties_to_skip = [
+        'direct_ancestors',
+        'collections',
+        'upload',
+        'title'
+    ]
+
+    normalized_revisions_list = []
+    sorted_revisions_list_merged = sorted_revisions_list[0] + sorted_revisions_list[1][::-1]
+
+    if property_key is None:
+        for revision in sorted_revisions_list_merged:
+            complete_revision_list = schema_manager.get_complete_entities_list(token, revision, properties_to_skip)
+            normal = schema_manager.normalize_entities_list_for_response(complete_revision_list)
+            normalized_revisions_list.append(normal)
+    else:
+        normalized_revisions_list = sorted_revisions_list_merged
+
+    # Now all we need to do is to compose the result list
+    results = []
+    revision_number = len(normalized_revisions_list)
+    for revision in normalized_revisions_list:
+        result = {
+            'revision_number': revision_number,
+            'uuids': revision
+        }
+        results.append(result)
+        revision_number -= 1
+
+    return jsonify(results)
+
+
+
 """
 Retract a published dataset with a retraction reason and sub status
 

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -300,6 +300,66 @@ def get_sorted_revisions(neo4j_driver, uuid):
     return results
 
 
+"""
+Get all revisions for a given dataset uuid and sort them in descending order based on their creation time
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+uuid : str
+    The uuid of target entity 
+fetch_all : bool
+    Whether to fetch all Datasets or only include Published
+property_key : str
+    Return only a particular property from the cypher query, None for return all    
+
+Returns
+-------
+dict
+    A multi-dimensional list [prev_revisions<list<list>>, next_revisions<list<list>>]
+"""
+
+
+def get_sorted_multi_revisions(neo4j_driver, uuid, fetch_all=True, property_key=False):
+    results = []
+    match_case = '' if fetch_all is True else 'AND prev.status = "Published" AND next.status = "Published" '
+    collect_prop = f".{property_key}" if property_key else ''
+
+    query = (
+        "MATCH (e:Dataset), (next:Dataset), (prev:Dataset),"
+        f"p = (e)-[:REVISION_OF *0..]->(prev),"
+        f"n = (e)<-[:REVISION_OF *0..]-(next) "
+        f"WHERE e.uuid='{uuid}' {match_case}"
+        "WITH length(p) AS p_len, prev, length(n) AS n_len, next "
+        "ORDER BY prev.created_timestamp, next.created_timestamp DESC "
+        f"WITH p_len, collect(distinct prev{collect_prop}) AS prev_revisions, n_len, collect(distinct next{collect_prop}) AS next_revisions "
+        f"RETURN [collect(distinct next_revisions), collect(distinct prev_revisions)] AS {record_field_name}"
+    )
+
+    logger.info("======get_sorted_revisions() query======")
+    logger.info(query)
+
+    with neo4j_driver.session() as session:
+        record = session.read_transaction(schema_neo4j_queries.execute_readonly_tx, query)
+
+        if record and record[record_field_name] and len(record[record_field_name]) > 0:
+            record[record_field_name][0].pop()  # the target will appear twice, pop it from the next list
+            if property_key:
+                return record[record_field_name]
+            else:
+                for collection in record[record_field_name]:  # two collections: next, prev
+                    revs = []
+                    for rev in collection:  # each collection list contains revision lists, so 2 dimensional array
+                        # Convert the list of nodes to a list of dicts
+                        nodes_to_dicts = schema_neo4j_queries.nodes_to_dicts(rev)
+                        revs.append(nodes_to_dicts)
+
+                    results.append(revs)
+
+    return results
+
+
 """
 Get all previous revisions of the target entity by uuid