Merge pull request #329 from sennetconsortium/tjmadonna/314-synchroni…

…ze-component-dataset-status Tjmadonna/314 synchronize component dataset status
sennetconsortium · Apr 11, 2024 · bcf2c3d · bcf2c3d
2 parents 8864887 + 3722582
commit bcf2c3d
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 24 deletions.
diff --git a/src/app.py b/src/app.py
@@ -213,15 +213,18 @@ def close_neo4j_driver(error):
 try:
     # The schema_manager is a singleton module
     # Pass in auth_helper_instance, neo4j_driver instance, and file_upload_helper instance
-    schema_manager.initialize(app.config['SCHEMA_YAML_FILE'],
-                              app.config['UUID_API_URL'],
-                              app.config['INGEST_API_URL'],
-                              app.config['SEARCH_API_URL'],
-                              auth_helper_instance,
-                              neo4j_driver_instance,
-                              app.ubkg,
-                              memcached_client_instance,
-                              app.config['MEMCACHED_PREFIX'])
+    schema_manager.initialize(
+        valid_yaml_file=app.config['SCHEMA_YAML_FILE'],
+        uuid_api_url=app.config['UUID_API_URL'],
+        entity_api_url=app.config['ENTITY_API_URL'],
+        ingest_api_url=app.config['INGEST_API_URL'],
+        search_api_url=app.config['SEARCH_API_URL'],
+        auth_helper_instance=auth_helper_instance,
+        neo4j_driver_instance=neo4j_driver_instance,
+        ubkg_instance=app.ubkg,
+        memcached_client_instance=memcached_client_instance,
+        memcached_prefix=app.config['MEMCACHED_PREFIX']
+    )
 
     logger.info("Initialized schema_manager module successfully :)")
 # Use a broad catch-all here
@@ -1284,6 +1287,10 @@ def update_entity(id):
         normalized_status = schema_manager.normalize_status(json_data_dict["status"])
         json_data_dict["status"] = normalized_status
 
+    has_updated_status = False
+    if 'status' in json_data_dict and json_data_dict['status']:
+        has_updated_status = True
+
     # Normalize user provided status
     if "sub_status" in json_data_dict:
         normalized_status = schema_manager.normalize_status(json_data_dict["sub_status"])
@@ -1298,7 +1305,6 @@ def update_entity(id):
     # Normalize user provided entity_type
     normalized_entity_type = schema_manager.normalize_entity_type(entity_dict['entity_type'])
 
-
     verify_ubkg_properties(json_data_dict)
 
     # Note, we don't support entity level validators on entity update via PUT
@@ -1307,7 +1313,7 @@ def update_entity(id):
     # Validate request json against the yaml schema
     # Pass in the entity_dict for missing required key check, this is different from creating new entity
     try:
-        schema_manager.validate_json_data_against_schema('ENTITIES', json_data_dict, normalized_entity_type, existing_entity_dict = entity_dict)
+        schema_manager.validate_json_data_against_schema('ENTITIES', json_data_dict, normalized_entity_type, existing_entity_dict=entity_dict)
     except schema_errors.SchemaValidationException as e:
         # No need to log the validation errors
         abort_bad_req(str(e))
@@ -1342,7 +1348,7 @@ def update_entity(id):
         # Generate 'before_update_triiger' data and update the entity details in Neo4j
         merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)
 
-        # Handle linkages update via `after_update_trigger` methods 
+        # Handle linkages update via `after_update_trigger` methods
         if has_direct_ancestor_uuid:
             after_update(normalized_entity_type, user_token, merged_updated_dict)
     elif normalized_entity_type in ['Dataset', 'Publication']:
@@ -1360,7 +1366,7 @@ def update_entity(id):
         merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)
 
         # Handle linkages update via `after_update_trigger` methods
-        if has_direct_ancestor_uuids:
+        if has_direct_ancestor_uuids or has_updated_status:
             after_update(normalized_entity_type, user_token, merged_updated_dict)
     elif normalized_entity_type == 'Upload':
         has_dataset_uuids_to_link = False
@@ -1392,11 +1398,11 @@ def update_entity(id):
         merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)
 
         # Handle linkages update via `after_update_trigger` methods
-        if has_dataset_uuids_to_link or has_dataset_uuids_to_unlink:
+        if has_dataset_uuids_to_link or has_updated_status:
             after_update(normalized_entity_type, user_token, merged_updated_dict)
     elif normalized_entity_type == 'Collection':
-        entity_visibility = _get_entity_visibility(  normalized_entity_type=normalized_entity_type
-                                                    ,entity_dict=entity_dict)
+        entity_visibility = _get_entity_visibility(normalized_entity_type=normalized_entity_type, entity_dict=entity_dict)
+
         # Prohibit update of an existing Collection if it meets criteria of being visible to public e.g. has DOI.
         if entity_visibility == DataVisibilityEnum.PUBLIC:
             logger.info(f"Attempt to update {normalized_entity_type} with id={id} which has visibility {entity_visibility}.")

diff --git a/src/instance/app.cfg.example b/src/instance/app.cfg.example
@@ -33,6 +33,11 @@ UUID_API_URL = 'http://uuid-api:8080'
 # Works regardless of the trailing slash
 INGEST_API_URL = 'http://ingest-api:8080'
 
+# URL for talking to Entity API (default for Localhost)
+# This is the same URL base where entity-api is running. This is useful in places where a call for one entity
+# necessitates subsequent calls for other entities.
+ENTITY_API_URL = 'http://localhost:5002'
+
 # URL for talking to Search API (default value used for docker deployment, no token needed)
 # Don't use localhost since search-api is running on a different container
 # Point to remote URL for non-docker development

diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -369,11 +369,13 @@ ENTITIES:
         before_property_update_validators:
           - validate_application_header_before_property_update
           - validate_dataset_status_value
+          - validate_status_changed
+          - validate_dataset_not_component
         generated: true
         description: "One of: New|Processing|QA|Published|Error|Hold|Invalid|Incomplete"
         before_create_trigger: set_dataset_status_new
         after_create_trigger: set_status_history
-        after_update_trigger: set_status_history
+        after_update_trigger: update_status
       status_history:
         type: list
         description: "A list of all status change events. Each entry in the list is a dictionary containing the change_timestamp, changed_by_email, previous_status, new_status"

diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py
@@ -1,15 +1,19 @@
 from enum import Enum
+
+
 class SchemaConstants(object):
     # Expire the request cache after the time-to-live (seconds), default 4 hours
     REQUEST_CACHE_TTL = 14400
     MEMCACHED_TTL = 7200
 
     # Constants used by validators
     INGEST_API_APP = 'ingest-api'
+    COMPONENT_DATASET = 'component-dataset'
     INGEST_PIPELINE_APP = 'ingest-pipeline'
     INGEST_PORTAL_APP = 'portal-ui'
     # HTTP header names are case-insensitive
     SENNET_APP_HEADER = 'X-SenNet-Application'
+    INTERNAL_TRIGGER = 'X-Internal-Trigger'
     DATASET_STATUS_PUBLISHED = 'published'
 
     # Used by triggers, all lowercase for easy comparision
@@ -25,6 +29,7 @@ class SchemaConstants(object):
     ALLOWED_DATASET_STATUSES = ['new', 'processing', 'published', 'qa', 'error', 'hold', 'invalid', 'submitted', 'incomplete']
     ALLOWED_UPLOAD_STATUSES = ['new', 'valid', 'invalid', 'error', 'reorganized', 'processing', 'submitted', 'incomplete']
 
+
 # Define an enumeration to classify an entity's visibility, which can be combined with
 # authorization info when verify operations on a request.
 class DataVisibilityEnum(Enum):

diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
@@ -2,9 +2,11 @@
 import yaml
 import logging
 import requests
-from flask import Response
 from datetime import datetime
 
+from flask import Response
+from hubmap_commons.file_helper import ensureTrailingSlashURL
+
 # Don't confuse urllib (Python native library) with urllib3 (3rd-party library, requests also uses urllib3)
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
 
@@ -15,9 +17,6 @@
 from schema.schema_constants import SchemaConstants
 from schema import schema_neo4j_queries
 
-# HuBMAP commons
-from hubmap_commons.hm_auth import AuthHelper
-
 # Atlas Consortia commons
 from atlas_consortia_commons.rest import *
 
@@ -36,6 +35,7 @@
 # A single leading underscore means you're not supposed to access it "from the outside"
 _schema = None
 _uuid_api_url = None
+_entity_api_url = None
 _ingest_api_url = None
 _search_api_url = None
 _auth_helper = None
@@ -66,6 +66,7 @@
 
 def initialize(valid_yaml_file,
                uuid_api_url,
+               entity_api_url,
                ingest_api_url,
                search_api_url,
                auth_helper_instance,
@@ -77,6 +78,7 @@ def initialize(valid_yaml_file,
     # Specify as module-scope variables
     global _schema
     global _uuid_api_url
+    global _entity_api_url
     global _ingest_api_url
     global _search_api_url
     global _auth_helper
@@ -93,6 +95,13 @@ def initialize(valid_yaml_file,
     _ingest_api_url = ingest_api_url
     _search_api_url = search_api_url
 
+    if entity_api_url is not None:
+        _entity_api_url = entity_api_url
+    else:
+        msg = f"Unable to initialize schema manager with entity_api_url={entity_api_url}."
+        logger.critical(msg=msg)
+        raise Exception(msg)
+
     # Get the helper instances
     _auth_helper = auth_helper_instance
     _neo4j_driver = neo4j_driver_instance
@@ -2032,4 +2041,16 @@ def delete_memcached_cache(uuids_list):
 
         _memcached_client.delete_many(cache_keys)
 
-        logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")
+        logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")
+
+
+def get_entity_api_url():
+    """ Get the entity-api URL to be used by trigger methods.
+
+    Returns
+    -------
+    str
+        The entity-api URL ending with a trailing slash
+    """
+    global _entity_api_url
+    return ensureTrailingSlashURL(_entity_api_url)
diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py
@@ -1824,6 +1824,71 @@ def set_was_derived_from(property_key, normalized_type, user_token, existing_dat
         raise
 
 
+def update_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
+    """
+    Trigger event method that calls related functions involved with updating the status value
+
+    Parameters
+    ----------
+    property_key : str
+        The target property key
+    normalized_type : str
+        One of the types defined in the schema yaml: Dataset
+    user_token: str
+        The user's globus nexus token
+    existing_data_dict : dict
+        A dictionary that contains all existing entity properties
+    new_data_dict : dict
+        A merged dictionary that contains all possible input data to be used
+    """
+
+    # execute set_status_history
+    set_status_history(property_key, normalized_type, user_token, existing_data_dict, new_data_dict)
+
+    # execute sync_component_dataset_status
+    sync_component_dataset_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict)
+
+
+def sync_component_dataset_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
+    """
+    Function that changes the status of component datasets when their parent multi-assay dataset's status changes.
+
+    Parameters
+    ----------
+    property_key : str
+        The target property key
+    normalized_type : str
+        One of the types defined in the schema yaml: Dataset
+    user_token: str
+        The user's globus nexus token
+    existing_data_dict : dict
+        A dictionary that contains all existing entity properties
+    new_data_dict : dict
+        A merged dictionary that contains all possible input data to be used
+    """
+
+    if 'uuid' not in existing_data_dict:
+        raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'link_dataset_to_direct_ancestors()' trigger method.")
+    uuid = existing_data_dict['uuid']
+    if 'status' not in existing_data_dict:
+        raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'link_dataset_to_direct_ancestors()' trigger method.")
+    status = existing_data_dict['status']
+    children_uuids_list = schema_neo4j_queries.get_children(schema_manager.get_neo4j_driver_instance(), uuid, property_key='uuid')
+    status_body = {"status": status}
+
+    for child_uuid in children_uuids_list:
+        creation_action = schema_neo4j_queries.get_entity_creation_action_activity(schema_manager.get_neo4j_driver_instance(), child_uuid)
+        if creation_action == 'Multi-Assay Split':
+            # Update the status of the child entities
+            url = schema_manager.get_entity_api_url() + 'entities/' + child_uuid
+            header = schema_manager._create_request_headers(user_token)
+            header[SchemaConstants.SENNET_APP_HEADER] = SchemaConstants.INGEST_API_APP
+            header[SchemaConstants.INTERNAL_TRIGGER] = SchemaConstants.COMPONENT_DATASET
+            response = requests.put(url=url, headers=header, json=status_body)
+            if response.status_code != 200:
+                logger.error(f"Failed to update status of child entity {child_uuid} when parent dataset status changed: {response.text}")
+
+
 ####################################################################################################
 ## Trigger methods specific to Collection - DO NOT RENAME
 ####################################################################################################
@@ -2801,6 +2866,7 @@ def source_metadata_display_value(metadata_item: dict) -> str:
 ####################################################################################################
 ## Trigger methods shared by Dataset, Upload, and Publication - DO NOT RENAME
 ####################################################################################################
+
 def set_status_history(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
     new_status_history = []
     status_entry = {}
@@ -2839,5 +2905,5 @@ def set_status_history(property_key, normalized_type, user_token, existing_data_
     new_status_history.append(status_entry)
     entity_data_dict = {"status_history": new_status_history}
 
-    schema_neo4j_queries.update_entity(schema_manager.get_neo4j_driver_instance(), normalized_type, entity_data_dict,
-                                       uuid)
+    schema_neo4j_queries.update_entity(schema_manager.get_neo4j_driver_instance(), normalized_type, entity_data_dict, uuid)
+
diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py
@@ -568,6 +568,61 @@ def validate_group_name(property_key, normalized_entity_type, request, existing_
         raise ValueError("Invalid group in 'assigned_to_group_name'. Must be a data provider")
 
 
+def validate_status_changed(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
+    """
+    Validate that status, if included in new_data_dict, is different from the existing status value
+
+    Parameters
+    ----------
+    property_key : str
+        The target property key
+    normalized_type : str
+        Submission
+    request: Flask request object
+        The instance of Flask request passed in from application request
+    existing_data_dict : dict
+        A dictionary that contains all existing entity properties
+    new_data_dict : dict
+        The json data in request body, already after the regular validations
+    """
+
+    if 'status' not in existing_data_dict:
+        raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'validate_status_changed()' validator method.")
+
+    # Only allow 'status' in new_data_dict if its different than the existing status value
+    if existing_data_dict['status'].lower() == new_data_dict['status'].lower():
+        raise ValueError(f"Status value is already {existing_data_dict['status']}, cannot change to {existing_data_dict['status']}. If no change, do not include status field in update")
+
+
+def validate_dataset_not_component(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
+    """
+    Validate that a given dataset is not a component of a multi-assay split parent dataset fore allowing status to be
+    updated. If a component dataset needs to be updated, update it via its parent multi-assay dataset
+
+    Parameters
+    ----------
+    property_key : str
+        The target property key
+    normalized_type : str
+        Submission
+    request: Flask request object
+        The instance of Flask request passed in from application request
+    existing_data_dict : dict
+        A dictionary that contains all existing entity properties
+    new_data_dict : dict
+        The json data in request body, already after the regular validations
+    """
+    headers = request.headers
+    if headers.get(SchemaConstants.INTERNAL_TRIGGER) != SchemaConstants.COMPONENT_DATASET:
+        neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
+        uuid = existing_data_dict['uuid']
+        creation_action = schema_neo4j_queries.get_entity_creation_action_activity(neo4j_driver_instance, uuid)
+        if creation_action == 'Multi-Assay Split':
+            raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
+                             f" {existing_data_dict['uuid']}. Can not change status on component datasets directly. Status"
+                             f"change must occur on parent multi-assay split dataset")
+
+
 ####################################################################################################
 ## Internal Functions
 ####################################################################################################

diff --git a/test/config/app.cfg b/test/config/app.cfg
@@ -30,6 +30,11 @@ UUID_API_URL = 'http://uuid-api:8080'
 # Works regardless of the trailing slash
 INGEST_API_URL = 'http://ingest-api:8080'
 
+# URL for talking to Entity API (default for Localhost)
+# This is the same URL base where entity-api is running. This is useful in places where a call for one entity
+# necessitates subsequent calls for other entities.
+ENTITY_API_URL = 'http://entity-api:5002'
+
 # URL for talking to Search API (default value used for docker deployment, no token needed)
 # Don't use localhost since search-api is running on a different container
 # Point to remote URL for non-docker development