Skip to content

Commit

Permalink
Merge pull request #329 from sennetconsortium/tjmadonna/314-synchroni…
Browse files Browse the repository at this point in the history
…ze-component-dataset-status

Tjmadonna/314 synchronize component dataset status
  • Loading branch information
maxsibilla authored Apr 11, 2024
2 parents 8864887 + 3722582 commit bcf2c3d
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 24 deletions.
38 changes: 22 additions & 16 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,18 @@ def close_neo4j_driver(error):
try:
# The schema_manager is a singleton module
# Pass in auth_helper_instance, neo4j_driver instance, and file_upload_helper instance
schema_manager.initialize(app.config['SCHEMA_YAML_FILE'],
app.config['UUID_API_URL'],
app.config['INGEST_API_URL'],
app.config['SEARCH_API_URL'],
auth_helper_instance,
neo4j_driver_instance,
app.ubkg,
memcached_client_instance,
app.config['MEMCACHED_PREFIX'])
schema_manager.initialize(
valid_yaml_file=app.config['SCHEMA_YAML_FILE'],
uuid_api_url=app.config['UUID_API_URL'],
entity_api_url=app.config['ENTITY_API_URL'],
ingest_api_url=app.config['INGEST_API_URL'],
search_api_url=app.config['SEARCH_API_URL'],
auth_helper_instance=auth_helper_instance,
neo4j_driver_instance=neo4j_driver_instance,
ubkg_instance=app.ubkg,
memcached_client_instance=memcached_client_instance,
memcached_prefix=app.config['MEMCACHED_PREFIX']
)

logger.info("Initialized schema_manager module successfully :)")
# Use a broad catch-all here
Expand Down Expand Up @@ -1284,6 +1287,10 @@ def update_entity(id):
normalized_status = schema_manager.normalize_status(json_data_dict["status"])
json_data_dict["status"] = normalized_status

has_updated_status = False
if 'status' in json_data_dict and json_data_dict['status']:
has_updated_status = True

# Normalize user provided status
if "sub_status" in json_data_dict:
normalized_status = schema_manager.normalize_status(json_data_dict["sub_status"])
Expand All @@ -1298,7 +1305,6 @@ def update_entity(id):
# Normalize user provided entity_type
normalized_entity_type = schema_manager.normalize_entity_type(entity_dict['entity_type'])


verify_ubkg_properties(json_data_dict)

# Note, we don't support entity level validators on entity update via PUT
Expand All @@ -1307,7 +1313,7 @@ def update_entity(id):
# Validate request json against the yaml schema
# Pass in the entity_dict for missing required key check, this is different from creating new entity
try:
schema_manager.validate_json_data_against_schema('ENTITIES', json_data_dict, normalized_entity_type, existing_entity_dict = entity_dict)
schema_manager.validate_json_data_against_schema('ENTITIES', json_data_dict, normalized_entity_type, existing_entity_dict=entity_dict)
except schema_errors.SchemaValidationException as e:
# No need to log the validation errors
abort_bad_req(str(e))
Expand Down Expand Up @@ -1342,7 +1348,7 @@ def update_entity(id):
# Generate 'before_update_triiger' data and update the entity details in Neo4j
merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)

# Handle linkages update via `after_update_trigger` methods
# Handle linkages update via `after_update_trigger` methods
if has_direct_ancestor_uuid:
after_update(normalized_entity_type, user_token, merged_updated_dict)
elif normalized_entity_type in ['Dataset', 'Publication']:
Expand All @@ -1360,7 +1366,7 @@ def update_entity(id):
merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)

# Handle linkages update via `after_update_trigger` methods
if has_direct_ancestor_uuids:
if has_direct_ancestor_uuids or has_updated_status:
after_update(normalized_entity_type, user_token, merged_updated_dict)
elif normalized_entity_type == 'Upload':
has_dataset_uuids_to_link = False
Expand Down Expand Up @@ -1392,11 +1398,11 @@ def update_entity(id):
merged_updated_dict = update_object_details('ENTITIES', request, normalized_entity_type, user_token, json_data_dict, entity_dict)

# Handle linkages update via `after_update_trigger` methods
if has_dataset_uuids_to_link or has_dataset_uuids_to_unlink:
if has_dataset_uuids_to_link or has_updated_status:
after_update(normalized_entity_type, user_token, merged_updated_dict)
elif normalized_entity_type == 'Collection':
entity_visibility = _get_entity_visibility( normalized_entity_type=normalized_entity_type
,entity_dict=entity_dict)
entity_visibility = _get_entity_visibility(normalized_entity_type=normalized_entity_type, entity_dict=entity_dict)

# Prohibit update of an existing Collection if it meets criteria of being visible to public e.g. has DOI.
if entity_visibility == DataVisibilityEnum.PUBLIC:
logger.info(f"Attempt to update {normalized_entity_type} with id={id} which has visibility {entity_visibility}.")
Expand Down
5 changes: 5 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ UUID_API_URL = 'http://uuid-api:8080'
# Works regardless of the trailing slash
INGEST_API_URL = 'http://ingest-api:8080'

# URL for talking to Entity API (default for Localhost)
# This is the same URL base where entity-api is running. This is useful in places where a call for one entity
# necessitates subsequent calls for other entities.
ENTITY_API_URL = 'http://localhost:5002'

# URL for talking to Search API (default value used for docker deployment, no token needed)
# Don't use localhost since search-api is running on a different container
# Point to remote URL for non-docker development
Expand Down
4 changes: 3 additions & 1 deletion src/schema/provenance_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,13 @@ ENTITIES:
before_property_update_validators:
- validate_application_header_before_property_update
- validate_dataset_status_value
- validate_status_changed
- validate_dataset_not_component
generated: true
description: "One of: New|Processing|QA|Published|Error|Hold|Invalid|Incomplete"
before_create_trigger: set_dataset_status_new
after_create_trigger: set_status_history
after_update_trigger: set_status_history
after_update_trigger: update_status
status_history:
type: list
description: "A list of all status change events. Each entry in the list is a dictionary containing the change_timestamp, changed_by_email, previous_status, new_status"
Expand Down
5 changes: 5 additions & 0 deletions src/schema/schema_constants.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from enum import Enum


class SchemaConstants(object):
# Expire the request cache after the time-to-live (seconds), default 4 hours
REQUEST_CACHE_TTL = 14400
MEMCACHED_TTL = 7200

# Constants used by validators
INGEST_API_APP = 'ingest-api'
COMPONENT_DATASET = 'component-dataset'
INGEST_PIPELINE_APP = 'ingest-pipeline'
INGEST_PORTAL_APP = 'portal-ui'
# HTTP header names are case-insensitive
SENNET_APP_HEADER = 'X-SenNet-Application'
INTERNAL_TRIGGER = 'X-Internal-Trigger'
DATASET_STATUS_PUBLISHED = 'published'

# Used by triggers, all lowercase for easy comparision
Expand All @@ -25,6 +29,7 @@ class SchemaConstants(object):
ALLOWED_DATASET_STATUSES = ['new', 'processing', 'published', 'qa', 'error', 'hold', 'invalid', 'submitted', 'incomplete']
ALLOWED_UPLOAD_STATUSES = ['new', 'valid', 'invalid', 'error', 'reorganized', 'processing', 'submitted', 'incomplete']


# Define an enumeration to classify an entity's visibility, which can be combined with
# authorization info when verify operations on a request.
class DataVisibilityEnum(Enum):
Expand Down
31 changes: 26 additions & 5 deletions src/schema/schema_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import yaml
import logging
import requests
from flask import Response
from datetime import datetime

from flask import Response
from hubmap_commons.file_helper import ensureTrailingSlashURL

# Don't confuse urllib (Python native library) with urllib3 (3rd-party library, requests also uses urllib3)
from requests.packages.urllib3.exceptions import InsecureRequestWarning

Expand All @@ -15,9 +17,6 @@
from schema.schema_constants import SchemaConstants
from schema import schema_neo4j_queries

# HuBMAP commons
from hubmap_commons.hm_auth import AuthHelper

# Atlas Consortia commons
from atlas_consortia_commons.rest import *

Expand All @@ -36,6 +35,7 @@
# A single leading underscore means you're not supposed to access it "from the outside"
_schema = None
_uuid_api_url = None
_entity_api_url = None
_ingest_api_url = None
_search_api_url = None
_auth_helper = None
Expand Down Expand Up @@ -66,6 +66,7 @@

def initialize(valid_yaml_file,
uuid_api_url,
entity_api_url,
ingest_api_url,
search_api_url,
auth_helper_instance,
Expand All @@ -77,6 +78,7 @@ def initialize(valid_yaml_file,
# Specify as module-scope variables
global _schema
global _uuid_api_url
global _entity_api_url
global _ingest_api_url
global _search_api_url
global _auth_helper
Expand All @@ -93,6 +95,13 @@ def initialize(valid_yaml_file,
_ingest_api_url = ingest_api_url
_search_api_url = search_api_url

if entity_api_url is not None:
_entity_api_url = entity_api_url
else:
msg = f"Unable to initialize schema manager with entity_api_url={entity_api_url}."
logger.critical(msg=msg)
raise Exception(msg)

# Get the helper instances
_auth_helper = auth_helper_instance
_neo4j_driver = neo4j_driver_instance
Expand Down Expand Up @@ -2032,4 +2041,16 @@ def delete_memcached_cache(uuids_list):

_memcached_client.delete_many(cache_keys)

logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")
logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")


def get_entity_api_url():
""" Get the entity-api URL to be used by trigger methods.
Returns
-------
str
The entity-api URL ending with a trailing slash
"""
global _entity_api_url
return ensureTrailingSlashURL(_entity_api_url)
70 changes: 68 additions & 2 deletions src/schema/schema_triggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,71 @@ def set_was_derived_from(property_key, normalized_type, user_token, existing_dat
raise


def update_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
"""
Trigger event method that calls related functions involved with updating the status value
Parameters
----------
property_key : str
The target property key
normalized_type : str
One of the types defined in the schema yaml: Dataset
user_token: str
The user's globus nexus token
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
A merged dictionary that contains all possible input data to be used
"""

# execute set_status_history
set_status_history(property_key, normalized_type, user_token, existing_data_dict, new_data_dict)

# execute sync_component_dataset_status
sync_component_dataset_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict)


def sync_component_dataset_status(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
"""
Function that changes the status of component datasets when their parent multi-assay dataset's status changes.
Parameters
----------
property_key : str
The target property key
normalized_type : str
One of the types defined in the schema yaml: Dataset
user_token: str
The user's globus nexus token
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
A merged dictionary that contains all possible input data to be used
"""

if 'uuid' not in existing_data_dict:
raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'link_dataset_to_direct_ancestors()' trigger method.")
uuid = existing_data_dict['uuid']
if 'status' not in existing_data_dict:
raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'link_dataset_to_direct_ancestors()' trigger method.")
status = existing_data_dict['status']
children_uuids_list = schema_neo4j_queries.get_children(schema_manager.get_neo4j_driver_instance(), uuid, property_key='uuid')
status_body = {"status": status}

for child_uuid in children_uuids_list:
creation_action = schema_neo4j_queries.get_entity_creation_action_activity(schema_manager.get_neo4j_driver_instance(), child_uuid)
if creation_action == 'Multi-Assay Split':
# Update the status of the child entities
url = schema_manager.get_entity_api_url() + 'entities/' + child_uuid
header = schema_manager._create_request_headers(user_token)
header[SchemaConstants.SENNET_APP_HEADER] = SchemaConstants.INGEST_API_APP
header[SchemaConstants.INTERNAL_TRIGGER] = SchemaConstants.COMPONENT_DATASET
response = requests.put(url=url, headers=header, json=status_body)
if response.status_code != 200:
logger.error(f"Failed to update status of child entity {child_uuid} when parent dataset status changed: {response.text}")


####################################################################################################
## Trigger methods specific to Collection - DO NOT RENAME
####################################################################################################
Expand Down Expand Up @@ -2801,6 +2866,7 @@ def source_metadata_display_value(metadata_item: dict) -> str:
####################################################################################################
## Trigger methods shared by Dataset, Upload, and Publication - DO NOT RENAME
####################################################################################################

def set_status_history(property_key, normalized_type, user_token, existing_data_dict, new_data_dict):
new_status_history = []
status_entry = {}
Expand Down Expand Up @@ -2839,5 +2905,5 @@ def set_status_history(property_key, normalized_type, user_token, existing_data_
new_status_history.append(status_entry)
entity_data_dict = {"status_history": new_status_history}

schema_neo4j_queries.update_entity(schema_manager.get_neo4j_driver_instance(), normalized_type, entity_data_dict,
uuid)
schema_neo4j_queries.update_entity(schema_manager.get_neo4j_driver_instance(), normalized_type, entity_data_dict, uuid)

55 changes: 55 additions & 0 deletions src/schema/schema_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,61 @@ def validate_group_name(property_key, normalized_entity_type, request, existing_
raise ValueError("Invalid group in 'assigned_to_group_name'. Must be a data provider")


def validate_status_changed(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
"""
Validate that status, if included in new_data_dict, is different from the existing status value
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""

if 'status' not in existing_data_dict:
raise KeyError("Missing 'status' key in 'existing_data_dict' during calling 'validate_status_changed()' validator method.")

# Only allow 'status' in new_data_dict if its different than the existing status value
if existing_data_dict['status'].lower() == new_data_dict['status'].lower():
raise ValueError(f"Status value is already {existing_data_dict['status']}, cannot change to {existing_data_dict['status']}. If no change, do not include status field in update")


def validate_dataset_not_component(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
"""
Validate that a given dataset is not a component of a multi-assay split parent dataset fore allowing status to be
updated. If a component dataset needs to be updated, update it via its parent multi-assay dataset
Parameters
----------
property_key : str
The target property key
normalized_type : str
Submission
request: Flask request object
The instance of Flask request passed in from application request
existing_data_dict : dict
A dictionary that contains all existing entity properties
new_data_dict : dict
The json data in request body, already after the regular validations
"""
headers = request.headers
if headers.get(SchemaConstants.INTERNAL_TRIGGER) != SchemaConstants.COMPONENT_DATASET:
neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
uuid = existing_data_dict['uuid']
creation_action = schema_neo4j_queries.get_entity_creation_action_activity(neo4j_driver_instance, uuid)
if creation_action == 'Multi-Assay Split':
raise ValueError(f"Unable to modify existing {existing_data_dict['entity_type']}"
f" {existing_data_dict['uuid']}. Can not change status on component datasets directly. Status"
f"change must occur on parent multi-assay split dataset")


####################################################################################################
## Internal Functions
####################################################################################################
Expand Down
5 changes: 5 additions & 0 deletions test/config/app.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ UUID_API_URL = 'http://uuid-api:8080'
# Works regardless of the trailing slash
INGEST_API_URL = 'http://ingest-api:8080'

# URL for talking to Entity API (default for Localhost)
# This is the same URL base where entity-api is running. This is useful in places where a call for one entity
# necessitates subsequent calls for other entities.
ENTITY_API_URL = 'http://entity-api:5002'

# URL for talking to Search API (default value used for docker deployment, no token needed)
# Don't use localhost since search-api is running on a different container
# Point to remote URL for non-docker development
Expand Down

0 comments on commit bcf2c3d

Please sign in to comment.