Skip to content

Commit

Permalink
Merge pull request #548 from hubmapconsortium/Derek-Furst/multiple-co…
Browse files Browse the repository at this point in the history
…mponents-datasets

Derek furst/multiple components datasets
  • Loading branch information
yuanzhou authored Oct 23, 2023
2 parents 33ebe05 + c14692e commit 1fa3e6d
Show file tree
Hide file tree
Showing 3 changed files with 371 additions and 0 deletions.
44 changes: 44 additions & 0 deletions entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2512,3 +2512,47 @@ paths:
description: The given dataset is unpublished and the user does not have the authorization to view it.
'500':
description: Internal error
'/datasets/components':
post:
summary: Create multiple component datasets from a single Multi-Assay ancestor
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
creation_action:
type: string
description: the action event that will describe the activity node. Allowed valuese are "Multi-Assay Split"
group_uuid:
type: string
description: the group uuid for the new component datasets
direct_ancestor_uuids:
type: string
description: the uuid for the parent multi assay dataset
datasets:
type: array
items:
$ref: '#/components/schemas/Dataset'

responses:
'200':
description: The entities were successfully created and are returned.
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Dataset'

'400':
description: Invalid input.
'404':
description: Not found. No matching datasets were found, or the none were found that the user is authorized to see.
'401':
description: The user's token has expired or the user did not supply a valid token
'403':
description: The given dataset is unpublished and the user does not have the authorization to view it.
'500':
description: Internal error
260 changes: 260 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from schema import schema_manager
from schema import schema_errors
from schema import schema_triggers
from schema import schema_validators
from schema import schema_neo4j_queries
from schema.schema_constants import SchemaConstants
from schema.schema_constants import DataVisibilityEnum
Expand Down Expand Up @@ -3828,6 +3829,151 @@ def paired_dataset(id):
return jsonify(out_list), 200


"""
Create multiple component datasets from a single Multi-Assay ancestor
Input
-----
json
A json object with the fields:
creation_action
- type: str
- description: the action event that will describe the activity node. Allowed valuese are: "Multi-Assay Split"
group_uuid
- type: str
- description: the group uuid for the new component datasets
direct_ancestor_uuid
- type: str
- description: the uuid for the parent multi assay dataset
datasets
- type: dict
- description: the datasets to be created. Only difference between these and normal datasets are the field "dataset_link_abs_dir"
Returns
--------
json array
List of the newly created datasets represented as dictionaries.
"""
@app.route('/datasets/components', methods=['POST'])
def multiple_components():
if READ_ONLY_MODE:
forbidden_error("Access not granted when entity-api in READ-ONLY mode")
# If an invalid token provided, we need to tell the client with a 401 error, rather
# than a 500 error later if the token is not good.
validate_token_if_auth_header_exists(request)
# Get user token from Authorization header
user_token = get_user_token(request)
try:
schema_validators.validate_application_header_before_entity_create("Dataset", request)
except Exception as e:
bad_request_error(str(e))
require_json(request)

######### validate top level properties ########

# Verify that each required field is in the json_data_dict, and that there are no other fields
json_data_dict = request.get_json()
required_fields = ['creation_action', 'group_uuid', 'direct_ancestor_uuids', 'datasets']
for field in required_fields:
if field not in json_data_dict:
raise bad_request_error(f"Missing required field {field}")
for field in json_data_dict:
if field not in required_fields:
raise bad_request_error(f"Request body contained unexpected field {field}")

# validate creation_action
allowable_creation_actions = ['Multi-Assay Split']
if json_data_dict.get('creation_action') not in allowable_creation_actions:
bad_request_error(f"creation_action {json_data_dict.get('creation_action')} not recognized. Allowed values are: {COMMA_SEPARATOR.join(allowable_creation_actions)}")

# While we accept a list of direct_ancestor_uuids, we currently only allow a single direct ancestor so verify that there is only 1
direct_ancestor_uuids = json_data_dict.get('direct_ancestor_uuids')
if direct_ancestor_uuids is None or not isinstance(direct_ancestor_uuids, list) or len(direct_ancestor_uuids) !=1:
bad_request_error(f"Required field 'direct_ancestor_uuids' must be a list. This list may only contain 1 item: a string representing the uuid of the direct ancestor")

# validate existence of direct ancestors.
for direct_ancestor_uuid in direct_ancestor_uuids:
direct_ancestor_dict = query_target_entity(direct_ancestor_uuid, user_token)
if direct_ancestor_dict.get('entity_type').lower() != "dataset":
bad_request_error(f"Direct ancestor is of type: {direct_ancestor_dict.get('entity_type')}. Must be of type 'dataset'.")

# validate that there are 2 and only 2 datasets in the dataset list
if len(json_data_dict.get('datasets')) != 2:
bad_request_error(f"'datasets' field must contain 2 component datasets.")

# Validate all datasets using existing schema with triggers and validators
for dataset in json_data_dict.get('datasets'):
# dataset_link_abs_dir is not part of the entity creation, will not be stored in neo4j and does not require
# validation. Remove it here and add it back after validation. We do the same for creating the entities. Doing
# this makes it easier to keep the dataset_link_abs_dir with the associated dataset instead of adding additional lists and keeping track of which value is tied to which dataset
dataset_link_abs_dir = dataset.pop('dataset_link_abs_dir', None)
if not dataset_link_abs_dir:
bad_request_error(f"Missing required field in datasets: dataset_link_abs_dir")
dataset['group_uuid'] = json_data_dict.get('group_uuid')
dataset['direct_ancestor_uuids'] = direct_ancestor_uuids
try:
schema_manager.validate_json_data_against_schema(dataset, 'Dataset')
except schema_errors.SchemaValidationException as e:
# No need to log validation errors
bad_request_error(str(e))
# Execute property level validators defined in the schema yaml before entity property creation
# Use empty dict {} to indicate there's no existing_data_dict
try:
schema_manager.execute_property_level_validators('before_property_create_validators', "Dataset", request, {}, dataset)
# Currently only ValueError
except ValueError as e:
bad_request_error(e)

# Add back in dataset_link_abs_dir
dataset['dataset_link_abs_dir'] = dataset_link_abs_dir

dataset_list = create_multiple_component_details(request, "Dataset", user_token, json_data_dict.get('datasets'), json_data_dict.get('creation_action'))

# We wait until after the new datasets are linked to their ancestor before performing the remaining post-creation
# linkeages. This way, in the event of unforseen errors, we don't have orphaned nodes.
for dataset in dataset_list:
schema_triggers.set_status_history('status', 'Dataset', user_token, dataset, {})

properties_to_skip = [
'direct_ancestors',
'collections',
'upload',
'title',
'previous_revision_uuid',
'next_revision_uuid'
]

if bool(request.args):
# The parsed query string value is a string 'true'
return_all_properties = request.args.get('return_all_properties')

if (return_all_properties is not None) and (return_all_properties.lower() == 'true'):
properties_to_skip = []

normalized_complete_entity_list = []
for dataset in dataset_list:
# Remove dataset_link_abs_dir once more before entity creation
dataset_link_abs_dir = dataset.pop('dataset_link_abs_dir', None)
# Generate the filtered or complete entity dict to send back
complete_dict = schema_manager.get_complete_entity_result(user_token, dataset, properties_to_skip)

# Will also filter the result based on schema
normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict)


# Also index the new entity node in elasticsearch via search-api
logger.log(logging.INFO
,f"Re-indexing for creation of {complete_dict['entity_type']}"
f" with UUID {complete_dict['uuid']}")
reindex_entity(complete_dict['uuid'], user_token)
# Add back in dataset_link_abs_dir one last time
normalized_complete_dict['dataset_link_abs_dir'] = dataset_link_abs_dir
normalized_complete_entity_list.append(normalized_complete_dict)

return jsonify(normalized_complete_entity_list)



####################################################################################################
## Internal Functions
####################################################################################################
Expand Down Expand Up @@ -4346,6 +4492,120 @@ def create_multiple_samples_details(request, normalized_entity_type, user_token,
return new_ids_dict_list


"""
Create multiple dataset nodes and relationships with the source entity node
Parameters
----------
request : flask.Request object
The incoming request
normalized_entity_type : str
One of the normalized entity types: Dataset, Collection, Sample, Donor
user_token: str
The user's globus groups token
json_data_dict_list: list
List of datasets objects as dictionaries
creation_action : str
The creation action for the new activity node.
Returns
-------
list
A list of all the newly created datasets with generated fields represented as dictionaries
"""
def create_multiple_component_details(request, normalized_entity_type, user_token, json_data_dict_list, creation_action):
# Get user info based on request
user_info_dict = schema_manager.get_user_info(request)
direct_ancestor = json_data_dict_list[0].get('direct_ancestor_uuids')[0]
# Create new ids for the new entity
try:
# we only need the json data from one of the datasets. The info will be the same for both, so we just grab the first in the list
new_ids_dict_list = schema_manager.create_hubmap_ids(normalized_entity_type, json_data_dict_list[0], user_token, user_info_dict, len(json_data_dict_list))
# When group_uuid is provided by user, it can be invalid
except KeyError as e:
# Log the full stack trace, prepend a line with our message
logger.exception(e)
bad_request_error(e)
except requests.exceptions.RequestException as e:
msg = f"Failed to create new HuBMAP ids via the uuid-api service"
logger.exception(msg)

# Due to the use of response.raise_for_status() in schema_manager.create_hubmap_ids()
# we can access the status codes from the exception
status_code = e.response.status_code

if status_code == 400:
bad_request_error(e.response.text)
if status_code == 404:
not_found_error(e.response.text)
else:
internal_server_error(e.response.text)
datasets_dict_list = []
for i in range(len(json_data_dict_list)):
# Remove dataset_link_abs_dir once more before entity creation
dataset_link_abs_dir = json_data_dict_list[i].pop('dataset_link_abs_dir', None)
# Combine each id dict into each dataset in json_data_dict_list
new_data_dict = {**json_data_dict_list[i], **user_info_dict, **new_ids_dict_list[i]}
try:
# Use {} since no existing dict
generated_before_create_trigger_data_dict = schema_manager.generate_triggered_data('before_create_trigger', normalized_entity_type, user_token, {}, new_data_dict)
# If one of the before_create_trigger methods fails, we can't create the entity
except schema_errors.BeforeCreateTriggerException:
# Log the full stack trace, prepend a line with our message
msg = "Failed to execute one of the 'before_create_trigger' methods, can't create the entity"
logger.exception(msg)
internal_server_error(msg)
except schema_errors.NoDataProviderGroupException:
# Log the full stack trace, prepend a line with our message
if 'group_uuid' in json_data_dict_list[i]:
msg = "Invalid 'group_uuid' value, can't create the entity"
else:
msg = "The user does not have the correct Globus group associated with, can't create the entity"

logger.exception(msg)
bad_request_error(msg)
except schema_errors.UnmatchedDataProviderGroupException:
# Log the full stack trace, prepend a line with our message
msg = "The user does not belong to the given Globus group, can't create the entity"
logger.exception(msg)
forbidden_error(msg)
except schema_errors.MultipleDataProviderGroupException:
# Log the full stack trace, prepend a line with our message
msg = "The user has mutiple Globus groups associated with, please specify one using 'group_uuid'"
logger.exception(msg)
bad_request_error(msg)
except KeyError as e:
# Log the full stack trace, prepend a line with our message
logger.exception(e)
bad_request_error(e)
except Exception as e:
logger.exception(e)
internal_server_error(e)
merged_dict = {**json_data_dict_list[i], **generated_before_create_trigger_data_dict}

# Filter out the merged_dict by getting rid of the transitent properties (not to be stored)
# and properties with None value
# Meaning the returned target property key is different from the original key
# in the trigger method, e.g., Donor.image_files_to_add
filtered_merged_dict = schema_manager.remove_transient_and_none_values(merged_dict, normalized_entity_type)
dataset_dict = {**filtered_merged_dict, **new_ids_dict_list[i]}
dataset_dict['dataset_link_abs_dir'] = dataset_link_abs_dir
datasets_dict_list.append(dataset_dict)

activity_data_dict = schema_manager.generate_activity_data(normalized_entity_type, user_token, user_info_dict)
activity_data_dict['creation_action'] = creation_action
try:
created_datasets = app_neo4j_queries.create_multiple_datasets(neo4j_driver_instance, datasets_dict_list, activity_data_dict, direct_ancestor)
except TransactionError:
msg = "Failed to create multiple samples"
# Log the full stack trace, prepend a line with our message
logger.exception(msg)
# Terminate and let the users know
internal_server_error(msg)


return created_datasets

"""
Execute 'after_create_triiger' methods
Expand Down
Loading

0 comments on commit 1fa3e6d

Please sign in to comment.