diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml index 3c1909dd..606f9f06 100644 --- a/entity-api-spec.yaml +++ b/entity-api-spec.yaml @@ -2485,3 +2485,47 @@ paths: description: The given dataset is unpublished and the user does not have the authorization to view it. '500': description: Internal error + '/datasets/components': + post: + summary: Create multiple component datasets from a single Multi-Assay ancestor + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + creation_action: + type: string + description: the action event that will describe the activity node. Allowed valuese are "Multi-Assay Split" + group_uuid: + type: string + description: the group uuid for the new component datasets + direct_ancestor_uuids: + type: string + description: the uuid for the parent multi assay dataset + datasets: + type: array + items: + $ref: '#/components/schemas/Dataset' + + responses: + '200': + description: The entities were successfully created and are returned. + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/Dataset' + + '400': + description: Invalid input. + '404': + description: Not found. No matching datasets were found, or the none were found that the user is authorized to see. + '401': + description: The user's token has expired or the user did not supply a valid token + '403': + description: The given dataset is unpublished and the user does not have the authorization to view it. + '500': + description: Internal error diff --git a/src/app.py b/src/app.py index 94c614d3..29ad7072 100644 --- a/src/app.py +++ b/src/app.py @@ -27,6 +27,7 @@ from schema import schema_manager from schema import schema_errors from schema import schema_triggers +from schema import schema_validators from schema import schema_neo4j_queries from schema.schema_constants import SchemaConstants from schema.schema_constants import DataVisibilityEnum @@ -3669,6 +3670,151 @@ def paired_dataset(id): return jsonify(out_list), 200 +""" +Create multiple component datasets from a single Multi-Assay ancestor + +Input +----- +json + A json object with the fields: + creation_action + - type: str + - description: the action event that will describe the activity node. Allowed valuese are: "Multi-Assay Split" + group_uuid + - type: str + - description: the group uuid for the new component datasets + direct_ancestor_uuid + - type: str + - description: the uuid for the parent multi assay dataset + datasets + - type: dict + - description: the datasets to be created. Only difference between these and normal datasets are the field "dataset_link_abs_dir" + +Returns +-------- +json array + List of the newly created datasets represented as dictionaries. +""" +@app.route('/datasets/components', methods=['POST']) +def multiple_components(): + if READ_ONLY_MODE: + forbidden_error("Access not granted when entity-api in READ-ONLY mode") + # If an invalid token provided, we need to tell the client with a 401 error, rather + # than a 500 error later if the token is not good. + validate_token_if_auth_header_exists(request) + # Get user token from Authorization header + user_token = get_user_token(request) + try: + schema_validators.validate_application_header_before_entity_create("Dataset", request) + except Exception as e: + bad_request_error(str(e)) + require_json(request) + + ######### validate top level properties ######## + + # Verify that each required field is in the json_data_dict, and that there are no other fields + json_data_dict = request.get_json() + required_fields = ['creation_action', 'group_uuid', 'direct_ancestor_uuids', 'datasets'] + for field in required_fields: + if field not in json_data_dict: + raise bad_request_error(f"Missing required field {field}") + for field in json_data_dict: + if field not in required_fields: + raise bad_request_error(f"Request body contained unexpected field {field}") + + # validate creation_action + allowable_creation_actions = ['Multi-Assay Split'] + if json_data_dict.get('creation_action') not in allowable_creation_actions: + bad_request_error(f"creation_action {json_data_dict.get('creation_action')} not recognized. Allowed values are: {COMMA_SEPARATOR.join(allowable_creation_actions)}") + + # While we accept a list of direct_ancestor_uuids, we currently only allow a single direct ancestor so verify that there is only 1 + direct_ancestor_uuids = json_data_dict.get('direct_ancestor_uuids') + if direct_ancestor_uuids is None or not isinstance(direct_ancestor_uuids, list) or len(direct_ancestor_uuids) !=1: + bad_request_error(f"Required field 'direct_ancestor_uuids' must be a list. This list may only contain 1 item: a string representing the uuid of the direct ancestor") + + # validate existence of direct ancestors. + for direct_ancestor_uuid in direct_ancestor_uuids: + direct_ancestor_dict = query_target_entity(direct_ancestor_uuid, user_token) + if direct_ancestor_dict.get('entity_type').lower() != "dataset": + bad_request_error(f"Direct ancestor is of type: {direct_ancestor_dict.get('entity_type')}. Must be of type 'dataset'.") + + # validate that there are 2 and only 2 datasets in the dataset list + if len(json_data_dict.get('datasets')) != 2: + bad_request_error(f"'datasets' field must contain 2 component datasets.") + + # Validate all datasets using existing schema with triggers and validators + for dataset in json_data_dict.get('datasets'): + # dataset_link_abs_dir is not part of the entity creation, will not be stored in neo4j and does not require + # validation. Remove it here and add it back after validation. We do the same for creating the entities. Doing + # this makes it easier to keep the dataset_link_abs_dir with the associated dataset instead of adding additional lists and keeping track of which value is tied to which dataset + dataset_link_abs_dir = dataset.pop('dataset_link_abs_dir', None) + if not dataset_link_abs_dir: + bad_request_error(f"Missing required field in datasets: dataset_link_abs_dir") + dataset['group_uuid'] = json_data_dict.get('group_uuid') + dataset['direct_ancestor_uuids'] = direct_ancestor_uuids + try: + schema_manager.validate_json_data_against_schema(dataset, 'Dataset') + except schema_errors.SchemaValidationException as e: + # No need to log validation errors + bad_request_error(str(e)) + # Execute property level validators defined in the schema yaml before entity property creation + # Use empty dict {} to indicate there's no existing_data_dict + try: + schema_manager.execute_property_level_validators('before_property_create_validators', "Dataset", request, {}, dataset) + # Currently only ValueError + except ValueError as e: + bad_request_error(e) + + # Add back in dataset_link_abs_dir + dataset['dataset_link_abs_dir'] = dataset_link_abs_dir + + dataset_list = create_multiple_component_details(request, "Dataset", user_token, json_data_dict.get('datasets'), json_data_dict.get('creation_action')) + + # We wait until after the new datasets are linked to their ancestor before performing the remaining post-creation + # linkeages. This way, in the event of unforseen errors, we don't have orphaned nodes. + for dataset in dataset_list: + schema_triggers.set_status_history('status', 'Dataset', user_token, dataset, {}) + + properties_to_skip = [ + 'direct_ancestors', + 'collections', + 'upload', + 'title', + 'previous_revision_uuid', + 'next_revision_uuid' + ] + + if bool(request.args): + # The parsed query string value is a string 'true' + return_all_properties = request.args.get('return_all_properties') + + if (return_all_properties is not None) and (return_all_properties.lower() == 'true'): + properties_to_skip = [] + + normalized_complete_entity_list = [] + for dataset in dataset_list: + # Remove dataset_link_abs_dir once more before entity creation + dataset_link_abs_dir = dataset.pop('dataset_link_abs_dir', None) + # Generate the filtered or complete entity dict to send back + complete_dict = schema_manager.get_complete_entity_result(user_token, dataset, properties_to_skip) + + # Will also filter the result based on schema + normalized_complete_dict = schema_manager.normalize_entity_result_for_response(complete_dict) + + + # Also index the new entity node in elasticsearch via search-api + logger.log(logging.INFO + ,f"Re-indexing for creation of {complete_dict['entity_type']}" + f" with UUID {complete_dict['uuid']}") + reindex_entity(complete_dict['uuid'], user_token) + # Add back in dataset_link_abs_dir one last time + normalized_complete_dict['dataset_link_abs_dir'] = dataset_link_abs_dir + normalized_complete_entity_list.append(normalized_complete_dict) + + return jsonify(normalized_complete_entity_list) + + + #################################################################################################### ## Internal Functions #################################################################################################### @@ -4149,6 +4295,120 @@ def create_multiple_samples_details(request, normalized_entity_type, user_token, return new_ids_dict_list +""" +Create multiple dataset nodes and relationships with the source entity node + +Parameters +---------- +request : flask.Request object + The incoming request +normalized_entity_type : str + One of the normalized entity types: Dataset, Collection, Sample, Donor +user_token: str + The user's globus groups token +json_data_dict_list: list + List of datasets objects as dictionaries +creation_action : str + The creation action for the new activity node. + +Returns +------- +list + A list of all the newly created datasets with generated fields represented as dictionaries +""" +def create_multiple_component_details(request, normalized_entity_type, user_token, json_data_dict_list, creation_action): + # Get user info based on request + user_info_dict = schema_manager.get_user_info(request) + direct_ancestor = json_data_dict_list[0].get('direct_ancestor_uuids')[0] + # Create new ids for the new entity + try: + # we only need the json data from one of the datasets. The info will be the same for both, so we just grab the first in the list + new_ids_dict_list = schema_manager.create_hubmap_ids(normalized_entity_type, json_data_dict_list[0], user_token, user_info_dict, len(json_data_dict_list)) + # When group_uuid is provided by user, it can be invalid + except KeyError as e: + # Log the full stack trace, prepend a line with our message + logger.exception(e) + bad_request_error(e) + except requests.exceptions.RequestException as e: + msg = f"Failed to create new HuBMAP ids via the uuid-api service" + logger.exception(msg) + + # Due to the use of response.raise_for_status() in schema_manager.create_hubmap_ids() + # we can access the status codes from the exception + status_code = e.response.status_code + + if status_code == 400: + bad_request_error(e.response.text) + if status_code == 404: + not_found_error(e.response.text) + else: + internal_server_error(e.response.text) + datasets_dict_list = [] + for i in range(len(json_data_dict_list)): + # Remove dataset_link_abs_dir once more before entity creation + dataset_link_abs_dir = json_data_dict_list[i].pop('dataset_link_abs_dir', None) + # Combine each id dict into each dataset in json_data_dict_list + new_data_dict = {**json_data_dict_list[i], **user_info_dict, **new_ids_dict_list[i]} + try: + # Use {} since no existing dict + generated_before_create_trigger_data_dict = schema_manager.generate_triggered_data('before_create_trigger', normalized_entity_type, user_token, {}, new_data_dict) + # If one of the before_create_trigger methods fails, we can't create the entity + except schema_errors.BeforeCreateTriggerException: + # Log the full stack trace, prepend a line with our message + msg = "Failed to execute one of the 'before_create_trigger' methods, can't create the entity" + logger.exception(msg) + internal_server_error(msg) + except schema_errors.NoDataProviderGroupException: + # Log the full stack trace, prepend a line with our message + if 'group_uuid' in json_data_dict_list[i]: + msg = "Invalid 'group_uuid' value, can't create the entity" + else: + msg = "The user does not have the correct Globus group associated with, can't create the entity" + + logger.exception(msg) + bad_request_error(msg) + except schema_errors.UnmatchedDataProviderGroupException: + # Log the full stack trace, prepend a line with our message + msg = "The user does not belong to the given Globus group, can't create the entity" + logger.exception(msg) + forbidden_error(msg) + except schema_errors.MultipleDataProviderGroupException: + # Log the full stack trace, prepend a line with our message + msg = "The user has mutiple Globus groups associated with, please specify one using 'group_uuid'" + logger.exception(msg) + bad_request_error(msg) + except KeyError as e: + # Log the full stack trace, prepend a line with our message + logger.exception(e) + bad_request_error(e) + except Exception as e: + logger.exception(e) + internal_server_error(e) + merged_dict = {**json_data_dict_list[i], **generated_before_create_trigger_data_dict} + + # Filter out the merged_dict by getting rid of the transitent properties (not to be stored) + # and properties with None value + # Meaning the returned target property key is different from the original key + # in the trigger method, e.g., Donor.image_files_to_add + filtered_merged_dict = schema_manager.remove_transient_and_none_values(merged_dict, normalized_entity_type) + dataset_dict = {**filtered_merged_dict, **new_ids_dict_list[i]} + dataset_dict['dataset_link_abs_dir'] = dataset_link_abs_dir + datasets_dict_list.append(dataset_dict) + + activity_data_dict = schema_manager.generate_activity_data(normalized_entity_type, user_token, user_info_dict) + activity_data_dict['creation_action'] = creation_action + try: + created_datasets = app_neo4j_queries.create_multiple_datasets(neo4j_driver_instance, datasets_dict_list, activity_data_dict, direct_ancestor) + except TransactionError: + msg = "Failed to create multiple samples" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + # Terminate and let the users know + internal_server_error(msg) + + + return created_datasets + """ Execute 'after_create_triiger' methods diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 98fd674c..129f528c 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -192,6 +192,73 @@ def create_multiple_samples(neo4j_driver, samples_dict_list, activity_data_dict, raise TransactionError(msg) +""" +Create multiple dataset nodes in neo4j + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +datasets_dict_list : list + A list of dicts containing the generated data of each sample to be created +activity_dict : dict + The dict containing generated activity data +direct_ancestor_uuid : str + The uuid of the direct ancestor to be linked to +""" +def create_multiple_datasets(neo4j_driver, datasets_dict_list, activity_data_dict, direct_ancestor_uuid): + try: + with neo4j_driver.session() as session: + entity_dict = {} + + tx = session.begin_transaction() + + activity_uuid = activity_data_dict['uuid'] + + # Step 1: create the Activity node + schema_neo4j_queries.create_activity_tx(tx, activity_data_dict) + + # Step 2: create relationship from source entity node to this Activity node + schema_neo4j_queries.create_relationship_tx(tx, direct_ancestor_uuid, activity_uuid, 'ACTIVITY_INPUT', '->') + + # Step 3: create each new sample node and link to the Activity node at the same time + output_dicts_list = [] + for dataset_dict in datasets_dict_list: + # Remove dataset_link_abs_dir once more before entity creation + dataset_link_abs_dir = dataset_dict.pop('dataset_link_abs_dir', None) + node_properties_map = schema_neo4j_queries.build_properties_map(dataset_dict) + + query = (f"MATCH (a:Activity) " + f"WHERE a.uuid = '{activity_uuid}' " + # Always define the Entity label in addition to the target `entity_type` label + f"CREATE (e:Entity:Dataset {node_properties_map} ) " + f"CREATE (a)-[:ACTIVITY_OUTPUT]->(e)" + f"RETURN e AS {record_field_name}") + + logger.info("======create_multiple_samples() individual query======") + logger.info(query) + + result = tx.run(query) + record = result.single() + entity_node = record[record_field_name] + entity_dict = schema_neo4j_queries.node_to_dict(entity_node) + entity_dict['dataset_link_abs_dir'] = dataset_link_abs_dir + output_dicts_list.append(entity_dict) + # Then + tx.commit() + return output_dicts_list + except TransactionError as te: + msg = f"TransactionError from calling create_multiple_samples(): {te.value}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + if tx.closed() == False: + logger.info("Failed to commit create_multiple_samples() transaction, rollback") + + tx.rollback() + + raise TransactionError(msg) + """ Get all revisions for a given dataset uuid and sort them in descending order based on their creation time @@ -512,8 +579,9 @@ def get_prov_info(neo4j_driver, param_dict, published_only): if published_only: published_only_query_string = f" AND toUpper(ds.status) = 'PUBLISHED'" published_only_revisions_string = f" WHERE toUpper(rev.status) = 'PUBLISHED'" - query = (f"MATCH (ds:Dataset)<-[:ACTIVITY_OUTPUT]-(a)<-[:ACTIVITY_INPUT]-(firstSample:Sample)<-[*]-(donor:Donor)" - f"WHERE not (ds)-[:REVISION_OF]->(:Dataset)" + query = (f"MATCH (ds:Dataset)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[*]-(firstSample:Sample)<-[*]-(donor:Donor)" + f" WHERE not (ds)-[:REVISION_OF]->(:Dataset)" + f" AND NOT toLower(a.creation_action) ENDS WITH 'process'" f"{group_uuid_query_string}" f"{dataset_status_query_string}" f"{published_only_query_string}" @@ -531,7 +599,8 @@ def get_prov_info(neo4j_driver, param_dict, published_only): f" {organ_query_string} (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)" f" {organ_where_clause}" f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, COLLECT(DISTINCT organ) AS ORGAN " - f" OPTIONAL MATCH (ds)-[:ACTIVITY_INPUT]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)" + f" OPTIONAL MATCH (ds)-[*]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)" + f" WHERE toLower(a3.creation_action) ENDS WITH 'process'" f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " @@ -617,7 +686,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only): """ def get_individual_prov_info(neo4j_driver, dataset_uuid): query = (f"MATCH (ds:Dataset {{uuid: '{dataset_uuid}'}})<-[*]-(firstSample:Sample)<-[*]-(donor:Donor)" - f" WHERE (:Dataset)<-[]-()<-[]-(firstSample)" + f" WHERE (:Dataset)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(firstSample)" f" WITH ds, COLLECT(distinct donor) AS DONOR, COLLECT(distinct firstSample) AS FIRSTSAMPLE" f" OPTIONAL MATCH (ds)<-[*]-(metaSample:Sample)" f" WHERE NOT metaSample.metadata IS NULL AND NOT TRIM(metaSample.metadata) = ''" @@ -628,7 +697,8 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): # specimen_type -> sample_category 12/15/2022 f" OPTIONAL match (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)" f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, COLLECT(distinct organ) AS ORGAN " - f" OPTIONAL MATCH (ds)-[:ACTIVITY_INPUT]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)" + f" OPTIONAL MATCH (ds)-[*]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)" + f" WHERE toLower(a3.creation_action) ENDS WITH 'process'" f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " @@ -800,7 +870,7 @@ def get_sample_prov_info(neo4j_driver, param_dict, public_only): # specimen_type -> sample_category 12/15/2022 f" OPTIONAL MATCH (s)<-[*]-(organ:Sample{{sample_category: 'organ'}})" f" WITH s, organ, d" - f" MATCH (s)<-[]-()<-[]-(da)" + f" MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(da)" f" RETURN s.uuid, s.lab_tissue_sample_id, s.group_name, s.created_by_user_email, s.metadata, s.rui_location," f" d.uuid, d.metadata, organ.uuid, organ.sample_category, organ.metadata, da.uuid, da.entity_type, " f"s.sample_category, organ.organ, s.organ, s.hubmap_id, s.submission_id, organ.hubmap_id, organ.submission_id, " @@ -909,7 +979,7 @@ def get_paired_dataset(neo4j_driver, uuid, data_type, search_depth): number_of_jumps = f"*..{search_depth}" data_type = f"['{data_type}']" query = ( - f'MATCH (ds:Dataset)<-[*]-(s:Sample) WHERE ds.uuid = "{uuid}" AND (:Dataset)<-[]-()<-[]-(s)' + f'MATCH (ds:Dataset)<-[*]-(s:Sample) WHERE ds.uuid = "{uuid}" AND (:Dataset)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(s)' f'MATCH (ods)<-[{number_of_jumps}]-(s) WHERE ods.data_types = "{data_type}"' f'return ods.uuid as uuid, ods.status as status' )