From 8851db0b84f3f6c9d9cfd94e9c22f9b6cbd0773c Mon Sep 17 00:00:00 2001 From: Karl Burke Date: Mon, 4 Dec 2023 10:38:57 -0500 Subject: [PATCH 1/2] Initial commit of Dataset dataset_type support for create, read, update and indexing, prior to YAML-to-UBKG transition. Leaving in TODO markup for ongoing work on replacing Dataset dataset_types. --- src/app.py | 55 +++++++++++--- src/app_neo4j_queries.py | 5 +- src/schema/provenance_schema.yaml | 8 ++ src/schema/schema_errors.py | 3 + src/schema/schema_manager.py | 120 ++++++++++++++++++++++++++++-- src/schema/schema_validators.py | 33 +++++++- 6 files changed, 203 insertions(+), 21 deletions(-) diff --git a/src/app.py b/src/app.py index 4cbc05e8..7ed9f226 100644 --- a/src/app.py +++ b/src/app.py @@ -60,6 +60,7 @@ # Remove trailing slash / from URL base to avoid "//" caused by config with trailing slash app.config['UUID_API_URL'] = app.config['UUID_API_URL'].strip('/') app.config['INGEST_API_URL'] = app.config['INGEST_API_URL'].strip('/') +app.config['ONTOLOGY_API_URL'] = app.config['ONTOLOGY_API_URL'].strip('/') app.config['SEARCH_API_URL_LIST'] = [url.strip('/') for url in app.config['SEARCH_API_URL_LIST']] # This mode when set True disables the PUT and POST calls, used on STAGE to make entity-api READ-ONLY @@ -192,12 +193,20 @@ def http_internal_server_error(e): ## Schema initialization #################################################################################################### + try: + try: + _schema_yaml_file = app.config['SCHEMA_YAML_FILE'] + except KeyError as ke: + logger.error("Expected configuration failed to load %s from app_config=%s.", ke, app.config) + raise Exception("Expected configuration failed to load. See the logs.") + # The schema_manager is a singleton module # Pass in auth_helper_instance, neo4j_driver instance, and memcached_client_instance schema_manager.initialize(app.config['SCHEMA_YAML_FILE'], app.config['UUID_API_URL'], app.config['INGEST_API_URL'], + app.config['ONTOLOGY_API_URL'], auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -263,7 +272,6 @@ def http_internal_server_error(e): DATASET_STATUS_PUBLISHED = SchemaConstants.DATASET_STATUS_PUBLISHED COMMA_SEPARATOR = ',' - #################################################################################################### ## API Endpoints #################################################################################################### @@ -933,6 +941,8 @@ def create_entity(entity_type): # Currently only ValueError except ValueError as e: bad_request_error(e) + except schema_errors.UnimplementedValidatorException as uve: + internal_server_error(uve) # Additional validation for Sample entities if normalized_entity_type == 'Sample': @@ -2671,7 +2681,8 @@ def get_prov_info(): HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified' HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email' HEADER_DATASET_LAB_ID = 'lab_id_or_name' - HEADER_DATASET_DATA_TYPES = 'dataset_data_types' + HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required + HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' HEADER_DATASET_PORTAL_URL = 'dataset_portal_url' HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id' HEADER_FIRST_SAMPLE_SUBMISSION_ID = 'first_sample_submission_id' @@ -2698,11 +2709,12 @@ def get_prov_info(): HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url' HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids' + # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required. headers = [ HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME, HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL, HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID, - HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, + HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE, HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID, HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID, @@ -2809,6 +2821,8 @@ def get_prov_info(): # Data type codes are replaced with data type descriptions assay_description_list = [] + # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. for item in dataset['data_types']: try: assay_description_list.append(assay_types_dict[item]['description']) @@ -2829,6 +2843,8 @@ def get_prov_info(): # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv if return_json is False: internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types']) + # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace('', dataset['uuid']) @@ -3063,7 +3079,8 @@ def get_prov_info_for_dataset(id): HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified' HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email' HEADER_DATASET_LAB_ID = 'lab_id_or_name' - HEADER_DATASET_DATA_TYPES = 'dataset_data_types' + HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required + HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' HEADER_DATASET_PORTAL_URL = 'dataset_portal_url' HEADER_DATASET_SAMPLES = 'dataset_samples' HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id' @@ -3090,11 +3107,12 @@ def get_prov_info_for_dataset(id): HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status' HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url' + # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required. headers = [ HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME, HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL, HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID, - HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, + HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE, HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID, HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID, @@ -3147,6 +3165,8 @@ def get_prov_info_for_dataset(id): # Data type codes are replaced with data type descriptions assay_description_list = [] + # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. for item in dataset['data_types']: try: assay_description_list.append(assay_types_dict[item]['description']) @@ -3165,6 +3185,11 @@ def get_prov_info_for_dataset(id): internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types'] if return_json is False: internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types']) + # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. + + internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type'] + internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace( '', dataset['uuid']) if dataset['first_sample'] is not None: @@ -3335,7 +3360,7 @@ def get_prov_info_for_dataset(id): ------- json a json array. Each item in the array corresponds to a dataset. Each dataset has the values: dataset_group_name, - organ_type, dataset_data_types, and dataset_status, each of which is a string. + organ_type, dataset_data_types, and dataset_status, each of which is a string. # TODO-integrate dataset_dataset_type to documentation. """ @app.route('/datasets/sankey_data', methods=['GET']) @@ -3343,7 +3368,8 @@ def sankey_data(): # String constants HEADER_DATASET_GROUP_NAME = 'dataset_group_name' HEADER_ORGAN_TYPE = 'organ_type' - HEADER_DATASET_DATA_TYPES = 'dataset_data_types' + HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required + HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' HEADER_DATASET_STATUS = 'dataset_status' with open('sankey_mapping.json') as f: @@ -3389,6 +3415,9 @@ def sankey_data(): internal_dict = collections.OrderedDict() internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME] internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[dataset[HEADER_ORGAN_TYPE]]['description'].lower() + + # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. # Data type codes are replaced with data type descriptions assay_description = "" try: @@ -3412,6 +3441,8 @@ def sankey_data(): internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]] if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys(): internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]] + # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, + # and once dataset['dataset_type'] is required and dataset['data_types'] removed. # Each dataset's dictionary is added to the list to be returned dataset_sankey_list.append(internal_dict) @@ -3594,16 +3625,17 @@ def get_sample_prov_info(): json an array of each unpublished dataset. fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", - "provider_experiment_id", "uuid") + "provider_experiment_id", "uuid") # TODO-integrate dataset_dataset_type to documentation. tsv a text/tab-seperated-value document including each unpublished dataset. fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", - "provider_experiment_id", "uuid") + "provider_experiment_id", "uuid") # TODO-integrate dataset_dataset_type to documentation. """ @app.route('/datasets/unpublished', methods=['GET']) def unpublished(): # String constraints - HEADER_DATA_TYPES = "data_types" + HEADER_DATA_TYPES = "data_types" # TODO-eliminate when HEADER_DATASET_TYPE is required + HEADER_DATASET_TYPE = 'dataset_type' HEADER_ORGANIZATION = "organization" HEADER_UUID = "uuid" HEADER_HUBMAP_ID = "hubmap_id" @@ -3612,8 +3644,9 @@ def unpublished(): HEADER_SUBMISSION_ID = "donor_submission_id" HEADER_PROVIDER_EXPERIMENT_ID = "provider_experiment_id" + # TODO-Eliminate HEADER_DATA_TYPES once HEADER_DATASET_TYPE is required. headers = [ - HEADER_DATA_TYPES, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID, + HEADER_DATA_TYPES, HEADER_DATASET_TYPE, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID, HEADER_SUBMISSION_ID, HEADER_PROVIDER_EXPERIMENT_ID ] diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 586edf45..87183315 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only): f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " - f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") + f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required logger.info("======get_prov_info() query======") logger.info(query) @@ -834,7 +834,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " - f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET") + f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type") logger.info("======get_prov_info() query======") logger.info(query) @@ -891,6 +891,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): node_dict = schema_neo4j_queries.node_to_dict(entry) content_sixteen.append(node_dict) record_dict['processed_dataset'] = content_sixteen + record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else '' return record_dict diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 05174972..0413cf76 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -332,6 +332,14 @@ ENTITIES: type: list required_on_create: true # Only required for create via POST, not update via PUT description: "The data or assay types contained in this dataset as a json array of strings. Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/assay_types.yaml)." + dataset_type: + before_property_create_validators: + - validate_recognized_dataset_type + before_property_update_validators: + - validate_recognized_dataset_type + type: string + required_on_create: false # Once replaces data_types, will be required for create via POST, not update via PUT + description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API." collections: type: list transient: true diff --git a/src/schema/schema_errors.py b/src/schema/schema_errors.py index 47690b52..90a900a0 100644 --- a/src/schema/schema_errors.py +++ b/src/schema/schema_errors.py @@ -1,4 +1,7 @@ +class UnimplementedValidatorException(Exception): + pass + class SchemaValidationException(Exception): pass diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 0cafdecf..2c6dde8e 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -31,6 +31,7 @@ _schema = None _uuid_api_url = None _ingest_api_url = None +_ontology_api_url = None _auth_helper = None _neo4j_driver = None _memcached_client = None @@ -65,6 +66,7 @@ def initialize(valid_yaml_file, uuid_api_url, ingest_api_url, + ontology_api_url, auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -73,14 +75,33 @@ def initialize(valid_yaml_file, global _schema global _uuid_api_url global _ingest_api_url + global _ontology_api_url global _auth_helper global _neo4j_driver global _memcached_client global _memcached_prefix _schema = load_provenance_schema(valid_yaml_file) - _uuid_api_url = uuid_api_url - _ingest_api_url = ingest_api_url + if uuid_api_url is not None: + _uuid_api_url = uuid_api_url + else: + msg = f"Unable to initialize schema manager with uuid_api_url={uuid_api_url}." + logger.critical(msg=msg) + raise Exception(msg) + + if ingest_api_url is not None: + _ingest_api_url = ingest_api_url + else: + msg = f"Unable to initialize schema manager with ingest_api_url={ingest_api_url}." + logger.critical(msg=msg) + raise Exception(msg) + + if ontology_api_url is not None: + _ontology_api_url = ontology_api_url + else: + msg = f"Unable to initialize schema manager with ontology_api_url={ontology_api_url}." + logger.critical(msg=msg) + raise Exception(msg) # Get the helper instances _auth_helper = auth_helper_instance @@ -838,8 +859,8 @@ def execute_entity_level_validator(validator_type, normalized_entity_type, reque raise schema_errors.MissingApplicationHeaderException(e) except schema_errors.InvalidApplicationHeaderException as e: raise schema_errors.InvalidApplicationHeaderException(e) - except Exception: - msg = f"Failed to call the {validator_type} method: {validator_method_name} defiend for entity {normalized_entity_type}" + except Exception as e: + msg = f"Failed to call the {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type}" # Log the full stack trace, prepend a line with our message logger.exception(msg) @@ -892,11 +913,16 @@ def execute_property_level_validators(validator_type, normalized_entity_type, re raise schema_errors.InvalidApplicationHeaderException(e) except ValueError as ve: raise ValueError(ve) - except Exception as e: + except schema_errors.UnimplementedValidatorException as uve: msg = f"Failed to call the {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type} on property {key}" # Log the full stack trace, prepend a line with our message + logger.exception(f"{msg}. {str(uve)}") + raise uve + except Exception as e: + msg = f"Unexpected exception @TODO-KBKBKB calling {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type} on property {key}" + # Log the full stack trace, prepend a line with our message logger.exception(f"{msg}. {str(e)}") - + raise e """ Get a list of entity types that can be used as derivation source in the schmea yaml @@ -1234,6 +1260,86 @@ def get_hubmap_ids(id): raise requests.exceptions.RequestException(response.text) +""" +Helper function to use the Ontology API to retrieve a valueset from UBKG containing +allowed values for soft assays, which can be set on the beginning of (part before +square brackets containing anything) the Dataset dataset_type field. + +Examples of valid dataset_type values are "RNASeq" and "CODEX [cytokit, image_pyramid]" + +Parameters +---------- +N/A: This help encapsulates hard-coded strings for soft assay values from the HUBMAP + source vocabulary of UBKG. + +Returns +------- +List of String values for each element in the UBKG valueset for valid dataset_type soft assay entries. +['Histology','Molecular Cartography',...] +""" +def get_dataset_type_valueset_list(): + # Use the Ontology API to get JSON for allowed terms. + ubkg_valueset = get_valueset(parent_vocabulary_sab='HUBMAP' + ,parent_vocabulary_valueset_code='C003041' + ,value_preferred_vocabulary_sab='HUBMAP') + # Extract the term elements from the JSON into a list to be returned. + return [v['term'] for v in ubkg_valueset] + +""" +Use the Ontology API valueset endpoint to retrieve the UBKG valueset for a particular +"parent" vocabulary & term. The preferred vocabulary which each "child" element of the valueset +comes from is also specified. + +Parameters +---------- +parent_vocabulary_sab: The source vocabulary (SAB) recognized by UBKG to which parent_vocabulary_valueset_code belongs. + +parent_vocabulary_valueset_code: A code from parent_vocabulary_sab which is the parent of all elements of the valueset. + +value_preferred_vocabulary_sab: The source vocabulary (SAB) preferred for each term in the dataset. It is common, but +not required, that parent_vocabulary_sab and value_preferred_vocabulary_sab are the same i.e. specify a parent code +from the HUBMAP vocabulary and return terms from the HUBMAP vocabulary. +@TODO-KBKBKB determine if it is advisable to check the "sab" element of each term dictionary the Ontology API returns or if UBKG assures coverage such that we would never get a "sab" element which did not match value_preferred_vocabulary_sab. + +Returns +------- +JSON response from the Ontology API, which is a list of dictionaries, each containing "code", "sab", and "term" elements. +[ + {"code": "C003047", "sab": "HUBMAP", "term": "Histology"}, + {"code": "C003051", "sab": "HUBMAP", "term": "Molecular Cartography"}, + ... +] +""" +def get_valueset(parent_vocabulary_sab, parent_vocabulary_valueset_code, value_preferred_vocabulary_sab): + global _ontology_api_url + + target_url = f"{_ontology_api_url}/valueset" \ + f"?parent_sab={parent_vocabulary_sab}" \ + f"&parent_code={parent_vocabulary_valueset_code}" \ + f"&child_sabs={value_preferred_vocabulary_sab}" + + # Use Memcached to improve performance + response = make_request_get(target_url, internal_token_used = True) + + # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes + response.raise_for_status() + + if response.status_code == 200: + return response.json() + else: + msg = f"Unable to make a request to query the UBKG via ontology-api: {target_url}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======get_valueset() status code from ontology-api======") + logger.debug(response.status_code) + + logger.debug("======get_valueset() response text from ontology-api======") + logger.debug(response.text) + + # Also bubble up the error message from uuid-api + raise requests.exceptions.RequestException(response.text) + """ Create a set of new ids for the new entity to be created @@ -1794,4 +1900,4 @@ def _create_request_headers(user_token): auth_header_name: auth_scheme + ' ' + user_token } - return headers_dict \ No newline at end of file + return headers_dict diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 8c1269a5..336ec8f1 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -1,3 +1,5 @@ +import re + import yaml import logging import requests @@ -41,6 +43,36 @@ def validate_application_header_before_entity_create(normalized_entity_type, req ## Property Level Validators #################################################################################################### + +""" +@TODO-KBKBKB redo doc... +Validate the specified value for a Dataset's dataset_type is in the valueset UBKG recognizes. + +Parameters +---------- +property_key : str + The target property key +normalized_type : str + Submission +request: Flask request object + The instance of Flask request passed in from application request +existing_data_dict : dict + A dictionary that contains all existing entity properties +new_data_dict : dict + The json data in request body, already after the regular validations +""" +def validate_recognized_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): + # If the proposed Dataset dataset_type ends with something in square brackets, anything inside + # those square brackets are acceptable at the end of the string. Simply validate the start. + proposed_dataset_type_prefix = re.sub(pattern='[ ]*\[.*]$', repl='', string=new_data_dict['dataset_type']) + target_list = schema_manager.get_dataset_type_valueset_list() + + if proposed_dataset_type_prefix not in target_list: + raise ValueError(f"Proposed Dataset dataset_type '{proposed_dataset_type_prefix}'" + f" is not recognized in the existing ontology." + f" Valid values are: {str(target_list)}.") + + """ Validate the target list has no duplicated items @@ -605,4 +637,3 @@ def _get_tissue_types(): # Also bubble up the error message raise requests.exceptions.RequestException(response.text) - From 741b91d9996976e9bb513028bb6df067dc0cd213 Mon Sep 17 00:00:00 2001 From: Karl Burke Date: Tue, 5 Dec 2023 15:45:11 -0500 Subject: [PATCH 2/2] Revise regular expression and re.sub() command so exactly one space is accepted between a UBKG-recognized soft assay type and square brackets containing values not validated. --- src/schema/schema_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index a72b74ed..8eb4cb95 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -64,7 +64,7 @@ def validate_application_header_before_entity_create(normalized_entity_type, req def validate_recognized_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): # If the proposed Dataset dataset_type ends with something in square brackets, anything inside # those square brackets are acceptable at the end of the string. Simply validate the start. - proposed_dataset_type_prefix = re.sub(pattern='[ ]*\[.*]$', repl='', string=new_data_dict['dataset_type']) + proposed_dataset_type_prefix = re.sub(pattern='(\S)\s\[.*\]$', repl=r'\1', string=new_data_dict['dataset_type']) target_list = schema_manager.get_dataset_type_valueset_list() if proposed_dataset_type_prefix not in target_list: