From 8851db0b84f3f6c9d9cfd94e9c22f9b6cbd0773c Mon Sep 17 00:00:00 2001
From: Karl Burke <kburke@pitt.edu>
Date: Mon, 4 Dec 2023 10:38:57 -0500
Subject: [PATCH 1/2] Initial commit of Dataset dataset_type support for
 create, read, update and indexing, prior to YAML-to-UBKG transition. Leaving
 in TODO markup for ongoing work on replacing Dataset dataset_types.

---
 src/app.py                        |  55 +++++++++++---
 src/app_neo4j_queries.py          |   5 +-
 src/schema/provenance_schema.yaml |   8 ++
 src/schema/schema_errors.py       |   3 +
 src/schema/schema_manager.py      | 120 ++++++++++++++++++++++++++++--
 src/schema/schema_validators.py   |  33 +++++++-
 6 files changed, 203 insertions(+), 21 deletions(-)

diff --git a/src/app.py b/src/app.py
index 4cbc05e8..7ed9f226 100644
--- a/src/app.py
+++ b/src/app.py
@@ -60,6 +60,7 @@
 # Remove trailing slash / from URL base to avoid "//" caused by config with trailing slash
 app.config['UUID_API_URL'] = app.config['UUID_API_URL'].strip('/')
 app.config['INGEST_API_URL'] = app.config['INGEST_API_URL'].strip('/')
+app.config['ONTOLOGY_API_URL'] = app.config['ONTOLOGY_API_URL'].strip('/')
 app.config['SEARCH_API_URL_LIST'] = [url.strip('/') for url in app.config['SEARCH_API_URL_LIST']]
 
 # This mode when set True disables the PUT and POST calls, used on STAGE to make entity-api READ-ONLY 
@@ -192,12 +193,20 @@ def http_internal_server_error(e):
 ## Schema initialization
 ####################################################################################################
 
+
 try:
+    try:
+        _schema_yaml_file = app.config['SCHEMA_YAML_FILE']
+    except KeyError as ke:
+        logger.error("Expected configuration failed to load %s from app_config=%s.", ke, app.config)
+        raise Exception("Expected configuration failed to load. See the logs.")
+
     # The schema_manager is a singleton module
     # Pass in auth_helper_instance, neo4j_driver instance, and memcached_client_instance
     schema_manager.initialize(app.config['SCHEMA_YAML_FILE'],
                               app.config['UUID_API_URL'],
                               app.config['INGEST_API_URL'],
+                              app.config['ONTOLOGY_API_URL'],
                               auth_helper_instance,
                               neo4j_driver_instance,
                               memcached_client_instance,
@@ -263,7 +272,6 @@ def http_internal_server_error(e):
 DATASET_STATUS_PUBLISHED = SchemaConstants.DATASET_STATUS_PUBLISHED
 COMMA_SEPARATOR = ','
 
-
 ####################################################################################################
 ## API Endpoints
 ####################################################################################################
@@ -933,6 +941,8 @@ def create_entity(entity_type):
     # Currently only ValueError
     except ValueError as e:
         bad_request_error(e)
+    except schema_errors.UnimplementedValidatorException as uve:
+        internal_server_error(uve)
 
     # Additional validation for Sample entities
     if normalized_entity_type == 'Sample':
@@ -2671,7 +2681,8 @@ def get_prov_info():
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
     HEADER_FIRST_SAMPLE_SUBMISSION_ID = 'first_sample_submission_id'
@@ -2698,11 +2709,12 @@ def get_prov_info():
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
     HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'
 
+    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -2809,6 +2821,8 @@ def get_prov_info():
 
         # Data type codes are replaced with data type descriptions
         assay_description_list = []
+        # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
         for item in dataset['data_types']:
             try:
                 assay_description_list.append(assay_types_dict[item]['description'])
@@ -2829,6 +2843,8 @@ def get_prov_info():
         # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
         if return_json is False:
             internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
+        # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
 
         internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])
 
@@ -3063,7 +3079,8 @@ def get_prov_info_for_dataset(id):
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_DATASET_SAMPLES = 'dataset_samples'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
@@ -3090,11 +3107,12 @@ def get_prov_info_for_dataset(id):
     HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
 
+    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -3147,6 +3165,8 @@ def get_prov_info_for_dataset(id):
 
     # Data type codes are replaced with data type descriptions
     assay_description_list = []
+    # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
     for item in dataset['data_types']:
         try:
             assay_description_list.append(assay_types_dict[item]['description'])
@@ -3165,6 +3185,11 @@ def get_prov_info_for_dataset(id):
     internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
     if return_json is False:
         internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
+    # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
+
+    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
+
     internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
         '<identifier>', dataset['uuid'])
     if dataset['first_sample'] is not None:
@@ -3335,7 +3360,7 @@ def get_prov_info_for_dataset(id):
 -------
 json
     a json array. Each item in the array corresponds to a dataset. Each dataset has the values: dataset_group_name, 
-    organ_type, dataset_data_types, and dataset_status, each of which is a string. 
+    organ_type, dataset_data_types, and dataset_status, each of which is a string. # TODO-integrate dataset_dataset_type to documentation.
 
 """
 @app.route('/datasets/sankey_data', methods=['GET'])
@@ -3343,7 +3368,8 @@ def sankey_data():
     # String constants
     HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
     HEADER_ORGAN_TYPE = 'organ_type'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_STATUS = 'dataset_status'
 
     with open('sankey_mapping.json') as f:
@@ -3389,6 +3415,9 @@ def sankey_data():
             internal_dict = collections.OrderedDict()
             internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]
             internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[dataset[HEADER_ORGAN_TYPE]]['description'].lower()
+
+            # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
             # Data type codes are replaced with data type descriptions
             assay_description = ""
             try:
@@ -3412,6 +3441,8 @@ def sankey_data():
                 internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
             if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
                 internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
+            # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
 
             # Each dataset's dictionary is added to the list to be returned
             dataset_sankey_list.append(internal_dict)
@@ -3594,16 +3625,17 @@ def get_sample_prov_info():
 json
     an array of each unpublished dataset.
     fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", 
-             "provider_experiment_id", "uuid")
+             "provider_experiment_id", "uuid")  # TODO-integrate dataset_dataset_type to documentation.
 tsv
     a text/tab-seperated-value document including each unpublished dataset.
     fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", 
-             "provider_experiment_id", "uuid")
+             "provider_experiment_id", "uuid")  # TODO-integrate dataset_dataset_type to documentation.
 """
 @app.route('/datasets/unpublished', methods=['GET'])
 def unpublished():
     # String constraints
-    HEADER_DATA_TYPES = "data_types"
+    HEADER_DATA_TYPES = "data_types" # TODO-eliminate when HEADER_DATASET_TYPE is required
+    HEADER_DATASET_TYPE = 'dataset_type'
     HEADER_ORGANIZATION = "organization"
     HEADER_UUID = "uuid"
     HEADER_HUBMAP_ID = "hubmap_id"
@@ -3612,8 +3644,9 @@ def unpublished():
     HEADER_SUBMISSION_ID = "donor_submission_id"
     HEADER_PROVIDER_EXPERIMENT_ID = "provider_experiment_id"
 
+    # TODO-Eliminate HEADER_DATA_TYPES once HEADER_DATASET_TYPE is required.
     headers = [
-        HEADER_DATA_TYPES, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
+        HEADER_DATA_TYPES, HEADER_DATASET_TYPE, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
         HEADER_SUBMISSION_ID, HEADER_PROVIDER_EXPERIMENT_ID
     ]
 
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index 586edf45..87183315 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
              f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
 
     logger.info("======get_prov_info() query======")
     logger.info(query)
@@ -834,7 +834,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
              f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
     logger.info("======get_prov_info() query======")
     logger.info(query)
 
@@ -891,6 +891,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
                 content_sixteen.append(node_dict)
             record_dict['processed_dataset'] = content_sixteen
+            record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
     return record_dict
 
 
diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
index 05174972..0413cf76 100644
--- a/src/schema/provenance_schema.yaml
+++ b/src/schema/provenance_schema.yaml
@@ -332,6 +332,14 @@ ENTITIES:
         type: list
         required_on_create: true # Only required for create via POST, not update via PUT
         description: "The data or assay types contained in this dataset as a json array of strings.  Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/assay_types.yaml)."
+      dataset_type:
+        before_property_create_validators:
+          - validate_recognized_dataset_type
+        before_property_update_validators:
+          - validate_recognized_dataset_type
+        type: string
+        required_on_create: false # Once replaces data_types, will be required for create via POST, not update via PUT
+        description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API."
       collections:
         type: list
         transient: true
diff --git a/src/schema/schema_errors.py b/src/schema/schema_errors.py
index 47690b52..90a900a0 100644
--- a/src/schema/schema_errors.py
+++ b/src/schema/schema_errors.py
@@ -1,4 +1,7 @@
 
+class UnimplementedValidatorException(Exception):
+    pass
+
 class SchemaValidationException(Exception):
     pass
 
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
index 0cafdecf..2c6dde8e 100644
--- a/src/schema/schema_manager.py
+++ b/src/schema/schema_manager.py
@@ -31,6 +31,7 @@
 _schema = None
 _uuid_api_url = None
 _ingest_api_url = None
+_ontology_api_url = None
 _auth_helper = None
 _neo4j_driver = None
 _memcached_client = None
@@ -65,6 +66,7 @@
 def initialize(valid_yaml_file, 
                uuid_api_url,
                ingest_api_url,
+               ontology_api_url,
                auth_helper_instance,
                neo4j_driver_instance,
                memcached_client_instance,
@@ -73,14 +75,33 @@ def initialize(valid_yaml_file,
     global _schema
     global _uuid_api_url
     global _ingest_api_url
+    global _ontology_api_url
     global _auth_helper
     global _neo4j_driver
     global _memcached_client
     global _memcached_prefix
 
     _schema = load_provenance_schema(valid_yaml_file)
-    _uuid_api_url = uuid_api_url
-    _ingest_api_url = ingest_api_url
+    if uuid_api_url is not None:
+        _uuid_api_url = uuid_api_url
+    else:
+        msg = f"Unable to initialize schema manager with uuid_api_url={uuid_api_url}."
+        logger.critical(msg=msg)
+        raise Exception(msg)
+
+    if ingest_api_url is not None:
+        _ingest_api_url = ingest_api_url
+    else:
+        msg = f"Unable to initialize schema manager with ingest_api_url={ingest_api_url}."
+        logger.critical(msg=msg)
+        raise Exception(msg)
+
+    if ontology_api_url is not None:
+        _ontology_api_url = ontology_api_url
+    else:
+        msg = f"Unable to initialize schema manager with ontology_api_url={ontology_api_url}."
+        logger.critical(msg=msg)
+        raise Exception(msg)
 
     # Get the helper instances
     _auth_helper = auth_helper_instance
@@ -838,8 +859,8 @@ def execute_entity_level_validator(validator_type, normalized_entity_type, reque
                 raise schema_errors.MissingApplicationHeaderException(e) 
             except schema_errors.InvalidApplicationHeaderException as e: 
                 raise schema_errors.InvalidApplicationHeaderException(e)
-            except Exception:
-                msg = f"Failed to call the {validator_type} method: {validator_method_name} defiend for entity {normalized_entity_type}"
+            except Exception as e:
+                msg = f"Failed to call the {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type}"
                 # Log the full stack trace, prepend a line with our message
                 logger.exception(msg)
 
@@ -892,11 +913,16 @@ def execute_property_level_validators(validator_type, normalized_entity_type, re
                     raise schema_errors.InvalidApplicationHeaderException(e)
                 except ValueError as ve:
                     raise ValueError(ve)
-                except Exception as e:
+                except schema_errors.UnimplementedValidatorException as uve:
                     msg = f"Failed to call the {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type} on property {key}"
                     # Log the full stack trace, prepend a line with our message
+                    logger.exception(f"{msg}. {str(uve)}")
+                    raise uve
+                except Exception as e:
+                    msg = f"Unexpected exception @TODO-KBKBKB calling {validator_type} method: {validator_method_name} defined for entity {normalized_entity_type} on property {key}"
+                    # Log the full stack trace, prepend a line with our message
                     logger.exception(f"{msg}. {str(e)}")
-
+                    raise e
 
 """
 Get a list of entity types that can be used as derivation source in the schmea yaml
@@ -1234,6 +1260,86 @@ def get_hubmap_ids(id):
         raise requests.exceptions.RequestException(response.text)
 
 
+"""
+Helper function to use the Ontology API to retrieve a valueset from UBKG containing
+allowed values for soft assays, which can be set on the beginning of (part before
+square brackets containing anything) the Dataset dataset_type field.
+
+Examples of valid dataset_type values are "RNASeq" and "CODEX [cytokit, image_pyramid]" 
+
+Parameters
+----------
+N/A: This help encapsulates hard-coded strings for soft assay values from the HUBMAP
+     source vocabulary of UBKG.
+
+Returns
+-------
+List of String values for each element in the UBKG valueset for valid dataset_type soft assay entries.
+['Histology','Molecular Cartography',...]
+"""
+def get_dataset_type_valueset_list():
+    # Use the Ontology API to get JSON for allowed terms.
+    ubkg_valueset = get_valueset(parent_vocabulary_sab='HUBMAP'
+                                 ,parent_vocabulary_valueset_code='C003041'
+                                 ,value_preferred_vocabulary_sab='HUBMAP')
+    # Extract the term elements from the JSON into a list to be returned.
+    return [v['term'] for v in ubkg_valueset]
+
+"""
+Use the Ontology API valueset endpoint to retrieve the UBKG valueset for a particular
+"parent" vocabulary & term.  The preferred vocabulary which each "child" element of the valueset
+comes from is also specified.
+
+Parameters
+----------
+parent_vocabulary_sab: The source vocabulary (SAB) recognized by UBKG to which parent_vocabulary_valueset_code belongs.
+
+parent_vocabulary_valueset_code: A code from parent_vocabulary_sab which is the parent of all elements of the valueset.
+
+value_preferred_vocabulary_sab: The source vocabulary (SAB) preferred for each term in the dataset.  It is common, but
+not required, that parent_vocabulary_sab and value_preferred_vocabulary_sab are the same i.e. specify a parent code
+from the HUBMAP vocabulary and return terms from the HUBMAP vocabulary.
+@TODO-KBKBKB determine if it is advisable to check the "sab" element of each term dictionary the Ontology API returns or if UBKG assures coverage such that we would never get a "sab" element which did not match value_preferred_vocabulary_sab.
+
+Returns
+-------
+JSON response from the Ontology API, which is a list of dictionaries, each containing "code", "sab", and "term" elements.
+[
+    {"code": "C003047", "sab": "HUBMAP", "term": "Histology"},
+    {"code": "C003051", "sab": "HUBMAP", "term": "Molecular Cartography"},
+    ...
+]
+"""
+def get_valueset(parent_vocabulary_sab, parent_vocabulary_valueset_code, value_preferred_vocabulary_sab):
+    global _ontology_api_url
+
+    target_url = f"{_ontology_api_url}/valueset" \
+                 f"?parent_sab={parent_vocabulary_sab}" \
+                 f"&parent_code={parent_vocabulary_valueset_code}" \
+                 f"&child_sabs={value_preferred_vocabulary_sab}"
+
+    # Use Memcached to improve performance
+    response = make_request_get(target_url, internal_token_used = True)
+
+    # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
+    response.raise_for_status()
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        msg = f"Unable to make a request to query the UBKG via ontology-api: {target_url}"
+        # Log the full stack trace, prepend a line with our message
+        logger.exception(msg)
+
+        logger.debug("======get_valueset() status code from ontology-api======")
+        logger.debug(response.status_code)
+
+        logger.debug("======get_valueset() response text from ontology-api======")
+        logger.debug(response.text)
+
+        # Also bubble up the error message from uuid-api
+        raise requests.exceptions.RequestException(response.text)
+
 """
 Create a set of new ids for the new entity to be created
 
@@ -1794,4 +1900,4 @@ def _create_request_headers(user_token):
         auth_header_name: auth_scheme + ' ' + user_token
     }
 
-    return headers_dict
\ No newline at end of file
+    return headers_dict
diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py
index 8c1269a5..336ec8f1 100644
--- a/src/schema/schema_validators.py
+++ b/src/schema/schema_validators.py
@@ -1,3 +1,5 @@
+import re
+
 import yaml
 import logging
 import requests
@@ -41,6 +43,36 @@ def validate_application_header_before_entity_create(normalized_entity_type, req
 ## Property Level Validators
 ####################################################################################################
 
+
+"""
+@TODO-KBKBKB redo doc...
+Validate the specified value for a Dataset's dataset_type is in the valueset UBKG recognizes. 
+
+Parameters
+----------
+property_key : str
+    The target property key
+normalized_type : str
+    Submission
+request: Flask request object
+    The instance of Flask request passed in from application request
+existing_data_dict : dict
+    A dictionary that contains all existing entity properties
+new_data_dict : dict
+    The json data in request body, already after the regular validations
+"""
+def validate_recognized_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
+    # If the proposed Dataset dataset_type ends with something in square brackets, anything inside
+    # those square brackets are acceptable at the end of the string.  Simply validate the start.
+    proposed_dataset_type_prefix = re.sub(pattern='[ ]*\[.*]$', repl='', string=new_data_dict['dataset_type'])
+    target_list = schema_manager.get_dataset_type_valueset_list()
+
+    if proposed_dataset_type_prefix not in target_list:
+        raise ValueError(f"Proposed Dataset dataset_type '{proposed_dataset_type_prefix}'"
+                         f" is not recognized in the existing ontology."
+                         f" Valid values are: {str(target_list)}.")
+
+
 """
 Validate the target list has no duplicated items
 
@@ -605,4 +637,3 @@ def _get_tissue_types():
 
         # Also bubble up the error message
         raise requests.exceptions.RequestException(response.text)
-

From 741b91d9996976e9bb513028bb6df067dc0cd213 Mon Sep 17 00:00:00 2001
From: Karl Burke <kburke@pitt.edu>
Date: Tue, 5 Dec 2023 15:45:11 -0500
Subject: [PATCH 2/2] Revise regular expression and re.sub() command so exactly
 one space is accepted between a UBKG-recognized soft assay type and square
 brackets containing values not validated.

---
 src/schema/schema_validators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py
index a72b74ed..8eb4cb95 100644
--- a/src/schema/schema_validators.py
+++ b/src/schema/schema_validators.py
@@ -64,7 +64,7 @@ def validate_application_header_before_entity_create(normalized_entity_type, req
 def validate_recognized_dataset_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
     # If the proposed Dataset dataset_type ends with something in square brackets, anything inside
     # those square brackets are acceptable at the end of the string.  Simply validate the start.
-    proposed_dataset_type_prefix = re.sub(pattern='[ ]*\[.*]$', repl='', string=new_data_dict['dataset_type'])
+    proposed_dataset_type_prefix = re.sub(pattern='(\S)\s\[.*\]$', repl=r'\1', string=new_data_dict['dataset_type'])
     target_list = schema_manager.get_dataset_type_valueset_list()
 
     if proposed_dataset_type_prefix not in target_list: