From 95b243e3da97d3c9138507ca598e64acb056cf28 Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Fri, 12 Jan 2024 15:04:23 -0500
Subject: [PATCH 1/2] modified prov-info and <id>/prov-info to use dataset_type
 instead of data types. samples/prov-info didn't appear to include any
 references. Also applied an identical change to what was done in the
 data-sankey branch to make validate_organ_code accept an optional argument
 that includes the organ dictionary so that ontology api doesn't need called
 thousands of times.

---
 src/app.py               | 69 +++++++---------------------------------
 src/app_neo4j_queries.py | 15 +++------
 2 files changed, 15 insertions(+), 69 deletions(-)
diff --git a/src/app.py b/src/app.py
index 8c5b3090..3ab79a66 100644
--- a/src/app.py
+++ b/src/app.py
@@ -2674,7 +2674,6 @@ def get_prov_info():
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
@@ -2702,12 +2701,11 @@ def get_prov_info():
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
     HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'
 
-    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -2721,7 +2719,7 @@ def get_prov_info():
     # Token is not required, but if an invalid token is provided,
     # we need to tell the client with a 401 error
     validate_token_if_auth_header_exists(request)
-
+    organ_types_dict = schema_manager.get_organ_types()
     if user_in_hubmap_read_group(request):
         published_only = False
 
@@ -2797,29 +2795,7 @@ def get_prov_info():
         internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
         internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
         internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']
-
-        # Data type codes are replaced with data type descriptions
-        assay_description_list = []
-        # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-        for item in dataset['data_types']:
-            try:
-                assay_description_list.append(assay_types_dict[item]['description'])
-            except KeyError:
-                logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")
-
-                # Just use the data type value
-                assay_description_list.append(item)
-
-        dataset['data_types'] = assay_description_list
-        internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
-
-        # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
-        if return_json is False:
-            internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
-        # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-
+        internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
         internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])
 
         # first_sample properties are retrieved from its own dictionary
@@ -2860,7 +2836,7 @@ def get_prov_info():
                 distinct_organ_uuid_list.append(item['uuid'])
 
                 organ_code = item['organ'].upper()
-                validate_organ_code(organ_code)
+                validate_organ_code(organ_code, organ_types_dict)
 
                 distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
             internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
@@ -3016,7 +2992,7 @@ def get_prov_info_for_dataset(id):
     # Token is not required, but if an invalid token provided,
     # we need to tell the client with a 401 error
     validate_token_if_auth_header_exists(request)
-
+    organ_types_dict = schema_manager.get_organ_types()
     # Use the internal token to query the target entity
     # since public entities don't require user token
     token = get_internal_token()
@@ -3055,7 +3031,6 @@ def get_prov_info_for_dataset(id):
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_DATASET_SAMPLES = 'dataset_samples'
@@ -3083,12 +3058,11 @@ def get_prov_info_for_dataset(id):
     HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
 
-    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -3124,28 +3098,7 @@ def get_prov_info_for_dataset(id):
     internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
     internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
     internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']
-
-    # Data type codes are replaced with data type descriptions
-    assay_description_list = []
-    # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-    for item in dataset['data_types']:
-        try:
-            assay_description_list.append(assay_types_dict[item]['description'])
-        except KeyError:
-            logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")
-
-            # Just use the data type value
-            assay_description_list.append(item)
-
-    dataset['data_types'] = assay_description_list
-    internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
-    if return_json is False:
-        internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
-    # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-
-    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
+    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
 
     internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
         '<identifier>', dataset['uuid'])
@@ -3185,7 +3138,7 @@ def get_prov_info_for_dataset(id):
             distinct_organ_uuid_list.append(item['uuid'])
 
             organ_code = item['organ'].upper()
-            validate_organ_code(organ_code)
+            validate_organ_code(organ_code, organ_types_dict )
 
             distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
         internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
@@ -4807,13 +4760,13 @@ def access_level_prefix_dir(dir_name):
 ----------
 organ_code : str
 """
-def validate_organ_code(organ_code):
+def validate_organ_code(organ_code, organ_types_dict=None):
+    if organ_types_dict is None:
+        organ_types_dict = schema_manager.get_organ_types()
     if not organ_code.isalpha() or not len(organ_code) == 2:
         internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code")
 
     try:
-        organ_types_dict = schema_manager.get_organ_types()
-
         if organ_code.upper() not in organ_types_dict:
             not_found_error(f"Unable to find organ code {organ_code} via the ontology-api")
     except requests.exceptions.RequestException:
diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
index 87183315..bb203133 100644
--- a/src/app_neo4j_queries.py
+++ b/src/app_neo4j_queries.py
@@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
              f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS")
 
     logger.info("======get_prov_info() query======")
     logger.info(query)
@@ -782,10 +782,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
             record_dict['last_modified_timestamp'] = record_contents[11]
             record_dict['last_modified_user_email'] = record_contents[12]
             record_dict['lab_dataset_id'] = record_contents[13]
-            data_types = record_contents[14]
-            data_types = data_types.replace("'", '"')
-            data_types = json.loads(data_types)
-            record_dict['data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[14]
             content_fifteen = []
             for entry in record_contents[15]:
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
@@ -834,7 +831,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
              f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET")
     logger.info("======get_prov_info() query======")
     logger.info(query)
 
@@ -877,10 +874,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
             record_dict['last_modified_timestamp'] = record_contents[11]
             record_dict['last_modified_user_email'] = record_contents[12]
             record_dict['lab_dataset_id'] = record_contents[13]
-            data_types = record_contents[14]
-            data_types = data_types.replace("'", '"')
-            data_types = json.loads(data_types)
-            record_dict['data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[14]
             content_fifteen = []
             for entry in record_contents[15]:
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
@@ -891,7 +885,6 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
                 content_sixteen.append(node_dict)
             record_dict['processed_dataset'] = content_sixteen
-            record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
     return record_dict
 
 

From 22a8337e07339da282138d6e9fe34bdb600c4c2b Mon Sep 17 00:00:00 2001
From: DerekFurstPitt <drf57@pitt.edu>
Date: Tue, 16 Jan 2024 23:18:29 -0500
Subject: [PATCH 2/2] removed duplicate organ_type_dict assignments

---
 src/app.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/app.py b/src/app.py
index 3ab79a66..8d00c48f 100644
--- a/src/app.py
+++ b/src/app.py
@@ -2725,7 +2725,6 @@ def get_prov_info():
 
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
-    organ_types_dict = schema_manager.get_organ_types()
 
     # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
     # having to access the resource for every dataset.
@@ -3074,7 +3073,6 @@ def get_prov_info_for_dataset(id):
 
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
-    organ_types_dict = schema_manager.get_organ_types()
 
     # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
     # having to access the resource for every dataset.