hubmapconsortium · yuanzhou · Jan 31, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/src/app.py b/src/app.py
@@ -2674,7 +2674,6 @@ def get_prov_info():
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
@@ -2702,12 +2701,11 @@ def get_prov_info():
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
     HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'
 
-    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -2721,13 +2719,12 @@ def get_prov_info():
     # Token is not required, but if an invalid token is provided,
     # we need to tell the client with a 401 error
     validate_token_if_auth_header_exists(request)
-
+    organ_types_dict = schema_manager.get_organ_types()
     if user_in_hubmap_read_group(request):
         published_only = False
 
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
-    organ_types_dict = schema_manager.get_organ_types()
 
     # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
     # having to access the resource for every dataset.
@@ -2797,29 +2794,7 @@ def get_prov_info():
         internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
         internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
         internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']
-
-        # Data type codes are replaced with data type descriptions
-        assay_description_list = []
-        # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-        for item in dataset['data_types']:
-            try:
-                assay_description_list.append(assay_types_dict[item]['description'])
-            except KeyError:
-                logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")
-
-                # Just use the data type value
-                assay_description_list.append(item)
-
-        dataset['data_types'] = assay_description_list
-        internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
-
-        # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
-        if return_json is False:
-            internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
-        # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-
+        internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
         internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])
 
         # first_sample properties are retrieved from its own dictionary
@@ -2860,7 +2835,7 @@ def get_prov_info():
                 distinct_organ_uuid_list.append(item['uuid'])
 
                 organ_code = item['organ'].upper()
-                validate_organ_code(organ_code)
+                validate_organ_code(organ_code, organ_types_dict)
 
                 distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
             internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
@@ -3016,7 +2991,7 @@ def get_prov_info_for_dataset(id):
     # Token is not required, but if an invalid token provided,
     # we need to tell the client with a 401 error
     validate_token_if_auth_header_exists(request)
-
+    organ_types_dict = schema_manager.get_organ_types()
     # Use the internal token to query the target entity
     # since public entities don't require user token
     token = get_internal_token()
@@ -3055,7 +3030,6 @@ def get_prov_info_for_dataset(id):
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_DATASET_SAMPLES = 'dataset_samples'
@@ -3083,12 +3057,11 @@ def get_prov_info_for_dataset(id):
     HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
 
-    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -3100,7 +3073,6 @@ def get_prov_info_for_dataset(id):
 
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
-    organ_types_dict = schema_manager.get_organ_types()
 
     # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
     # having to access the resource for every dataset.
@@ -3124,28 +3096,7 @@ def get_prov_info_for_dataset(id):
     internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
     internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
     internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']
-
-    # Data type codes are replaced with data type descriptions
-    assay_description_list = []
-    # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-    for item in dataset['data_types']:
-        try:
-            assay_description_list.append(assay_types_dict[item]['description'])
-        except KeyError:
-            logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")
-
-            # Just use the data type value
-            assay_description_list.append(item)
-
-    dataset['data_types'] = assay_description_list
-    internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
-    if return_json is False:
-        internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
-    # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
-
-    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
+    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
 
     internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
         '<identifier>', dataset['uuid'])
@@ -3185,7 +3136,7 @@ def get_prov_info_for_dataset(id):
             distinct_organ_uuid_list.append(item['uuid'])
 
             organ_code = item['organ'].upper()
-            validate_organ_code(organ_code)
+            validate_organ_code(organ_code, organ_types_dict )
 
             distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
         internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
@@ -3326,12 +3277,9 @@ def sankey_data():
     # String constants
     HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
     HEADER_ORGAN_TYPE = 'organ_type'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_STATUS = 'dataset_status'
 
-    with open('sankey_mapping.json') as f:
-        mapping_dict = json.load(f)
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
     organ_types_dict = schema_manager.get_organ_types()
@@ -3359,41 +3307,27 @@ def sankey_data():
             internal_dict = collections.OrderedDict()
             internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]
 
-            # TODO BEGIN evaluate elimination of this block once dataset['dataset_type'] is required and dataset['data_types'] removed.
             organ_code = dataset[HEADER_ORGAN_TYPE].upper()
-            validate_organ_code(organ_code)
+            validate_organ_code(organ_code, organ_types_dict)
 
             internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
-            # Data type codes are replaced with data type descriptions
-            assay_description = ""
-            try:
-                assay_description = assay_types_dict[dataset[HEADER_DATASET_DATA_TYPES]]['description']
-            except KeyError:
-                logger.exception(f"Data type {dataset[HEADER_DATASET_DATA_TYPES]} not found in resulting assay types via ontology-api")
-
-                # Just use the data type value
-                assay_description = dataset[HEADER_DATASET_DATA_TYPES]
 
-            internal_dict[HEADER_DATASET_DATA_TYPES] = assay_description
+            internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset[HEADER_DATASET_DATASET_TYPE]
 
             # Replace applicable Group Name and Data type with the value needed for the sankey via the mapping_dict
             internal_dict[HEADER_DATASET_STATUS] = dataset['dataset_status']
-            if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
-                internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
-            if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
-                internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
-            # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
+            # if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
+            #     internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
 
             # Each dataset's dictionary is added to the list to be returned
             dataset_sankey_list.append(internal_dict)
-        
+
         if MEMCACHED_MODE:
             # Cache the result
             memcached_client_instance.set(cache_key, dataset_sankey_list, expire = SchemaConstants.MEMCACHED_TTL)
     else:
         logger.info(f'Using the cached sankey data at time {datetime.now()}')
-        
+
     return jsonify(dataset_sankey_list)
 
 
@@ -4807,13 +4741,13 @@ def access_level_prefix_dir(dir_name):
 ----------
 organ_code : str
 """
-def validate_organ_code(organ_code):
+def validate_organ_code(organ_code, organ_types_dict=None):
+    if organ_types_dict is None:
+        organ_types_dict = schema_manager.get_organ_types()
     if not organ_code.isalpha() or not len(organ_code) == 2:
         internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code")
 
     try:
-        organ_types_dict = schema_manager.get_organ_types()
-
         if organ_code.upper() not in organ_types_dict:
             not_found_error(f"Unable to find organ code {organ_code} via the ontology-api")
     except requests.exceptions.RequestException:

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
              f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS")
 
     logger.info("======get_prov_info() query======")
     logger.info(query)
@@ -782,10 +782,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
             record_dict['last_modified_timestamp'] = record_contents[11]
             record_dict['last_modified_user_email'] = record_contents[12]
             record_dict['lab_dataset_id'] = record_contents[13]
-            data_types = record_contents[14]
-            data_types = data_types.replace("'", '"')
-            data_types = json.loads(data_types)
-            record_dict['data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[14]
             content_fifteen = []
             for entry in record_contents[15]:
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
@@ -834,7 +831,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
              f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET")
     logger.info("======get_prov_info() query======")
     logger.info(query)
 
@@ -877,10 +874,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
             record_dict['last_modified_timestamp'] = record_contents[11]
             record_dict['last_modified_user_email'] = record_contents[12]
             record_dict['lab_dataset_id'] = record_contents[13]
-            data_types = record_contents[14]
-            data_types = data_types.replace("'", '"')
-            data_types = json.loads(data_types)
-            record_dict['data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[14]
             content_fifteen = []
             for entry in record_contents[15]:
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
@@ -891,7 +885,6 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
                 content_sixteen.append(node_dict)
             record_dict['processed_dataset'] = content_sixteen
-            record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
     return record_dict
 
 
@@ -942,7 +935,7 @@ def get_sankey_info(neo4j_driver):
     query = (f"MATCH (ds:Dataset)<-[]-(a)<-[]-(:Sample)"
              # specimen_type -> sample_category 12/15/2022
              f"MATCH (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)"
-             f"RETURN distinct ds.group_name, organ.organ, ds.data_types, ds.status, ds. uuid order by ds.group_name")
+             f"RETURN distinct ds.group_name, organ.organ, ds.dataset_type, ds.status, ds. uuid order by ds.group_name")
     logger.info("======get_sankey_info() query======")
     logger.info(query)
     with neo4j_driver.session() as session:
@@ -958,14 +951,7 @@ def get_sankey_info(neo4j_driver):
                 record_contents.append(item)
             record_dict['dataset_group_name'] = record_contents[0]
             record_dict['organ_type'] = record_contents[1]
-            data_types_list = record_contents[2]
-            data_types_list = data_types_list.replace("'", '"')
-            data_types_list = json.loads(data_types_list)
-            data_types = data_types_list[0]
-            if (len(data_types_list)) > 1:
-                if (data_types_list[0] == "scRNAseq-10xGenomics-v3" and data_types_list[1] == "snATACseq") or (data_types_list[1] == "scRNAseq-10xGenomics-v3" and data_types_list[0] == "snATACseq"):
-                    data_types = "scRNA-seq (10x Genomics v3),snATAC-seq"
-            record_dict['dataset_data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[2]
             record_dict['dataset_status'] = record_contents[3]
             list_of_dictionaries.append(record_dict)
         return list_of_dictionaries