Modified data sankey endpoint. Uses dataset_type rather than data_types.

Various conversions and mappings no longer required. Also fixed a previous inefficiency which would cause the function to take minutes to run. Now it only takes seconds. This was caused by unecessary calls to ontology. This was done by modifying validate_organ_code to take an optional argument that includes the organ dictionary.
hubmapconsortium · Jan 11, 2024 · e2643ae · e2643ae
1 parent f2d1ca4
commit e2643ae
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 71 deletions.
diff --git a/src/app.py b/src/app.py
@@ -3326,12 +3326,9 @@ def sankey_data():
     # String constants
     HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
     HEADER_ORGAN_TYPE = 'organ_type'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
     HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_STATUS = 'dataset_status'
 
-    with open('sankey_mapping.json') as f:
-        mapping_dict = json.load(f)
     # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
     # because that would require using a urllib request for each dataset
     organ_types_dict = schema_manager.get_organ_types()
@@ -3359,41 +3356,27 @@ def sankey_data():
             internal_dict = collections.OrderedDict()
             internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]
 
-            # TODO BEGIN evaluate elimination of this block once dataset['dataset_type'] is required and dataset['data_types'] removed.
             organ_code = dataset[HEADER_ORGAN_TYPE].upper()
-            validate_organ_code(organ_code)
+            validate_organ_code(organ_code, organ_types_dict)
 
             internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
-            # Data type codes are replaced with data type descriptions
-            assay_description = ""
-            try:
-                assay_description = assay_types_dict[dataset[HEADER_DATASET_DATA_TYPES]]['description']
-            except KeyError:
-                logger.exception(f"Data type {dataset[HEADER_DATASET_DATA_TYPES]} not found in resulting assay types via ontology-api")
 
-                # Just use the data type value
-                assay_description = dataset[HEADER_DATASET_DATA_TYPES]
-
-            internal_dict[HEADER_DATASET_DATA_TYPES] = assay_description
+            internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset[HEADER_DATASET_DATASET_TYPE]
 
             # Replace applicable Group Name and Data type with the value needed for the sankey via the mapping_dict
             internal_dict[HEADER_DATASET_STATUS] = dataset['dataset_status']
-            if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
-                internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
-            if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
-                internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
-            # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
-            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
+            # if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
+            #     internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
 
             # Each dataset's dictionary is added to the list to be returned
             dataset_sankey_list.append(internal_dict)
-        
+
         if MEMCACHED_MODE:
             # Cache the result
             memcached_client_instance.set(cache_key, dataset_sankey_list, expire = SchemaConstants.MEMCACHED_TTL)
     else:
         logger.info(f'Using the cached sankey data at time {datetime.now()}')
-        
+
     return jsonify(dataset_sankey_list)
 
 
@@ -4807,13 +4790,13 @@ def access_level_prefix_dir(dir_name):
 ----------
 organ_code : str
 """
-def validate_organ_code(organ_code):
+def validate_organ_code(organ_code, organ_types_dict=None):
+    if organ_types_dict is None:
+        organ_types_dict = schema_manager.get_organ_types()
     if not organ_code.isalpha() or not len(organ_code) == 2:
         internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code")
 
     try:
-        organ_types_dict = schema_manager.get_organ_types()
-
         if organ_code.upper() not in organ_types_dict:
             not_found_error(f"Unable to find organ code {organ_code} via the ontology-api")
     except requests.exceptions.RequestException:

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -942,7 +942,7 @@ def get_sankey_info(neo4j_driver):
     query = (f"MATCH (ds:Dataset)<-[]-(a)<-[]-(:Sample)"
              # specimen_type -> sample_category 12/15/2022
              f"MATCH (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)"
-             f"RETURN distinct ds.group_name, organ.organ, ds.data_types, ds.status, ds. uuid order by ds.group_name")
+             f"RETURN distinct ds.group_name, organ.organ, ds.dataset_type, ds.status, ds. uuid order by ds.group_name")
     logger.info("======get_sankey_info() query======")
     logger.info(query)
     with neo4j_driver.session() as session:
@@ -958,14 +958,7 @@ def get_sankey_info(neo4j_driver):
                 record_contents.append(item)
             record_dict['dataset_group_name'] = record_contents[0]
             record_dict['organ_type'] = record_contents[1]
-            data_types_list = record_contents[2]
-            data_types_list = data_types_list.replace("'", '"')
-            data_types_list = json.loads(data_types_list)
-            data_types = data_types_list[0]
-            if (len(data_types_list)) > 1:
-                if (data_types_list[0] == "scRNAseq-10xGenomics-v3" and data_types_list[1] == "snATACseq") or (data_types_list[1] == "scRNAseq-10xGenomics-v3" and data_types_list[0] == "snATACseq"):
-                    data_types = "scRNA-seq (10x Genomics v3),snATAC-seq"
-            record_dict['dataset_data_types'] = data_types
+            record_dict['dataset_dataset_type'] = record_contents[2]
             record_dict['dataset_status'] = record_contents[3]
             list_of_dictionaries.append(record_dict)
         return list_of_dictionaries

diff --git a/src/sankey_mapping.json b/src/sankey_mapping.json