Skip to content

Commit

Permalink
Modified data sankey endpoint. Uses dataset_type rather than data_types.
Browse files Browse the repository at this point in the history
Various conversions and mappings no longer required. Also fixed a previous
inefficiency which would cause the function to take minutes to run. Now it
only takes seconds. This was caused by unecessary calls to ontology. This
was done by modifying validate_organ_code to take an optional argument
that includes the organ dictionary.
  • Loading branch information
DerekFurstPitt committed Jan 11, 2024
1 parent f2d1ca4 commit e2643ae
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 71 deletions.
35 changes: 9 additions & 26 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3326,12 +3326,9 @@ def sankey_data():
# String constants
HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
HEADER_ORGAN_TYPE = 'organ_type'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_STATUS = 'dataset_status'

with open('sankey_mapping.json') as f:
mapping_dict = json.load(f)
# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()
Expand Down Expand Up @@ -3359,41 +3356,27 @@ def sankey_data():
internal_dict = collections.OrderedDict()
internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]

# TODO BEGIN evaluate elimination of this block once dataset['dataset_type'] is required and dataset['data_types'] removed.
organ_code = dataset[HEADER_ORGAN_TYPE].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict)

internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
# Data type codes are replaced with data type descriptions
assay_description = ""
try:
assay_description = assay_types_dict[dataset[HEADER_DATASET_DATA_TYPES]]['description']
except KeyError:
logger.exception(f"Data type {dataset[HEADER_DATASET_DATA_TYPES]} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description = dataset[HEADER_DATASET_DATA_TYPES]

internal_dict[HEADER_DATASET_DATA_TYPES] = assay_description
internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset[HEADER_DATASET_DATASET_TYPE]

# Replace applicable Group Name and Data type with the value needed for the sankey via the mapping_dict
internal_dict[HEADER_DATASET_STATUS] = dataset['dataset_status']
if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
# if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
# internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]

# Each dataset's dictionary is added to the list to be returned
dataset_sankey_list.append(internal_dict)

if MEMCACHED_MODE:
# Cache the result
memcached_client_instance.set(cache_key, dataset_sankey_list, expire = SchemaConstants.MEMCACHED_TTL)
else:
logger.info(f'Using the cached sankey data at time {datetime.now()}')

return jsonify(dataset_sankey_list)


Expand Down Expand Up @@ -4807,13 +4790,13 @@ def access_level_prefix_dir(dir_name):
----------
organ_code : str
"""
def validate_organ_code(organ_code):
def validate_organ_code(organ_code, organ_types_dict=None):
if organ_types_dict is None:
organ_types_dict = schema_manager.get_organ_types()
if not organ_code.isalpha() or not len(organ_code) == 2:
internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code")

try:
organ_types_dict = schema_manager.get_organ_types()

if organ_code.upper() not in organ_types_dict:
not_found_error(f"Unable to find organ code {organ_code} via the ontology-api")
except requests.exceptions.RequestException:
Expand Down
11 changes: 2 additions & 9 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ def get_sankey_info(neo4j_driver):
query = (f"MATCH (ds:Dataset)<-[]-(a)<-[]-(:Sample)"
# specimen_type -> sample_category 12/15/2022
f"MATCH (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)"
f"RETURN distinct ds.group_name, organ.organ, ds.data_types, ds.status, ds. uuid order by ds.group_name")
f"RETURN distinct ds.group_name, organ.organ, ds.dataset_type, ds.status, ds. uuid order by ds.group_name")
logger.info("======get_sankey_info() query======")
logger.info(query)
with neo4j_driver.session() as session:
Expand All @@ -958,14 +958,7 @@ def get_sankey_info(neo4j_driver):
record_contents.append(item)
record_dict['dataset_group_name'] = record_contents[0]
record_dict['organ_type'] = record_contents[1]
data_types_list = record_contents[2]
data_types_list = data_types_list.replace("'", '"')
data_types_list = json.loads(data_types_list)
data_types = data_types_list[0]
if (len(data_types_list)) > 1:
if (data_types_list[0] == "scRNAseq-10xGenomics-v3" and data_types_list[1] == "snATACseq") or (data_types_list[1] == "scRNAseq-10xGenomics-v3" and data_types_list[0] == "snATACseq"):
data_types = "scRNA-seq (10x Genomics v3),snATAC-seq"
record_dict['dataset_data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[2]
record_dict['dataset_status'] = record_contents[3]
list_of_dictionaries.append(record_dict)
return list_of_dictionaries
Expand Down
36 changes: 0 additions & 36 deletions src/sankey_mapping.json

This file was deleted.

0 comments on commit e2643ae

Please sign in to comment.