From 95b243e3da97d3c9138507ca598e64acb056cf28 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Fri, 12 Jan 2024 15:04:23 -0500 Subject: [PATCH 1/2] modified prov-info and /prov-info to use dataset_type instead of data types. samples/prov-info didn't appear to include any references. Also applied an identical change to what was done in the data-sankey branch to make validate_organ_code accept an optional argument that includes the organ dictionary so that ontology api doesn't need called thousands of times. --- src/app.py | 69 +++++++--------------------------------- src/app_neo4j_queries.py | 15 +++------ 2 files changed, 15 insertions(+), 69 deletions(-) diff --git a/src/app.py b/src/app.py index 8c5b3090..3ab79a66 100644 --- a/src/app.py +++ b/src/app.py @@ -2674,7 +2674,6 @@ def get_prov_info(): HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified' HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email' HEADER_DATASET_LAB_ID = 'lab_id_or_name' - HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' HEADER_DATASET_PORTAL_URL = 'dataset_portal_url' HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id' @@ -2702,12 +2701,11 @@ def get_prov_info(): HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url' HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids' - # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required. headers = [ HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME, HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL, HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID, - HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, + HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE, HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID, HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID, @@ -2721,7 +2719,7 @@ def get_prov_info(): # Token is not required, but if an invalid token is provided, # we need to tell the client with a 401 error validate_token_if_auth_header_exists(request) - + organ_types_dict = schema_manager.get_organ_types() if user_in_hubmap_read_group(request): published_only = False @@ -2797,29 +2795,7 @@ def get_prov_info(): internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0))) internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email'] internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id'] - - # Data type codes are replaced with data type descriptions - assay_description_list = [] - # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, - # and once dataset['dataset_type'] is required and dataset['data_types'] removed. - for item in dataset['data_types']: - try: - assay_description_list.append(assay_types_dict[item]['description']) - except KeyError: - logger.exception(f"Data type {item} not found in resulting assay types via ontology-api") - - # Just use the data type value - assay_description_list.append(item) - - dataset['data_types'] = assay_description_list - internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types'] - - # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv - if return_json is False: - internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types']) - # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, - # and once dataset['dataset_type'] is required and dataset['data_types'] removed. - + internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type'] internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace('', dataset['uuid']) # first_sample properties are retrieved from its own dictionary @@ -2860,7 +2836,7 @@ def get_prov_info(): distinct_organ_uuid_list.append(item['uuid']) organ_code = item['organ'].upper() - validate_organ_code(organ_code) + validate_organ_code(organ_code, organ_types_dict) distinct_organ_type_list.append(organ_types_dict[organ_code].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list @@ -3016,7 +2992,7 @@ def get_prov_info_for_dataset(id): # Token is not required, but if an invalid token provided, # we need to tell the client with a 401 error validate_token_if_auth_header_exists(request) - + organ_types_dict = schema_manager.get_organ_types() # Use the internal token to query the target entity # since public entities don't require user token token = get_internal_token() @@ -3055,7 +3031,6 @@ def get_prov_info_for_dataset(id): HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified' HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email' HEADER_DATASET_LAB_ID = 'lab_id_or_name' - HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' HEADER_DATASET_PORTAL_URL = 'dataset_portal_url' HEADER_DATASET_SAMPLES = 'dataset_samples' @@ -3083,12 +3058,11 @@ def get_prov_info_for_dataset(id): HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status' HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url' - # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required. headers = [ HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME, HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL, HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID, - HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, + HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE, HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID, HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID, @@ -3124,28 +3098,7 @@ def get_prov_info_for_dataset(id): internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0))) internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email'] internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id'] - - # Data type codes are replaced with data type descriptions - assay_description_list = [] - # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, - # and once dataset['dataset_type'] is required and dataset['data_types'] removed. - for item in dataset['data_types']: - try: - assay_description_list.append(assay_types_dict[item]['description']) - except KeyError: - logger.exception(f"Data type {item} not found in resulting assay types via ontology-api") - - # Just use the data type value - assay_description_list.append(item) - - dataset['data_types'] = assay_description_list - internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types'] - if return_json is False: - internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types']) - # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494, - # and once dataset['dataset_type'] is required and dataset['data_types'] removed. - - internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type'] + internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type'] internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace( '', dataset['uuid']) @@ -3185,7 +3138,7 @@ def get_prov_info_for_dataset(id): distinct_organ_uuid_list.append(item['uuid']) organ_code = item['organ'].upper() - validate_organ_code(organ_code) + validate_organ_code(organ_code, organ_types_dict ) distinct_organ_type_list.append(organ_types_dict[organ_code].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list @@ -4807,13 +4760,13 @@ def access_level_prefix_dir(dir_name): ---------- organ_code : str """ -def validate_organ_code(organ_code): +def validate_organ_code(organ_code, organ_types_dict=None): + if organ_types_dict is None: + organ_types_dict = schema_manager.get_organ_types() if not organ_code.isalpha() or not len(organ_code) == 2: internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code") try: - organ_types_dict = schema_manager.get_organ_types() - if organ_code.upper() not in organ_types_dict: not_found_error(f"Unable to find organ code {organ_code} via the ontology-api") except requests.exceptions.RequestException: diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 87183315..bb203133 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only): f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " - f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required + f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS") logger.info("======get_prov_info() query======") logger.info(query) @@ -782,10 +782,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only): record_dict['last_modified_timestamp'] = record_contents[11] record_dict['last_modified_user_email'] = record_contents[12] record_dict['lab_dataset_id'] = record_contents[13] - data_types = record_contents[14] - data_types = data_types.replace("'", '"') - data_types = json.loads(data_types) - record_dict['data_types'] = data_types + record_dict['dataset_dataset_type'] = record_contents[14] content_fifteen = [] for entry in record_contents[15]: node_dict = schema_neo4j_queries.node_to_dict(entry) @@ -834,7 +831,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " - f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type") + f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET") logger.info("======get_prov_info() query======") logger.info(query) @@ -877,10 +874,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): record_dict['last_modified_timestamp'] = record_contents[11] record_dict['last_modified_user_email'] = record_contents[12] record_dict['lab_dataset_id'] = record_contents[13] - data_types = record_contents[14] - data_types = data_types.replace("'", '"') - data_types = json.loads(data_types) - record_dict['data_types'] = data_types + record_dict['dataset_dataset_type'] = record_contents[14] content_fifteen = [] for entry in record_contents[15]: node_dict = schema_neo4j_queries.node_to_dict(entry) @@ -891,7 +885,6 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid): node_dict = schema_neo4j_queries.node_to_dict(entry) content_sixteen.append(node_dict) record_dict['processed_dataset'] = content_sixteen - record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else '' return record_dict From 22a8337e07339da282138d6e9fe34bdb600c4c2b Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Tue, 16 Jan 2024 23:18:29 -0500 Subject: [PATCH 2/2] removed duplicate organ_type_dict assignments --- src/app.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/app.py b/src/app.py index 3ab79a66..8d00c48f 100644 --- a/src/app.py +++ b/src/app.py @@ -2725,7 +2725,6 @@ def get_prov_info(): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. @@ -3074,7 +3073,6 @@ def get_prov_info_for_dataset(id): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset.