From 8be60a6901913ad581dd213cb53cf9241e61d0b7 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Fri, 10 Nov 2023 14:26:40 -0500 Subject: [PATCH 01/12] Replace oragns and assaytypes yamls with ontology-api calls --- src/app.py | 104 ++++++++------------------------- src/instance/app.cfg.example | 4 ++ src/schema/schema_constants.py | 8 ++- src/schema/schema_manager.py | 91 +++++++++++++++++++++++++++-- src/schema/schema_triggers.py | 72 +++-------------------- 5 files changed, 130 insertions(+), 149 deletions(-) diff --git a/src/app.py b/src/app.py index 7878d726..ce39b5bc 100644 --- a/src/app.py +++ b/src/app.py @@ -60,6 +60,7 @@ # Remove trailing slash / from URL base to avoid "//" caused by config with trailing slash app.config['UUID_API_URL'] = app.config['UUID_API_URL'].strip('/') app.config['INGEST_API_URL'] = app.config['INGEST_API_URL'].strip('/') +app.config['ONTOLOGY_API_URL'] = app.config['ONTOLOGY_API_URL'].strip('/') app.config['SEARCH_API_URL_LIST'] = [url.strip('/') for url in app.config['SEARCH_API_URL_LIST']] # This mode when set True disables the PUT and POST calls, used on STAGE to make entity-api READ-ONLY @@ -198,6 +199,7 @@ def http_internal_server_error(e): schema_manager.initialize(app.config['SCHEMA_YAML_FILE'], app.config['UUID_API_URL'], app.config['INGEST_API_URL'], + app.config['ONTOLOGY_API_URL'], auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -2623,26 +2625,12 @@ def get_prov_info(): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - + assay_types_dict = schema_manager.get_assay_types() + # Processing and validating query parameters accepted_arguments = ['format', 'organ', 'has_rui_info', 'dataset_status', 'group_uuid'] return_json = False @@ -3007,25 +2995,11 @@ def get_prov_info_for_dataset(id): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + assay_types_dict = schema_manager.get_assay_types() hubmap_ids = schema_manager.get_hubmap_ids(id) @@ -3251,25 +3225,11 @@ def sankey_data(): mapping_dict = json.load(f) # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + organ_types_dict = schema_manager.get_organ_types() # As above, we parse te assay type yaml here rather than calling the special method for it because this avoids # having to access the resource for every dataset. - response = schema_manager.make_request_get(SchemaConstants.ASSAY_TYPES_YAML) - - if response.status_code == 200: - yaml_file = response.text - try: - assay_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + assay_types_dict = schema_manager.get_assay_types() # Instantiation of the list dataset_sankey_list dataset_sankey_list = [] @@ -3377,14 +3337,16 @@ def get_sample_prov_info(): # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description # because that would require using a urllib request for each dataset - response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) + # response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - if response.status_code == 200: - yaml_file = response.text - try: - organ_types_dict = yaml.safe_load(yaml_file) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) + # if response.status_code == 200: + # yaml_file = response.text + # try: + # organ_types_dict = yaml.safe_load(yaml_file) + # except yaml.YAMLError as e: + # raise yaml.YAMLError(e) + + organ_types_dict = schema_manager.get_organ_types() # Processing and validating query parameters accepted_arguments = ['group_uuid'] @@ -4744,34 +4706,18 @@ def access_level_prefix_dir(dir_name): Returns nothing. Raises bad_request_error is organ code not found on organ_types.yaml """ def validate_organ_code(organ_code): - yaml_file_url = SchemaConstants.ORGAN_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text + try: + organ_types_dict = schema_manager.get_organ_types() - try: - organ_types_dict = yaml.safe_load(response.text) - - if organ_code.upper() not in organ_types_dict: - bad_request_error(f"Invalid organ code. Must be 2 digit code specified {yaml_file_url}") - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" + if organ_code.upper() not in organ_types_dict: + bad_request_error(f"Invalid organ code. Must be 2 digit code") + except: + msg = f"Failed to validate the organ code: {organ_code}" # Log the full stack trace, prepend a line with our message logger.exception(msg) - logger.debug("======validate_organ_code() status code======") - logger.debug(response.status_code) - - logger.debug("======validate_organ_code() response text======") - logger.debug(response.text) - # Terminate and let the users know - internal_server_error(f"Failed to validate the organ code: {organ_code}") + internal_server_error(msg) #################################################################################################### diff --git a/src/instance/app.cfg.example b/src/instance/app.cfg.example index 0c55f5bd..839972dc 100644 --- a/src/instance/app.cfg.example +++ b/src/instance/app.cfg.example @@ -28,6 +28,10 @@ UUID_API_URL = 'http://uuid-api:8080' # Works regardless of the trailing slash INGEST_API_URL = 'https://ingest-api.dev.hubmapconsortium.org' +# URL for talking to Ontology API (default for DEV) +# Works regardless of the trailing slash +ONTOLOGY_API_URL = 'https://ontology-api.dev.hubmapconsortium.org' + # A list of URLs for talking to multiple Search API instances (default value used for docker deployment, no token needed) # Works regardless of the trailing slash / SEARCH_API_URL_LIST = ['http://search-api:8080'] diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index 5e4ad332..b426827c 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -13,9 +13,11 @@ class SchemaConstants(object): ACCESS_LEVEL_CONSORTIUM = 'consortium' ACCESS_LEVEL_PROTECTED = 'protected' - # Yaml file to parse organ description - ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml' - ASSAY_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml' + UUID_API_ID_ENDPOINT = '/uuid' + INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit' + INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove' + ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HuBMAP' + ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs?application_context=HuBMAP' # For generating Sample.tissue_type TISSUE_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml' diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 2386b013..5018e567 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -50,9 +50,11 @@ valid_yaml_file : file A valid yaml file uuid_api_url : str - The uuid-api URL + The uuid-api base URL ingest_api_url : str - The ingest-api URL + The ingest-api base URL +ontology_api_url : str + The ontology-api base URL auth_helper_instance : AuthHelper The auth helper instance neo4j_driver_instance : neo4j_driver @@ -65,6 +67,7 @@ def initialize(valid_yaml_file, uuid_api_url, ingest_api_url, + ontology_api_url, auth_helper_instance, neo4j_driver_instance, memcached_client_instance, @@ -73,6 +76,7 @@ def initialize(valid_yaml_file, global _schema global _uuid_api_url global _ingest_api_url + global _ontology_api_url global _auth_helper global _neo4j_driver global _memcached_client @@ -81,6 +85,7 @@ def initialize(valid_yaml_file, _schema = load_provenance_schema(valid_yaml_file) _uuid_api_url = uuid_api_url _ingest_api_url = ingest_api_url + _ontology_api_url = ontology_api_url # Get the helper instances _auth_helper = auth_helper_instance @@ -1202,7 +1207,7 @@ def get_user_info(request): def get_hubmap_ids(id): global _uuid_api_url - target_url = _uuid_api_url + '/uuid/' + id + target_url = _uuid_api_url + schema_constants.UUID_API_ID_ENDPOINT + '/' + id # Use Memcached to improve performance response = make_request_get(target_url, internal_token_used = True) @@ -1365,7 +1370,7 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di logger.info(json_to_post) # Disable ssl certificate verification - target_url = _uuid_api_url + '/uuid' + target_url = _uuid_api_url + schema_constants.UUID_API_ID_ENDPOINT response = requests.post(url = target_url, headers = request_headers, json = json_to_post, verify = False, params = query_parms) # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes @@ -1764,6 +1769,84 @@ def delete_memcached_cache(uuids_list): logger.info(f"Deleted cache by key: {', '.join(cache_keys)}") +""" +Retrive the organ types from ontology-api + +Returns +------- +dict + The available organ types +""" +def get_organ_types(): + global _ontology_api_url + + target_url = _ontology_api_url + '/organs?application_context=HuBMAP' + + # Use Memcached to improve performance + response = make_request_get(target_url, internal_token_used = True) + + # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes + response.raise_for_status() + + if response.status_code == 200: + ids_dict = response.json() + return ids_dict + else: + # uuid-api will also return 400 if the given id is invalid + # We'll just hanle that and all other cases all together here + msg = f"Unable to make a request to query the id via uuid-api: {id}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======get_organ_types() status code from ontology-api======") + logger.debug(response.status_code) + + logger.debug("======get_organ_types() response text from ontology-api======") + logger.debug(response.text) + + # Also bubble up the error message from ontology-api + raise requests.exceptions.RequestException(response.text) + + +""" +Retrive the assay types from ontology-api + +Returns +------- +dict + The available assay types +""" +def get_assay_types(): + global _ontology_api_url + + target_url = _ontology_api_url + '/assaytype?application_context=HuBMAP' + + # Use Memcached to improve performance + response = make_request_get(target_url, internal_token_used = True) + + # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes + response.raise_for_status() + + if response.status_code == 200: + ids_dict = response.json() + return ids_dict + else: + # uuid-api will also return 400 if the given id is invalid + # We'll just hanle that and all other cases all together here + msg = f"Unable to make a request to query the id via uuid-api: {id}" + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======get_assay_types() status code from ontology-api======") + logger.debug(response.status_code) + + logger.debug("======get_assay_types() response text from ontology-api======") + logger.debug(response.text) + + # Also bubble up the error message from ontology-api + raise requests.exceptions.RequestException(response.text) + + #################################################################################################### ## Internal functions #################################################################################################### diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 38cf61dc..d831670a 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1184,7 +1184,7 @@ def commit_thumbnail_file(property_key, normalized_type, user_token, existing_da entity_uuid = existing_data_dict['uuid'] # Commit the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-commit' + ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_COMMIT_ENDPOINT # Example: {"temp_file_id":"dzevgd6xjs4d5grmcp4n"} thumbnail_file_dict = new_data_dict[property_key] @@ -1286,7 +1286,7 @@ def delete_thumbnail_file(property_key, normalized_type, user_token, existing_da file_info_dict = generated_dict[target_property_key] # Remove the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-remove' + ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_REMOVE_ENDPOINT # ingest-api's /file-remove takes a list of files to remove # In this case, we only need to remove the single thumbnail file @@ -1994,7 +1994,7 @@ def _commit_files(target_property_key, property_key, normalized_type, user_token entity_uuid = existing_data_dict['uuid'] # Commit the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-commit' + ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_COMMIT_ENDPOINT for file_info in new_data_dict[property_key]: temp_file_id = file_info['temp_file_id'] @@ -2104,7 +2104,7 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token file_uuids.append(file_uuid) # Remove the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + '/file-remove' + ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_REMOVE_ENDPOINT json_to_post = { 'entity_uuid': entity_uuid, @@ -2143,39 +2143,10 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token str: The corresponding assay type description """ def _get_assay_type_description(assay_type): - yaml_file_url = SchemaConstants.ASSAY_TYPES_YAML + assay_types_dict = schema_manager.get_assay_types() - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - assay_types_dict = yaml.safe_load(response.text) - - if assay_type in assay_types_dict: - return assay_types_dict[assay_type]['description'].lower() - else: - # Check the 'alt-names' list if not found in the top-level keys - for key in assay_types_dict: - if assay_type in assay_types_dict[key]['alt-names']: - return assay_types_dict[key]['description'].lower() - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_assay_type_description() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_assay_type_description() response text======") - logger.debug(response.text) - - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) + if assay_type in assay_types_dict: + return assay_types_dict[assay_type]['description'].lower() """ @@ -2230,32 +2201,7 @@ def _get_combined_assay_type_description(data_types): str: The organ code description """ def _get_organ_description(organ_code): - yaml_file_url = SchemaConstants.ORGAN_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - organ_types_dict = yaml.safe_load(response.text) - return organ_types_dict[organ_code]['description'].lower() - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_organ_description() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_organ_description() response text======") - logger.debug(response.text) - - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) - + organ_types_dict = schema_manager.get_organ_types() + return organ_types_dict[organ_code]['description'].lower() From 83a4b57c220eae960b35d27be8db49974ac9062c Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Fri, 10 Nov 2023 20:30:39 -0500 Subject: [PATCH 02/12] Remove tissue_type, specimen_type, and realted pieces --- src/schema/provenance_schema.yaml | 28 --------- src/schema/schema_constants.py | 5 -- src/schema/schema_triggers.py | 99 ------------------------------- src/schema/schema_validators.py | 71 ---------------------- 4 files changed, 203 deletions(-) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index b72bf3a1..7e43d41a 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -897,33 +897,6 @@ ENTITIES: - validate_sample_category before_property_update_validators: - validate_sample_category - - # No logner required on create, specimen_type -> sample_category 12/15/2022 - specimen_type: - type: string - #required_on_create: true # Only required for create via POST, not update via PUT - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - # Validate the given value against the definitions: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - # Disabled validation 12/15/2022 - # before_property_create_validators: - # - validate_specimen_type - # before_property_update_validators: - # - validate_specimen_type - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." - - - # specimen_type no logner required on create, will remove this field when removing specimen_type - # Simply always set to 'Unknown' and no need to update 12/15/2022 - tissue_type: - type: string - generated: true # Can not be updated via the PUT - #auto_update: true # Will always update automatically if the entity gets updated - description: 'The type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type, default is Unknown' - before_create_trigger: set_tissue_type - #before_update_trigger: set_tissue_type - portal_metadata_upload_files: type: json_string description: "A list of relative paths to metadata files" @@ -949,7 +922,6 @@ ENTITIES: immutable: true description: "The displayname of globus group which the user who created this entity is a member of" before_create_trigger: set_group_name - # Should be required on create only when specimen_type==organ organ: type: string description: "Organ code specifier, only set if sample_type == organ. Valid values found in: [organ types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml)" diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index b426827c..e34bd10a 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -2,13 +2,11 @@ class SchemaConstants(object): MEMCACHED_TTL = 7200 - # Constants used by validators INGEST_API_APP = 'ingest-api' INGEST_PIPELINE_APP = 'ingest-pipeline' HUBMAP_APP_HEADER = 'X-Hubmap-Application' DATASET_STATUS_PUBLISHED = 'published' - # Used by triggers, all lowercase for easy comparision ACCESS_LEVEL_PUBLIC = 'public' ACCESS_LEVEL_CONSORTIUM = 'consortium' ACCESS_LEVEL_PROTECTED = 'protected' @@ -19,9 +17,6 @@ class SchemaConstants(object): ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HuBMAP' ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs?application_context=HuBMAP' - # For generating Sample.tissue_type - TISSUE_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml' - DOI_BASE_URL = 'https://doi.org/' # Define an enumeration to classify an entity's visibility, which can be combined with diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index d831670a..093ed3a6 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1562,105 +1562,6 @@ def get_sample_direct_ancestor(property_key, normalized_type, user_token, existi return property_key, schema_manager.normalize_entity_result_for_response(direct_ancestor_dict) -""" -Trigger event method of generating the type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type -This method applies to both the create and update triggers - -Rererence: - - https://docs.google.com/spreadsheets/d/1OODo8QK852txSNSmfIe0ua4A7nPFSgKq6h46grmrpto/edit#gid=0 - - https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - -Parameters ----------- -property_key : str - The target property key of the value to be generated -normalized_type : str - One of the types defined in the schema yaml: Sample -user_token: str - The user's globus nexus token -existing_data_dict : dict - A dictionary that contains all existing entity properties -new_data_dict : dict - A merged dictionary that contains all possible input data to be used - -Returns -------- -str: The target property key -str: The type of the tissue -""" -def set_tissue_type(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): - # specimen_type is no logner required on create 12/15/2022, set to Unknown - # Default to use 'Unknown' - tissue_type = 'Unknown' - - # # The `specimen_type` field is required on entity creation via POST - # # thus should be available on existing entity update via PUT - # # We do a double check here just in case - # if ('specimen_type' not in new_data_dict) and ('specimen_type' not in existing_data_dict): - # raise KeyError("Missing 'specimen_type' key in both 'new_data_dict' and 'existing_data_dict' during calling 'set_tissue_type()' trigger method.") - - # # Always calculate the tissue_type value no matter new creation or update existing - # # The `specimen_type` field can be used in a PUT - # # But if it's not in the request JSON of a PUT, it must be in the existing data - # if 'specimen_type' in new_data_dict: - # # The `specimen_type` value validation is handled in the `schema_validators.validate_specimen_type()` - # # and that gets called before this trigger method - # specimen_type = new_data_dict['specimen_type'].lower() - # else: - # # Use lowercase in case someone manually updated the neo4j filed with incorrect case - # specimen_type = existing_data_dict['specimen_type'].lower() - - # # Categories: Block, Section, Suspension - # block_category = [ - # 'pbmc', - # 'biopsy', - # 'segment', - # 'ffpe_block', - # 'organ_piece', - # 'fresh_tissue', - # 'clarity_hydrogel', - # 'fixed_tissue_piece', - # 'fresh_frozen_tissue', - # 'fresh_frozen_oct_block', - # 'formalin_fixed_oct_block', - # 'pfa_fixed_frozen_oct_block', - # 'flash_frozen_liquid_nitrogen', - # 'frozen_cell_pellet_buffy_coat' - # ] - - # section_category = [ - # 'ffpe_slide', - # 'fixed_frozen_section_slide', - # 'fresh_frozen_section_slide', - # 'fresh_frozen_tissue_section', - # 'cryosections_curls_rnalater', - # 'cryosections_curls_from_fresh_frozen_oct' - # ] - - # suspension_category = [ - # 'gdna', - # 'serum', - # 'plasma', - # 'nuclei', - # 'protein', - # 'rna_total', - # 'cell_lysate', - # 'tissue_lysate', - # 'sequence_library', - # 'ran_poly_a_enriched', - # 'single_cell_cryopreserved' - # ] - - # # Capitalized type, default is 'Unknown' if no match - # if specimen_type in block_category: - # tissue_type = 'Block' - # elif specimen_type in section_category: - # tissue_type = 'Section' - # elif specimen_type in suspension_category: - # tissue_type = 'Suspension' - - return property_key, tissue_type - #################################################################################################### ## Trigger methods specific to Publication - DO NOT RENAME diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 8c1269a5..cedd2d53 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -430,33 +430,6 @@ def validate_upload_status_value(property_key, normalized_entity_type, request, raise ValueError(f"Invalid status value: {new_status}") -""" -NOTE: TO BE REMOVED when we remove specimen_type field - -Validate the provided value of Sample.specimen_type on create via POST and update via PUT - -Parameters ----------- -property_key : str - The target property key -normalized_type : str - Submission -request: Flask request object - The instance of Flask request passed in from application request -existing_data_dict : dict - A dictionary that contains all existing entity properties -new_data_dict : dict - The json data in request body, already after the regular validations -""" -def validate_specimen_type(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): - # Use lowercase for comparison - defined_tissue_types = _get_tissue_types() - specimen_type = new_data_dict[property_key].lower() - - if specimen_type not in defined_tissue_types: - raise ValueError(f"Invalid specimen_type value: {specimen_type}") - - """ Validate the provided value of Sample.sample_category on create via POST and update via PUT @@ -562,47 +535,3 @@ def _validate_application_header(applications_allowed, request_headers): msg = f"Unable to proceed due to invalid {SchemaConstants.HUBMAP_APP_HEADER} header value: {app_header}" raise schema_errors.InvalidApplicationHeaderException(msg) - -""" -Get the complete list of defined tissue types - -Returns -------- -list: The list of defined tissue types -""" -def _get_tissue_types(): - yaml_file_url = SchemaConstants.TISSUE_TYPES_YAML - - # Use Memcached to improve performance - response = schema_manager.make_request_get(yaml_file_url) - - if response.status_code == 200: - yaml_file = response.text - - try: - tissue_types_dict = yaml.safe_load(response.text) - - # We don't need the description here, just a list of tissue types - # Note: dict.keys() returns a dict, need to typecast to list - tissue_types_list = list(tissue_types_dict.keys()) - - # Add the 'other' - tissue_types_list.append('other') - - return tissue_types_list - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - else: - msg = f"Unable to fetch the: {yaml_file_url}" - # Log the full stack trace, prepend a line with our message - logger.exception(msg) - - logger.debug("======_get_tissue_types() status code======") - logger.debug(response.status_code) - - logger.debug("======_get_tissue_types() response text======") - logger.debug(response.text) - - # Also bubble up the error message - raise requests.exceptions.RequestException(response.text) - From 031c4b548bfe2b332ce325ed05c1ef98327f9718 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 20 Nov 2023 09:26:38 -0500 Subject: [PATCH 03/12] Remove irrelevant logging comments --- src/app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/app.py b/src/app.py index ce39b5bc..3f90d7e8 100644 --- a/src/app.py +++ b/src/app.py @@ -44,9 +44,6 @@ global logger # Set logging format and level (default is warning) -# All the API logging is forwarded to the uWSGI server and gets written into the log file `log/uwsgi-entity-api.log` -# Log rotation is handled via logrotate on the host system with a configuration file -# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') # Use `getLogger()` instead of `getLogger(__name__)` to apply the config to the root logger From 393d40f1d5d5bf3da115f7dc021fa17fa13aae2b Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Tue, 21 Nov 2023 10:20:08 -0500 Subject: [PATCH 04/12] Code cleanup --- entity-api-spec.yaml | 53 ------------------ src/app.py | 12 ----- src/schema/schema_manager.py | 1 - src/schema/schema_neo4j_queries.py | 17 ------ .../api-template-test/entity-Template.yaml | 54 ------------------- .../example-yaml-templates/sample-schema.yaml | 9 ---- 6 files changed, 146 deletions(-) diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml index 8fd04211..fbfc5548 100644 --- a/entity-api-spec.yaml +++ b/entity-api-spec.yaml @@ -406,59 +406,6 @@ components: - section - suspension description: "A code representing the type of specimen. Must be an organ, block, section, or suspension" - specimen_type: - type: string - enum: - - atacseq - - biopsy - - blood - - cell_lysate - - clarity_hydrogel - - codex - - cryosections_curls_from_fresh_frozen_oct - - cryosections_curls_rnalater - - ffpe_block - - ffpe_slide - - fixed_frozen_section_slide - - fixed_tissue_piece - - flash_frozen_liquid_nitrogen - - formalin_fixed_oct_block - - fresh_frozen_oct_block - - fresh_frozen_section_slide - - fresh_frozen_tissue - - fresh_frozen_tissue_section - - fresh_tissue - - frozen_cell_pellet_buffy_coat - - gdna - - module - - nuclei - - nuclei_rnalater - - organ - - organ_piece - - other - - pbmc - - pfa_fixed_frozen_oct_block - - plasma - - protein - - ran_poly_a_enriched - - rna_total - - rnalater_treated_and_stored - - rnaseq - - scatacseq - - scrnaseq - - segment - - seqfish - - sequence_library - - serum - - single_cell_cryopreserved - - snatacseq - - snrnaseq - - tissue_lysate - - wgs - description: "DEPRECATED: No longer a required field. A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared." diff --git a/src/app.py b/src/app.py index ae8ad938..00f1d523 100644 --- a/src/app.py +++ b/src/app.py @@ -436,7 +436,6 @@ def get_ancestor_organs(id): bad_request_error(f"Unable to get the ancestor organs for this: {normalized_entity_type}," " supported entity types: Sample, Dataset, Publication") - # specimen_type -> sample_category 12/15/2022 if normalized_entity_type == 'Sample' and entity_dict['sample_category'].lower() == 'organ': bad_request_error("Unable to get the ancestor organ of an organ.") @@ -939,7 +938,6 @@ def create_entity(entity_type): # Check existence of the direct ancestor (either another Sample or Donor) direct_ancestor_dict = query_target_entity(direct_ancestor_uuid, user_token) - # specimen_type -> sample_category 12/15/2022 # `sample_category` is required on create sample_category = json_data_dict['sample_category'].lower() @@ -1112,7 +1110,6 @@ def create_multiple_samples(count): # sample's direct ancestor is a Donor. # Must be one of the codes from: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml if direct_ancestor_dict['entity_type'] == 'Donor': - # specimen_type -> sample_category 12/15/2022 # `sample_category` is required on create if json_data_dict['sample_category'].lower() != 'organ': bad_request_error("The sample_category must be organ since the direct ancestor is a Donor") @@ -2828,8 +2825,6 @@ def get_prov_info(): first_sample_hubmap_id_list.append(item['hubmap_id']) first_sample_submission_id_list.append(item['submission_id']) first_sample_uuid_list.append(item['uuid']) - - # specimen_type -> sample_category 12/15/2022 first_sample_type_list.append(item['sample_category']) first_sample_portal_url_list.append(app.config['DOI_REDIRECT_URL'].replace('', 'sample').replace('', item['uuid'])) @@ -3148,8 +3143,6 @@ def get_prov_info_for_dataset(id): first_sample_hubmap_id_list.append(item['hubmap_id']) first_sample_submission_id_list.append(item['submission_id']) first_sample_uuid_list.append(item['uuid']) - - # specimen_type -> sample_category 12/15/2022 first_sample_type_list.append(item['sample_category']) first_sample_portal_url_list.append( @@ -3267,7 +3260,6 @@ def get_prov_info_for_dataset(id): else: requested_samples = {} for uuid in dataset_samples.keys(): - # specimen_type -> sample_category 12/15/2022 if dataset_samples[uuid]['sample_category'] in include_samples: requested_samples[uuid] = dataset_samples[uuid] internal_dict[HEADER_DATASET_SAMPLES] = requested_samples @@ -3479,7 +3471,6 @@ def get_sample_prov_info(): organ_hubmap_id = sample['organ_hubmap_id'] organ_submission_id = sample['organ_submission_id'] else: - # sample_specimen_type -> sample_category 12/15/2022 if sample['sample_category'] == "organ": organ_uuid = sample['sample_uuid'] organ_type = organ_types_dict[sample['sample_organ']]['description'].lower() @@ -3507,10 +3498,7 @@ def get_sample_prov_info(): internal_dict[HEADER_SAMPLE_HAS_METADATA] = sample_has_metadata internal_dict[HEADER_SAMPLE_HAS_RUI_INFO] = sample_has_rui_info internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ID] = sample['sample_ancestor_id'] - - # sample_specimen_type -> sample_category 12/15/2022 internal_dict[HEADER_SAMPLE_TYPE] = sample['sample_category'] - internal_dict[HEADER_SAMPLE_HUBMAP_ID] = sample['sample_hubmap_id'] internal_dict[HEADER_SAMPLE_SUBMISSION_ID] = sample['sample_submission_id'] internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ENTITY_TYPE] = sample['sample_ancestor_entity'] diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index e66c6f21..1f0d79b5 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1356,7 +1356,6 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di parent_id = json_data_dict['direct_ancestor_uuid'] json_to_post['parent_ids'] = [parent_id] - # specimen_type -> sample_category 12/15/2022 # 'Sample.sample_category' is marked as `required_on_create` in the schema yaml if json_data_dict['sample_category'].lower() == 'organ': # The 'organ' field containing the 2 digit organ code is required in this case diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 8da148b3..231375b3 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -442,24 +442,8 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid): donor_metadata = None with neo4j_driver.session() as session: - # Old time-consuming single query, it takes a significant amounts of DB hits - # query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) " - # f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) " - # f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata") - - # logger.info("======get_dataset_organ_and_donor_info() query======") - # logger.info(query) - - # with neo4j_driver.session() as session: - # record = session.read_transaction(execute_readonly_tx, query) - - # if record: - # organ_name = record['organ_name'] - # donor_metadata = record['donor_metadata'] - # To improve the query performance, we implement the two-step queries to drastically reduce the DB hits sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) " - # specimen_type -> sample_category 12/15/2022 f"WHERE e.uuid='{uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) " f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid") @@ -473,7 +457,6 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid): sample_uuid = sample_record['sample_uuid'] donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) " - # specimen_type -> sample_category 12/15/2022 f"WHERE s.uuid='{sample_uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) " f"RETURN DISTINCT d.metadata AS donor_metadata") diff --git a/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml b/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml index 5a6739f7..ab8ec463 100644 --- a/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml +++ b/src/schema_templating/example-yaml-templates/api-template-test/entity-Template.yaml @@ -398,60 +398,6 @@ x-ref-components: - consortium - public description: "One of the values: public, consortium." - specimen_type: - type: string - enum: - - atacseq - - biopsy - - blood - - cell_lysate - - clarity_hydrogel - - codex - - cryosections_curls_from_fresh_frozen_oct - - cryosections_curls_rnalater - - ffpe_block - - ffpe_slide - - fixed_frozen_section_slide - - fixed_tissue_piece - - flash_frozen_liquid_nitrogen - - formalin_fixed_oct_block - - fresh_frozen_oct_block - - fresh_frozen_section_slide - - fresh_frozen_tissue - - fresh_frozen_tissue_section - - fresh_tissue - - frozen_cell_pellet_buffy_coat - - gdna - - module - - nuclei - - nuclei_rnalater - - organ - - organ_piece - - other - - pbmc - - pfa_fixed_frozen_oct_block - - plasma - - protein - - ran_poly_a_enriched - - rna_total - - rnalater_treated_and_stored - - rnaseq - - scatacseq - - scrnaseq - - segment - - seqfish - - sequence_library - - serum - - sequence_library - - single_cell_cryopreserved - - snatacseq - - snrnaseq - - tissue_lysate - - wgs - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared." diff --git a/src/schema_templating/example-yaml-templates/sample-schema.yaml b/src/schema_templating/example-yaml-templates/sample-schema.yaml index 1f5ee751..2b6f3f11 100644 --- a/src/schema_templating/example-yaml-templates/sample-schema.yaml +++ b/src/schema_templating/example-yaml-templates/sample-schema.yaml @@ -72,15 +72,6 @@ Sample: - consortium - public description: "One of the values: public, consortium." - specimen_type: - type: string - enum: - X-replace-enum-list: - enum-file-ref: https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)" - specimen_type_other: - type: string - description: "The user provided sample type if the 'other' sample_type is chosen." protocol_url: type: string description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared." From 8ec3fd0687bf58d266243f0a4d8847c6d7f587d1 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Tue, 21 Nov 2023 11:02:28 -0500 Subject: [PATCH 05/12] Fix constants reference --- src/schema/schema_manager.py | 4 ++-- src/schema/schema_triggers.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 1f0d79b5..cf247934 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1211,7 +1211,7 @@ def get_user_info(request): def get_hubmap_ids(id): global _uuid_api_url - target_url = _uuid_api_url + schema_constants.UUID_API_ID_ENDPOINT + '/' + id + target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT + '/' + id # Use Memcached to improve performance response = make_request_get(target_url, internal_token_used = True) @@ -1373,7 +1373,7 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di logger.info(json_to_post) # Disable ssl certificate verification - target_url = _uuid_api_url + schema_constants.UUID_API_ID_ENDPOINT + target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT response = requests.post(url = target_url, headers = request_headers, json = json_to_post, verify = False, params = query_parms) # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 5908fc05..182e5614 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1194,7 +1194,7 @@ def commit_thumbnail_file(property_key, normalized_type, user_token, existing_da entity_uuid = existing_data_dict['uuid'] # Commit the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_COMMIT_ENDPOINT + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_COMMIT_ENDPOINT # Example: {"temp_file_id":"dzevgd6xjs4d5grmcp4n"} thumbnail_file_dict = new_data_dict[property_key] @@ -1296,7 +1296,7 @@ def delete_thumbnail_file(property_key, normalized_type, user_token, existing_da file_info_dict = generated_dict[target_property_key] # Remove the thumbnail file via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_REMOVE_ENDPOINT + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_REMOVE_ENDPOINT # ingest-api's /file-remove takes a list of files to remove # In this case, we only need to remove the single thumbnail file @@ -1905,7 +1905,7 @@ def _commit_files(target_property_key, property_key, normalized_type, user_token entity_uuid = existing_data_dict['uuid'] # Commit the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_COMMIT_ENDPOINT + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_COMMIT_ENDPOINT for file_info in new_data_dict[property_key]: temp_file_id = file_info['temp_file_id'] @@ -2015,7 +2015,7 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token file_uuids.append(file_uuid) # Remove the files via ingest-api call - ingest_api_target_url = schema_manager.get_ingest_api_url() + schema_constants.INGEST_API_FILE_REMOVE_ENDPOINT + ingest_api_target_url = schema_manager.get_ingest_api_url() + SchemaConstants.INGEST_API_FILE_REMOVE_ENDPOINT json_to_post = { 'entity_uuid': entity_uuid, From 97d2c90e74afb53a9879e01f7bb72c7139812ac1 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 11:20:51 -0500 Subject: [PATCH 06/12] Fix incomptiable result format --- src/schema/schema_constants.py | 2 +- src/schema/schema_manager.py | 53 ++++++++++++++++++++++++++++------ src/schema/schema_triggers.py | 23 ++------------- 3 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index e34bd10a..111f1ed1 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -15,7 +15,7 @@ class SchemaConstants(object): INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit' INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove' ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HuBMAP' - ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs?application_context=HuBMAP' + ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs/by-code?application_context=HuBMAP' DOI_BASE_URL = 'https://doi.org/' diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index cf247934..d619669e 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1778,12 +1778,22 @@ def delete_memcached_cache(uuids_list): Returns ------- dict - The available organ types + The available organ types in the following format: + + { + "AO": "Aorta", + "BD": "Blood", + "BL": "Bladder", + "BM": "Bone Marrow", + "BR": "Brain", + "HT": "Heart", + ... + } """ def get_organ_types(): global _ontology_api_url - target_url = _ontology_api_url + '/organs?application_context=HuBMAP' + target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ORGAN_TYPES_ENDPOINT # Use Memcached to improve performance response = make_request_get(target_url, internal_token_used = True) @@ -1795,8 +1805,6 @@ def get_organ_types(): ids_dict = response.json() return ids_dict else: - # uuid-api will also return 400 if the given id is invalid - # We'll just hanle that and all other cases all together here msg = f"Unable to make a request to query the id via uuid-api: {id}" # Log the full stack trace, prepend a line with our message logger.exception(msg) @@ -1817,12 +1825,32 @@ def get_organ_types(): Returns ------- dict - The available assay types + The available assay types by name in the following format: + + { + "10x-multiome": { + "contains_pii": true, + "description": "10x Multiome", + "name": "10x-multiome", + "primary": true, + "vis_only": false, + "vitessce_hints": [] + }, + "AF": { + "contains_pii": false, + "description": "Autofluorescence Microscopy", + "name": "AF", + "primary": true, + "vis_only": false, + "vitessce_hints": [] + }, + ... + } """ def get_assay_types(): global _ontology_api_url - target_url = _ontology_api_url + '/assaytype?application_context=HuBMAP' + target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ASSAY_TYPES_ENDPOINT # Use Memcached to improve performance response = make_request_get(target_url, internal_token_used = True) @@ -1831,8 +1859,15 @@ def get_assay_types(): response.raise_for_status() if response.status_code == 200: - ids_dict = response.json() - return ids_dict + assay_types_by_name = {} + result_dict = response.json() + + # Due to the json envelop being used int the json result + assay_types_list = result_dict['result'] + for assay_type_dict in assay_types_list: + assay_types_dict_by_name[assay_type_dict['name']] = assay_type_dict + + return assay_types_by_name else: # uuid-api will also return 400 if the given id is invalid # We'll just hanle that and all other cases all together here @@ -1876,4 +1911,4 @@ def _create_request_headers(user_token): auth_header_name: auth_scheme + ' ' + user_token } - return headers_dict \ No newline at end of file + return headers_dict diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 182e5614..606e9af2 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1024,9 +1024,10 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d # Parse the organ description if organ_name is not None: try: - # The organ_name is the two-letter code only set if specimen_type == 'organ' + # The organ_name is the two-letter code only set for 'organ' # Convert the two-letter code to a description - organ_desc = _get_organ_description(organ_name) + organ_types_dict = schema_manager.get_organ_types() + organ_desc = organ_types_dict[organ_name].lower() except (yaml.YAMLError, requests.exceptions.RequestException) as e: raise Exception(e) @@ -2098,21 +2099,3 @@ def _get_combined_assay_type_description(data_types): return assay_type_desc - -""" -Get the organ description based on the given organ code - -Parameters ----------- -organ_code : str - The two-letter organ code - -Returns -------- -str: The organ code description -""" -def _get_organ_description(organ_code): - organ_types_dict = schema_manager.get_organ_types() - return organ_types_dict[organ_code]['description'].lower() - - From 5cd9a073b823545a91a29a79e0c346e77ffc52f3 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 11:25:23 -0500 Subject: [PATCH 07/12] Fix code comments --- src/schema/schema_manager.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index d619669e..a4f36612 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1802,10 +1802,9 @@ def get_organ_types(): response.raise_for_status() if response.status_code == 200: - ids_dict = response.json() - return ids_dict + return response.json() else: - msg = f"Unable to make a request to query the id via uuid-api: {id}" + msg = "Unable to make a request to query the organ types via ontology-api: {id}" # Log the full stack trace, prepend a line with our message logger.exception(msg) @@ -1869,9 +1868,7 @@ def get_assay_types(): return assay_types_by_name else: - # uuid-api will also return 400 if the given id is invalid - # We'll just hanle that and all other cases all together here - msg = f"Unable to make a request to query the id via uuid-api: {id}" + msg = "Unable to make a request to query the assay types via ontology-api" # Log the full stack trace, prepend a line with our message logger.exception(msg) From d63b25a328938296a58e3a66ca737afdb56dd416 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 19:50:03 -0500 Subject: [PATCH 08/12] Remove parsing on alt-names --- src/app.py | 45 ++++++++++++---------------------- src/schema/schema_constants.py | 4 +-- src/schema/schema_manager.py | 6 ++--- 3 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/app.py b/src/app.py index 00f1d523..5e18cb59 100644 --- a/src/app.py +++ b/src/app.py @@ -2794,17 +2794,12 @@ def get_prov_info(): for item in dataset['data_types']: try: assay_description_list.append(assay_types_dict[item]['description']) - # Some data types aren't given by their code in the assay types yaml and are instead given as an alt name. - # In these cases, we have to search each assay type and see if the given code matches any alternate names. except KeyError: - valid_key = False - for each in assay_types_dict: - if valid_key is False: - if item in assay_types_dict[each]['alt-names']: - assay_description_list.append(assay_types_dict[each]['description']) - valid_key = True - if valid_key is False: - assay_description_list.append(item) + logger.exception(f"Data type {item} not found in resulting assay types via ontology-api") + + # Just use the data type value + assay_description_list.append(item) + dataset['data_types'] = assay_description_list internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types'] @@ -3116,17 +3111,12 @@ def get_prov_info_for_dataset(id): for item in dataset['data_types']: try: assay_description_list.append(assay_types_dict[item]['description']) - # Some data types aren't given by their code in the assay types yaml and are instead given as an alt name. - # In these cases, we have to search each assay type and see if the given code matches any alternate names. except KeyError: - valid_key = False - for each in assay_types_dict: - if valid_key is False: - if item in assay_types_dict[each]['alt-names']: - assay_description_list.append(assay_types_dict[each]['description']) - valid_key = True - if valid_key is False: - assay_description_list.append(item) + logger.exception(f"Data type {item} not found in resulting assay types via ontology-api") + + # Just use the data type value + assay_description_list.append(item) + dataset['data_types'] = assay_description_list internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types'] if return_json is False: @@ -3342,17 +3332,12 @@ def sankey_data(): assay_description = "" try: assay_description = assay_types_dict[dataset[HEADER_DATASET_DATA_TYPES]]['description'] - # Some data types aren't given by their code in the assay types yaml and are instead given as an alt name. - # In these cases, we have to search each assay type and see if the given code matches any alternate names. except KeyError: - valid_key = False - for each in assay_types_dict: - if valid_key is False: - if dataset[HEADER_DATASET_DATA_TYPES] in assay_types_dict[each]['alt-names']: - assay_description = assay_types_dict[each]['description'] - valid_key = True - if valid_key is False: - assay_description = dataset[HEADER_DATASET_DATA_TYPES] + logger.exception(f"Data type {dataset[HEADER_DATASET_DATA_TYPES]} not found in resulting assay types via ontology-api") + + # Just use the data type value + assay_description = dataset[HEADER_DATASET_DATA_TYPES] + internal_dict[HEADER_DATASET_DATA_TYPES] = assay_description # Replace applicable Group Name and Data type with the value needed for the sankey via the mapping_dict diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py index 111f1ed1..6a471587 100644 --- a/src/schema/schema_constants.py +++ b/src/schema/schema_constants.py @@ -14,8 +14,8 @@ class SchemaConstants(object): UUID_API_ID_ENDPOINT = '/uuid' INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit' INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove' - ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HuBMAP' - ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs/by-code?application_context=HuBMAP' + ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HUBMAP' + ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs/by-code?application_context=HUBMAP' DOI_BASE_URL = 'https://doi.org/' diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index a4f36612..bba74e03 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1804,9 +1804,8 @@ def get_organ_types(): if response.status_code == 200: return response.json() else: - msg = "Unable to make a request to query the organ types via ontology-api: {id}" # Log the full stack trace, prepend a line with our message - logger.exception(msg) + logger.exception("Unable to make a request to query the organ types via ontology-api") logger.debug("======get_organ_types() status code from ontology-api======") logger.debug(response.status_code) @@ -1868,9 +1867,8 @@ def get_assay_types(): return assay_types_by_name else: - msg = "Unable to make a request to query the assay types via ontology-api" # Log the full stack trace, prepend a line with our message - logger.exception(msg) + logger.exception("Unable to make a request to query the assay types via ontology-api") logger.debug("======get_assay_types() status code from ontology-api======") logger.debug(response.status_code) From ab37ce0f4256daf791223c02434fca8e7c881853 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 20:02:34 -0500 Subject: [PATCH 09/12] Fix var name --- src/schema/schema_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index bba74e03..edab58a7 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1863,7 +1863,7 @@ def get_assay_types(): # Due to the json envelop being used int the json result assay_types_list = result_dict['result'] for assay_type_dict in assay_types_list: - assay_types_dict_by_name[assay_type_dict['name']] = assay_type_dict + assay_types_by_name[assay_type_dict['name']] = assay_type_dict return assay_types_by_name else: From 68d233ea6bf75164332e01c8e7b1fb85e53920d9 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 20:17:41 -0500 Subject: [PATCH 10/12] Fix sankey data and prov-info --- src/app.py | 10 +++++----- src/schema/schema_triggers.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/app.py b/src/app.py index 5e18cb59..307160d7 100644 --- a/src/app.py +++ b/src/app.py @@ -2845,7 +2845,7 @@ def get_prov_info(): distinct_organ_hubmap_id_list.append(item['hubmap_id']) distinct_organ_submission_id_list.append(item['submission_id']) distinct_organ_uuid_list.append(item['uuid']) - distinct_organ_type_list.append(organ_types_dict[item['organ']]['description'].lower()) + distinct_organ_type_list.append(organ_types_dict[item['organ']].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list internal_dict[HEADER_ORGAN_SUBMISSION_ID] = distinct_organ_submission_id_list internal_dict[HEADER_ORGAN_UUID] = distinct_organ_uuid_list @@ -3157,7 +3157,7 @@ def get_prov_info_for_dataset(id): distinct_organ_hubmap_id_list.append(item['hubmap_id']) distinct_organ_submission_id_list.append(item['submission_id']) distinct_organ_uuid_list.append(item['uuid']) - distinct_organ_type_list.append(organ_types_dict[item['organ']]['description'].lower()) + distinct_organ_type_list.append(organ_types_dict[item['organ']].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list internal_dict[HEADER_ORGAN_SUBMISSION_ID] = distinct_organ_submission_id_list internal_dict[HEADER_ORGAN_UUID] = distinct_organ_uuid_list @@ -3327,7 +3327,7 @@ def sankey_data(): for dataset in sankey_info: internal_dict = collections.OrderedDict() internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME] - internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[dataset[HEADER_ORGAN_TYPE]]['description'].lower() + internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[dataset[HEADER_ORGAN_TYPE]].lower() # Data type codes are replaced with data type descriptions assay_description = "" try: @@ -3452,13 +3452,13 @@ def get_sample_prov_info(): organ_submission_id = None if sample['organ_uuid'] is not None: organ_uuid = sample['organ_uuid'] - organ_type = organ_types_dict[sample['organ_organ_type']]['description'].lower() + organ_type = organ_types_dict[sample['organ_organ_type']].lower() organ_hubmap_id = sample['organ_hubmap_id'] organ_submission_id = sample['organ_submission_id'] else: if sample['sample_category'] == "organ": organ_uuid = sample['sample_uuid'] - organ_type = organ_types_dict[sample['sample_organ']]['description'].lower() + organ_type = organ_types_dict[sample['sample_organ']].lower() organ_hubmap_id = sample['sample_hubmap_id'] organ_submission_id = sample['sample_submission_id'] diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 606e9af2..3446da2f 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1086,6 +1086,7 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d return property_key, generated_title + """ Trigger event method of getting the uuid of the previous revision dataset if exists From 9779fcf527dd3bfcf16031dacaca5959bf947de9 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 21:47:58 -0500 Subject: [PATCH 11/12] Organ code validation --- src/app.py | 57 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/src/app.py b/src/app.py index 307160d7..86aa9e3b 100644 --- a/src/app.py +++ b/src/app.py @@ -951,9 +951,8 @@ def create_entity(entity_type): # A valid organ code must be present in the `organ` field if ('organ' not in json_data_dict) or (json_data_dict['organ'].strip() == ''): bad_request_error("A valid organ code is required when registering an organ associated with a Donor") - - # Must be one of the defined organ codes - # https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml + + # Must be a 2-letter alphabetic code and can be found in UBKG ontology-api validate_organ_code(json_data_dict['organ']) else: if 'organ' in json_data_dict: @@ -2845,7 +2844,11 @@ def get_prov_info(): distinct_organ_hubmap_id_list.append(item['hubmap_id']) distinct_organ_submission_id_list.append(item['submission_id']) distinct_organ_uuid_list.append(item['uuid']) - distinct_organ_type_list.append(organ_types_dict[item['organ']].lower()) + + organ_code = item['organ'].upper() + validate_organ_code(organ_code) + + distinct_organ_type_list.append(organ_types_dict[organ_code].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list internal_dict[HEADER_ORGAN_SUBMISSION_ID] = distinct_organ_submission_id_list internal_dict[HEADER_ORGAN_UUID] = distinct_organ_uuid_list @@ -3157,7 +3160,11 @@ def get_prov_info_for_dataset(id): distinct_organ_hubmap_id_list.append(item['hubmap_id']) distinct_organ_submission_id_list.append(item['submission_id']) distinct_organ_uuid_list.append(item['uuid']) - distinct_organ_type_list.append(organ_types_dict[item['organ']].lower()) + + organ_code = item['organ'].upper() + validate_organ_code(organ_code) + + distinct_organ_type_list.append(organ_types_dict[organ_code].lower()) internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list internal_dict[HEADER_ORGAN_SUBMISSION_ID] = distinct_organ_submission_id_list internal_dict[HEADER_ORGAN_UUID] = distinct_organ_uuid_list @@ -3327,7 +3334,11 @@ def sankey_data(): for dataset in sankey_info: internal_dict = collections.OrderedDict() internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME] - internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[dataset[HEADER_ORGAN_TYPE]].lower() + + organ_code = dataset[HEADER_ORGAN_TYPE].upper() + validate_organ_code(organ_code) + + internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower() # Data type codes are replaced with data type descriptions assay_description = "" try: @@ -3408,17 +3419,6 @@ def get_sample_prov_info(): if user_in_hubmap_read_group(request): public_only = False - # Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description - # because that would require using a urllib request for each dataset - # response = schema_manager.make_request_get(SchemaConstants.ORGAN_TYPES_YAML) - - # if response.status_code == 200: - # yaml_file = response.text - # try: - # organ_types_dict = yaml.safe_load(yaml_file) - # except yaml.YAMLError as e: - # raise yaml.YAMLError(e) - organ_types_dict = schema_manager.get_organ_types() # Processing and validating query parameters @@ -3452,13 +3452,21 @@ def get_sample_prov_info(): organ_submission_id = None if sample['organ_uuid'] is not None: organ_uuid = sample['organ_uuid'] - organ_type = organ_types_dict[sample['organ_organ_type']].lower() + + organ_code = sample['organ_organ_type'].upper() + validate_organ_code(organ_code) + + organ_type = organ_types_dict[organ_code].lower() organ_hubmap_id = sample['organ_hubmap_id'] organ_submission_id = sample['organ_submission_id'] else: if sample['sample_category'] == "organ": organ_uuid = sample['sample_uuid'] - organ_type = organ_types_dict[sample['sample_organ']].lower() + + organ_code = sample['sample_organ'].upper() + validate_organ_code(organ_code) + + organ_type = organ_types_dict[organ_code].lower() organ_hubmap_id = sample['sample_hubmap_id'] organ_submission_id = sample['sample_submission_id'] @@ -4764,22 +4772,21 @@ def access_level_prefix_dir(dir_name): """ -Ensures that a given organ code matches what is found on the organ_types yaml document +Ensures that a given organ code is 2-letter alphabetic and can be found int the UBKG ontology-api Parameters ---------- organ_code : str - -Returns -------- -Returns nothing. Raises bad_request_error is organ code not found on organ_types.yaml """ def validate_organ_code(organ_code): + if not organ_code.isalpha() or not len(organ_code) == 2: + internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code") + try: organ_types_dict = schema_manager.get_organ_types() if organ_code.upper() not in organ_types_dict: - bad_request_error(f"Invalid organ code. Must be 2 digit code") + internal_server_error(f"Unable to find organ code {organ_code} via the ontology-api") except: msg = f"Failed to validate the organ code: {organ_code}" # Log the full stack trace, prepend a line with our message From ebfecad4bcf06b9e9f722e5362f7f2ffbab44321 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 27 Nov 2023 21:53:14 -0500 Subject: [PATCH 12/12] Further tweaks to organ code validation --- src/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/app.py b/src/app.py index 86aa9e3b..ab01761f 100644 --- a/src/app.py +++ b/src/app.py @@ -4786,8 +4786,8 @@ def validate_organ_code(organ_code): organ_types_dict = schema_manager.get_organ_types() if organ_code.upper() not in organ_types_dict: - internal_server_error(f"Unable to find organ code {organ_code} via the ontology-api") - except: + not_found_error(f"Unable to find organ code {organ_code} via the ontology-api") + except requests.exceptions.RequestException: msg = f"Failed to validate the organ code: {organ_code}" # Log the full stack trace, prepend a line with our message logger.exception(msg)