Merge branch 'yuanzhou/yaml-to-ubkg' of http://github.com/hubmapconso…

…rtium/entity-api into kburke/addDatasetTypeAttribute
hubmapconsortium · Dec 5, 2023 · 2258dad · 2258dad
2 parents 8851db0 + ebfecad
commit 2258dad
Show file tree

Hide file tree

Showing 11 changed files with 194 additions and 566 deletions.
diff --git a/entity-api-spec.yaml b/entity-api-spec.yaml
@@ -406,59 +406,6 @@ components:
             - section
             - suspension
           description: "A code representing the type of specimen. Must be an organ, block, section, or suspension"
-        specimen_type:
-          type: string
-          enum:
-            - atacseq
-            - biopsy
-            - blood
-            - cell_lysate
-            - clarity_hydrogel
-            - codex
-            - cryosections_curls_from_fresh_frozen_oct
-            - cryosections_curls_rnalater
-            - ffpe_block
-            - ffpe_slide
-            - fixed_frozen_section_slide
-            - fixed_tissue_piece
-            - flash_frozen_liquid_nitrogen
-            - formalin_fixed_oct_block
-            - fresh_frozen_oct_block
-            - fresh_frozen_section_slide
-            - fresh_frozen_tissue
-            - fresh_frozen_tissue_section
-            - fresh_tissue
-            - frozen_cell_pellet_buffy_coat
-            - gdna
-            - module
-            - nuclei
-            - nuclei_rnalater
-            - organ
-            - organ_piece
-            - other
-            - pbmc
-            - pfa_fixed_frozen_oct_block
-            - plasma
-            - protein
-            - ran_poly_a_enriched
-            - rna_total
-            - rnalater_treated_and_stored
-            - rnaseq
-            - scatacseq
-            - scrnaseq
-            - segment
-            - seqfish
-            - sequence_library
-            - serum
-            - single_cell_cryopreserved
-            - snatacseq
-            - snrnaseq
-            - tissue_lysate
-            - wgs
-          description: "DEPRECATED:  No longer a required field. A code representing the type of specimen.  Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)"
-        specimen_type_other:
-          type: string
-          description: "The user provided sample type if the 'other' sample_type is chosen."
         protocol_url:
           type: string
           description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared."

diff --git a/src/app.py b/src/app.py
diff --git a/src/instance/app.cfg.example b/src/instance/app.cfg.example
@@ -28,6 +28,10 @@ UUID_API_URL = 'http://uuid-api:8080'
 # Works regardless of the trailing slash
 INGEST_API_URL = 'https://ingest-api.dev.hubmapconsortium.org'
 
+# URL for talking to Ontology API (default for DEV)
+# Works regardless of the trailing slash
+ONTOLOGY_API_URL = 'https://ontology-api.dev.hubmapconsortium.org'
+
 # A list of URLs for talking to multiple Search API instances (default value used for docker deployment, no token needed)
 # Works regardless of the trailing slash /
 SEARCH_API_URL_LIST = ['http://search-api:8080']

diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -909,33 +909,6 @@ ENTITIES:
           - validate_sample_category
         before_property_update_validators:
           - validate_sample_category
-
-      # No logner required on create, specimen_type -> sample_category 12/15/2022
-      specimen_type:
-        type: string
-        #required_on_create: true # Only required for create via POST, not update via PUT
-        description: "A code representing the type of specimen.  Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)"
-        # Validate the given value against the definitions: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml
-        # Disabled validation 12/15/2022
-        # before_property_create_validators:
-        #   - validate_specimen_type
-        # before_property_update_validators:
-        #   - validate_specimen_type
-      specimen_type_other:
-        type: string
-        description: "The user provided sample type if the 'other' sample_type is chosen."
-
-
-      # specimen_type no logner required on create, will remove this field when removing specimen_type
-      # Simply always set to 'Unknown' and no need to update 12/15/2022
-      tissue_type:
-        type: string
-        generated: true # Can not be updated via the PUT
-        #auto_update: true # Will always update automatically if the entity gets updated
-        description: 'The type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type, default is Unknown'
-        before_create_trigger: set_tissue_type
-        #before_update_trigger: set_tissue_type
-
       portal_metadata_upload_files:
         type: json_string
         description: "A list of relative paths to metadata files"
@@ -961,7 +934,6 @@ ENTITIES:
         immutable: true
         description: "The displayname of globus group which the user who created this entity is a member of"
         before_create_trigger: set_group_name
-      # Should be required on create only when specimen_type==organ
       organ:
         type: string
         description: "Organ code specifier, only set if sample_type == organ.  Valid values found in: [organ types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml)"

diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py
@@ -2,23 +2,20 @@
 class SchemaConstants(object):
     MEMCACHED_TTL = 7200
 
-    # Constants used by validators
     INGEST_API_APP = 'ingest-api'
     INGEST_PIPELINE_APP = 'ingest-pipeline'
     HUBMAP_APP_HEADER = 'X-Hubmap-Application'
     DATASET_STATUS_PUBLISHED = 'published'
 
-    # Used by triggers, all lowercase for easy comparision
     ACCESS_LEVEL_PUBLIC = 'public'
     ACCESS_LEVEL_CONSORTIUM = 'consortium'
     ACCESS_LEVEL_PROTECTED = 'protected'
 
-    # Yaml file to parse organ description
-    ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml'
-    ASSAY_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml'
-
-    # For generating Sample.tissue_type
-    TISSUE_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml'
+    UUID_API_ID_ENDPOINT = '/uuid'
+    INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit'
+    INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove'
+    ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HUBMAP'
+    ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs/by-code?application_context=HUBMAP'
 
     DOI_BASE_URL = 'https://doi.org/'
 

diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
@@ -51,9 +51,11 @@
 valid_yaml_file : file
     A valid yaml file
 uuid_api_url : str
-    The uuid-api URL
+    The uuid-api base URL
 ingest_api_url : str
-    The ingest-api URL
+    The ingest-api base URL
+ontology_api_url : str
+    The ontology-api base URL
 auth_helper_instance : AuthHelper
     The auth helper instance
 neo4j_driver_instance : neo4j_driver
@@ -1232,7 +1234,7 @@ def get_user_info(request):
 def get_hubmap_ids(id):
     global _uuid_api_url
 
-    target_url = _uuid_api_url + '/uuid/' + id
+    target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT + '/' + id
 
     # Use Memcached to improve performance
     response = make_request_get(target_url, internal_token_used = True)
@@ -1457,7 +1459,6 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di
             parent_id = json_data_dict['direct_ancestor_uuid']
             json_to_post['parent_ids'] = [parent_id]
 
-            # specimen_type -> sample_category 12/15/2022
             # 'Sample.sample_category' is marked as `required_on_create` in the schema yaml
             if json_data_dict['sample_category'].lower() == 'organ':
                 # The 'organ' field containing the 2 digit organ code is required in this case
@@ -1475,7 +1476,7 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di
     logger.info(json_to_post)
 
     # Disable ssl certificate verification
-    target_url = _uuid_api_url + '/uuid'
+    target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT
     response = requests.post(url = target_url, headers = request_headers, json = json_to_post, verify = False, params = query_parms)
 
     # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
@@ -1874,6 +1875,114 @@ def delete_memcached_cache(uuids_list):
         logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")
 
 
+"""
+Retrive the organ types from ontology-api
+
+Returns
+-------
+dict
+    The available organ types in the following format:
+
+    {
+        "AO": "Aorta",
+        "BD": "Blood",
+        "BL": "Bladder",
+        "BM": "Bone Marrow",
+        "BR": "Brain",
+        "HT": "Heart",
+        ...
+    }
+"""
+def get_organ_types():
+    global _ontology_api_url
+
+    target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ORGAN_TYPES_ENDPOINT
+
+    # Use Memcached to improve performance
+    response = make_request_get(target_url, internal_token_used = True)
+
+    # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
+    response.raise_for_status()
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        # Log the full stack trace, prepend a line with our message
+        logger.exception("Unable to make a request to query the organ types via ontology-api")
+
+        logger.debug("======get_organ_types() status code from ontology-api======")
+        logger.debug(response.status_code)
+
+        logger.debug("======get_organ_types() response text from ontology-api======")
+        logger.debug(response.text)
+
+        # Also bubble up the error message from ontology-api
+        raise requests.exceptions.RequestException(response.text)
+
+
+"""
+Retrive the assay types from ontology-api
+
+Returns
+-------
+dict
+    The available assay types by name in the following format:
+
+    {
+        "10x-multiome": {
+            "contains_pii": true,
+            "description": "10x Multiome",
+            "name": "10x-multiome",
+            "primary": true,
+            "vis_only": false,
+            "vitessce_hints": []
+        },
+        "AF": {
+            "contains_pii": false,
+            "description": "Autofluorescence Microscopy",
+            "name": "AF",
+            "primary": true,
+            "vis_only": false,
+            "vitessce_hints": []
+        },
+        ...
+    }
+"""
+def get_assay_types():
+    global _ontology_api_url
+
+    target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ASSAY_TYPES_ENDPOINT
+
+    # Use Memcached to improve performance
+    response = make_request_get(target_url, internal_token_used = True)
+
+    # Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
+    response.raise_for_status()
+
+    if response.status_code == 200:
+        assay_types_by_name = {}
+        result_dict = response.json()
+
+        # Due to the json envelop being used int the json result
+        assay_types_list = result_dict['result']
+        for assay_type_dict in assay_types_list:
+            assay_types_by_name[assay_type_dict['name']] = assay_type_dict
+
+        return assay_types_by_name
+    else:
+        # Log the full stack trace, prepend a line with our message
+        logger.exception("Unable to make a request to query the assay types via ontology-api")
+
+        logger.debug("======get_assay_types() status code from ontology-api======")
+        logger.debug(response.status_code)
+
+        logger.debug("======get_assay_types() response text from ontology-api======")
+        logger.debug(response.text)
+
+        # Also bubble up the error message from ontology-api
+        raise requests.exceptions.RequestException(response.text)
+
+
 ####################################################################################################
 ## Internal functions
 ####################################################################################################

diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py
@@ -442,24 +442,8 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
     donor_metadata = None
 
     with neo4j_driver.session() as session:
-        # Old time-consuming single query, it takes a significant amounts of DB hits
-        # query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
-        #          f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
-        #          f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")
-
-        # logger.info("======get_dataset_organ_and_donor_info() query======")
-        # logger.info(query)
-
-        # with neo4j_driver.session() as session:
-        #     record = session.read_transaction(execute_readonly_tx, query)
-
-        #     if record:
-        #         organ_name = record['organ_name']
-        #         donor_metadata = record['donor_metadata']
-
         # To improve the query performance, we implement the two-step queries to drastically reduce the DB hits
         sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) "
-                        # specimen_type -> sample_category 12/15/2022
                         f"WHERE e.uuid='{uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) "
                         f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid")
 
@@ -473,7 +457,6 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
             sample_uuid = sample_record['sample_uuid']
 
             donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) "
-                           # specimen_type -> sample_category 12/15/2022
                            f"WHERE s.uuid='{sample_uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) "
                            f"RETURN DISTINCT d.metadata AS donor_metadata")