Skip to content

Commit

Permalink
Merge branch 'yuanzhou/yaml-to-ubkg' of http://github.com/hubmapconso…
Browse files Browse the repository at this point in the history
…rtium/entity-api into kburke/addDatasetTypeAttribute
  • Loading branch information
Karl Burke committed Dec 5, 2023
2 parents 8851db0 + ebfecad commit 2258dad
Show file tree
Hide file tree
Showing 11 changed files with 194 additions and 566 deletions.
53 changes: 0 additions & 53 deletions entity-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -406,59 +406,6 @@ components:
- section
- suspension
description: "A code representing the type of specimen. Must be an organ, block, section, or suspension"
specimen_type:
type: string
enum:
- atacseq
- biopsy
- blood
- cell_lysate
- clarity_hydrogel
- codex
- cryosections_curls_from_fresh_frozen_oct
- cryosections_curls_rnalater
- ffpe_block
- ffpe_slide
- fixed_frozen_section_slide
- fixed_tissue_piece
- flash_frozen_liquid_nitrogen
- formalin_fixed_oct_block
- fresh_frozen_oct_block
- fresh_frozen_section_slide
- fresh_frozen_tissue
- fresh_frozen_tissue_section
- fresh_tissue
- frozen_cell_pellet_buffy_coat
- gdna
- module
- nuclei
- nuclei_rnalater
- organ
- organ_piece
- other
- pbmc
- pfa_fixed_frozen_oct_block
- plasma
- protein
- ran_poly_a_enriched
- rna_total
- rnalater_treated_and_stored
- rnaseq
- scatacseq
- scrnaseq
- segment
- seqfish
- sequence_library
- serum
- single_cell_cryopreserved
- snatacseq
- snrnaseq
- tissue_lysate
- wgs
description: "DEPRECATED: No longer a required field. A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)"
specimen_type_other:
type: string
description: "The user provided sample type if the 'other' sample_type is chosen."
protocol_url:
type: string
description: "The protocols.io doi url pointing the protocol under wich the sample was obtained and/or prepared."
Expand Down
201 changes: 60 additions & 141 deletions src/app.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ UUID_API_URL = 'http://uuid-api:8080'
# Works regardless of the trailing slash
INGEST_API_URL = 'https://ingest-api.dev.hubmapconsortium.org'

# URL for talking to Ontology API (default for DEV)
# Works regardless of the trailing slash
ONTOLOGY_API_URL = 'https://ontology-api.dev.hubmapconsortium.org'

# A list of URLs for talking to multiple Search API instances (default value used for docker deployment, no token needed)
# Works regardless of the trailing slash /
SEARCH_API_URL_LIST = ['http://search-api:8080']
Expand Down
28 changes: 0 additions & 28 deletions src/schema/provenance_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -909,33 +909,6 @@ ENTITIES:
- validate_sample_category
before_property_update_validators:
- validate_sample_category

# No logner required on create, specimen_type -> sample_category 12/15/2022
specimen_type:
type: string
#required_on_create: true # Only required for create via POST, not update via PUT
description: "A code representing the type of specimen. Must be one of the codes specified in: [tissue sample types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml)"
# Validate the given value against the definitions: https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml
# Disabled validation 12/15/2022
# before_property_create_validators:
# - validate_specimen_type
# before_property_update_validators:
# - validate_specimen_type
specimen_type_other:
type: string
description: "The user provided sample type if the 'other' sample_type is chosen."


# specimen_type no logner required on create, will remove this field when removing specimen_type
# Simply always set to 'Unknown' and no need to update 12/15/2022
tissue_type:
type: string
generated: true # Can not be updated via the PUT
#auto_update: true # Will always update automatically if the entity gets updated
description: 'The type of the tissue based on the mapping between type (Block/Section/Suspension) and the specimen_type, default is Unknown'
before_create_trigger: set_tissue_type
#before_update_trigger: set_tissue_type

portal_metadata_upload_files:
type: json_string
description: "A list of relative paths to metadata files"
Expand All @@ -961,7 +934,6 @@ ENTITIES:
immutable: true
description: "The displayname of globus group which the user who created this entity is a member of"
before_create_trigger: set_group_name
# Should be required on create only when specimen_type==organ
organ:
type: string
description: "Organ code specifier, only set if sample_type == organ. Valid values found in: [organ types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/organ_types.yaml)"
Expand Down
13 changes: 5 additions & 8 deletions src/schema/schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,20 @@
class SchemaConstants(object):
MEMCACHED_TTL = 7200

# Constants used by validators
INGEST_API_APP = 'ingest-api'
INGEST_PIPELINE_APP = 'ingest-pipeline'
HUBMAP_APP_HEADER = 'X-Hubmap-Application'
DATASET_STATUS_PUBLISHED = 'published'

# Used by triggers, all lowercase for easy comparision
ACCESS_LEVEL_PUBLIC = 'public'
ACCESS_LEVEL_CONSORTIUM = 'consortium'
ACCESS_LEVEL_PROTECTED = 'protected'

# Yaml file to parse organ description
ORGAN_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml'
ASSAY_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/assay_types.yaml'

# For generating Sample.tissue_type
TISSUE_TYPES_YAML = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/tissue_sample_types.yaml'
UUID_API_ID_ENDPOINT = '/uuid'
INGEST_API_FILE_COMMIT_ENDPOINT = '/file-commit'
INGEST_API_FILE_REMOVE_ENDPOINT = '/file-remove'
ONTOLOGY_API_ASSAY_TYPES_ENDPOINT = '/assaytype?application_context=HUBMAP'
ONTOLOGY_API_ORGAN_TYPES_ENDPOINT = '/organs/by-code?application_context=HUBMAP'

DOI_BASE_URL = 'https://doi.org/'

Expand Down
119 changes: 114 additions & 5 deletions src/schema/schema_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@
valid_yaml_file : file
A valid yaml file
uuid_api_url : str
The uuid-api URL
The uuid-api base URL
ingest_api_url : str
The ingest-api URL
The ingest-api base URL
ontology_api_url : str
The ontology-api base URL
auth_helper_instance : AuthHelper
The auth helper instance
neo4j_driver_instance : neo4j_driver
Expand Down Expand Up @@ -1232,7 +1234,7 @@ def get_user_info(request):
def get_hubmap_ids(id):
global _uuid_api_url

target_url = _uuid_api_url + '/uuid/' + id
target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT + '/' + id

# Use Memcached to improve performance
response = make_request_get(target_url, internal_token_used = True)
Expand Down Expand Up @@ -1457,7 +1459,6 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di
parent_id = json_data_dict['direct_ancestor_uuid']
json_to_post['parent_ids'] = [parent_id]

# specimen_type -> sample_category 12/15/2022
# 'Sample.sample_category' is marked as `required_on_create` in the schema yaml
if json_data_dict['sample_category'].lower() == 'organ':
# The 'organ' field containing the 2 digit organ code is required in this case
Expand All @@ -1475,7 +1476,7 @@ def create_hubmap_ids(normalized_class, json_data_dict, user_token, user_info_di
logger.info(json_to_post)

# Disable ssl certificate verification
target_url = _uuid_api_url + '/uuid'
target_url = _uuid_api_url + SchemaConstants.UUID_API_ID_ENDPOINT
response = requests.post(url = target_url, headers = request_headers, json = json_to_post, verify = False, params = query_parms)

# Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
Expand Down Expand Up @@ -1874,6 +1875,114 @@ def delete_memcached_cache(uuids_list):
logger.info(f"Deleted cache by key: {', '.join(cache_keys)}")


"""
Retrive the organ types from ontology-api
Returns
-------
dict
The available organ types in the following format:
{
"AO": "Aorta",
"BD": "Blood",
"BL": "Bladder",
"BM": "Bone Marrow",
"BR": "Brain",
"HT": "Heart",
...
}
"""
def get_organ_types():
global _ontology_api_url

target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ORGAN_TYPES_ENDPOINT

# Use Memcached to improve performance
response = make_request_get(target_url, internal_token_used = True)

# Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
response.raise_for_status()

if response.status_code == 200:
return response.json()
else:
# Log the full stack trace, prepend a line with our message
logger.exception("Unable to make a request to query the organ types via ontology-api")

logger.debug("======get_organ_types() status code from ontology-api======")
logger.debug(response.status_code)

logger.debug("======get_organ_types() response text from ontology-api======")
logger.debug(response.text)

# Also bubble up the error message from ontology-api
raise requests.exceptions.RequestException(response.text)


"""
Retrive the assay types from ontology-api
Returns
-------
dict
The available assay types by name in the following format:
{
"10x-multiome": {
"contains_pii": true,
"description": "10x Multiome",
"name": "10x-multiome",
"primary": true,
"vis_only": false,
"vitessce_hints": []
},
"AF": {
"contains_pii": false,
"description": "Autofluorescence Microscopy",
"name": "AF",
"primary": true,
"vis_only": false,
"vitessce_hints": []
},
...
}
"""
def get_assay_types():
global _ontology_api_url

target_url = _ontology_api_url + SchemaConstants.ONTOLOGY_API_ASSAY_TYPES_ENDPOINT

# Use Memcached to improve performance
response = make_request_get(target_url, internal_token_used = True)

# Invoke .raise_for_status(), an HTTPError will be raised with certain status codes
response.raise_for_status()

if response.status_code == 200:
assay_types_by_name = {}
result_dict = response.json()

# Due to the json envelop being used int the json result
assay_types_list = result_dict['result']
for assay_type_dict in assay_types_list:
assay_types_by_name[assay_type_dict['name']] = assay_type_dict

return assay_types_by_name
else:
# Log the full stack trace, prepend a line with our message
logger.exception("Unable to make a request to query the assay types via ontology-api")

logger.debug("======get_assay_types() status code from ontology-api======")
logger.debug(response.status_code)

logger.debug("======get_assay_types() response text from ontology-api======")
logger.debug(response.text)

# Also bubble up the error message from ontology-api
raise requests.exceptions.RequestException(response.text)


####################################################################################################
## Internal functions
####################################################################################################
Expand Down
17 changes: 0 additions & 17 deletions src/schema/schema_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,24 +442,8 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
donor_metadata = None

with neo4j_driver.session() as session:
# Old time-consuming single query, it takes a significant amounts of DB hits
# query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) "
# f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) "
# f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata")

# logger.info("======get_dataset_organ_and_donor_info() query======")
# logger.info(query)

# with neo4j_driver.session() as session:
# record = session.read_transaction(execute_readonly_tx, query)

# if record:
# organ_name = record['organ_name']
# donor_metadata = record['donor_metadata']

# To improve the query performance, we implement the two-step queries to drastically reduce the DB hits
sample_query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample) "
# specimen_type -> sample_category 12/15/2022
f"WHERE e.uuid='{uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) "
f"RETURN DISTINCT s.organ AS organ_name, s.uuid AS sample_uuid")

Expand All @@ -473,7 +457,6 @@ def get_dataset_organ_and_donor_info(neo4j_driver, uuid):
sample_uuid = sample_record['sample_uuid']

donor_query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[:ACTIVITY_INPUT]-(d:Donor) "
# specimen_type -> sample_category 12/15/2022
f"WHERE s.uuid='{sample_uuid}' AND s.sample_category='organ' AND EXISTS(s.organ) "
f"RETURN DISTINCT d.metadata AS donor_metadata")

Expand Down
Loading

0 comments on commit 2258dad

Please sign in to comment.