Skip to content

Commit

Permalink
Merge pull request #588 from sennetconsortium/maxsibilla/issue-574
Browse files Browse the repository at this point in the history
Porting over UBKG assaytype/rulechain support
  • Loading branch information
maxsibilla authored Nov 20, 2024
2 parents f194ec8 + d0e9ee5 commit 2dbf07c
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 142 deletions.
4 changes: 4 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,9 @@ UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020"}'

# UBKG Integration Configs for Rule Chain
UBKG_INTEGRATION_ENDPOINT = 'http://gateway.dev.hubmapconsortium.org:8181/'
APPLICATION_CONTEXT = 'SENNET'

# URI from which to load the assay classifier rules.
RULE_CHAIN_URI = 'https://raw.githubusercontent.com/sennetconsortium/ingest-api/main/src/routes/assayclassifier/testing_rule_chain.json'
79 changes: 74 additions & 5 deletions src/lib/rule_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,41 @@
SCHEMA_FILE = "rule_chain_schema.json"
SCHEMA_BASE_URI = "http://schemata.hubmapconsortium.org/"


rule_chain = None

# Have to translate pre-UBKG keys to UBKG keys
# Format is:
# "Key before UBKG integration": "UBKG Key"
pre_integration_to_ubkg_translation = {
'vitessce-hints': 'vitessce_hints',
'dir-schema': 'dir_schema',
'tbl-schema': 'tbl_schema',
'contains-pii': 'contains_full_genetic_sequences',
'dataset-type': 'dataset_type',
'is-multi-assay': 'is_multiassay',
'pipeline-shorthand': 'pipeline_shorthand',
'must-contain': 'must_contain',
}

# These are the keys returned by the rule chain before UBKG integration.
# We will return the UBKG data in this format as well for MVP.
# This is to avoid too much churn on end-users.
# We set primary manually so ignore it.
pre_integration_keys = [
'assaytype',
'vitessce-hints',
'dir-schema',
'tbl-schema',
'contains-pii',
# 'primary',
'dataset-type',
'description',
'is-multi-assay',
'pipeline-shorthand',
'must-contain',
"process_state"
]


def initialize_rule_chain():
"""Initialize the rule chain from the source URI.
Expand Down Expand Up @@ -79,9 +111,9 @@ def calculate_data_types(entity: Entity) -> list[str]:
# the data_types field is not empty and not a list of empty strings
# If it has a value it must be an old derived dataset so use that to match the rules
if (
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
):
data_types = entity.data_types
# Moving forward (2024) we are no longer using data_types for derived datasets.
Expand Down Expand Up @@ -134,7 +166,7 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
# The primary publication will always have metadata,
# so we have to do the association here.
if entity.entity_type == "Publication":
metadata["data_types"] = calculate_data_types(entity)
metadata["data_types"] = calculate_data_types(entity)

# If there is no metadata, then it must be a derived dataset
else:
Expand All @@ -150,6 +182,43 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
return metadata


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if source_type.upper() == "MOUSE":
rule_value_set["contains-pii"] = False

return rule_value_set


def get_data_from_ubkg(ubkg_code: str) -> dict:
query = urllib.parse.urlencode({"application_context": current_app.config['APPLICATION_CONTEXT']})
ubkg_api_url = f"{current_app.config['UBKG_INTEGRATION_ENDPOINT']}assayclasses/{ubkg_code}?{query}"
req = urllib.request.Request(ubkg_api_url)
try:
with urllib.request.urlopen(req) as response:
response_data = response.read().decode("utf-8")
except urllib.error.URLError as excp:
print(f"Error getting extra info from UBKG {excp}")
return {}

return json.loads(response_data)


def standardize_results(rule_chain_json: dict, ubkg_json: dict) -> dict:
# Initialize this with conditional logic to set 'primary' true or false.
ubkg_transformed_json = {
"primary": ubkg_json.get("process_state") == "primary"
}

for pre_integration_key in pre_integration_keys:
ubkg_key = pre_integration_to_ubkg_translation.get(pre_integration_key, pre_integration_key)
ubkg_value = ubkg_json.get(ubkg_key)
ubkg_transformed_json[pre_integration_key] = ubkg_value

return rule_chain_json | ubkg_transformed_json


class NoMatchException(Exception):
pass

Expand Down
26 changes: 17 additions & 9 deletions src/routes/assayclassifier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
build_entity_metadata,
calculate_assay_info,
initialize_rule_chain,
get_data_from_ubkg,
standardize_results
)
from lib.services import get_entity

Expand All @@ -30,7 +32,7 @@ def get_ds_assaytype(ds_uuid: str):
token = get_token()
entity = get_entity(ds_uuid, token)
metadata = build_entity_metadata(entity)
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if sources := entity.sources:
source_type = ""
Expand All @@ -39,9 +41,12 @@ def get_ds_assaytype(ds_uuid: str):
# If there is a single Human source_type, treat this as a Human case
if source_type.upper() == "HUMAN":
break
apply_source_type_transformations(source_type, rule_value_set)
apply_source_type_transformations(source_type, rules_json)

return jsonify(rule_value_set)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ValueError as excp:
logger.error(excp, exc_info=True)
return Response("Bad parameter: {excp}", 400)
Expand Down Expand Up @@ -97,21 +102,21 @@ def get_ds_rule_metadata(ds_uuid: str):
)


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
def apply_source_type_transformations(source_type: str, rules_json: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if "MOUSE" in source_type.upper():
rule_value_set["contains-pii"] = False
rules_json["contains-pii"] = False

return rule_value_set
return rules_json


@assayclassifier_blueprint.route("/assaytype", methods=["POST"])
@require_valid_token()
@require_json(param="metadata")
def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
try:
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if parent_sample_ids := metadata.get("parent_sample_id"):
source_type = ""
Expand All @@ -123,8 +128,11 @@ def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
if source_type.upper() == "HUMAN":
break

apply_source_type_transformations(source_type, rule_value_set)
return jsonify(rule_value_set)
apply_source_type_transformations(source_type, rules_json)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ResponseException as re:
logger.error(re, exc_info=True)
return re.response
Expand Down
Loading

0 comments on commit 2dbf07c

Please sign in to comment.