Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve automated Zooma mappings #411

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions cmat/trait_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
from collections import Counter

from cmat.clinvar_xml_io import ClinVarTrait
from cmat.trait_mapping.ols import get_uri_from_exact_match
from cmat.trait_mapping.output import output_trait
from cmat.trait_mapping.oxo import get_oxo_results
from cmat.trait_mapping.oxo import uris_to_oxo_format
from cmat.trait_mapping.trait import Trait
from cmat.trait_mapping.trait import Trait, OntologyEntry
from cmat.trait_mapping.trait_names_parsing import parse_trait_names
from cmat.trait_mapping.zooma import get_zooma_results

Expand All @@ -32,8 +33,9 @@ def get_uris_for_oxo(zooma_result_list: list) -> set:

def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int, target_ontology: str = 'EFO') -> Trait:
"""
Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma
mappings that are in EFO then query OxO with any high confidence mappings not in EFO.
Process a single trait. First look for an exact string match in the target ontology and return immediately if found.
Otherwise find any mappings in Zooma. If there are no high confidence Zooma mappings that are in EFO then query OxO
with any high confidence mappings not in EFO.

:param trait: The trait to be processed.
:param filters: A dictionary of filters to use for querying Zooma.
Expand All @@ -47,7 +49,12 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list:
"""
logger.debug('Processing trait {}'.format(trait.name))

trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host, target_ontology)
string_match_uri = get_uri_from_exact_match(trait.name.lower(), target_ontology)
if string_match_uri:
trait.finished_mapping_set.add(OntologyEntry(string_match_uri, trait.name.lower()))
return trait

trait.zooma_result_list = get_zooma_results(trait.name.lower(), filters, zooma_host, target_ontology)
trait.process_zooma_results()
if (trait.is_finished
or len(trait.zooma_result_list) == 0
Expand Down
29 changes: 29 additions & 0 deletions cmat/trait_mapping/ols.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from functools import lru_cache
import logging
import requests
Expand Down Expand Up @@ -121,3 +122,31 @@ def get_replacement_term(uri: str, ontology: str = 'EFO') -> str:
if response_json["term_replaced_by"] is not None:
return response_json["term_replaced_by"]
return ""


@lru_cache(maxsize=16384)
@retry(exceptions=(ConnectionError, requests.RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def get_uri_from_exact_match(text, ontology='EFO'):
"""
Finds URI from target ontology for a given text based on exact string match.

:param text: String to search for
:param ontology: ID of target ontology to query (default EFO)
:return: URI of matching term or None if not found
"""
search_url = os.path.join(OLS_SERVER, f'api/search?ontology={ontology}&q={text}&queryFields=label&exact=true')
response = requests.get(search_url)
response.raise_for_status()
data = response.json()
if 'response' in data:
results = data['response']['docs']
candidates = set()
for result in results:
# Check that we've found the term exactly (strict case-insensitive string match)
if result['label'].lower() == text.lower():
candidates.add(result['iri'])
# Only return a result if we can find it unambiguously
if len(candidates) == 1:
return candidates.pop()
logger.warning(f'Could not find an IRI for {text}')
return None
11 changes: 5 additions & 6 deletions cmat/trait_mapping/trait.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,11 @@ def process_zooma_results(self):
Check whether any Zooma mappings can be output as a finished ontology mapping.
Put any finished mappings in finished_mapping_set
"""
for mapping in self.zooma_result_list:
if mapping.confidence.lower() != "high":
continue

for mapping in mapping.mapping_list:
if mapping.in_ontology and mapping.is_current:
for zooma_result in self.zooma_result_list:
for mapping in zooma_result.mapping_list:
# Accept current mappings in the target ontology with either high-confidence or exact string matches
if mapping.in_ontology and mapping.is_current and (zooma_result.confidence.lower() == "high"
or zooma_result.zooma_label.lower() == self.name.lower()):
ontology_entry = OntologyEntry(mapping.uri, mapping.ontology_label)
self.finished_mapping_set.add(ontology_entry)

Expand Down
3 changes: 2 additions & 1 deletion tests/pipelines/resources/expected/consequences_snp.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@
17:65558594:G:A ENSG00000168646 AXIN2 synonymous_variant
17:6556374:C:T ENSG00000091622 PITPNM3 intron_variant
17:6628328:A:G ENSG00000198920 KIAA0753 synonymous_variant
17:6628328:A:G ENSG00000282936 3_prime_UTR_variant
17:6693178:A:AACACACACACAC ENSG00000141485 SLC13A5 splice_polypyrimidine_tract_variant
17:7224245:T:C ENSG00000072778 ACADVL splice_donor_variant
17:73201307:A:G ENSG00000166685 COG1 missense_variant
Expand Down Expand Up @@ -900,7 +901,7 @@
4:120785105:A:G ENSG00000138738 PRDM5 splice_polypyrimidine_tract_variant
4:121847458:G:T ENSG00000138686 BBS7 missense_variant
4:121853086:C:T ENSG00000138686 BBS7 missense_variant
4:122979387:G:T ENSG00000145375 SPATA5 splice_donor_variant
4:122979387:G:T ENSG00000145375 AFG2A splice_donor_variant
4:127881908:G:T ENSG00000142731 PLK4 missense_variant
4:127930737:G:T ENSG00000164073 MFSD8 missense_variant
4:128959797:C:A ENSG00000151466 SCLT1 intron_variant
Expand Down
1 change: 1 addition & 0 deletions tests/pipelines/resources/expected/evidence_strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["uncertain significance"], "confidence": "criteria provided, single submitter", "studyId": "RCV001373139", "releaseDate": "2021-04-13", "targetFromSourceId": "ENSG00000163930", "variantFunctionalConsequenceId": "SO_0001583", "variantId": "3_52402628_G_A", "cohortPhenotypes": ["BAP1 tumor predisposition syndrome", "BAP1-related tumor predisposition syndrome", "Tumor predisposition syndrome", "Tumor susceptibility linked to germline BAP1 mutations"], "diseaseFromSource": "BAP1-related tumor predisposition syndrome", "diseaseFromSourceId": "C3280492", "diseaseFromSourceMappedId": "MONDO_0013692", "variantHgvsId": "NC_000003.12:g.52402628G>A"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["uncertain significance"], "confidence": "criteria provided, single submitter", "studyId": "RCV001373139", "releaseDate": "2021-04-13", "targetFromSourceId": "ENSG00000163930", "variantFunctionalConsequenceId": "SO_0001583", "variantId": "3_52402628_G_A", "cohortPhenotypes": ["BAP1 tumor predisposition syndrome", "BAP1-related tumor predisposition syndrome", "Tumor predisposition syndrome", "Tumor susceptibility linked to germline BAP1 mutations"], "diseaseFromSource": "BAP1-related tumor predisposition syndrome", "diseaseFromSourceId": "C3280492", "diseaseFromSourceMappedId": "Orphanet_289539", "variantHgvsId": "NC_000003.12:g.52402628G>A"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV001730858", "releaseDate": "2022-02-20", "targetFromSourceId": "ENSG00000198920", "variantFunctionalConsequenceId": "SO_0001819", "variantId": "17_6628328_A_G", "cohortPhenotypes": ["Joubert syndrome 38"], "diseaseFromSource": "Joubert syndrome 38", "diseaseFromSourceId": "C5561958", "variantHgvsId": "NC_000017.11:g.6628328A>G"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV001730858", "releaseDate": "2022-02-20", "targetFromSourceId": "ENSG00000282936", "variantFunctionalConsequenceId": "SO_0001624", "variantId": "17_6628328_A_G", "cohortPhenotypes": ["Joubert syndrome 38"], "diseaseFromSource": "Joubert syndrome 38", "diseaseFromSourceId": "C5561958", "variantHgvsId": "NC_000017.11:g.6628328A>G"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["likely benign"], "confidence": "criteria provided, single submitter", "studyId": "RCV002057147", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000115904", "variantFunctionalConsequenceId": "SO_0001627", "variantId": "2_39120324_C_A", "variantRsId": "rs368569135", "cohortPhenotypes": ["Noonan spectrum disorder", "RASopathy", "rasopathies"], "diseaseFromSource": "RASopathy", "diseaseFromSourceId": "C5555857", "diseaseFromSourceMappedId": "EFO_1001502", "variantHgvsId": "NC_000002.12:g.39120324C>A"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["pathogenic"], "confidence": "criteria provided, single submitter", "studyId": "RCV002247286", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000198712", "variantFunctionalConsequenceId": "SO_0001631", "variantId": "MT_7512_T_C", "variantRsId": "rs199474817", "cohortPhenotypes": ["COX deficiency", "Complex 4 mitochondrial respiratory chain deficiency", "Complex IV deficiency", "Cytochrome-c oxidase deficiency", "Cytochrome-c oxidase deficiency disease", "Deficiency of mitochondrial respiratory chain complex4", "MITOCHONDRIAL COMPLEX IV DEFICIENCY, NUCLEAR TYPE 1", "Mitochondrial complex IV deficiency"], "diseaseFromSource": "Cytochrome-c oxidase deficiency disease", "diseaseFromSourceId": "C5435656", "diseaseFromSourceMappedId": "MONDO_0009068", "variantHgvsId": "NC_012920.1:m.7512T>C"}
{"alleleOrigins": ["germline"], "datasourceId": "eva", "datatypeId": "genetic_association", "clinicalSignificances": ["pathogenic"], "confidence": "criteria provided, single submitter", "studyId": "RCV002247286", "releaseDate": "2022-06-09", "targetFromSourceId": "ENSG00000198786", "variantFunctionalConsequenceId": "SO_0001631", "variantId": "MT_7512_T_C", "variantRsId": "rs199474817", "cohortPhenotypes": ["COX deficiency", "Complex 4 mitochondrial respiratory chain deficiency", "Complex IV deficiency", "Cytochrome-c oxidase deficiency", "Cytochrome-c oxidase deficiency disease", "Deficiency of mitochondrial respiratory chain complex4", "MITOCHONDRIAL COMPLEX IV DEFICIENCY, NUCLEAR TYPE 1", "Mitochondrial complex IV deficiency"], "diseaseFromSource": "Cytochrome-c oxidase deficiency disease", "diseaseFromSourceId": "C5435656", "diseaseFromSourceMappedId": "MONDO_0009068", "variantHgvsId": "NC_012920.1:m.7512T>C"}
Expand Down
27 changes: 26 additions & 1 deletion tests/trait_mapping/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import pytest

from cmat.trait_mapping.main import parse_traits, process_traits
from cmat.trait_mapping.main import parse_traits, process_traits, process_trait
from cmat.trait_mapping.trait import Trait


def get_test_resource(resource_name):
Expand Down Expand Up @@ -62,3 +63,27 @@ def test_main():
mapped_terms = {x[0] for x in output_mappings}
curation_terms = {x[0] for x in output_curation}
assert len(mapped_terms) + len(curation_terms) == len(all_terms)


def test_process_trait_exact_match():
# Exact match with MONDO:0009061 (in EFO and Mondo)
trait_name = 'Cystic Fibrosis'
# Use our default Zooma filters
zooma_filters = {'ontologies': 'efo,ordo,hp,mondo',
'required': 'cttv,eva-clinvar,clinvar-xrefs,gwas',
'preferred': 'eva-clinvar,cttv,gwas,clinvar-xrefs'}
zooma_host = 'https://www.ebi.ac.uk'
# Don't use OxO
oxo_targets = []
oxo_distance = 0

# This should be marked as finished, as it's an exact string match with a term contained in the target ontology
efo_trait = process_trait(Trait(trait_name, None, None), zooma_filters, zooma_host, oxo_targets, oxo_distance,
target_ontology='efo')
assert efo_trait.is_finished

# This should not be marked as finished, even though Zooma finds an exact match in one of its ontologies, it's not
# the requested target ontology and thus still needs to be curated
hpo_trait = process_trait(Trait(trait_name, None, None), zooma_filters, zooma_host, oxo_targets, oxo_distance,
target_ontology='hp')
assert not hpo_trait.is_finished
Loading