Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add broad/narrow curation to interface #158

Merged
merged 7 commits into from
Sep 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 34 additions & 35 deletions scripts/generate_vo_mesh_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import bioontologies
import gilda
import pyobo
import pyobo.gilda_utils
from bioontologies.obograph import Node
from tqdm import tqdm

from biomappings import PredictionTuple
Expand All @@ -14,39 +16,38 @@

def main():
"""Generate mappings from between VO and MeSH."""
mesh_grounder = pyobo.gilda_utils.get_grounder("mesh")
provenance = get_script_url(__file__)
graph = (
bioontologies.get_obograph_by_prefix(
"vo", check=False, json_path="/Users/cthoyt/Desktop/vo.json"
)
.guess("vo")
.standardize()
)
graph = bioontologies.get_obograph_by_prefix("vo", check=False).guess("vo").standardize()
rows = []
extracted_mesh = 0
for node in tqdm(graph.nodes, unit="node", unit_scale=True):
if not node.lbl or node.prefix != "vo":
if not node.name or node.prefix != "vo":
continue
if node.meta:
found_mesh = False
for p in node.meta.basicPropertyValues or []:
if p.pred_prefix == "rdfs" and p.pred_identifier == "seeAlso":
values = [value.strip().replace(" ", "") for value in p.val.strip().split(";")]
for p in node.meta.properties or []:
if not p.predicate:
continue
if p.predicate.curie == "rdfs:seeAlso":
values = [
value.strip().replace(" ", "") for value in p.value_raw.strip().split(";")
]
# print(node.luid, values)
for value in values:
# TODO this is place to extract oher mapping types
# TODO this is place to extract other mapping types
if not value.lower().startswith("mesh:"):
continue
mesh_id = value.split(":", 1)[1].strip()
mesh_name = pyobo.get_name("mesh", mesh_id)
if not mesh_name:
tqdm.write(f"No mesh name for vo:{node.luid} mapped to mesh:{mesh_id}")
tqdm.write(f"No mesh name for vo:{node.name} mapped to mesh:{mesh_id}")
continue
rows.append(
PredictionTuple(
"vo",
node.luid,
node.lbl,
node.prefix,
node.identifier,
node.name,
"skos:exactMatch",
"mesh",
mesh_id,
Expand All @@ -61,41 +62,39 @@ def main():
if found_mesh:
continue

_ground(node, rows, provenance)
_ground(mesh_grounder, node, rows, provenance)

append_prediction_tuples(rows)
print(f"extracted {extracted_mesh} mesh mappings. should be abount 65")
print(f"extracted {extracted_mesh} mesh mappings. should be about 65")


def _ground(node, rows, provenance):
texts = [node.lbl]
def _ground(grounder: gilda.Grounder, node: Node, rows, provenance):
texts = [node.name]
# VO doesn't store its synonyms using standard predicates,
# so look in IAO_0000118 (alternate label) or IAO_0000116 (editor note)
# with "synonym: " as the string prefix
if node.meta:
for p in node.meta.basicPropertyValues or []:
if p.pred_prefix == "iao" and p.pred_identifier == "0000118":
texts.append(p.val)
if (
p.pred_prefix == "iao"
and p.pred_identifier == "0000116"
and p.val.startswith("synonym:")
):
texts.append(p.val.removeprefix("synonym:").strip())
for p in node.meta.properties or []:
if not p.predicate:
continue
if p.predicate.curie == "iao:0000118":
texts.append(p.value_raw)
elif p.predicate.curie == "iao:0000116" and p.value_raw.startswith("synonym:"):
texts.append(p.value_raw.removeprefix("synonym:").strip())

for text in [node.lbl, *(s.val for s in node.synonyms)]:
for scored_match in gilda.ground(text, namespaces=["MESH"]):
for text in [node.name, *(s.value for s in node.synonyms)]:
for scored_match in grounder.ground(text):
rows.append(
PredictionTuple(
"vo",
node.luid,
node.lbl,
node.prefix,
node.identifier,
node.name,
"skos:exactMatch",
scored_match.term.db.lower(),
scored_match.term.id,
scored_match.term.entry_name,
"semapv:LexicalMatching",
scored_match.score,
round(scored_match.score, 2),
provenance,
)
)
Expand Down
3 changes: 3 additions & 0 deletions src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -1406,6 +1406,7 @@ uberon UBERON:0012245 silk skos:exactMatch bto BTO:0002854 corn silk semapv:Manu
uberon UBERON:0022469 primary olfactory cortex skos:exactMatch mesh D066194 Olfactory Cortex semapv:ManualMappingCuration orcid:0000-0001-9439-5346
uberon UBERON:2001977 pad skos:exactMatch mesh D058729 Peripheral Arterial Disease semapv:ManualMappingCuration orcid:0000-0001-9439-5346
umls C0006142 Malignant neoplasm of breast skos:exactMatch mesh D001943 Breast Neoplasms semapv:ManualMappingCuration orcid:0000-0002-6601-2165
vo 0000189 colony forming unit skos:exactMatch mesh D013234 Stem Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/28ac41/scripts/generate_vo_mesh_mappings.py 0.54
vo 0004075 PBT skos:exactMatch mesh D001803 Blood Transfusion semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5555555555555556
vo 0004075 PBT skos:exactMatch mesh D013601 T-Lymphocytes semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5555555555555556
vo 0004075 PBT skos:exactMatch mesh D061766 Proton Therapy semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5555555555555556
Expand Down Expand Up @@ -1433,7 +1434,9 @@ vo 0010927 ORF skos:exactMatch mesh D004474 Ecthyma, Contagious semapv:ManualMap
vo 0010944 Eae skos:exactMatch mesh D004681 Encephalomyelitis, Autoimmune, Experimental semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5400948258091115
vo 0010971 SurA skos:exactMatch uberon UBERON:0003823 hindlimb zeugopod semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5209235209235209
vo 0010988 IroN skos:exactMatch chebi CHEBI:18248 iron atom semapv:ManualMappingCuration orcid:0000-0003-4423-4370
vo 0010988 IroN skos:exactMatch mesh D007501 Iron semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/28ac41/scripts/generate_vo_mesh_mappings.py 0.74
vo 0010997 IroN skos:exactMatch chebi CHEBI:18248 iron atom semapv:ManualMappingCuration orcid:0000-0003-4423-4370
vo 0010997 IroN skos:exactMatch mesh D007501 Iron semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/28ac41/scripts/generate_vo_mesh_mappings.py 0.74
vo 0011021 CP skos:exactMatch chebi CHEBI:3380 captopril semapv:ManualMappingCuration orcid:0000-0003-4423-4370
vo 0011021 CP skos:exactMatch hp HP:0100021 Cerebral palsy semapv:ManualMappingCuration orcid:0000-0003-4423-4370
vo 0011021 CP skos:exactMatch mesh D002547 Cerebral Palsy semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5555555555555556
Expand Down
Loading
Loading