Skip to content

Commit

Permalink
Improve redundancy filtering (#141)
Browse files Browse the repository at this point in the history
Before, redundant mapping filtering just checked that the tuple was the
same. Now, it groups by the canonical tuple and uses a key for picking
the "best" mapping. For now, this is a simple function. Later, it can be
extended to take into account confidence in the curator, prediction
methodology, or potentially extending the data model to keep track of
the date of prediction/curation and just keep the earliest.
  • Loading branch information
cthoyt authored Aug 14, 2023
1 parent ebe90a2 commit f74c651
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 53 deletions.
93 changes: 62 additions & 31 deletions src/biomappings/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import bioregistry
from tqdm.auto import tqdm
from typing_extensions import Literal

from biomappings.utils import OVERRIDE_MIRIAM, RESOURCE_PATH, get_canonical_tuple

Expand Down Expand Up @@ -166,6 +167,9 @@ def target_curie(self) -> str:
return f"{self.target_prefix}:{self.target_identifier}"


Mappings = Iterable[Mapping[str, str]]


def get_resource_file_path(fname) -> Path:
"""Get a resource by its file name."""
return RESOURCE_PATH.joinpath(fname)
Expand All @@ -184,13 +188,13 @@ def _clean(header, row):


def _write_helper(
header: Sequence[str], lod: Iterable[Mapping[str, str]], path: Union[str, Path], mode: str
header: Sequence[str], mappings: Mappings, path: Union[str, Path], mode: Literal["w", "a"]
) -> None:
lod = sorted(lod, key=mapping_sort_key)
mappings = sorted(mappings, key=mapping_sort_key)
with open(path, mode) as file:
if mode == "w":
print(*header, sep="\t", file=file) # noqa:T201
for line in lod:
for line in mappings:
print(*[line[k] or "" for k in header], sep="\t", file=file) # noqa:T201


Expand All @@ -216,15 +220,15 @@ def load_mappings(*, path: Optional[Path] = None) -> List[Dict[str, str]]:


def append_true_mappings(
m: Iterable[Mapping[str, str]],
mappings: Mappings,
*,
sort: bool = True,
path: Optional[Path] = None,
) -> None:
"""Append new lines to the mappings table."""
if path is None:
path = TRUE_MAPPINGS_PATH
_write_helper(MAPPINGS_HEADER, m, path, mode="a")
_write_curated(mappings, path=path, mode="a")
if sort:
lint_true_mappings(path=path)

Expand All @@ -234,16 +238,25 @@ def append_true_mapping_tuples(mappings: Iterable[MappingTuple]) -> None:
append_true_mappings(mapping.as_dict() for mapping in set(mappings))


def write_true_mappings(m: Iterable[Mapping[str, str]], *, path: Optional[Path] = None) -> None:
def write_true_mappings(mappings: Mappings, *, path: Optional[Path] = None) -> None:
"""Write mappigns to the true mappings file."""
_write_helper(MAPPINGS_HEADER, m, path or TRUE_MAPPINGS_PATH, mode="w")
_write_curated(mappings=mappings, path=path or TRUE_MAPPINGS_PATH, mode="w")


def _write_curated(mappings: Mappings, *, path: Path, mode: Literal["w", "a"]):
_write_helper(MAPPINGS_HEADER, mappings, path, mode=mode)


def lint_true_mappings(*, standardize: bool = False, path: Optional[Path] = None) -> None:
"""Lint the true mappings file."""
mappings = load_mappings(path=path)
mappings = _remove_redundant(mappings, MappingTuple, standardize=standardize)
write_true_mappings(sorted(mappings, key=mapping_sort_key), path=path)
_lint_curated_mappings(standardize=standardize, path=path or TRUE_MAPPINGS_PATH)


def _lint_curated_mappings(path: Path, *, standardize: bool = False) -> None:
"""Lint the true mappings file."""
mappings = _load_table(path)
mappings = _remove_redundant(mappings, standardize=standardize)
_write_helper(MAPPINGS_HEADER, mappings, path, mode="w")


FALSE_MAPPINGS_PATH = get_resource_file_path("incorrect.tsv")
Expand All @@ -255,29 +268,27 @@ def load_false_mappings(*, path: Optional[Path] = None) -> List[Dict[str, str]]:


def append_false_mappings(
m: Iterable[Mapping[str, str]],
mappings: Mappings,
*,
sort: bool = True,
path: Optional[Path] = None,
) -> None:
"""Append new lines to the false mappings table."""
if path is None:
path = FALSE_MAPPINGS_PATH
_write_helper(MAPPINGS_HEADER, m, path, mode="a")
_write_curated(mappings=mappings, path=path, mode="a")
if sort:
lint_false_mappings(path=path)


def write_false_mappings(m: Iterable[Mapping[str, str]], *, path: Optional[Path] = None) -> None:
def write_false_mappings(mappings: Mappings, *, path: Optional[Path] = None) -> None:
"""Write mappings to the false mappings file."""
_write_helper(MAPPINGS_HEADER, m, path or FALSE_MAPPINGS_PATH, mode="w")
_write_helper(MAPPINGS_HEADER, mappings, path or FALSE_MAPPINGS_PATH, mode="w")


def lint_false_mappings(*, standardize: bool = False, path: Optional[Path] = None) -> None:
"""Lint the false mappings file."""
mappings = load_false_mappings(path=path)
mappings = _remove_redundant(mappings, MappingTuple, standardize=standardize)
write_false_mappings(sorted(mappings, key=mapping_sort_key), path=path)
_lint_curated_mappings(standardize=standardize, path=path or FALSE_MAPPINGS_PATH)


UNSURE_PATH = get_resource_file_path("unsure.tsv")
Expand All @@ -289,29 +300,27 @@ def load_unsure(*, path: Optional[Path] = None) -> List[Dict[str, str]]:


def append_unsure_mappings(
m: Iterable[Mapping[str, str]],
mappings: Mappings,
*,
sort: bool = True,
path: Optional[Path] = None,
) -> None:
"""Append new lines to the "unsure" mappings table."""
if path is None:
path = UNSURE_PATH
_write_helper(MAPPINGS_HEADER, m, path, mode="a")
_write_curated(mappings, path=path, mode="a")
if sort:
lint_unsure_mappings(path=path)


def write_unsure_mappings(m: Iterable[Mapping[str, str]], *, path: Optional[Path] = None) -> None:
def write_unsure_mappings(mappings: Mappings, *, path: Optional[Path] = None) -> None:
"""Write mappings to the unsure mappings file."""
_write_helper(MAPPINGS_HEADER, m, path or UNSURE_PATH, mode="w")
_write_helper(MAPPINGS_HEADER, mappings, path or UNSURE_PATH, mode="w")


def lint_unsure_mappings(*, standardize: bool = False, path: Optional[Path] = None) -> None:
"""Lint the unsure mappings file."""
mappings = load_unsure(path=path)
mappings = _remove_redundant(mappings, MappingTuple, standardize=standardize)
write_unsure_mappings(sorted(mappings, key=mapping_sort_key), path=path)
_lint_curated_mappings(standardize=standardize, path=path or UNSURE_PATH)


PREDICTIONS_PATH = get_resource_file_path("predictions.tsv")
Expand All @@ -322,9 +331,9 @@ def load_predictions(*, path: Optional[Path] = None) -> List[Dict[str, str]]:
return _load_table(path or PREDICTIONS_PATH)


def write_predictions(m: Iterable[Mapping[str, str]], *, path: Optional[Path] = None) -> None:
def write_predictions(mappings: Mappings, *, path: Optional[Path] = None) -> None:
"""Write new content to the predictions table."""
_write_helper(PREDICTIONS_HEADER, m, path or PREDICTIONS_PATH, mode="w")
_write_helper(PREDICTIONS_HEADER, mappings, path or PREDICTIONS_PATH, mode="w")


def append_prediction_tuples(
Expand All @@ -344,7 +353,7 @@ def append_prediction_tuples(


def append_predictions(
mappings: Iterable[Mapping[str, str]],
mappings: Mappings,
*,
deduplicate: bool = True,
sort: bool = True,
Expand Down Expand Up @@ -397,14 +406,36 @@ def lint_predictions(standardize: bool = False) -> None:
)
if get_canonical_tuple(mapping) not in curated_mappings
]
mappings = _remove_redundant(mappings, PredictionTuple, standardize=standardize)
write_predictions(sorted(mappings, key=mapping_sort_key))
mappings = _remove_redundant(mappings, standardize=standardize)
mappings = sorted(mappings, key=mapping_sort_key)
write_predictions(mappings)


def _remove_redundant(mappings, tuple_cls, standardize: bool = False):
def _remove_redundant(mappings, *, standardize: bool = False):
if standardize:
mappings = _standardize_mappings(mappings)
return (mapping.as_dict() for mapping in {tuple_cls.from_dict(mapping) for mapping in mappings})
dd = defaultdict(list)
for mapping in mappings:
dd[get_canonical_tuple(mapping)].append(mapping)
return [max(mappings, key=_pick_best) for mappings in dd.values()]


def _pick_best(mapping: Dict[str, str]) -> int:
"""Assign a value for this mapping.
:param mapping: A mapping dictionary
:returns: An integer, where higher means a better choice.
This function is currently simple, but can later be extended to
account for several other things including:
- confidence in the curator
- prediction methodology
- date of prediction/curation (to keep the earliest)
"""
if mapping["source"].startswith("orcid"):
return 1
return 0


def _standardize_mappings(mappings, *, progress: bool = True):
Expand Down
1 change: 0 additions & 1 deletion src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -1430,7 +1430,6 @@ vo 0011021 CP skos:exactMatch mesh D002547 Cerebral Palsy semapv:ManualMappingCu
vo 0011021 CP skos:exactMatch pr PR:P00450 ceruloplasmin (human) semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5555555555555556
vo 0011021 CP skos:exactMatch uberon UBERON:0001886 choroid plexus semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5555555555555556
vo 0011021 CP skos:exactMatch uberon UBERON:0005343 cortical plate semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5555555555555556
vo 0011026 RAP-1 skos:exactMatch idomal 0001123 RAP-1 semapv:ManualMappingCuration orcid:0000-0003-4423-4370
vo 0011050 Tax skos:exactMatch mesh D013660 Taxes semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/4b2628/scripts/generate_vo_mesh_mappings.py 0.5555555555555556
vo 0011081 Hpd skos:exactMatch pr PR:000008730 4-hydroxyphenylpyruvate dioxygenase semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5400948258091115
vo 0011081 Hpd skos:exactMatch pr PR:P32754 4-hydroxyphenylpyruvate dioxygenase (human) semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching mira 0.5400948258091115
Expand Down
Loading

0 comments on commit f74c651

Please sign in to comment.