From f3c1c65c369dc98b9969ce9aa399f195723acc18 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Tue, 4 Jun 2024 14:55:09 +0100 Subject: [PATCH 01/14] removed report generation from model pack build as should now be done via interface --- kazu/utils/build_and_test_model_packs.py | 31 ------------------------ 1 file changed, 31 deletions(-) diff --git a/kazu/utils/build_and_test_model_packs.py b/kazu/utils/build_and_test_model_packs.py index 4bf412ee..53cbe317 100644 --- a/kazu/utils/build_and_test_model_packs.py +++ b/kazu/utils/build_and_test_model_packs.py @@ -18,8 +18,6 @@ #: A default timeout in seconds for Ray to finish building the model packs within. #: This is equal to 3 hours DEFAULT_RAY_TIMEOUT = 180.0 * 60 -GLOBAL_CONFLICT_REPORT_DIR = "global_parser_conflict_reports" -GLOBAL_CONFLICT_REPORT_FN = "global_string_match_conflicts.txt" @dataclass @@ -75,7 +73,6 @@ def __init__( maybe_base_configuration_path: Optional[Path], skip_tests: bool, zip_pack: bool, - run_global_conflict_report: bool, ): """A ModelPackBuilder is a helper class to assist in the building of a model pack. @@ -96,7 +93,6 @@ def __init__( :param maybe_base_configuration_path: if this pack requires the base configuration, specify path :param skip_tests: don't run any tests :param zip_pack: zip the pack at the end (requires the 'zip' CLI tool) - :param run_global_conflict_report: writes reports in the model pack about inter and intra parser resource conflicts. """ if logging_config_path is not None: fileConfig(logging_config_path) @@ -110,7 +106,6 @@ def __init__( self.model_pack_build_path = self.build_dir.joinpath(self.target_model_pack_path.name) os.environ["KAZU_MODEL_PACK"] = str(self.model_pack_build_path) self.build_config = self.load_build_configuration() - self.run_global_conflict_report = run_global_conflict_report def __repr__(self): """For nice log messages.""" @@ -137,8 +132,6 @@ def build_model_pack(self) -> Path: config_name="config", overrides=["hydra/job_logging=none", "hydra/hydra_logging=none"], ) - if self.run_global_conflict_report: - self.write_resource_conflict_reports(cfg) self.build_caches_and_run_sanity_checks(cfg) if not self.skip_tests: self.run_acceptance_tests(cfg) @@ -316,18 +309,6 @@ def report_tested_dependencies(self): with self.model_pack_build_path.joinpath("tested_dependencies.txt").open(mode="w") as f: f.write(dependencies) - def write_resource_conflict_reports(self, cfg: DictConfig) -> None: - self.logger.info("creating resource conflict reports") - - global_report_dir = self.model_pack_build_path.joinpath(GLOBAL_CONFLICT_REPORT_DIR) - global_report_dir.mkdir() - - for parser in instantiate(cfg.ontologies.parsers, _convert_="all").values(): - _, resource_report = parser.populate_metadata_db_and_resolve_string_resources() - resource_report.write_reports_for_parser( - path=global_report_dir, parser_name=parser.name - ) - @ray.remote(num_cpus=1) class ModelPackBuilderActor(ModelPackBuilder): @@ -344,7 +325,6 @@ def build_all_model_packs( max_parallel_build: Optional[int], debug: bool = False, ray_timeout: Optional[float] = DEFAULT_RAY_TIMEOUT, - run_global_conflict_report: bool = False, ) -> None: """Build multiple model packs. @@ -358,8 +338,6 @@ def build_all_model_packs( None, use all available CPUs :param debug: Disables Ray parallelization, enabling the use of debugger tools :param ray_timeout: A timeout for Ray to complete model pack building within. Defaults to :attr:`~DEFAULT_RAY_TIMEOUT` - :param run_global_conflict_report: Checks the strings associated configured for string matching across - each parser, and reports any inconsistencies. :return: """ if not output_dir.is_dir(): @@ -394,7 +372,6 @@ def build_all_model_packs( target_model_pack_path=model_pack_path, build_dir=output_dir, skip_tests=skip_tests, - run_global_conflict_report=run_global_conflict_report, ) if not debug: futures.append(cast(ray.ObjectRef, builder.build_model_pack.remote())) @@ -514,13 +491,6 @@ def wait_for_model_pack_completion( type=float, required=False, ) - parser.add_argument( - "--run_global_conflict_report", - action="store_true", - help="Checks the strings associated configured for string matching across and within each parser," - f" and reports any inconsistencies. These are reported in a directory called {GLOBAL_CONFLICT_REPORT_DIR}" - " in the model pack root. WARNING: this may cause a spike in memory usage.", - ) args = parser.parse_args() @@ -534,5 +504,4 @@ def wait_for_model_pack_completion( max_parallel_build=args.max_parallel_build, debug=args.debug, ray_timeout=args.ray_timeout, - run_global_conflict_report=args.run_global_conflict_report, ) From b6d27c3e4d7b51522c6e853e36852f5654b3d26a Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 12:36:35 +0100 Subject: [PATCH 02/14] Kazu resource tool docs update --- docs/conf.py | 2 ++ docs/index.rst | 1 + docs/kazu_resource_tool.rst | 21 +++++++++++++++++++++ docs/ontology_parser.rst | 2 +- 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 docs/kazu_resource_tool.rst diff --git a/docs/conf.py b/docs/conf.py index 7980e6c7..8b6958ac 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -259,4 +259,6 @@ def linkcode_resolve(domain, info): ("py:class", "_regex.Pattern"), ("py:class", "urllib3.util.retry.Retry"), ("py:class", "gliner.GLiNER"), + # no sphinx for streamlit + ("py:class", "streamlit.delta_generator.DeltaGenerator"), ] diff --git a/docs/index.rst b/docs/index.rst index b3f8ec46..6fcd22ce 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ Welcome to Kazu's documentation! Kazu Data Model Visualising results in Label Studio The OntologyParser + The Kazu Resource Tool Curating a knowledge base for NER and Linking Scaling with Ray Kazu as a WebService diff --git a/docs/kazu_resource_tool.rst b/docs/kazu_resource_tool.rst new file mode 100644 index 00000000..8d5678d7 --- /dev/null +++ b/docs/kazu_resource_tool.rst @@ -0,0 +1,21 @@ +.. _kazu_resource_tool: + +The Kazu Resource Tool +================================ + +The Kazu Resource Tool is a command line tool that can be used to find and resolve the various issues that can arise when working with :class:`.OntologyStringResource`\s in a Kazu project, +as decribed at the end of the :ref:`ontology_parser` section. + +It's main purpose is to help you find and fix issues with your resource files, and to help you understand how your resources are configured. + +First, ensure that ``KAZU_MODEL_PACK`` is set in your environment, as described in the :ref:`quickstart` section. + +Now, from the root of your Kazu project, you can run the Kazu Resource Tool with the following command: + +.. code-block:: console + + $ streamlit run krt/Introduction.py + +This will start a Streamlit server, which will allow you to interact with the Kazu Resource Tool in your browser. + +Further instructions are described in the Streamlit app itself. diff --git a/docs/ontology_parser.rst b/docs/ontology_parser.rst index ffe8e831..ee9af2b6 100644 --- a/docs/ontology_parser.rst +++ b/docs/ontology_parser.rst @@ -273,7 +273,7 @@ The flow of an ontology parser to handling the underlying strings is as follows: a :class:`.OntologyResourceSetCompleteReport`, describing the differences between the old and the new versions. The results are then used to supplement the existing :class:`.OntologyStringResource`\s for the new version. -We are working on a simple tool to guide the user through each of these stages, which will be available in a future release. +To assist with the above, Kazu provides a simple Streamlit tool :ref:`kazu_resource_tool` to help with the curation process. To explore the other capabilities of the :class:`.OntologyParser`, such as synonym generation and ID filtering, please refer to the API documentation. From 0736c39a7f49aa7b4664729f0423e1e4b57ef16e Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 12:37:20 +0100 Subject: [PATCH 03/14] added streamlit as a dev dependency --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 009fb5c6..83a80e49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,7 +117,9 @@ dev = [ "towncrier>=23.10.0", "ray>=1.10.0", # required for parsing wikimedia data for disambiguation - 'mwparserfromhell' + 'mwparserfromhell', + # required for krt + 'streamlit' ] [build-system] From 3c9b4d97ac1769f6d9ffcf04878c0a99719a1a2d Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:10:57 +0100 Subject: [PATCH 04/14] added curations_injections to dumy parser to simplify process of testing human curation overrides --- kazu/tests/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kazu/tests/utils.py b/kazu/tests/utils.py index 036f2e0b..f208eb1d 100644 --- a/kazu/tests/utils.py +++ b/kazu/tests/utils.py @@ -26,6 +26,7 @@ OntologyParser, ) from kazu.ontology_preprocessing.synonym_generation import CombinatorialSynonymGenerator +from kazu.ontology_preprocessing.curation_utils import dump_ontology_string_resources TEST_ASSETS_PATH = Path(__file__).parent.joinpath("test_assets") @@ -104,7 +105,7 @@ def __init__( autocurator: Optional[AutoCurator] = None, curations_path: Optional[str] = None, global_actions: Optional[GlobalParserActions] = None, - run_upgrade_report: bool = False, + curations_injections: Optional[set[OntologyStringResource]] = None, ): """ @@ -121,13 +122,18 @@ def __init__( :param autocurator: :param curations_path: :param global_actions: - :param run_upgrade_report: + :param curations_injections: resources to inject into the curations path for testing """ if in_path == "": temp_parent = tempfile.mkdtemp() in_path = tempfile.mkdtemp(dir=temp_parent) elif len(os.listdir(in_path)) != 0: raise ValueError("DummyParser used with non-empty directory. This is problematic") + assert in_path is not None + if curations_injections: + curations_path = str(Path(in_path).joinpath("curations").absolute()) + dump_ontology_string_resources(curations_injections, curations_path) + super().__init__( in_path, entity_class, From fcd37ab60b1d4ddfa85ecc5041a4fe79f6cb1e1e Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:12:09 +0100 Subject: [PATCH 05/14] added ResourceManager for managing changes to resources via an interactive application --- kazu/krt/resource_manager.py | 138 +++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 kazu/krt/resource_manager.py diff --git a/kazu/krt/resource_manager.py b/kazu/krt/resource_manager.py new file mode 100644 index 00000000..643997c6 --- /dev/null +++ b/kazu/krt/resource_manager.py @@ -0,0 +1,138 @@ +import logging +from collections import defaultdict +from pathlib import Path +from typing import Optional, Iterable + +from kazu.data import ( + OntologyStringResource, +) +from kazu.ontology_preprocessing.base import OntologyParser +from kazu.ontology_preprocessing.curation_utils import ( + OntologyStringConflictAnalyser, + load_ontology_string_resources, + dump_ontology_string_resources, + OntologyResourceSetCompleteReport, +) + + +class ResourceManager: + """The ResourceManager class is responsible for managing resources in the streamlit + application. + + It manages the global state of all :class:`.OntologyStringResource`\\s. + It is also responsible for saving and updates to these resources in the configured model pack. + """ + + def __init__(self, parsers: Iterable[OntologyParser]) -> None: + """Initializes the ResourceManager instance. + + Sets up dictionaries for managing resources and parsers, and loads resources + from each parser. + """ + self.parser_to_curations: defaultdict[str, set[OntologyStringResource]] = defaultdict(set) + # since duplicate resources may exist in multiple parsers, this mapping controls that + self.resource_to_parsers: defaultdict[OntologyStringResource, set[str]] = defaultdict(set) + # where to save the serialised resources to + self.parser_to_path: dict[str, Path] = {} + # parser to report + self.parser_to_report: dict[str, OntologyResourceSetCompleteReport] = {} + self.parsers: dict[str, OntologyParser] = {} + + for parser in parsers: + self.parsers[parser.name] = parser + logging.info(f"loading data from parser: {parser.name}") + if parser.curations_path is None: + logging.warning( + "Parser %s has no curations path and will not be loaded", parser.name + ) + continue + self.parser_to_path[parser.name] = parser.curations_path + _, resource_report = parser.populate_metadata_db_and_resolve_string_resources() + self.parser_to_report[parser.name] = resource_report + # we need the clean resources and the conflicted resources from the parser + for resource in resource_report.final_conflict_report.clean_resources: + self.resource_to_parsers[resource].add(parser.name) + for resource_set in resource_report.final_conflict_report.case_conflicts: + for resource in resource_set: + self.resource_to_parsers[resource].add(parser.name) + if resource_report.human_conflict_report: + for resource in resource_report.human_conflict_report.clean_resources: + self.resource_to_parsers[resource].add(parser.name) + for resource_set in resource_report.human_conflict_report.case_conflicts: + for resource in resource_set: + self.resource_to_parsers[resource].add(parser.name) + if resource_report.merge_report: + for r1, r2 in resource_report.merge_report.resources_with_discrepancies: + self.resource_to_parsers[r1].add(parser.name) + self.resource_to_parsers[r2].add(parser.name) + + if parser.curations_path: + self.parser_to_curations[parser.name].update( + load_ontology_string_resources(parser.curations_path) + ) + + logging.info("building synonym lookup...") + + self.synonym_lookup = OntologyStringConflictAnalyser.build_synonym_defaultdict( + self.resource_to_parsers.keys() + ) + + def parser_count(self) -> int: + """Returns the number of parsers loaded by the ResourceManager instance. + + :return: + """ + + return len(self.parsers) + + def sync_resources( + self, + original_resource: Optional[OntologyStringResource], + new_resource: OntologyStringResource, + parser_name: str, + ) -> None: + """Synchronizes resources within the internal state. + + If an original resource is provided, it is removed from the resource + dictionaries and the new resource is added. If no original resource is provided, + only the new resource is added. Note that no action is taken if + original_resource == new_resource + + :param original_resource: The original resource to be replaced. If None, no + resource is replaced. + :param new_resource: The new resource to be added. + :param parser_name: The name of the parser that is handling the resource. + :return: + """ + resources_are_equal = False + if original_resource: + # Only do something if the resource has actually changed + resources_are_equal = original_resource == new_resource + if not resources_are_equal: + for synonym in original_resource.all_strings(): + self.synonym_lookup[synonym.lower()].discard(original_resource) + + self.parser_to_curations[parser_name].discard(original_resource) + # it may have already been popped, or not exist + try: + self.resource_to_parsers.pop(original_resource) + except KeyError: + pass + + if not original_resource or not resources_are_equal: + for synonym in new_resource.all_strings(): + self.synonym_lookup[synonym.lower()].add(new_resource) + + self.parser_to_curations[parser_name].add(new_resource) + self.resource_to_parsers[new_resource].add(parser_name) + + def save(self) -> Iterable[str]: + """Saves updated resources to the model pack. + + :return: + """ + + for parser_name, curation_set in self.parser_to_curations.items(): + path = self.parser_to_path[parser_name] + yield f"Saving updated resources to {path}" + dump_ontology_string_resources(curation_set, path, force=True) From c372184e846749bb0bae896e97f50f56ce247cd4 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:13:06 +0100 Subject: [PATCH 06/14] added krt.utils with various useful functions for streamlit --- kazu/krt/utils.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 kazu/krt/utils.py diff --git a/kazu/krt/utils.py b/kazu/krt/utils.py new file mode 100644 index 00000000..f39af8e9 --- /dev/null +++ b/kazu/krt/utils.py @@ -0,0 +1,107 @@ +import dataclasses +import os + +import pandas as pd +import streamlit as st +from hydra import initialize_config_dir, compose +from hydra.utils import instantiate +from kazu.data import MentionConfidence, OntologyStringResource +from kazu.krt.resource_manager import ResourceManager +from omegaconf import DictConfig +from kazu.ontology_preprocessing.base import OntologyParser + + +def load_config() -> DictConfig: + """Loads the configuration for the ResourceManager instance. + + :return: + """ + conf_dir = os.environ["KAZU_MODEL_PACK"] + "/conf" + from kazu.utils.constants import HYDRA_VERSION_BASE + + with initialize_config_dir(version_base=HYDRA_VERSION_BASE, config_dir=str(conf_dir)): + cfg = compose(config_name="config", overrides=[]) + return cfg + + +@st.cache_resource(show_spinner="Loading parsers") +def load_parsers() -> list[OntologyParser]: + cfg = load_config() + parsers = [] + for parser in cfg.ontologies.parsers.values(): + parsers.append(instantiate(parser)) + return parsers + + +@st.cache_resource(show_spinner="Building the ResourceManager") +def get_resource_manager() -> ResourceManager: + return ResourceManager(load_parsers()) + + +def create_new_resource_with_updated_synonyms( + new_conf: MentionConfidence, new_cs: bool, resource: OntologyStringResource +) -> OntologyStringResource: + """Create a new :class:`.OntologyStringResource` with updated :class:`.Synonym`\\s. + + This function takes a :class:`.MentionConfidence`\\, a boolean value, and an + :class:`.OntologyStringResource` object as inputs. It creates a new :class:`.OntologyStringResource` + object with updated synonyms based on the provided :class:`.MentionConfidence` and boolean + values. The new_conf parameter represents the new :class:`.MentionConfidence` to be set for + the :class:`.Synonym`\\s. The new_cs parameter represents whether the :class:`.Synonym`\\s should be case + sensitive or not. The resource parameter is the original :class:`.OntologyStringResource` + object whose :class:`.Synonym`\\s are to be updated. + + :param new_conf: + :param new_cs: + :param resource: + :return: + """ + new_original_syns = set() + for syn in resource.original_synonyms: + new_original_syns.add( + dataclasses.replace(syn, case_sensitive=new_cs, mention_confidence=new_conf) + ) + new_alternative_syns = set() + for syn in resource.alternative_synonyms: + new_alternative_syns.add( + dataclasses.replace(syn, case_sensitive=new_cs, mention_confidence=new_conf) + ) + + return dataclasses.replace( + resource, + original_synonyms=frozenset(new_original_syns), + alternative_synonyms=frozenset(new_alternative_syns), + ) + + +def resource_to_df(resource: OntologyStringResource) -> pd.DataFrame: + """Convert an :class:`.OntologyStringResource` to a :class:`~pandas.DataFrame` for + display in Streamlit. + + This function takes an OntologyStringResource object as input and transforms it into a pandas DataFrame. + The DataFrame has four columns: `type`, `text`, `confidence`, and `case_sensitive`. The `type` column + indicates whether the synonym is "original" or "alternative". The `text` column contains the synonym text. + The `confidence` column contains the mention confidence, and the `case_sensitive` column indicates + whether the synonym is case sensitive or not. + + :param resource: + :return: + """ + data = [ + ( + "original", + syn.text, + syn.mention_confidence.name, + syn.case_sensitive, + ) + for syn in resource.original_synonyms + ] + [ + ( + "alternative", + syn.text, + syn.mention_confidence.name, + syn.case_sensitive, + ) + for syn in resource.alternative_synonyms + ] + return pd.DataFrame.from_records(data, columns=["type", "text", "confidence", "case_sensitive"]) From 58792e24f69f4811d5de975ceaed354a3361e357 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:14:10 +0100 Subject: [PATCH 07/14] added krt.string_editor_utils, including ResourceConflictManager for finding and working with ResourceManager --- kazu/krt/string_editor/utils.py | 244 ++++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 kazu/krt/string_editor/utils.py diff --git a/kazu/krt/string_editor/utils.py b/kazu/krt/string_editor/utils.py new file mode 100644 index 00000000..f1c55879 --- /dev/null +++ b/kazu/krt/string_editor/utils.py @@ -0,0 +1,244 @@ +import logging +from collections import defaultdict +from enum import Enum, auto +from typing import Iterable, Optional + +import pandas as pd +from kazu.data import OntologyStringResource, MentionConfidence +from kazu.krt.resource_manager import ResourceManager +from kazu.krt.utils import ( + create_new_resource_with_updated_synonyms, + resource_to_df, +) +from kazu.ontology_preprocessing.curation_utils import OntologyStringConflictAnalyser + + +class CaseConflictResolutionRequest(Enum): + PESSIMISTIC = auto() + OPTIMISTIC = auto() + CUSTOM = auto() + + +class ResourceConflict: + """This class represents a conflict in resources. + + It provides methods to resolve these conflicts optimistically or pessimistically. + """ + + def __init__(self, conflict_dict: dict[str, Iterable[OntologyStringResource]]): + """ + :param conflict_dict: A dictionary mapping parser names to a list of resources that are in conflict. + """ + self.parser_to_resource_to_resolution: defaultdict[ + str, dict[OntologyStringResource, Optional[OntologyStringResource]] + ] = defaultdict(lambda: dict()) + self.parser_names = set() + self.forms_to_parser = defaultdict(set) + self.string_set = set() + self.confidences = set() + self.cs = set() + for parser_name, resources in conflict_dict.items(): + self.parser_names.add(parser_name) + for resource in resources: + self.parser_to_resource_to_resolution[parser_name][resource] = None + for syn in resource.active_ner_synonyms(): + self.forms_to_parser[syn].add(parser_name) + self.string_set.add(syn.text) + self.confidences.add(syn.mention_confidence) + self.cs.add(syn.case_sensitive) + + def _shortest_string_len(self) -> int: + """Get the length of the shortest string in the conflict. + + :return: The length of the shortest string. + """ + return min(len(x) for x in self.string_set) + + def __lt__(self, other: "ResourceConflict") -> bool: + """Compare this ResourceConflict with another based on the length of the + shortest string in the conflict. + + :param other: The other ResourceConflict to compare with. + :return: True if this ResourceConflict's shortest string is shorter than the + other's, False otherwise. + """ + return self._shortest_string_len() < other._shortest_string_len() + + def batch_resolve(self, optimistic: bool) -> None: + """Resolve the conflict in batch, changing the parameters of all synonyms at + once to share the same values. + + The optimistic param indicates the minimum case sensitivity and maximum mention + confidence should be chosen (otherwise vice-versa) + + :param optimistic: Whether to resolve the conflict optimistically or + pessimistically. + """ + + new_cs = min(self.cs) if optimistic else max(self.cs) + new_conf = max(self.confidences) if optimistic else min(self.confidences) + self._resolve(new_cs=new_cs, new_conf=new_conf) + + def _resolve(self, new_cs: bool, new_conf: MentionConfidence) -> None: + """Resolve the conflict by creating a new resource with updated synonyms. + + :param new_cs: The new case sensitivity to use. + :param new_conf: The new mention confidence to use. + """ + for parser_name, resolution_dict in self.parser_to_resource_to_resolution.items(): + for resource in resolution_dict.keys(): + new_resource = create_new_resource_with_updated_synonyms(new_conf, new_cs, resource) + self.parser_to_resource_to_resolution[parser_name][resource] = new_resource + + +class ResourceConflictManager: + """This class is responsible for managing :class:`.ResourceConflict`\\s in + resources. + + It provides methods to find conflicts in resources, sync resources for resolved + string conflicts and find new conflicts. + """ + + def __init__( + self, + manager: ResourceManager, + ): + """Initialize with a :class:`.ResourceManager`\\. + + :param manager: The :class:`.ResourceManager` to use. + """ + self.manager = manager + self.unresolved_conflicts: dict[int, ResourceConflict] = {} + self.unresolved_conflicts_by_parser: dict[ + tuple[str, OntologyStringResource], ResourceConflict + ] = {} + self._init_conflict_maps() + + def sync_resources_for_resolved_resource_conflict_and_find_new_conflicts( + self, conflict: ResourceConflict + ) -> None: + """Sync resources for a resolved conflict and find new conflicts. + + This will refresh the internal map of conflicts. + + :param conflict: a resolved :class:`.ResourceConflict`\\. + """ + self._sync_resources(conflict) + self._check_resolved_conflict_for_new_conflicts(conflict) + + def _init_conflict_maps(self) -> None: + for i, conflict in enumerate( + self._find_conflicts_in_resources(self.manager.resource_to_parsers.keys()) + ): + self.unresolved_conflicts[i] = conflict + self._update_conflict_parser_map((conflict,)) + + def _update_conflict_parser_map(self, unresolved_conflicts: Iterable[ResourceConflict]) -> None: + """Update the conflict parser map with unresolved conflicts. + + :param unresolved_conflicts: The unresolved conflicts to add to the map. + """ + for conflict in unresolved_conflicts: + for parser_name, resolution_dict in conflict.parser_to_resource_to_resolution.items(): + for resource in resolution_dict.keys(): + self.unresolved_conflicts_by_parser[(parser_name, resource)] = conflict + + def _find_conflicts_in_resources( + self, resources: Iterable[OntologyStringResource] + ) -> set[ResourceConflict]: + """Find conflicts in resources. + + :param resources: The resources to check for conflicts against other resources + currently loaded in the internal resource manager. + :return: A set of ResourceConflicts. + """ + logging.info("Looking for conflicts across all parsers...") + ( + case_conflicts, + _, + ) = OntologyStringConflictAnalyser.check_for_case_conflicts_across_resources( + resources # type: ignore[arg-type] # dict_keys isn't a subtype of builtin set + ) + unresolved_conflicts = set() + for conflict_set in case_conflicts: + conflict_dict = defaultdict(set) + for conflict_resource in conflict_set: + parser_name: str + for parser_name in self.manager.resource_to_parsers[conflict_resource]: + conflict_dict[parser_name].add(conflict_resource) + + conflict = ResourceConflict(dict(conflict_dict)) + unresolved_conflicts.add(conflict) + return unresolved_conflicts + + def _sync_resources( + self, + conflict: ResourceConflict, + ) -> None: + """When synching resources, we are implicitly creating a new human curation, as + the autogenerated set should never be manually changed. + + Therefore, we only care about + :param conflict: + :return: + """ + for parser_name, resolution_map in conflict.parser_to_resource_to_resolution.items(): + for original_resource, new_resource in resolution_map.items(): + assert new_resource is not None + # pop the key as is no longer valid + try: + self.unresolved_conflicts_by_parser.pop( + ( + parser_name, + original_resource, + ) + ) + except KeyError: + pass + + self.manager.sync_resources( + original_resource=original_resource, + new_resource=new_resource, + parser_name=parser_name, + ) + + def _check_resolved_conflict_for_new_conflicts(self, conflict: ResourceConflict) -> None: + parser_name: str + for parser_name, resolution_map in conflict.parser_to_resource_to_resolution.items(): + for new_resource in resolution_map.values(): + assert new_resource is not None + new_conflicts = self._find_conflicts_in_resources([new_resource]) + if new_conflicts: + self._update_conflict_parser_map(new_conflicts) + self.unresolved_conflicts = { + i: conflict + for i, conflict in enumerate(set(self.unresolved_conflicts_by_parser.values())) + } + + def _resource_conflict_to_df(self, conflict: ResourceConflict) -> pd.DataFrame: + data = [] + for parser_name, resource_dict in conflict.parser_to_resource_to_resolution.items(): + df = pd.concat(resource_to_df(resource) for resource in resource_dict.keys()) + df["parser_name"] = parser_name + data.append(df) + if len(data) == 1: + return data[0] + else: + return pd.concat(data) + + def summary_df(self) -> pd.DataFrame: + """Create a summary :class:`~pandas.DataFrame` of unresolved conflicts. + + :return: A DataFrame summarizing the unresolved conflicts. + """ + data = [] + for i, conflict in self.unresolved_conflicts.items(): + data.append( + { + "parsers": "|".join(sorted(list(conflict.parser_names))), + "strings": "|".join(sorted(list(conflict.string_set))), + "string len": max(len(x) for x in conflict.string_set), + "id": i, + } + ) + return pd.DataFrame(data) From e3961002666f1db65f87e19ae6ecd36a206d37c6 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:14:59 +0100 Subject: [PATCH 08/14] added krt.resource_discrepancy_editoy.utils, including ResourceDiscrepancyManger for finding and working with ResourceManager --- kazu/krt/resource_discrepancy_editor/utils.py | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 kazu/krt/resource_discrepancy_editor/utils.py diff --git a/kazu/krt/resource_discrepancy_editor/utils.py b/kazu/krt/resource_discrepancy_editor/utils.py new file mode 100644 index 00000000..136ae3f8 --- /dev/null +++ b/kazu/krt/resource_discrepancy_editor/utils.py @@ -0,0 +1,168 @@ +from typing import Optional + +import pandas as pd +from kazu.data import OntologyStringResource +from kazu.krt.resource_manager import ResourceManager +from kazu.krt.utils import ( + create_new_resource_with_updated_synonyms, + resource_to_df, +) + + +class SynonymDiscrepancy: + """This class represents a discrepancy between a human-generated + :class:`.OntologyStringResource` and an auto-generated + :class:`.OntologyStringResource`. + + It provides methods to automatically resolve the discrepancy, convert the resources + to a DataFrame, and get an example string for display. + """ + + def __init__( + self, human_resource: OntologyStringResource, auto_resource: OntologyStringResource + ): + self.auto_resource = auto_resource + self.human_resource = human_resource + + def auto_resolve(self) -> Optional[OntologyStringResource]: + """This method attempts to automatically resolve discrepancies between human and + auto resources. + + It first creates a set of tuples containing the mention confidence and case + sensitivity for all :class:`.Synonym`\\s in the human resource. If there is more than one + unique tuple in the set, it means there are discrepancies in the human resource + itself, and the method returns None. If there is exactly one unique tuple, it + means all synonyms in the human resource have the same mention confidence and + case sensitivity. In this case, it updates all forms of the auto resource with + this mention confidence and case sensitivity, and returns the updated auto + resource. + + :return: The updated auto resource if discrepancies can be auto-resolved, which + can be used as a human override. None otherwise. + """ + human_aspects = set( + ( + x.mention_confidence, + x.case_sensitive, + ) + for x in self.human_resource.all_synonyms() + ) + if len(human_aspects) != 1: + return None + else: + new_conf, new_cs = next(iter(human_aspects)) + return create_new_resource_with_updated_synonyms( + new_conf=new_conf, new_cs=new_cs, resource=self.auto_resource + ) + + def dataframe(self) -> pd.DataFrame: + """Converts the human and auto resources to DataFrames, merges them, and returns + the rows with any null values (i.e. discrepancies) + + :return: A :class:`~pandas.DataFrame` representing the discrepancies between the human and auto + resources. + """ + human_df = resource_to_df(self.human_resource) + auto_df = resource_to_df(self.auto_resource) + merged = pd.merge( + human_df, auto_df, how="outer", on=["type", "text"], suffixes=("_human", "_auto") + ) + return merged[merged.isnull().any(axis=1)] + + def example_string(self) -> str: + """Returns an example string from the human resource's original synonyms.""" + return next(iter(self.human_resource.original_synonyms)).text + + +class ResourceDiscrepancyManger: + """This class manages :class:`.SynonymDiscrepancy`\\s between human-generated + resources and auto- generated resources. + + It provides methods to automatically resolve all discrepancies, commit changes to + the resources, and get a summary DataFrame. + """ + + def __init__( + self, + parser_name: str, + manager: ResourceManager, + ): + """Initializes the ResourceDiscrepancyManager. + + :param parser_name: The name of the parser used to generate the resources. + :param manager: The :class:`.ResourceManager` object used to manage the resources. + """ + self.manager = manager + self.parser_name = parser_name + report = manager.parser_to_report[parser_name] + if not report.merge_report: + todo = set() + else: + + todo = set( + SynonymDiscrepancy(human_resource=human_curation, auto_resource=autocuration) + for human_curation, autocuration in report.merge_report.resources_with_discrepancies + ) + + self.unresolved_discrepancies: dict[int, SynonymDiscrepancy] = { + i: discrepancy for i, discrepancy in enumerate(todo) + } + + def apply_autofix_to_all(self) -> None: + """Attempts to automatically resolve all discrepancies. + + If a discrepancy can be auto-resolved, it syncs the resources and removes the + discrepancy from the internal unresolved discrepancies list. + """ + for i in list(self.unresolved_discrepancies): + discrepancy = self.unresolved_discrepancies[i] + maybe_new_resource = discrepancy.auto_resolve() + if maybe_new_resource is not None: + self.manager.sync_resources( + original_resource=discrepancy.human_resource, + new_resource=maybe_new_resource, + parser_name=self.parser_name, + ) + self.unresolved_discrepancies.pop(i) + + def commit( + self, + original_human_resource: OntologyStringResource, + new_resource: OntologyStringResource, + index: int, + ) -> None: + """Commits changes to the resources and removes the discrepancy from the + internal todo list. + + :param original_human_resource: The original human-generated resource. + :param new_resource: The new resource to replace the original one. + :param index: The index of the discrepancy. + """ + self.manager.sync_resources( + original_resource=original_human_resource, + new_resource=new_resource, + parser_name=self.parser_name, + ) + self.unresolved_discrepancies.pop(index) + + def summary_df(self) -> pd.DataFrame: + """Returns a :class:`pandas.DataFrame` summarizing the unresolved discrepancies. + + :return: A DataFrame with columns for id, example text, and the number of unique + synonyms in the human and auto resources. + """ + data = [] + for i, discrepancy in self.unresolved_discrepancies.items(): + data.append( + { + "id": i, + "example_text": discrepancy.example_string(), + "human_resource_unique_synonyms": len( + set(discrepancy.human_resource.all_synonyms()) + ), + "auto_resource_unique_synonyms": len( + set(discrepancy.auto_resource.all_synonyms()) + ), + } + ) + return pd.DataFrame(data) From 3b290b1cfb387aa8772d6e8473e1046f33622c55 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:15:21 +0100 Subject: [PATCH 09/14] tests for krt managers --- kazu/tests/test_krt_managers.py | 173 ++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 kazu/tests/test_krt_managers.py diff --git a/kazu/tests/test_krt_managers.py b/kazu/tests/test_krt_managers.py new file mode 100644 index 00000000..04a98ea6 --- /dev/null +++ b/kazu/tests/test_krt_managers.py @@ -0,0 +1,173 @@ +import dataclasses + +from kazu.data import OntologyStringBehaviour, MentionConfidence, Synonym, OntologyStringResource +from kazu.krt.resource_manager import ResourceManager +from kazu.krt.string_editor.utils import ResourceConflictManager +from kazu.krt.resource_discrepancy_editor.utils import ResourceDiscrepancyManger +from kazu.tests.utils import DummyParser + + +def init_test_resource_manager() -> ResourceManager: + p1 = DummyParser(name="test_parser1") + p1.curations_path = p1.ontology_auto_generated_resources_set_path + p1.populate_databases() + p2 = DummyParser(name="test_parser2") + p1.curations_path = p1.ontology_auto_generated_resources_set_path + p2.populate_databases() + parsers = [p1, p2] + rm = ResourceManager(parsers) + return rm + + +def init_test_string_conflict_manager() -> ResourceConflictManager: + + conflict1_p1 = OntologyStringResource( + original_synonyms=frozenset( + {Synonym(text="4", case_sensitive=False, mention_confidence=MentionConfidence.PROBABLE)} + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + conflict1_p2 = OntologyStringResource( + original_synonyms=frozenset( + {Synonym(text="4", case_sensitive=True, mention_confidence=MentionConfidence.PROBABLE)} + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + conflict2_p1 = OntologyStringResource( + original_synonyms=frozenset( + { + Synonym( + text="two", case_sensitive=False, mention_confidence=MentionConfidence.PROBABLE + ) + } + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + conflict2_p2 = OntologyStringResource( + original_synonyms=frozenset( + { + Synonym( + text="two", case_sensitive=True, mention_confidence=MentionConfidence.PROBABLE + ) + } + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + p1 = DummyParser( + name="test_parser1", + curations_injections=set([conflict1_p1, conflict1_p2, conflict2_p1, conflict2_p2]), + ) + p1.populate_databases() + p2 = DummyParser( + name="test_parser2", + curations_injections=set([conflict1_p1, conflict1_p2, conflict2_p1, conflict2_p2]), + ) + p2.populate_databases() + rm = ResourceManager([p1, p2]) + scm = ResourceConflictManager(manager=rm) + return scm + + +def init_discrepancy_manager() -> ResourceDiscrepancyManger: + d1 = OntologyStringResource( + original_synonyms=frozenset( + { + Synonym( + text="one", case_sensitive=False, mention_confidence=MentionConfidence.PROBABLE + ), + Synonym( + text="one-", case_sensitive=False, mention_confidence=MentionConfidence.PROBABLE + ), + } + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + d2 = OntologyStringResource( + original_synonyms=frozenset( + { + Synonym( + text="two", case_sensitive=True, mention_confidence=MentionConfidence.PROBABLE + ), + Synonym( + text="two-", case_sensitive=True, mention_confidence=MentionConfidence.PROBABLE + ), + } + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset(), + associated_id_sets=None, + autocuration_results=None, + comment=None, + ) + p1 = DummyParser( + name="test_parser1", + curations_injections=set([d1, d2]), + ) + p1.populate_databases() + rm = ResourceManager([p1]) + dm = ResourceDiscrepancyManger(parser_name="test_parser1", manager=rm) + return dm + + +def test_resource_manager_sync(): + rm = init_test_resource_manager() + for parser in rm.parsers.values(): + old_resources = parser.populate_metadata_db_and_resolve_string_resources()[ + 1 + ].final_conflict_report.clean_resources + for old_resource in old_resources: + new_resource = dataclasses.replace( + old_resource, behaviour=OntologyStringBehaviour.DROP_FOR_LINKING + ) + rm.sync_resources( + original_resource=old_resource, new_resource=new_resource, parser_name=parser.name + ) + assert old_resource not in rm.resource_to_parsers + assert new_resource in rm.resource_to_parsers + + +def test_string_conflict_manager_sync(): + scm = init_test_string_conflict_manager() + new_resources: set[OntologyStringResource] = set() + assert len(scm.unresolved_conflicts) == 2 + for conflict in scm.unresolved_conflicts.values(): + conflict.batch_resolve(optimistic=True) + scm.sync_resources_for_resolved_resource_conflict_and_find_new_conflicts(conflict) + for resolution_dict in conflict.parser_to_resource_to_resolution.values(): + for resource in resolution_dict.values(): + if resource is not None: + new_resources.add(resource) + assert len(scm.unresolved_conflicts) == 0 + assert new_resources.issubset(scm.manager.resource_to_parsers) + + +def test_discrepancy_manager_sync(): + dm = init_discrepancy_manager() + new_resources: set[OntologyStringResource] = set() + assert len(dm.unresolved_discrepancies) == 2 + for index, discrepancy in list(dm.unresolved_discrepancies.items()): + new_resource = discrepancy.auto_resolve() + assert new_resource is not None + new_resources.add(new_resource) + dm.commit(discrepancy.human_resource, new_resource, index) + assert new_resources.issubset(dm.manager.resource_to_parsers) From 750f2bc4c3c09439f9180b600a0f878609f15ff8 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:19:10 +0100 Subject: [PATCH 10/14] added streamlit multipage app and various components --- kazu/krt/__init__.py | 0 kazu/krt/components.py | 539 ++++++++++++++++++ .../resource_discrepancy_editor/__init__.py | 0 .../resource_discrepancy_editor/components.py | 214 +++++++ kazu/krt/string_editor/__init__.py | 0 kazu/krt/string_editor/components.py | 220 +++++++ krt/Introduction.py | 28 + krt/pages/1_fix_resource_discrepancies.py | 44 ++ .../2_manage_string_matching_configuration.py | 27 + krt/pages/__init__.py | 0 10 files changed, 1072 insertions(+) create mode 100644 kazu/krt/__init__.py create mode 100644 kazu/krt/components.py create mode 100644 kazu/krt/resource_discrepancy_editor/__init__.py create mode 100644 kazu/krt/resource_discrepancy_editor/components.py create mode 100644 kazu/krt/string_editor/__init__.py create mode 100644 kazu/krt/string_editor/components.py create mode 100644 krt/Introduction.py create mode 100644 krt/pages/1_fix_resource_discrepancies.py create mode 100644 krt/pages/2_manage_string_matching_configuration.py create mode 100644 krt/pages/__init__.py diff --git a/kazu/krt/__init__.py b/kazu/krt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kazu/krt/components.py b/kazu/krt/components.py new file mode 100644 index 00000000..c12b6baf --- /dev/null +++ b/kazu/krt/components.py @@ -0,0 +1,539 @@ +import dataclasses +import time +from collections import defaultdict +from typing import Iterable, Optional, Any, cast, Callable, Union + +import pandas as pd +import streamlit as st +from kazu.data import ( + OntologyStringResource, + MentionConfidence, + Synonym, + AssociatedIdSets, + EquivalentIdSet, + OntologyStringBehaviour, +) +from kazu.krt.utils import get_resource_manager +from streamlit.delta_generator import DeltaGenerator +from streamlit.elements.lib.column_config_utils import ColumnDataKind +from streamlit.elements.widgets.data_editor import _apply_dataframe_edits + + +def save() -> None: + """This function saves the state of the resource manager and displays success + messages in the sidebar. + + It creates placeholders for each message returned by the resource manager's save + method. After a delay of 2 seconds, it empties all the placeholders. + + :return: + """ + placeholders = [] + for msg in get_resource_manager().save(): + placeholder = st.sidebar.empty() + placeholder.success("Done!") + placeholder.success(msg) + placeholders.append(placeholder) + time.sleep(2) + for pl in placeholders: + pl.empty() + + +def show_save_button() -> None: + st.sidebar.button("Save", on_click=save) + + +def show_reset_button(reset_func: Callable[..., None]) -> None: + st.sidebar.button("Reset", on_click=reset_func) + + +class PlaceholderResource: + """This class provides static methods to manage a placeholder resource in the + session state.""" + + PLACEHOLDER_RESOURCE = "PLACEHOLDER_RESOURCE" + + @staticmethod + def create_placeholder_resource(text: str, parser_name: str) -> None: + """Creates a placeholder resource with a given text and parser name, and stores + it in the session state. + + :param text: the text to be used in the Synonym instance + :param parser_name: The name of the parser to be associated with the placeholder + resource. + :return: + """ + syn = Synonym( + text=text, + case_sensitive=False, + mention_confidence=MentionConfidence.PROBABLE, + ) + placeholder = OntologyStringResource( + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + original_synonyms=frozenset((syn,)), + ) + st.session_state[PlaceholderResource.PLACEHOLDER_RESOURCE] = parser_name, placeholder + + @staticmethod + def get_placeholder_resource() -> tuple[str, OntologyStringResource]: + return cast( + tuple[str, OntologyStringResource], + st.session_state[PlaceholderResource.PLACEHOLDER_RESOURCE], + ) + + @staticmethod + def delete_placeholder() -> None: + del st.session_state[PlaceholderResource.PLACEHOLDER_RESOURCE] + + +class ResourceEditor: + ASSOCIATE_ID_SET_EDITOR = "ASSOCIATE_ID_SET_EDITOR" + ASSOCIATE_ID_SET_DF = "ASSOCIATE_ID_SET_DF" + BEHAVIOUR_SELECTOR = "BEHAVIOUR_SELECTOR" + CONFIDENCE_SELECTOR = "CONFIDENCE_SELECTOR" + CASE_SELECTOR = "CASE_SELECTOR" + + @staticmethod + def submit_form_for_addition() -> None: + """Submits the form for addition of a new resource. + + Extracts the placeholder resource and parser name from the session state. Then, + it extracts the form data from the state and syncs the resources. Finally, it + deletes the placeholder resource from the session state. + + :return: + """ + parser_name, placeholder_resource = PlaceholderResource.get_placeholder_resource() + for original_resource, new_resource in ResourceEditor.extract_form_data_from_state( + parser_name=parser_name, resources={placeholder_resource} + ): + # original_resource is None as it's only a placeholder that allows us to extract the form details + get_resource_manager().sync_resources( + original_resource=None, + new_resource=new_resource, + parser_name=parser_name, + ) + PlaceholderResource.delete_placeholder() + + @staticmethod + def display_case_sensitivity_selector( + row: list[DeltaGenerator], + row_index: int, + default_syn: Optional[Synonym], + key: Optional[Any] = None, + ) -> bool: + """Displays a radio button selector for case sensitivity. + + The default value is determined by the case sensitivity of the provided synonym. + + :param row: + :param row_index: + :param default_syn: + :param key: + :return: + """ + options = [True, False] + if default_syn: + index = options.index(default_syn.case_sensitive) + else: + index = 0 + return cast( + bool, row[row_index].radio("case sensitive", options=options, index=index, key=key) + ) + + @staticmethod + def display_confidence_selector( + row: list[DeltaGenerator], + row_index: int, + default_syn: Optional[Synonym], + key: Optional[Any] = None, + ) -> MentionConfidence: + """Displays a radio button selector for confidence. + + The default value is determined by the confidence of the provided synonym. + + :param row: + :param row_index: + :param default_syn: + :param key: + :return: + """ + options = list(MentionConfidence) + if default_syn: + index = options.index(default_syn.mention_confidence) + else: + index = 0 + return cast( + MentionConfidence, + row[row_index].radio("confidence", options=options, index=index, key=key), + ) + + @staticmethod + def display_case_sensitivity_and_confidence_selector( + row: list[DeltaGenerator], default_syn: Optional[Synonym] + ) -> tuple[bool, MentionConfidence]: + """Displays selectors for both case sensitivity and confidence. + + Returns a tuple of the selected values. + + :param row: + :param default_syn: + :return: + """ + cs = ResourceEditor.display_case_sensitivity_selector( + row=row, row_index=0, default_syn=default_syn + ) + conf = ResourceEditor.display_confidence_selector( + row=row, row_index=1, default_syn=default_syn + ) + return cs, conf + + @staticmethod + def display_synonym_options_container_with_defaults( + resource: OntologyStringResource, synonym: Synonym, parser_name: str + ) -> None: + """Displays a container with the synonym string and selectors for case + sensitivity and confidence. + + The default values for the selectors are determined by the provided synonym. + + :param resource: + :param synonym: + :param parser_name: + :return: + """ + st.markdown(f"""synonym string:\n> {synonym.text}""") + row = st.columns([2, 2]) + cs_key = ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + synonym=synonym, + suffix=ResourceEditor.CASE_SELECTOR, + ) + ResourceEditor.display_case_sensitivity_selector( + row=row, row_index=0, default_syn=synonym, key=cs_key + ) + conf_key = ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + synonym=synonym, + suffix=ResourceEditor.CONFIDENCE_SELECTOR, + ) + ResourceEditor.display_confidence_selector( + row=row, row_index=1, default_syn=synonym, key=conf_key + ) + + @staticmethod + def display_synonym_editor( + resources: Iterable[OntologyStringResource], parser_name: str + ) -> None: + """Displays an editor for each synonym in the provided resources. + + Each editor is contained within a bordered container. + :param resources: + :param parser_name: + :return: + """ + for resource in resources: + for synonym in resource.all_synonyms(): + with st.container(border=True): + ResourceEditor.display_synonym_options_container_with_defaults( + resource=resource, synonym=synonym, parser_name=parser_name + ) + + @staticmethod + def _build_parser_lookup( + resources: Iterable[OntologyStringResource], + ) -> dict[str, set[OntologyStringResource]]: + """Builds a lookup dictionary mapping parser names to sets of resources. + + :param resources: + :return: + """ + parser_lookup = defaultdict(set) + for resource in resources: + parser_names = get_resource_manager().resource_to_parsers[resource] + for parser_name in parser_names: + parser_lookup[parser_name].add(resource) + return parser_lookup + + @staticmethod + def _display_resource_editor_components( + resources: Iterable[OntologyStringResource], maybe_parser_name: Optional[str] = None + ) -> None: + """Displays the components of the resource editor. + + If a parser name is provided, only resources associated with that parser are + displayed. Otherwise, resources for all parsers are displayed. + + :param resources: + :param maybe_parser_name: + :return: + """ + data: defaultdict[str, set[OntologyStringResource]] = defaultdict(set) + if maybe_parser_name: + data[maybe_parser_name].update(resources) + else: + data.update(ResourceEditor._build_parser_lookup(resources)) + for parser_name, resource_set in data.items(): + st.markdown( + "### [OntologyStringResource Editor](https://astrazeneca.github.io/KAZU/_autosummary/kazu.data.html#kazu.data.OntologyStringResource)" + ) + st.write(f"Parser name: {parser_name}") + for resource in resource_set: + ResourceEditor._show_behaviour_selector(parser_name, resource) + with st.container(border=True): + st.markdown( + "##### [Equivalent Id Set Editor](https://astrazeneca.github.io/KAZU/_autosummary/kazu.data.html#kazu.data.EquivalentIdSet)" + ) + with st.container(border=True): + df = ResourceEditor._build_df_from_id_sets(resource) + ResourceEditor._show_id_set_data_editor(df, parser_name, resource) + with st.container(border=True): + st.markdown( + "##### [Synonym Editor](https://astrazeneca.github.io/KAZU/_autosummary/kazu.data.html#kazu.data.Synonym)" + ) + for synonym in resource.all_synonyms(): + with st.container(border=True): + ResourceEditor.display_synonym_options_container_with_defaults( + resource=resource, synonym=synonym, parser_name=parser_name + ) + + @staticmethod + def _show_behaviour_selector(parser_name: str, resource: OntologyStringResource) -> None: + behaviour_options = {x.name: i for i, x in enumerate(OntologyStringBehaviour)} + st.radio( + label="select behaviour", + options=OntologyStringBehaviour, + key=ResourceEditor._get_key( + parser_name=parser_name, resource=resource, suffix=ResourceEditor.BEHAVIOUR_SELECTOR + ), + index=behaviour_options[resource.behaviour.name], + ) + + @staticmethod + def _build_df_from_id_sets(resource: OntologyStringResource) -> pd.DataFrame: + if not resource.associated_id_sets: + st.write("No associated id set overrides configured for this resource.") + df = pd.DataFrame([], columns=["source", "id", "equivalent_id_set_id"]) + else: + data_lst: list[dict[str, Optional[Union[str, int]]]] = [] + for i, id_set in enumerate(resource.associated_id_sets): + for idx, source in id_set.ids_and_source: + data_lst.append({"source": source, "id": idx, "equivalent_id_set_id": i}) + df = pd.DataFrame(data_lst) + return df + + @staticmethod + def _show_id_set_data_editor( + df: pd.DataFrame, parser_name: str, resource: OntologyStringResource + ) -> None: + st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor.ASSOCIATE_ID_SET_DF, + ) + ] = df + st.data_editor( + df, + num_rows="dynamic", + key=ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor.ASSOCIATE_ID_SET_EDITOR, + ), + ), + ) + + @staticmethod + def display_resource_editor( + resources: Iterable[OntologyStringResource], + maybe_parser_name: Optional[str] = None, + on_click_override: Optional[Callable[..., None]] = None, + args: Optional[tuple[Any, ...]] = None, + ) -> None: + with st.form("resource_editor"): + ResourceEditor._display_resource_editor_components( + resources=resources, maybe_parser_name=maybe_parser_name + ) + if on_click_override: + st.form_submit_button("Submit", on_click=on_click_override, args=args) + else: + st.form_submit_button( + "Submit", on_click=ResourceEditor.submit_form_for_edits, args=(resources,) + ) + + @staticmethod + def _get_key( + parser_name: str, + resource: OntologyStringResource, + synonym: Optional[Synonym] = None, + suffix: Optional[str] = None, + ) -> str: + return f"{parser_name}{resource._id}{synonym}{suffix}" + + @staticmethod + def _extract_associated_id_set_from_df(df: pd.DataFrame) -> Optional[AssociatedIdSets]: + equiv_id_sets = set() + groups = df.groupby("equivalent_id_set_id").groups + for groupid, group in groups.items(): + if groupid is not None: + ids_and_source = set() + group_df = df.iloc[group] + for i, row in group_df.iterrows(): + source = row["source"] + idx = row["id"] + if idx and source: + ids_and_source.add( + ( + idx, + source, + ) + ) + equiv_id_sets.add(EquivalentIdSet(ids_and_source=frozenset(ids_and_source))) + if equiv_id_sets: + return AssociatedIdSets(frozenset(equiv_id_sets)) + else: + return None + + @staticmethod + def submit_form_for_edits(resources: set[OntologyStringResource]) -> None: + for parser_name, resource_set in ResourceEditor._build_parser_lookup(resources).items(): + for original_resource, new_resource in ResourceEditor.extract_form_data_from_state( + parser_name=parser_name, resources=resource_set + ): + get_resource_manager().sync_resources( + original_resource=original_resource, + new_resource=new_resource, + parser_name=parser_name, + ) + + @staticmethod + def extract_form_data_from_state( + parser_name: str, resources: Iterable[OntologyStringResource] + ) -> Iterable[tuple[OntologyStringResource, OntologyStringResource]]: + for resource in resources: + new_behaviour = st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor.BEHAVIOUR_SELECTOR, + ) + ] + + initial_df = st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor.ASSOCIATE_ID_SET_DF, + ) + ] + + ResourceEditor._update_df_with_edits(initial_df, parser_name, resource) + + maybe_assoc_id_set = ResourceEditor._extract_associated_id_set_from_df(initial_df) + new_alts, new_originals = ResourceEditor._extract_new_synonyms_from_state( + parser_name, resource + ) + new_resource = dataclasses.replace( + resource, + behaviour=new_behaviour, + associated_id_sets=maybe_assoc_id_set, + original_synonyms=frozenset(new_originals), + alternative_synonyms=frozenset(new_alts), + ) + yield resource, new_resource + + @staticmethod + def extract_updated_synonym_data_from_state( + parser_name: str, resource: OntologyStringResource, synonym: Synonym + ) -> Synonym: + conf = st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + synonym=synonym, + suffix=ResourceEditor.CONFIDENCE_SELECTOR, + ) + ] + cs = st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + synonym=synonym, + suffix=ResourceEditor.CASE_SELECTOR, + ) + ] + return dataclasses.replace(synonym, mention_confidence=conf, case_sensitive=cs) + + @staticmethod + def _extract_new_synonyms_from_state( + parser_name: str, resource: OntologyStringResource + ) -> tuple[set[Synonym], set[Synonym]]: + new_originals = set() + for synonym in resource.original_synonyms: + new_originals.add( + ResourceEditor.extract_updated_synonym_data_from_state( + parser_name, resource, synonym + ) + ) + new_alts = set() + for synonym in resource.alternative_synonyms: + new_alts.add( + ResourceEditor.extract_updated_synonym_data_from_state( + parser_name, resource, synonym + ) + ) + return new_alts, new_originals + + @staticmethod + def _update_df_with_edits( + initial_df: pd.DataFrame, parser_name: str, resource: OntologyStringResource + ) -> None: + edits = st.session_state[ + ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor._get_key( + parser_name=parser_name, + resource=resource, + suffix=ResourceEditor.ASSOCIATE_ID_SET_EDITOR, + ), + ) + ] + _apply_dataframe_edits( + df=initial_df, + data_editor_state=edits, + dataframe_schema={ + "source": ColumnDataKind.STRING, + "id": ColumnDataKind.STRING, + "equivalent_id_set_id": ColumnDataKind.STRING, + }, + ) + + +class ParserSelector: + PARSER_SELECTOR = "PARSER_SELECTOR" + + @staticmethod + def display_parser_selector(exclude: Optional[set[str]] = None) -> None: + manager = get_resource_manager() + choices = ( + manager.parser_to_report.keys() + if not exclude + else set(manager.parser_to_report.keys()).difference(exclude) + ) + st.selectbox( + "SELECT PARSER", + options=choices, + index=None, + key=ParserSelector.PARSER_SELECTOR, + ) + + @staticmethod + def get_selected_parser_name() -> Optional[str]: + return st.session_state.get(ParserSelector.PARSER_SELECTOR) diff --git a/kazu/krt/resource_discrepancy_editor/__init__.py b/kazu/krt/resource_discrepancy_editor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kazu/krt/resource_discrepancy_editor/components.py b/kazu/krt/resource_discrepancy_editor/components.py new file mode 100644 index 00000000..c604ce42 --- /dev/null +++ b/kazu/krt/resource_discrepancy_editor/components.py @@ -0,0 +1,214 @@ +from typing import cast + +import streamlit as st +from kazu.data import MentionConfidence +from kazu.krt.components import ( + ResourceEditor, + ParserSelector, +) +from kazu.krt.resource_discrepancy_editor.utils import ( + SynonymDiscrepancy, + ResourceDiscrepancyManger, +) +from kazu.krt.utils import create_new_resource_with_updated_synonyms, get_resource_manager + + +@st.cache_resource +def get_resource_merge_manager(parser_name: str) -> ResourceDiscrepancyManger: + manager = get_resource_manager() + return ResourceDiscrepancyManger( + parser_name=parser_name, + manager=manager, + ) + + +def reset() -> None: + # type ignore as mypy doesn't pick up annotations + get_resource_manager.clear() # type: ignore[attr-defined] + get_resource_merge_manager.clear() # type: ignore[attr-defined] + + +def get_resource_merge_manager_for_parser(parser_name: str) -> ResourceDiscrepancyManger: + return get_resource_merge_manager(parser_name=parser_name) + + +class ResourceDiscrepancyResolutionForm: + """This class is used to handle the resolution of resource discrepancies. + + It provides a form interface for the user to interact with and resolve + discrepancies. + """ + + ATTEMPT_AUTOFIX = "ATTEMPT_AUTOFIX" + AUTOFIX_ALREADY_ATTEMPTED = "AUTOFIX_ALREADY_ATTEMPTED" + + @staticmethod + def display_main_form(manager: ResourceDiscrepancyManger) -> None: + """Display the main form for resolving resource discrepancies. + + :param manager: + :return: + """ + st.write("select a row to resolve a discrepancy") + event = st.dataframe( + manager.summary_df(), + use_container_width=True, + selection_mode="single-row", + on_select="rerun", + hide_index=True, + column_config={"id": None}, + ) + if not ResourceDiscrepancyResolutionForm._autofix_has_been_attempted(): + ResourceDiscrepancyResolutionForm._display_attempt_autofix_button() + ResourceDiscrepancyResolutionForm._run_autofix_if_requested(manager) + + for row_id in event.get("selection", {}).get("rows", []): + ResourceDiscrepancyResolutionForm._display_discrepancy_form_for_selected_index( + index=row_id, manager=manager + ) + + @staticmethod + def _reset_form() -> None: + """Reset the form by setting the AUTOFIX_ALREADY_ATTEMPTED session state to + False.""" + st.session_state[ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED] = False + + @staticmethod + def _display_attempt_autofix_button() -> None: + """Display a button for the user to attempt to autofix discrepancies. + + The button is disabled if autofix has already been attempted. + """ + st.button( + "Attempt to Autofix discrepancies", + key=ResourceDiscrepancyResolutionForm.ATTEMPT_AUTOFIX, + disabled=st.session_state[ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED], + ) + + @staticmethod + def _run_autofix_if_requested(manager: ResourceDiscrepancyManger) -> None: + """If the user has requested to run autofix, apply autofix to all resources and + rerun the script. + + :param manager: + :return: + """ + if st.session_state.get(ResourceDiscrepancyResolutionForm.ATTEMPT_AUTOFIX): + manager.apply_autofix_to_all() + st.session_state[ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED] = True + st.rerun() + + @staticmethod + def set_autofix_session_state() -> None: + if ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED not in st.session_state: + st.session_state[ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED] = False + + @staticmethod + def _autofix_has_been_attempted() -> bool: + return cast( + bool, st.session_state[ResourceDiscrepancyResolutionForm.AUTOFIX_ALREADY_ATTEMPTED] + ) + + @staticmethod + def _submit_form_batch( + conf: MentionConfidence, cs: bool, conflict: SynonymDiscrepancy, index: int + ) -> None: + """Submit the form to resolve a discrepancy. + + All synonyms will be updated with the provided case sensitivity and confidence. + + :param conf: + :param cs: + :param conflict: + :param index: + :return: + """ + new_resource = create_new_resource_with_updated_synonyms( + new_conf=conf, new_cs=cs, resource=conflict.auto_resource + ) + parser_name = ParserSelector.get_selected_parser_name() + if parser_name: + flow = get_resource_merge_manager_for_parser(parser_name) + assert flow is not None + flow.commit( + original_human_resource=conflict.human_resource, + new_resource=new_resource, + index=index, + ) + ResourceDiscrepancyResolutionForm._reset_form() + + @staticmethod + def _submit_form_individual(discrepancy: SynonymDiscrepancy, index: int) -> None: + """Submit the form to resolve a discrepancy with individual edits. + + :param discrepancy: + :param index: + :return: + """ + parser_name = ParserSelector.get_selected_parser_name() + assert parser_name is not None + flow = get_resource_merge_manager_for_parser(parser_name) + assert flow is not None + for _, new_resource in ResourceEditor.extract_form_data_from_state( + parser_name=parser_name, resources={discrepancy.auto_resource} + ): + + flow.commit( + original_human_resource=discrepancy.human_resource, + new_resource=new_resource, + index=index, + ) + ResourceDiscrepancyResolutionForm._reset_form() + + @staticmethod + def _display_discrepancy_form_for_selected_index( + index: int, manager: ResourceDiscrepancyManger + ) -> None: + """Display the discrepancy form for the selected index. + + :param index: + :param manager: + :return: + """ + discrepancy = manager.unresolved_discrepancies[index] + st.write(discrepancy.dataframe()) + form = st.radio("select a form", options=["apply to all", "edit individual"]) + if form == "apply to all": + ResourceDiscrepancyResolutionForm._display_batch_edit_form(discrepancy, index) + else: + ResourceEditor.display_resource_editor( + resources={discrepancy.auto_resource}, + on_click_override=ResourceDiscrepancyResolutionForm._submit_form_individual, + args=( + discrepancy, + index, + ), + ) + + @staticmethod + def _display_batch_edit_form(discrepancy: SynonymDiscrepancy, index: int) -> None: + """Display the batch edit form for the given discrepancy. + + :param discrepancy: + :param index: + :return: + """ + with st.form("conflict_editor"): + header = st.columns([2, 2]) + header[0].subheader("case sensitivity") + header[1].subheader("confidence") + row1 = st.columns([2, 2]) + defaults = next(iter(discrepancy.auto_resource.original_synonyms)) + cs, conf = ResourceEditor.display_case_sensitivity_and_confidence_selector( + row=row1, default_syn=defaults + ) + st.form_submit_button( + "Submit", + on_click=ResourceDiscrepancyResolutionForm._submit_form_batch, + args=( + conf, + cs, + discrepancy, + index, + ), + ) diff --git a/kazu/krt/string_editor/__init__.py b/kazu/krt/string_editor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kazu/krt/string_editor/components.py b/kazu/krt/string_editor/components.py new file mode 100644 index 00000000..e5d755d1 --- /dev/null +++ b/kazu/krt/string_editor/components.py @@ -0,0 +1,220 @@ +import logging +from typing import Iterable + +import streamlit as st +from kazu.krt.components import ( + ResourceEditor, + ParserSelector, + PlaceholderResource, +) +from kazu.krt.string_editor.utils import ( + CaseConflictResolutionRequest, + ResourceConflict, + ResourceConflictManager, +) +from kazu.krt.utils import get_resource_manager + + +@st.cache_resource +def get_manager() -> ResourceConflictManager: + return ResourceConflictManager(manager=get_resource_manager()) + + +def reset() -> None: + # type ignore as mypy doesn't pick up annotations + get_resource_manager.clear() # type: ignore[attr-defined] + get_manager.clear() # type: ignore[attr-defined] + + +class StringConflictForm: + STRING_CONFLICT_BATCH_APPLY = "STRING_CONFLICT_BATCH_APPLY" + DATAFRAME_SELECTION = "DATAFRAME_SELECTION" + DATAFRAME = "DATAFRAME" + FORM_PICKER = "FORM_PICKER" + + @staticmethod + def submit_form_for_batch_conflict_resolution(conflicts: Iterable[ResourceConflict]) -> None: + for conflict in conflicts: + logging.info(f"submit form {id(conflict)}") + if ( + st.session_state.get(StringConflictForm.STRING_CONFLICT_BATCH_APPLY) + is CaseConflictResolutionRequest.OPTIMISTIC + ): + conflict.batch_resolve(True) + elif ( + st.session_state.get(StringConflictForm.STRING_CONFLICT_BATCH_APPLY) + is CaseConflictResolutionRequest.PESSIMISTIC + ): + conflict.batch_resolve(False) + flow = get_manager() + flow.sync_resources_for_resolved_resource_conflict_and_find_new_conflicts(conflict) + + if "submit_batch" in st.session_state: + del st.session_state["submit_batch"] + del st.session_state[StringConflictForm.DATAFRAME] + + @staticmethod + def submit_form_for_individual_conflicts(conflicts: Iterable[ResourceConflict]) -> None: + for conflict in conflicts: + logging.info(f"submit form {id(conflict)}") + if ( + st.session_state.get(StringConflictForm.STRING_CONFLICT_BATCH_APPLY) + is CaseConflictResolutionRequest.OPTIMISTIC + ): + conflict.batch_resolve(True) + elif ( + st.session_state.get(StringConflictForm.STRING_CONFLICT_BATCH_APPLY) + is CaseConflictResolutionRequest.PESSIMISTIC + ): + conflict.batch_resolve(False) + else: + for parser_name, resource_dict in conflict.parser_to_resource_to_resolution.items(): + + for ( + original_resource, + new_resource, + ) in ResourceEditor.extract_form_data_from_state( + parser_name=parser_name, resources=resource_dict.keys() + ): + conflict.parser_to_resource_to_resolution[parser_name][ + original_resource + ] = new_resource + flow = get_manager() + flow.sync_resources_for_resolved_resource_conflict_and_find_new_conflicts(conflict) + + if "submit_batch" in st.session_state: + del st.session_state["submit_batch"] + del st.session_state[StringConflictForm.DATAFRAME] + + @staticmethod + def individual_conflict_resolution_form() -> None: + row_ids = ( + st.session_state[StringConflictForm.DATAFRAME_SELECTION] + .get("selection", {}) + .get("rows", None) + ) + if len(row_ids) == 1: + row_id = row_ids[0] + df = st.session_state[StringConflictForm.DATAFRAME] + conflict_id = df.iloc[[row_id]]["id"].values[0] + conflict = get_manager().unresolved_conflicts[conflict_id] + resources = set( + resource + for resource_dict in conflict.parser_to_resource_to_resolution.values() + for resource in resource_dict + ) + ResourceEditor.display_resource_editor( + resources=resources, + on_click_override=StringConflictForm.submit_form_for_individual_conflicts, + args=([conflict],), + ) + + @staticmethod + def batch_conflict_resolution_form() -> None: + df = st.session_state[StringConflictForm.DATAFRAME] + resolution_choices = list(CaseConflictResolutionRequest) + resolution_choices.remove(CaseConflictResolutionRequest.CUSTOM) + st.radio( + "select a resolution", + options=resolution_choices, + key=StringConflictForm.STRING_CONFLICT_BATCH_APPLY, + ) + st.write("select rows to apply the resolution to") + event = st.dataframe( + df, + use_container_width=True, + selection_mode="multi-row", + on_select="rerun", + hide_index=True, + column_config={"id": None}, + key=StringConflictForm.DATAFRAME_SELECTION, + ) + conflicts = [] + for row_id in event.get("selection", {}).get("rows", []): + conflict_id = df.iloc[[row_id]]["id"].values[0] + conflict = get_manager().unresolved_conflicts[conflict_id] + conflicts.append(conflict) + + if conflicts: + disabled = False + else: + disabled = True + submitted = st.button("submit batch", key="submit_batch", disabled=disabled) + if submitted: + StringConflictForm.submit_form_for_batch_conflict_resolution(conflicts) + st.rerun() + + @staticmethod + def resolve_conflicts_form() -> None: + with st.container(border=True): + st.write(f"""{len(get_manager().unresolved_conflicts)} remaining discrepancies""") + st.radio( + "mode", + options=["apply to all", "edit individual"], + index=0, + key=StringConflictForm.FORM_PICKER, + ) + if StringConflictForm.DATAFRAME not in st.session_state: + maybe_df = get_manager().summary_df() + if not maybe_df.empty: + maybe_df = maybe_df.sort_values(by="string len", ascending=False) + st.session_state[StringConflictForm.DATAFRAME] = maybe_df + st.rerun() + + elif st.session_state[StringConflictForm.DATAFRAME].empty: + st.write("no conflicts found") + else: + + if st.session_state[StringConflictForm.FORM_PICKER] == "apply to all": + StringConflictForm.batch_conflict_resolution_form() + else: + df = st.session_state[StringConflictForm.DATAFRAME] + st.dataframe( + df, + use_container_width=True, + selection_mode="single-row", + on_select="rerun", + hide_index=True, + column_config={"conflict": None}, + key=StringConflictForm.DATAFRAME_SELECTION, + ) + StringConflictForm.individual_conflict_resolution_form() + + @staticmethod + def search_and_edit_synonyms_form() -> None: + with st.container(border=True): + text = st.text_input("search for a string") + if text: + manager = get_resource_manager() + maybe_resources = manager.synonym_lookup.get(text.lower()) + if not maybe_resources: + st.write("No existing resources found that contain this string") + StringConflictForm.add_new_resource_form(text) + else: + st.write( + "One or more resources already exists for this string. You can edit them here:" + ) + mode = st.radio( + label="edit existing or create new", + options=["edit existing", "create new"], + index=0, + ) + if mode == "edit existing": + ResourceEditor.display_resource_editor(maybe_resources) + else: + StringConflictForm.add_new_resource_form(text) + + @staticmethod + def add_new_resource_form(text: str) -> None: + ParserSelector.display_parser_selector() + maybe_parser_name = ParserSelector.get_selected_parser_name() + if maybe_parser_name: + parser = get_resource_manager().parsers[maybe_parser_name] + st.write(f"selected parser is {parser.name}, entity class: {parser.entity_class}") + PlaceholderResource.create_placeholder_resource(text=text, parser_name=parser.name) + parser_name, placeholder_resource = PlaceholderResource.get_placeholder_resource() + ResourceEditor.display_resource_editor( + resources={placeholder_resource}, + maybe_parser_name=parser_name, + on_click_override=ResourceEditor.submit_form_for_addition, + ) diff --git a/krt/Introduction.py b/krt/Introduction.py new file mode 100644 index 00000000..64a01366 --- /dev/null +++ b/krt/Introduction.py @@ -0,0 +1,28 @@ +import logging + +import streamlit as st + +logging.basicConfig(level=logging.INFO) + +st.set_page_config( + page_title="Kazu Resource Tool", + layout="wide", + page_icon="📚", +) + +st.write("# Welcome to The Kazu Resource Tool (KRT)") + +st.markdown( + """ + Welcome to Kazu (Korea AstraZeneca University), a python biomedical NLP framework built in collaboration with + Korea University, designed to handle production workloads. + + This tool is designed to facilitate the resolution of the [issues described here](https://astrazeneca.github.io/KAZU/ontology_parser.html#using-ontologystringresource-for-dictionary-based-matching-and-or-to-modify-an-ontology-s-behaviour) + + Select a workflow from the sidebar to begin + + ### Want to learn more? + - Check out [Kazu on github](https://github.com/AstraZeneca/KAZU) + - [Documentation here](https://astrazeneca.github.io/KAZU/index.html) +""" +) diff --git a/krt/pages/1_fix_resource_discrepancies.py b/krt/pages/1_fix_resource_discrepancies.py new file mode 100644 index 00000000..89d33005 --- /dev/null +++ b/krt/pages/1_fix_resource_discrepancies.py @@ -0,0 +1,44 @@ +import streamlit as st +from kazu.krt.components import ParserSelector, show_save_button, show_reset_button +from kazu.krt.resource_discrepancy_editor.components import ( + get_resource_merge_manager_for_parser, + ResourceDiscrepancyResolutionForm, + reset, +) + + +show_save_button() +show_reset_button(reset) + +with st.expander("Description", expanded=True): + st.markdown( + """### Correct Resource Discrepancy Issues. + + Discrepancies can occur when there are inconsistencies between the + autogenerated ontology resources and their human overrides. This can occur: + + 1) After an ontology version update, where the generated + resources have changed but their human overrides haven't. + + 2) After the configuration of the [Autocurator](https://astrazeneca.github.io/KAZU/_autosummary/kazu.ontology_preprocessing.autocuration.html#kazu.ontology_preprocessing.autocuration.AutoCurator) has changed. + + 3) If the implementation of the [StringNormalizer](https://astrazeneca.github.io/KAZU/_autosummary/kazu.utils.string_normalizer.html#kazu.utils.string_normalizer.StringNormalizer) has changed. + + + This tool finds these discrepancies and prompts the user for a fix. + When saved, the necessary human resources are updated in the model pack. + """ + ) + + +ParserSelector.display_parser_selector() +ResourceDiscrepancyResolutionForm.set_autofix_session_state() +parser_name = ParserSelector.get_selected_parser_name() + +if parser_name: + manager = get_resource_merge_manager_for_parser(parser_name) + with st.container(border=True): + if manager.summary_df().empty: + st.write(f"""{len(manager.unresolved_discrepancies)} discrepancies remaining""") + else: + ResourceDiscrepancyResolutionForm.display_main_form(manager) diff --git a/krt/pages/2_manage_string_matching_configuration.py b/krt/pages/2_manage_string_matching_configuration.py new file mode 100644 index 00000000..cb295f50 --- /dev/null +++ b/krt/pages/2_manage_string_matching_configuration.py @@ -0,0 +1,27 @@ +import streamlit as st +from kazu.krt.components import ( + show_save_button, + show_reset_button, +) +from kazu.krt.string_editor.components import ( + StringConflictForm, + reset, +) + +st.markdown("# String Matching Management") +show_save_button() +show_reset_button(reset) +st.write( + """This page modifies the configuration of OntologyStringResources, and finds and fixes conflicts.""" +) +choice = st.radio( + "Choose one", + options=["resolve_conflicts", "modify or add curation"], + index=0, +) + + +if choice == "resolve_conflicts": + StringConflictForm.resolve_conflicts_form() +else: + StringConflictForm.search_and_edit_synonyms_form() diff --git a/krt/pages/__init__.py b/krt/pages/__init__.py new file mode 100644 index 00000000..e69de29b From f577961885506447b45327bba5bb5e8b67d14a9d Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:19:43 +0100 Subject: [PATCH 11/14] postScriptGC = false as otherwise streamlit runs very slowly --- .streamlit/config.toml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .streamlit/config.toml diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 00000000..1b5b282e --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,4 @@ +[runner] + +postScriptGC = false +magicEnabled = false From 67ae19a7801e05af3934796d1e411dae47ff3627 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:47:42 +0100 Subject: [PATCH 12/14] made a bunch of methods private that didn't need to be public --- kazu/krt/components.py | 28 ++++++++++++++-------------- kazu/krt/string_editor/components.py | 22 +++++++++++----------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kazu/krt/components.py b/kazu/krt/components.py index c12b6baf..387898f0 100644 --- a/kazu/krt/components.py +++ b/kazu/krt/components.py @@ -116,7 +116,7 @@ def submit_form_for_addition() -> None: PlaceholderResource.delete_placeholder() @staticmethod - def display_case_sensitivity_selector( + def _display_case_sensitivity_selector( row: list[DeltaGenerator], row_index: int, default_syn: Optional[Synonym], @@ -142,7 +142,7 @@ def display_case_sensitivity_selector( ) @staticmethod - def display_confidence_selector( + def _display_confidence_selector( row: list[DeltaGenerator], row_index: int, default_syn: Optional[Synonym], @@ -180,16 +180,16 @@ def display_case_sensitivity_and_confidence_selector( :param default_syn: :return: """ - cs = ResourceEditor.display_case_sensitivity_selector( + cs = ResourceEditor._display_case_sensitivity_selector( row=row, row_index=0, default_syn=default_syn ) - conf = ResourceEditor.display_confidence_selector( + conf = ResourceEditor._display_confidence_selector( row=row, row_index=1, default_syn=default_syn ) return cs, conf @staticmethod - def display_synonym_options_container_with_defaults( + def _display_synonym_options_container_with_defaults( resource: OntologyStringResource, synonym: Synonym, parser_name: str ) -> None: """Displays a container with the synonym string and selectors for case @@ -210,7 +210,7 @@ def display_synonym_options_container_with_defaults( synonym=synonym, suffix=ResourceEditor.CASE_SELECTOR, ) - ResourceEditor.display_case_sensitivity_selector( + ResourceEditor._display_case_sensitivity_selector( row=row, row_index=0, default_syn=synonym, key=cs_key ) conf_key = ResourceEditor._get_key( @@ -219,7 +219,7 @@ def display_synonym_options_container_with_defaults( synonym=synonym, suffix=ResourceEditor.CONFIDENCE_SELECTOR, ) - ResourceEditor.display_confidence_selector( + ResourceEditor._display_confidence_selector( row=row, row_index=1, default_syn=synonym, key=conf_key ) @@ -237,7 +237,7 @@ def display_synonym_editor( for resource in resources: for synonym in resource.all_synonyms(): with st.container(border=True): - ResourceEditor.display_synonym_options_container_with_defaults( + ResourceEditor._display_synonym_options_container_with_defaults( resource=resource, synonym=synonym, parser_name=parser_name ) @@ -295,7 +295,7 @@ def _display_resource_editor_components( ) for synonym in resource.all_synonyms(): with st.container(border=True): - ResourceEditor.display_synonym_options_container_with_defaults( + ResourceEditor._display_synonym_options_container_with_defaults( resource=resource, synonym=synonym, parser_name=parser_name ) @@ -364,7 +364,7 @@ def display_resource_editor( st.form_submit_button("Submit", on_click=on_click_override, args=args) else: st.form_submit_button( - "Submit", on_click=ResourceEditor.submit_form_for_edits, args=(resources,) + "Submit", on_click=ResourceEditor._submit_form_for_edits, args=(resources,) ) @staticmethod @@ -401,7 +401,7 @@ def _extract_associated_id_set_from_df(df: pd.DataFrame) -> Optional[AssociatedI return None @staticmethod - def submit_form_for_edits(resources: set[OntologyStringResource]) -> None: + def _submit_form_for_edits(resources: set[OntologyStringResource]) -> None: for parser_name, resource_set in ResourceEditor._build_parser_lookup(resources).items(): for original_resource, new_resource in ResourceEditor.extract_form_data_from_state( parser_name=parser_name, resources=resource_set @@ -449,7 +449,7 @@ def extract_form_data_from_state( yield resource, new_resource @staticmethod - def extract_updated_synonym_data_from_state( + def _extract_updated_synonym_data_from_state( parser_name: str, resource: OntologyStringResource, synonym: Synonym ) -> Synonym: conf = st.session_state[ @@ -477,14 +477,14 @@ def _extract_new_synonyms_from_state( new_originals = set() for synonym in resource.original_synonyms: new_originals.add( - ResourceEditor.extract_updated_synonym_data_from_state( + ResourceEditor._extract_updated_synonym_data_from_state( parser_name, resource, synonym ) ) new_alts = set() for synonym in resource.alternative_synonyms: new_alts.add( - ResourceEditor.extract_updated_synonym_data_from_state( + ResourceEditor._extract_updated_synonym_data_from_state( parser_name, resource, synonym ) ) diff --git a/kazu/krt/string_editor/components.py b/kazu/krt/string_editor/components.py index e5d755d1..4f5c8d27 100644 --- a/kazu/krt/string_editor/components.py +++ b/kazu/krt/string_editor/components.py @@ -33,7 +33,7 @@ class StringConflictForm: FORM_PICKER = "FORM_PICKER" @staticmethod - def submit_form_for_batch_conflict_resolution(conflicts: Iterable[ResourceConflict]) -> None: + def _submit_form_for_batch_conflict_resolution(conflicts: Iterable[ResourceConflict]) -> None: for conflict in conflicts: logging.info(f"submit form {id(conflict)}") if ( @@ -54,7 +54,7 @@ def submit_form_for_batch_conflict_resolution(conflicts: Iterable[ResourceConfli del st.session_state[StringConflictForm.DATAFRAME] @staticmethod - def submit_form_for_individual_conflicts(conflicts: Iterable[ResourceConflict]) -> None: + def _submit_form_for_individual_conflicts(conflicts: Iterable[ResourceConflict]) -> None: for conflict in conflicts: logging.info(f"submit form {id(conflict)}") if ( @@ -87,7 +87,7 @@ def submit_form_for_individual_conflicts(conflicts: Iterable[ResourceConflict]) del st.session_state[StringConflictForm.DATAFRAME] @staticmethod - def individual_conflict_resolution_form() -> None: + def _individual_conflict_resolution_form() -> None: row_ids = ( st.session_state[StringConflictForm.DATAFRAME_SELECTION] .get("selection", {}) @@ -105,12 +105,12 @@ def individual_conflict_resolution_form() -> None: ) ResourceEditor.display_resource_editor( resources=resources, - on_click_override=StringConflictForm.submit_form_for_individual_conflicts, + on_click_override=StringConflictForm._submit_form_for_individual_conflicts, args=([conflict],), ) @staticmethod - def batch_conflict_resolution_form() -> None: + def _batch_conflict_resolution_form() -> None: df = st.session_state[StringConflictForm.DATAFRAME] resolution_choices = list(CaseConflictResolutionRequest) resolution_choices.remove(CaseConflictResolutionRequest.CUSTOM) @@ -141,7 +141,7 @@ def batch_conflict_resolution_form() -> None: disabled = True submitted = st.button("submit batch", key="submit_batch", disabled=disabled) if submitted: - StringConflictForm.submit_form_for_batch_conflict_resolution(conflicts) + StringConflictForm._submit_form_for_batch_conflict_resolution(conflicts) st.rerun() @staticmethod @@ -166,7 +166,7 @@ def resolve_conflicts_form() -> None: else: if st.session_state[StringConflictForm.FORM_PICKER] == "apply to all": - StringConflictForm.batch_conflict_resolution_form() + StringConflictForm._batch_conflict_resolution_form() else: df = st.session_state[StringConflictForm.DATAFRAME] st.dataframe( @@ -178,7 +178,7 @@ def resolve_conflicts_form() -> None: column_config={"conflict": None}, key=StringConflictForm.DATAFRAME_SELECTION, ) - StringConflictForm.individual_conflict_resolution_form() + StringConflictForm._individual_conflict_resolution_form() @staticmethod def search_and_edit_synonyms_form() -> None: @@ -189,7 +189,7 @@ def search_and_edit_synonyms_form() -> None: maybe_resources = manager.synonym_lookup.get(text.lower()) if not maybe_resources: st.write("No existing resources found that contain this string") - StringConflictForm.add_new_resource_form(text) + StringConflictForm._add_new_resource_form(text) else: st.write( "One or more resources already exists for this string. You can edit them here:" @@ -202,10 +202,10 @@ def search_and_edit_synonyms_form() -> None: if mode == "edit existing": ResourceEditor.display_resource_editor(maybe_resources) else: - StringConflictForm.add_new_resource_form(text) + StringConflictForm._add_new_resource_form(text) @staticmethod - def add_new_resource_form(text: str) -> None: + def _add_new_resource_form(text: str) -> None: ParserSelector.display_parser_selector() maybe_parser_name = ParserSelector.get_selected_parser_name() if maybe_parser_name: From c30b6e1903b35cc8c640e3a2d2ed55e24edd7937 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 14:48:24 +0100 Subject: [PATCH 13/14] removed unused method display_synonym_editor --- kazu/krt/components.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/kazu/krt/components.py b/kazu/krt/components.py index 387898f0..0fb98e7d 100644 --- a/kazu/krt/components.py +++ b/kazu/krt/components.py @@ -223,24 +223,6 @@ def _display_synonym_options_container_with_defaults( row=row, row_index=1, default_syn=synonym, key=conf_key ) - @staticmethod - def display_synonym_editor( - resources: Iterable[OntologyStringResource], parser_name: str - ) -> None: - """Displays an editor for each synonym in the provided resources. - - Each editor is contained within a bordered container. - :param resources: - :param parser_name: - :return: - """ - for resource in resources: - for synonym in resource.all_synonyms(): - with st.container(border=True): - ResourceEditor._display_synonym_options_container_with_defaults( - resource=resource, synonym=synonym, parser_name=parser_name - ) - @staticmethod def _build_parser_lookup( resources: Iterable[OntologyStringResource], From 1a8476baf4d3526ac1bad7f89f2840d502d86546 Mon Sep 17 00:00:00 2001 From: Richard Jackson Date: Thu, 13 Jun 2024 15:32:05 +0100 Subject: [PATCH 14/14] added mock_kazu_disk_cache_on_parsers to krt tests to prevent cache pollution --- kazu/tests/test_krt_managers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kazu/tests/test_krt_managers.py b/kazu/tests/test_krt_managers.py index 04a98ea6..7e91dc25 100644 --- a/kazu/tests/test_krt_managers.py +++ b/kazu/tests/test_krt_managers.py @@ -129,7 +129,7 @@ def init_discrepancy_manager() -> ResourceDiscrepancyManger: return dm -def test_resource_manager_sync(): +def test_resource_manager_sync(mock_kazu_disk_cache_on_parsers): rm = init_test_resource_manager() for parser in rm.parsers.values(): old_resources = parser.populate_metadata_db_and_resolve_string_resources()[ @@ -146,7 +146,7 @@ def test_resource_manager_sync(): assert new_resource in rm.resource_to_parsers -def test_string_conflict_manager_sync(): +def test_string_conflict_manager_sync(mock_kazu_disk_cache_on_parsers): scm = init_test_string_conflict_manager() new_resources: set[OntologyStringResource] = set() assert len(scm.unresolved_conflicts) == 2 @@ -161,7 +161,7 @@ def test_string_conflict_manager_sync(): assert new_resources.issubset(scm.manager.resource_to_parsers) -def test_discrepancy_manager_sync(): +def test_discrepancy_manager_sync(mock_kazu_disk_cache_on_parsers): dm = init_discrepancy_manager() new_resources: set[OntologyStringResource] = set() assert len(dm.unresolved_discrepancies) == 2