AstraZeneca · paluchasz · Dec 6, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/docs/_changelog.d/+scaling.feature.rst b/docs/_changelog.d/+scaling.feature.rst
@@ -0,0 +1 @@
+added scaling kazu with Ray docs and example.
diff --git a/docs/_changelog.d/+training.feature.rst b/docs/_changelog.d/+training.feature.rst
@@ -0,0 +1 @@
+added multilabel NER training example and config.
diff --git a/docs/conf.py b/docs/conf.py
@@ -213,6 +213,19 @@ def linkcode_resolve(domain: str, info: dict[str, Any]) -> Union[str, None]:
 nitpick_ignore = [
     # this doesn't appear to have an entry in the transformers docs for some reason.
     ("py:class", "transformers.models.bert.modeling_bert.BertPreTrainedModel"),
+    ("py:class", "transformers.models.bert.modeling_bert.BertForTokenClassification"),
+    ("py:class", "transformers.configuration_utils.PretrainedConfig"),
+    (
+        "py:class",
+        "transformers.models.deberta_v2.modeling_deberta_v2.DebertaV2ForTokenClassification",
+    ),
+    (
+        "py:class",
+        "transformers.models.distilbert.modeling_distilbert.DistilBertForTokenClassification",
+    ),
+    # pytorch doesn't have an objects.inv file, so we can't link to it directly
+    ("py:obj", "torch.LongTensor"),
+    ("py:class", "torch.LongTensor"),
     # the kazu.utils.grouping.Key TypeVar tries to generate this automatically.
     # Sphinx doesn't find it because the class is in _typeshed, which doesn't exist at runtime.
     # We link to _typeshed docs from the docstring anyway, so this is fine for the user.
@@ -247,8 +260,9 @@ def linkcode_resolve(domain: str, info: dict[str, Any]) -> Union[str, None]:
     # pydantic uses mkdocs, not Sphinx, and doesn't seem to have full API docs
     ("py:class", "pydantic.main.BaseModel"),
     # ray does have sphinx docs (at https://docs.ray.io/en/latest/ , but we don't need them for anything else)
-    # but it doesn't have a reference in its docs for ObjectRef (suprisingly)
+    # but it doesn't have a reference in its docs for a bunch of stuff (suprisingly)
     ("py:class", "ray._raylet.ObjectRef"),
+    ("py:class", "ray.util.queue.Queue"),
     # regex doesn't seem to have API docs at all
     ("py:class", "_regex.Pattern"),
     ("py:class", "urllib3.util.retry.Retry"),

diff --git a/docs/index.rst b/docs/index.rst
@@ -21,6 +21,7 @@ Welcome to Kazu's documentation!
    The Kazu Resource Tool <kazu_resource_tool>
    Curating a knowledge base for NER and Linking <curating_a_knowledgebase>
    Scaling with Ray <scaling_kazu>
+   Building a multilabel NER model with Kazu <training_multilabel_ner>
    Kazu as a WebService <kazu_webservice>
    Using Kazu as a library <kazu_as_a_library>
    Development Setup <development_setup>

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -1,3 +1,6 @@
+.. _quickstart:
+
+
 Quickstart
 ==========
 

diff --git a/docs/scaling_kazu.rst b/docs/scaling_kazu.rst
@@ -1,2 +1,37 @@
-TBA
-====
+.. _scaling_kazu:
+
+Scaling with Ray
+=================
+
+
+Usually, we want to run Kazu over large number of documents, so we need a framework to handle the distributed processing.
+
+`Ray <https://www.ray.io//>`_ is a simple to use Actor style framework that works extremely well for this. In this example,
+
+we demonstrate how Ray can be used to scale Kazu over multiple cores.
+
+.. note::
+    Ray can also be used in a multi node environment, for extreme scaling. Please refer to the Ray docs for this.
+
+
+
+Overview
+-----------
+
+We'll use the Kazu :class:`.LLMNERStep` with some clean up actions to build a Kazu pipeline. We'll then create multiple
+Ray actors to instantiate this pipeline, then feed those actors Kazu :class:`.Document`\s through :class:`ray.util.queue.Queue`\.
+The actors will process the documents, and write the results to another :class:`ray.util.queue.Queue`\. The main process will then
+read from this second queue and write the results to disk.
+
+The code for this orchestration is in ```scripts/examples/annotate_with_llm.py``` and the configuration is in
+```scripts/examples/conf/annotate_with_llm/default.yaml```
+
+The script can be executed with:
+
+.. code-block:: console
+
+   $ python scripts/examples/annotate_with_llm.py --config-path /<fully qualified>/kazu/scripts/examples/conf hydra.job.chdir=True
+
+
+.. note::
+    You will need to add values for the configuration keys marked ???, such as your input directory, vertex config etc.
diff --git a/docs/training_multilabel_ner.rst b/docs/training_multilabel_ner.rst
@@ -0,0 +1,43 @@
+Build an amasing NER model from LLM annotated data!
+====================================================
+
+Intro
+-----
+
+LLMs are REALLY good at BioNER (with some gently guidance). However, they may be too expensive to use over large corpora of
+documents. Instead, we can train classical multi-label BERT style classifiers using data produced from LLMs (licence restrictions not withstanding).
+
+This document briefly describes the workflow to do this.
+
+
+Creating training data
+-----------------------
+
+First, we need an LLM to annotate a bunch of documents for us, and potentially clean up their sometimes unpredictable output.
+To do this, follow the instuctions as described in :ref:`scaling_kazu`\.Then split the data into ```train/test/eval``` folders.
+
+Running the training
+---------------------
+
+We need the script ```kazu/training/train_script.py``` and the configuration from ```scripts/examples/conf/multilabel_ner_training/default.yaml```
+
+
+.. note::
+    This script expects you to have an instance of `LabelStudio <https://labelstud.io//>`_ running, so you can visualise the
+    results after each evaluation step. We recommend Docker for this.
+
+
+then run the script with
+
+
+
+.. code-block:: console
+
+   $ python -m training.train_script --config-path /<fully qualified>/kazu/scripts/examples/conf hydra.job.chdir=True \
+      multilabel_ner_training.test_path=<path to test docs> \
+      multilabel_ner_training.train_path=<path to train docs> \
+      multilabel_ner_training.training_data_cache_dir=<path to training data dir to cache docs> \
+      multilabel_ner_training.test_data_cache_dir=<path to test data dir to cache docs> \
+      multilabel_ner_training.label_studio_manager.headers.Authorisation="Token <your ls token>"
+
+More options are available via :class:`kazu.training.config.TrainingConfig`\.
diff --git a/kazu/annotation/label_studio.py b/kazu/annotation/label_studio.py
@@ -36,19 +36,27 @@ class KazuToLabelStudioConverter:
     """
 
     @classmethod
-    def convert_multiple_docs_to_tasks(cls, docs: Iterable[set[Document]]) -> Iterable[dict]:
+    def convert_multiple_docs_to_tasks(cls, docs: Iterable[list[Document]]) -> Iterable[dict]:
         """If you want to utilise multiple annotation views in label studio, you can
         supply an iterable of sets of kazu documents annotated by different pipelines.
         The entity information from each will be added to an independent annotation set
         in label studio.
 
+        index 0  of the sublist is assumed to be ground truth.
+
         :param docs:
         :return:
         """
         for differently_annotated_parallel_docs in docs:
-            all_tasks = (
-                cls.convert_single_doc_to_tasks(doc) for doc in differently_annotated_parallel_docs
+            all_tasks = []
+            all_tasks.append(
+                cls.convert_single_doc_to_tasks(differently_annotated_parallel_docs[0], True)
+            )
+            all_tasks.extend(
+                cls.convert_single_doc_to_tasks(doc, False)
+                for doc in differently_annotated_parallel_docs[1:]
             )
+
             for parallel_tasks in zip(*all_tasks, strict=True):
                 first_task = parallel_tasks[0]
                 other_tasks = parallel_tasks[1:]
@@ -61,36 +69,42 @@ def convert_multiple_docs_to_tasks(cls, docs: Iterable[set[Document]]) -> Iterab
                 # The extend here results in a list with a set of annotations for every original doc,
                 # which is what we want to signal to label studio 'this task has been annotated differently
                 # by several different annotation processes
-                result["annotations"].extend(chain(t["annotations"] for t in other_tasks))
+                result["annotations"].extend(chain(t["annotations"][0] for t in other_tasks))
                 yield result
 
     @classmethod
-    def convert_single_doc_to_tasks(cls, doc: Document) -> Iterable[dict]:
-        doc_id = doc.idx
+    def convert_single_doc_to_tasks(cls, doc: Document, ground_truth: bool) -> Iterable[dict]:
+
         for i, section in enumerate(doc.sections):
-            idx = f"{doc_id}_{section.name}_{i}"
-            data = {}
+            data: dict[str, Any] = {}
             data["text"] = section.text
-            data["id"] = idx
+            meta: dict[str, Any] = {}
+            meta["doc_id"] = doc.idx
+            meta["section_sequence"] = i
+            data["meta"] = meta
             annotations = []
             result_values = cls._create_label_studio_labels(section.entities, section.text)
-            annotation = {"id": idx, "result": result_values}
+            annotation: dict[str, Any] = {"result": result_values}
+            if ground_truth:
+                annotation["ground_truth"] = True
+            else:
+                annotation["ground_truth"] = False
             annotations.append(annotation)
             yield {"data": data, "annotations": annotations}
 
     @classmethod
     def convert_docs_to_tasks(cls, docs: list[Document]) -> list[dict]:
-        return [task for doc in docs for task in cls.convert_single_doc_to_tasks(doc)]
+        return [task for doc in docs for task in cls.convert_single_doc_to_tasks(doc, False)]
 
     @staticmethod
     def _create_label_studio_labels(
         entities: list[Entity],
         text: str,
     ) -> list[dict]:
         result_values: list[dict] = []
+        region_id = 0
         for ent in entities:
-            ent_hash = hash(ent)
-            prev_region_id: Optional[str] = None
+            prev_region_id: Optional[int] = None
             if len(ent.spans) > 2:
                 logger.warning(
                     """Currently we can't handle entities with 3 spans.
@@ -101,29 +115,27 @@ def _create_label_studio_labels(
                     Adding this warning as a safeguard"""
                 )
             for span in ent.spans:
-                region_id_str = f"{ent_hash}_{span}"
                 match = text[span.start : span.end]
                 ner_region = KazuToLabelStudioConverter._create_ner_region(
-                    ent, region_id_str, span, match
+                    ent, region_id, span, match
                 )
                 result_values.append(ner_region)
                 result_normalisation_value = KazuToLabelStudioConverter._create_mapping_region(
-                    ent, region_id_str, span, match
+                    ent, region_id, span, match
                 )
                 result_values.append(result_normalisation_value)
                 if prev_region_id is not None:
                     result_values.append(
                         KazuToLabelStudioConverter._create_non_contig_entity_links(
-                            prev_region_id, region_id_str
+                            prev_region_id, region_id
                         )
                     )
-                prev_region_id = region_id_str
+                prev_region_id = region_id
+                region_id += 1
         return result_values
 
     @staticmethod
-    def _create_non_contig_entity_links(
-        from_id: str, to_id: str
-    ) -> dict[str, Union[str, list[str]]]:
+    def _create_non_contig_entity_links(from_id: int, to_id: int) -> dict[str, Any]:
         return {
             "from_id": from_id,
             "to_id": to_id,
@@ -134,7 +146,7 @@ def _create_non_contig_entity_links(
 
     @staticmethod
     def _create_mapping_region(
-        ent: Entity, region_id: str, span: CharSpan, match: str
+        ent: Entity, region_id: int, span: CharSpan, match: str
     ) -> dict[str, Any]:
 
         return {
@@ -162,7 +174,7 @@ def _create_mapping_region(
 
     @staticmethod
     def _create_ner_region(
-        ent: Entity, region_id: str, span: CharSpan, match: str
+        ent: Entity, region_id: int, span: CharSpan, match: str
     ) -> dict[str, Any]:
         return {
             "id": region_id,
@@ -552,7 +564,7 @@ def update_view(self, view: LabelStudioAnnotationView, docs: list[Document]) ->
         pass
 
     @overload
-    def update_view(self, view: LabelStudioAnnotationView, docs: list[set[Document]]) -> None:
+    def update_view(self, view: LabelStudioAnnotationView, docs: list[list[Document]]) -> None:
         pass
 
     def update_view(self, view, docs):  # type: ignore[no-untyped-def]
@@ -565,7 +577,7 @@ def update_view(self, view, docs):  # type: ignore[no-untyped-def]
             information will form a seperate annotation set in label studio.
         :return:
         """
-        if isinstance(docs[0], set):
+        if isinstance(docs[0], list):
             tasks = list(KazuToLabelStudioConverter.convert_multiple_docs_to_tasks(docs))
         else:
             tasks = KazuToLabelStudioConverter.convert_docs_to_tasks(docs)
@@ -589,7 +601,7 @@ def update_tasks(self, docs: list[Document]) -> None:
         pass
 
     @overload
-    def update_tasks(self, docs: list[set[Document]]) -> None:
+    def update_tasks(self, docs: list[list[Document]]) -> None:
         pass
 
     def update_tasks(self, docs):  # type: ignore[no-untyped-def]
@@ -601,7 +613,7 @@ def update_tasks(self, docs):  # type: ignore[no-untyped-def]
             information will form a seperate annotation set in label studio.
         :return:
         """
-        if isinstance(docs[0], set):
+        if isinstance(docs[0], list):
             tasks = list(KazuToLabelStudioConverter.convert_multiple_docs_to_tasks(docs))
         else:
             tasks = KazuToLabelStudioConverter.convert_docs_to_tasks(docs)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		added multilabel NER training example and config.