Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

multilabel ner training #79

Merged
merged 35 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
87000fd
updated label studio API to be compatible with latest version, which …
RichJackson Oct 23, 2024
e18f4fd
LLMNER step now has options to parse results from structured outputs,…
RichJackson Oct 23, 2024
b04a1f6
added additional Entity filter actions to support clean parsing of LL…
RichJackson Oct 23, 2024
072088c
added missing param to LSWebUtils
RichJackson Oct 23, 2024
31f9638
fixed type hint for update view
RichJackson Oct 23, 2024
8a553bc
added training code for multilabel NER
RichJackson Oct 23, 2024
fe87417
added pyarrow to mypy overrides
RichJackson Oct 23, 2024
1b2070b
added epoch completion fraction to TrainingConfig
RichJackson Oct 24, 2024
8a5615a
fixed bug with non-json serialised ents in metadata not appearing in …
RichJackson Oct 24, 2024
6ee718a
minor code cleanups
RichJackson Oct 24, 2024
ebb8629
added tests for new LLMNER cleanup actions
RichJackson Oct 24, 2024
622bf4c
added annotate_with_llm.py example script
RichJackson Oct 24, 2024
ed750c3
added example configs for training and annotate_with_llm
RichJackson Oct 24, 2024
b560cf0
fix: make parameter configurable for different architectures
paluchasz Oct 24, 2024
3502309
renamed eval to test so we can add an eval stage at some future point…
RichJackson Oct 24, 2024
4f883a3
fixed docstrings for modelling.py
RichJackson Oct 24, 2024
524836c
fixed param keys for test_path and test_data_cache_dir
RichJackson Oct 24, 2024
d29a6c2
added scaling and training documentation, with required nitpicks
RichJackson Oct 24, 2024
64f4600
keys_to_use is now inferred from architecture
RichJackson Oct 24, 2024
a2cf1ec
twoncrier: scaling and NER training
RichJackson Oct 24, 2024
2a6bafa
added missing entity_key config param to StructuredOutputResultParser
RichJackson Oct 24, 2024
a283f2e
default epoch_completion_fraction_before_evals is now 0.75 of an epoch
RichJackson Oct 24, 2024
e26487c
removed unused keys_to_use from config
RichJackson Oct 24, 2024
64c9ffb
false pos and neg items now correctly updated
RichJackson Oct 24, 2024
2efd307
fixed mypy ignore for some strange linux issue
RichJackson Oct 24, 2024
e239c64
refactor: resuasbility and edge cases
paluchasz Nov 7, 2024
1e03b9b
feat: add predict script to test trained model
paluchasz Nov 7, 2024
1836f1d
refactor: delete specific params
paluchasz Nov 7, 2024
4855298
refactor: separate out parquet to kazu docs conversion
paluchasz Nov 15, 2024
505e6c9
fix: remove custom paths from config
paluchasz Nov 15, 2024
1b5b6e9
fix: issue with fps and move function out of class
paluchasz Nov 25, 2024
167107d
refactor: retrieve labels from the model
paluchasz Nov 25, 2024
5734563
feat: add separate eval stage
paluchasz Nov 25, 2024
74cd82c
refactor: move functions to a reusable module
paluchasz Nov 25, 2024
9b4f3ba
refactor: save out predictions in eval script
paluchasz Dec 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/_changelog.d/+scaling.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
added scaling kazu with Ray docs and example.
1 change: 1 addition & 0 deletions docs/_changelog.d/+training.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
added multilabel NER training example and config.
16 changes: 15 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,19 @@ def linkcode_resolve(domain: str, info: dict[str, Any]) -> Union[str, None]:
nitpick_ignore = [
# this doesn't appear to have an entry in the transformers docs for some reason.
("py:class", "transformers.models.bert.modeling_bert.BertPreTrainedModel"),
("py:class", "transformers.models.bert.modeling_bert.BertForTokenClassification"),
("py:class", "transformers.configuration_utils.PretrainedConfig"),
(
"py:class",
"transformers.models.deberta_v2.modeling_deberta_v2.DebertaV2ForTokenClassification",
),
(
"py:class",
"transformers.models.distilbert.modeling_distilbert.DistilBertForTokenClassification",
),
# pytorch doesn't have an objects.inv file, so we can't link to it directly
("py:obj", "torch.LongTensor"),
("py:class", "torch.LongTensor"),
# the kazu.utils.grouping.Key TypeVar tries to generate this automatically.
# Sphinx doesn't find it because the class is in _typeshed, which doesn't exist at runtime.
# We link to _typeshed docs from the docstring anyway, so this is fine for the user.
Expand Down Expand Up @@ -247,8 +260,9 @@ def linkcode_resolve(domain: str, info: dict[str, Any]) -> Union[str, None]:
# pydantic uses mkdocs, not Sphinx, and doesn't seem to have full API docs
("py:class", "pydantic.main.BaseModel"),
# ray does have sphinx docs (at https://docs.ray.io/en/latest/ , but we don't need them for anything else)
# but it doesn't have a reference in its docs for ObjectRef (suprisingly)
# but it doesn't have a reference in its docs for a bunch of stuff (suprisingly)
("py:class", "ray._raylet.ObjectRef"),
("py:class", "ray.util.queue.Queue"),
# regex doesn't seem to have API docs at all
("py:class", "_regex.Pattern"),
("py:class", "urllib3.util.retry.Retry"),
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Welcome to Kazu's documentation!
The Kazu Resource Tool <kazu_resource_tool>
Curating a knowledge base for NER and Linking <curating_a_knowledgebase>
Scaling with Ray <scaling_kazu>
Building a multilabel NER model with Kazu <training_multilabel_ner>
Kazu as a WebService <kazu_webservice>
Using Kazu as a library <kazu_as_a_library>
Development Setup <development_setup>
Expand Down
3 changes: 3 additions & 0 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.. _quickstart:


Quickstart
==========

Expand Down
39 changes: 37 additions & 2 deletions docs/scaling_kazu.rst
Original file line number Diff line number Diff line change
@@ -1,2 +1,37 @@
TBA
====
.. _scaling_kazu:

Scaling with Ray
=================


Usually, we want to run Kazu over large number of documents, so we need a framework to handle the distributed processing.

`Ray <https://www.ray.io//>`_ is a simple to use Actor style framework that works extremely well for this. In this example,

we demonstrate how Ray can be used to scale Kazu over multiple cores.

.. note::
Ray can also be used in a multi node environment, for extreme scaling. Please refer to the Ray docs for this.



Overview
-----------

We'll use the Kazu :class:`.LLMNERStep` with some clean up actions to build a Kazu pipeline. We'll then create multiple
Ray actors to instantiate this pipeline, then feed those actors Kazu :class:`.Document`\s through :class:`ray.util.queue.Queue`\.
The actors will process the documents, and write the results to another :class:`ray.util.queue.Queue`\. The main process will then
read from this second queue and write the results to disk.

The code for this orchestration is in ```scripts/examples/annotate_with_llm.py``` and the configuration is in
```scripts/examples/conf/annotate_with_llm/default.yaml```

The script can be executed with:

.. code-block:: console

$ python scripts/examples/annotate_with_llm.py --config-path /<fully qualified>/kazu/scripts/examples/conf hydra.job.chdir=True


.. note::
You will need to add values for the configuration keys marked ???, such as your input directory, vertex config etc.
43 changes: 43 additions & 0 deletions docs/training_multilabel_ner.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
Build an amasing NER model from LLM annotated data!
====================================================

Intro
-----

LLMs are REALLY good at BioNER (with some gently guidance). However, they may be too expensive to use over large corpora of
documents. Instead, we can train classical multi-label BERT style classifiers using data produced from LLMs (licence restrictions not withstanding).

This document briefly describes the workflow to do this.


Creating training data
-----------------------

First, we need an LLM to annotate a bunch of documents for us, and potentially clean up their sometimes unpredictable output.
To do this, follow the instuctions as described in :ref:`scaling_kazu`\.Then split the data into ```train/test/eval``` folders.

Running the training
---------------------

We need the script ```kazu/training/train_script.py``` and the configuration from ```scripts/examples/conf/multilabel_ner_training/default.yaml```


.. note::
This script expects you to have an instance of `LabelStudio <https://labelstud.io//>`_ running, so you can visualise the
results after each evaluation step. We recommend Docker for this.


then run the script with



.. code-block:: console

$ python -m training.train_script --config-path /<fully qualified>/kazu/scripts/examples/conf hydra.job.chdir=True \
multilabel_ner_training.test_path=<path to test docs> \
multilabel_ner_training.train_path=<path to train docs> \
multilabel_ner_training.training_data_cache_dir=<path to training data dir to cache docs> \
multilabel_ner_training.test_data_cache_dir=<path to test data dir to cache docs> \
multilabel_ner_training.label_studio_manager.headers.Authorisation="Token <your ls token>"

More options are available via :class:`kazu.training.config.TrainingConfig`\.
66 changes: 39 additions & 27 deletions kazu/annotation/label_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,27 @@ class KazuToLabelStudioConverter:
"""

@classmethod
def convert_multiple_docs_to_tasks(cls, docs: Iterable[set[Document]]) -> Iterable[dict]:
def convert_multiple_docs_to_tasks(cls, docs: Iterable[list[Document]]) -> Iterable[dict]:
"""If you want to utilise multiple annotation views in label studio, you can
supply an iterable of sets of kazu documents annotated by different pipelines.
The entity information from each will be added to an independent annotation set
in label studio.

index 0 of the sublist is assumed to be ground truth.

:param docs:
:return:
"""
for differently_annotated_parallel_docs in docs:
all_tasks = (
cls.convert_single_doc_to_tasks(doc) for doc in differently_annotated_parallel_docs
all_tasks = []
all_tasks.append(
cls.convert_single_doc_to_tasks(differently_annotated_parallel_docs[0], True)
)
all_tasks.extend(
cls.convert_single_doc_to_tasks(doc, False)
for doc in differently_annotated_parallel_docs[1:]
)

for parallel_tasks in zip(*all_tasks, strict=True):
first_task = parallel_tasks[0]
other_tasks = parallel_tasks[1:]
Expand All @@ -61,36 +69,42 @@ def convert_multiple_docs_to_tasks(cls, docs: Iterable[set[Document]]) -> Iterab
# The extend here results in a list with a set of annotations for every original doc,
# which is what we want to signal to label studio 'this task has been annotated differently
# by several different annotation processes
result["annotations"].extend(chain(t["annotations"] for t in other_tasks))
result["annotations"].extend(chain(t["annotations"][0] for t in other_tasks))
yield result

@classmethod
def convert_single_doc_to_tasks(cls, doc: Document) -> Iterable[dict]:
doc_id = doc.idx
def convert_single_doc_to_tasks(cls, doc: Document, ground_truth: bool) -> Iterable[dict]:

for i, section in enumerate(doc.sections):
idx = f"{doc_id}_{section.name}_{i}"
data = {}
data: dict[str, Any] = {}
data["text"] = section.text
data["id"] = idx
meta: dict[str, Any] = {}
meta["doc_id"] = doc.idx
meta["section_sequence"] = i
data["meta"] = meta
annotations = []
result_values = cls._create_label_studio_labels(section.entities, section.text)
annotation = {"id": idx, "result": result_values}
annotation: dict[str, Any] = {"result": result_values}
if ground_truth:
annotation["ground_truth"] = True
else:
annotation["ground_truth"] = False
annotations.append(annotation)
yield {"data": data, "annotations": annotations}

@classmethod
def convert_docs_to_tasks(cls, docs: list[Document]) -> list[dict]:
return [task for doc in docs for task in cls.convert_single_doc_to_tasks(doc)]
return [task for doc in docs for task in cls.convert_single_doc_to_tasks(doc, False)]

@staticmethod
def _create_label_studio_labels(
entities: list[Entity],
text: str,
) -> list[dict]:
result_values: list[dict] = []
region_id = 0
for ent in entities:
ent_hash = hash(ent)
prev_region_id: Optional[str] = None
prev_region_id: Optional[int] = None
if len(ent.spans) > 2:
logger.warning(
"""Currently we can't handle entities with 3 spans.
Expand All @@ -101,29 +115,27 @@ def _create_label_studio_labels(
Adding this warning as a safeguard"""
)
for span in ent.spans:
region_id_str = f"{ent_hash}_{span}"
match = text[span.start : span.end]
ner_region = KazuToLabelStudioConverter._create_ner_region(
ent, region_id_str, span, match
ent, region_id, span, match
)
result_values.append(ner_region)
result_normalisation_value = KazuToLabelStudioConverter._create_mapping_region(
ent, region_id_str, span, match
ent, region_id, span, match
)
result_values.append(result_normalisation_value)
if prev_region_id is not None:
result_values.append(
KazuToLabelStudioConverter._create_non_contig_entity_links(
prev_region_id, region_id_str
prev_region_id, region_id
)
)
prev_region_id = region_id_str
prev_region_id = region_id
region_id += 1
return result_values

@staticmethod
def _create_non_contig_entity_links(
from_id: str, to_id: str
) -> dict[str, Union[str, list[str]]]:
def _create_non_contig_entity_links(from_id: int, to_id: int) -> dict[str, Any]:
return {
"from_id": from_id,
"to_id": to_id,
Expand All @@ -134,7 +146,7 @@ def _create_non_contig_entity_links(

@staticmethod
def _create_mapping_region(
ent: Entity, region_id: str, span: CharSpan, match: str
ent: Entity, region_id: int, span: CharSpan, match: str
) -> dict[str, Any]:

return {
Expand Down Expand Up @@ -162,7 +174,7 @@ def _create_mapping_region(

@staticmethod
def _create_ner_region(
ent: Entity, region_id: str, span: CharSpan, match: str
ent: Entity, region_id: int, span: CharSpan, match: str
) -> dict[str, Any]:
return {
"id": region_id,
Expand Down Expand Up @@ -552,7 +564,7 @@ def update_view(self, view: LabelStudioAnnotationView, docs: list[Document]) ->
pass

@overload
def update_view(self, view: LabelStudioAnnotationView, docs: list[set[Document]]) -> None:
def update_view(self, view: LabelStudioAnnotationView, docs: list[list[Document]]) -> None:
pass

def update_view(self, view, docs): # type: ignore[no-untyped-def]
Expand All @@ -565,7 +577,7 @@ def update_view(self, view, docs): # type: ignore[no-untyped-def]
information will form a seperate annotation set in label studio.
:return:
"""
if isinstance(docs[0], set):
if isinstance(docs[0], list):
tasks = list(KazuToLabelStudioConverter.convert_multiple_docs_to_tasks(docs))
else:
tasks = KazuToLabelStudioConverter.convert_docs_to_tasks(docs)
Expand All @@ -589,7 +601,7 @@ def update_tasks(self, docs: list[Document]) -> None:
pass

@overload
def update_tasks(self, docs: list[set[Document]]) -> None:
def update_tasks(self, docs: list[list[Document]]) -> None:
pass

def update_tasks(self, docs): # type: ignore[no-untyped-def]
Expand All @@ -601,7 +613,7 @@ def update_tasks(self, docs): # type: ignore[no-untyped-def]
information will form a seperate annotation set in label studio.
:return:
"""
if isinstance(docs[0], set):
if isinstance(docs[0], list):
tasks = list(KazuToLabelStudioConverter.convert_multiple_docs_to_tasks(docs))
else:
tasks = KazuToLabelStudioConverter.convert_docs_to_tasks(docs)
Expand Down
Loading
Loading