Merge pull request #5 from wellcometrust/feature/ivyleavedtoadflax/pr…

…odigy_utilities Add deep_reference_parser utilities
wellcometrust · Feb 17, 2020 · dc40e8d · dc40e8d
2 parents 67efc81 + 0855a5c
commit dc40e8d
Show file tree

Hide file tree

Showing 21 changed files with 2,071 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ To train your own models you will need to define the model hyperparameters in a
 python -m deep_reference_parser train test.ini
 ```
 
-Data must be prepared in the following tab separated format (tsv). We may publish further tools in the future to assist in the preparation of data following annotation. In this case the data the data for reference span ddetection follows an IOBE schema.
+Data must be prepared in the following tab separated format (tsv). We use [prodi.gy](https://prodi.gy) for annotations. Some utilities to help with manual annotations and various format conversions are available in the [prodigy](./prodigy/) module. Data for reference span detection follows an IOBE schema.
 
 You must provide the train/test/validation data splits in this format in pre-prepared files that are defined in the config file.
 

diff --git a/deep_reference_parser/__main__.py b/deep_reference_parser/__main__.py
@@ -9,12 +9,12 @@
     import plac
     import sys
     from wasabi import msg
-    from deep_reference_parser.train import train
-    from deep_reference_parser.predict import predict
+    from .train import train
+    from .predict import predict
 
     commands = {
-        "train": train,
         "predict": predict,
+        "train": train,
     }
 
     if len(sys.argv) == 1:

diff --git a/deep_reference_parser/io/__init__.py b/deep_reference_parser/io/__init__.py
@@ -0,0 +1 @@
+from .io import read_jsonl, write_jsonl
diff --git a/deep_reference_parser/io/io.py b/deep_reference_parser/io/io.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+"""
+Utilities for loading and saving data from various formats
+"""
+
+import json
+
+from ..logger import logger
+
+
+def write_jsonl(input_data, output_file):
+    """
+    Write a dict to jsonl (line delimited json)
+
+    Output format will look like:
+
+    ```
+    {'a': 0}
+    {'b': 1}
+    {'c': 2}
+    {'d': 3}
+    ```
+
+    Args:
+        input_data(dict): A dict to be written to json.
+        output_file(str): Filename to which the jsonl will be saved.
+    """
+
+    with open(output_file, 'w') as fb:
+
+        # Check if a dict (and convert to list if so)
+
+        if isinstance(input_data, dict):
+            input_data = [value for key, value in input_data.items()]
+
+        # Write out to jsonl file
+
+        logger.debug('Writing %s lines to %s', len(input_data), output_file)
+
+        for i in input_data:
+            json_ = json.dumps(i) + '\n'
+            fb.write(json_)
+
+
+def _yield_jsonl(file_name):
+    for row in open(file_name, "r"):
+        yield json.loads(row)
+
+
+def read_jsonl(input_file):
+    """Create a list from a jsonl file
+
+    Args:
+        input_file(str): File to be loaded.
+    """
+
+    out = list(_yield_jsonl(input_file))
+
+    logger.debug('Read %s lines from %s', len(out), input_file)
+
+    return out
diff --git a/deep_reference_parser/prodigy/README.md b/deep_reference_parser/prodigy/README.md
@@ -0,0 +1,42 @@
+# Prodigy utilities
+
+The `deep_reference_parser.prodigy` module contains a number of utility functions for working with annotations created in [prodi.gy](http://prodi.gy).
+
+The individual functions can be access with the usual `import deep_reference_parser.prodigy` logic, but can also be accessed on the command line with:
+
+```
+$ python -m deep_reference_parser.prodigy
+Using TensorFlow backend.
+
+ℹ Available commands
+annotate_numbered_refs, prodigy_to_tsv, reach_to_prodigy,
+refs_to_token_annotations
+```
+
+|Name|Description|
+|---|---|
+|reach_to_prodigy|Converts a jsonl of reference sections output by reach into a jsonl containing prodigy format documents.|
+|annotate_numbered_refs|Takes numbered reference sections extract by Reach, and roughly annotates the references by splitting the reference lines apart on the numbers.|
+|prodigy_to_tsv|Converts a jsonl file of prodigy documents to a tab separated values (tsv) file where each token and its associated label occupy a line.|
+|refs_to_token_annotations|Takes a jsonl of annotated reference sections in prodigy format that have been manually annotated to the reference level, and converts the references into token level annotations based on the IOBE schema, saving a new file or prodigy documents to jsonl.|
+
+Help for each of these commands can be sought with the `--help` flag, e.g.:
+
+```
+$ python -m deep_reference_parser.prodigy prodigy_to_tsv --help
+Using TensorFlow backend.
+usage: deep_reference_parser prodigy_to_tsv [-h] input_file output_file
+
+    Convert token annotated jsonl to token annotated tsv ready for use in the
+    Rodrigues model.
+    
+
+positional arguments:
+  input_file   Path to jsonl file containing prodigy docs.
+  output_file  Path to output tsv file.
+
+optional arguments:
+  -h, --help   show this help message and exit
+
+```
+
diff --git a/deep_reference_parser/prodigy/__init__.py b/deep_reference_parser/prodigy/__init__.py
@@ -0,0 +1,2 @@
+from .spacy_doc_to_prodigy import SpacyDocToProdigy
+from .reference_to_token_annotations import TokenTagger
diff --git a/deep_reference_parser/prodigy/__main__.py b/deep_reference_parser/prodigy/__main__.py
@@ -0,0 +1,33 @@
+# coding: utf8
+
+"""
+Modified from https://github.com/explosion/spaCy/blob/master/spacy/__main__.py
+
+"""
+
+if __name__ == "__main__":
+    import plac
+    import sys
+    from wasabi import msg
+    from .numbered_reference_annotator import annotate_numbered_references
+    from .prodigy_to_tsv import prodigy_to_tsv
+    from .reach_to_prodigy import reach_to_prodigy
+    from .reference_to_token_annotations import reference_to_token_annotations
+
+    commands = {
+        "annotate_numbered_refs": annotate_numbered_references,
+        "prodigy_to_tsv": prodigy_to_tsv,
+        "reach_to_prodigy": reach_to_prodigy,
+        "refs_to_token_annotations": reference_to_token_annotations,
+    }
+
+    if len(sys.argv) == 1:
+        msg.info("Available commands", ", ".join(commands), exits=1)
+    command = sys.argv.pop(1)
+    sys.argv[0] = "deep_reference_parser %s" % command
+
+    if command in commands:
+        plac.call(commands[command], sys.argv[1:])
+    else:
+        available = "Available: {}".format(", ".join(commands))
+        msg.fail("Unknown command: {}".format(command), available, exits=1)
diff --git a/deep_reference_parser/prodigy/numbered_reference_annotator.py b/deep_reference_parser/prodigy/numbered_reference_annotator.py
@@ -0,0 +1,149 @@
+# coding: utf-8
+#!/usr/bin/env python3
+
+import re
+
+import plac
+
+from ..io import read_jsonl, write_jsonl
+from ..logger import logger
+
+REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
+
+class NumberedReferenceAnnotator:
+    """
+    Takes reference sections with numeric labelling scraped by Reach in prodigy
+    format, and labels the references as spans by splitting them using regex.
+
+    Note that you must identify numbered reference section first. This can be
+    done with a simple textcat model trained in prodigy.
+    """
+
+    def __init__(self):
+
+        self.regex = r""
+
+    def run(self, docs, regex=REGEX):
+
+        self.regex = regex
+
+        for doc in docs:
+
+            spans = self.label_numbered_references(doc["text"], doc["tokens"])
+            doc["spans"] = spans
+
+            yield doc
+
+    def label_numbered_references(self, text, tokens):
+
+        # Search for number reference using regex
+
+        splits = list(re.finditer(self.regex, text))
+        spans = []
+
+        for index in range(0, len(splits) - 1):
+
+            # Calculate the approximate start and end of the reference using
+            # the character offsets returned by re.finditer.
+
+            start = splits[index].end()
+            end = splits[index + 1].start()
+
+            # Calculate which is the closest token to the character offset
+            # returned above.
+
+            token_start = self._find_closest_token(tokens, start, "start")
+            token_end = self._find_closest_token(tokens, end, "end")
+
+            # To avoid the possibility of mismatches between the character
+            # offset and the token offset, reset the character offsets
+            # based on the token offsets.
+
+            start = self._get_token_offset(tokens, token_start, "start")
+            end = self._get_token_offset(tokens, token_end, "end")
+
+            # Create dict and append
+
+            span = {
+                "start": start,
+                "end": end,
+                "token_start": token_start,
+                "token_end": token_end,
+                "label": "BE"
+            }
+
+            spans.append(span)
+
+        return spans
+
+
+    def _find_closest_token(self, tokens, char_offset, pos_string):
+        """
+        Find the token start/end closest to "number"
+
+        Args:
+            tokens: A list of token dicts from a prodigy document.
+            char_offset(int): A character offset relating to either the start or the
+                end of a token.
+            pos_string(str): One of ["start", "end"] denoting whether `char_offset`
+                is a start or the end of a token
+        """
+        token_map = self._token_start_mapper(tokens, pos_string)
+        token_key = self._find_closest_number(token_map.keys(), char_offset)
+
+        return token_map[token_key]
+
+    def _get_token_offset(self, tokens, token_id, pos_string):
+        """
+        Return the character offset for the token with id == token_id
+        """
+
+        token_match = (token[pos_string] for token in tokens if token["id"] == token_id)
+
+        return next(token_match, None)
+
+    def _find_closest_number(self, numbers, number):
+        """ Find the closest match in a list of numbers when presented with
+        a number
+        """
+
+        return min(numbers, key=lambda x:abs(x - number))
+
+    def _token_start_mapper(self, tokens, pos_string):
+        """ Map token id by the token start/end position
+        """
+
+        return {token[pos_string]:token["id"] for token in tokens}
+
+
+@plac.annotations(
+    input_file=(
+        "Path to jsonl file containing numbered reference sections as docs.",
+        "positional",
+        None,
+        str
+    ),
+    output_file=(
+        "Path to output jsonl file containing prodigy docs with numbered references labelled.",
+        "positional",
+        None,
+        str
+    )
+)
+def annotate_numbered_references(input_file, output_file):
+    """
+    Takes reference sections with numeric labelling scraped by Reach in prodigy
+    format, and labels the references as spans by splitting them using regex.
+    """
+
+    numbered_reference_sections = read_jsonl(input_file)
+
+    logger.info("Loaded %s prodigy docs", len(numbered_reference_sections))
+
+    nra = NumberedReferenceAnnotator()
+    docs = list(nra.run[numbered_reference_sections])
+
+    write_jsonl(output_file)
+
+    logger.info("Wrote %s annotated references to %s", len(docs),
+        output_file)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .spacy_doc_to_prodigy import SpacyDocToProdigy
		from .reference_to_token_annotations import TokenTagger