Skip to content

Commit

Permalink
Merge pull request #5 from wellcometrust/feature/ivyleavedtoadflax/pr…
Browse files Browse the repository at this point in the history
…odigy_utilities

Add deep_reference_parser utilities
  • Loading branch information
ivyleavedtoadflax authored Feb 17, 2020
2 parents 67efc81 + 0855a5c commit dc40e8d
Show file tree
Hide file tree
Showing 21 changed files with 2,071 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ To train your own models you will need to define the model hyperparameters in a
python -m deep_reference_parser train test.ini
```

Data must be prepared in the following tab separated format (tsv). We may publish further tools in the future to assist in the preparation of data following annotation. In this case the data the data for reference span ddetection follows an IOBE schema.
Data must be prepared in the following tab separated format (tsv). We use [prodi.gy](https://prodi.gy) for annotations. Some utilities to help with manual annotations and various format conversions are available in the [prodigy](./prodigy/) module. Data for reference span detection follows an IOBE schema.

You must provide the train/test/validation data splits in this format in pre-prepared files that are defined in the config file.

Expand Down
6 changes: 3 additions & 3 deletions deep_reference_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
import plac
import sys
from wasabi import msg
from deep_reference_parser.train import train
from deep_reference_parser.predict import predict
from .train import train
from .predict import predict

commands = {
"train": train,
"predict": predict,
"train": train,
}

if len(sys.argv) == 1:
Expand Down
1 change: 1 addition & 0 deletions deep_reference_parser/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .io import read_jsonl, write_jsonl
63 changes: 63 additions & 0 deletions deep_reference_parser/io/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
# coding: utf-8

"""
Utilities for loading and saving data from various formats
"""

import json

from ..logger import logger


def write_jsonl(input_data, output_file):
"""
Write a dict to jsonl (line delimited json)
Output format will look like:
```
{'a': 0}
{'b': 1}
{'c': 2}
{'d': 3}
```
Args:
input_data(dict): A dict to be written to json.
output_file(str): Filename to which the jsonl will be saved.
"""

with open(output_file, 'w') as fb:

# Check if a dict (and convert to list if so)

if isinstance(input_data, dict):
input_data = [value for key, value in input_data.items()]

# Write out to jsonl file

logger.debug('Writing %s lines to %s', len(input_data), output_file)

for i in input_data:
json_ = json.dumps(i) + '\n'
fb.write(json_)


def _yield_jsonl(file_name):
for row in open(file_name, "r"):
yield json.loads(row)


def read_jsonl(input_file):
"""Create a list from a jsonl file
Args:
input_file(str): File to be loaded.
"""

out = list(_yield_jsonl(input_file))

logger.debug('Read %s lines from %s', len(out), input_file)

return out
42 changes: 42 additions & 0 deletions deep_reference_parser/prodigy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Prodigy utilities

The `deep_reference_parser.prodigy` module contains a number of utility functions for working with annotations created in [prodi.gy](http://prodi.gy).

The individual functions can be access with the usual `import deep_reference_parser.prodigy` logic, but can also be accessed on the command line with:

```
$ python -m deep_reference_parser.prodigy
Using TensorFlow backend.
ℹ Available commands
annotate_numbered_refs, prodigy_to_tsv, reach_to_prodigy,
refs_to_token_annotations
```

|Name|Description|
|---|---|
|reach_to_prodigy|Converts a jsonl of reference sections output by reach into a jsonl containing prodigy format documents.|
|annotate_numbered_refs|Takes numbered reference sections extract by Reach, and roughly annotates the references by splitting the reference lines apart on the numbers.|
|prodigy_to_tsv|Converts a jsonl file of prodigy documents to a tab separated values (tsv) file where each token and its associated label occupy a line.|
|refs_to_token_annotations|Takes a jsonl of annotated reference sections in prodigy format that have been manually annotated to the reference level, and converts the references into token level annotations based on the IOBE schema, saving a new file or prodigy documents to jsonl.|

Help for each of these commands can be sought with the `--help` flag, e.g.:

```
$ python -m deep_reference_parser.prodigy prodigy_to_tsv --help
Using TensorFlow backend.
usage: deep_reference_parser prodigy_to_tsv [-h] input_file output_file
Convert token annotated jsonl to token annotated tsv ready for use in the
Rodrigues model.
positional arguments:
input_file Path to jsonl file containing prodigy docs.
output_file Path to output tsv file.
optional arguments:
-h, --help show this help message and exit
```

2 changes: 2 additions & 0 deletions deep_reference_parser/prodigy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .spacy_doc_to_prodigy import SpacyDocToProdigy
from .reference_to_token_annotations import TokenTagger
33 changes: 33 additions & 0 deletions deep_reference_parser/prodigy/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# coding: utf8

"""
Modified from https://github.com/explosion/spaCy/blob/master/spacy/__main__.py
"""

if __name__ == "__main__":
import plac
import sys
from wasabi import msg
from .numbered_reference_annotator import annotate_numbered_references
from .prodigy_to_tsv import prodigy_to_tsv
from .reach_to_prodigy import reach_to_prodigy
from .reference_to_token_annotations import reference_to_token_annotations

commands = {
"annotate_numbered_refs": annotate_numbered_references,
"prodigy_to_tsv": prodigy_to_tsv,
"reach_to_prodigy": reach_to_prodigy,
"refs_to_token_annotations": reference_to_token_annotations,
}

if len(sys.argv) == 1:
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
sys.argv[0] = "deep_reference_parser %s" % command

if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
available = "Available: {}".format(", ".join(commands))
msg.fail("Unknown command: {}".format(command), available, exits=1)
149 changes: 149 additions & 0 deletions deep_reference_parser/prodigy/numbered_reference_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# coding: utf-8
#!/usr/bin/env python3

import re

import plac

from ..io import read_jsonl, write_jsonl
from ..logger import logger

REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"

class NumberedReferenceAnnotator:
"""
Takes reference sections with numeric labelling scraped by Reach in prodigy
format, and labels the references as spans by splitting them using regex.
Note that you must identify numbered reference section first. This can be
done with a simple textcat model trained in prodigy.
"""

def __init__(self):

self.regex = r""

def run(self, docs, regex=REGEX):

self.regex = regex

for doc in docs:

spans = self.label_numbered_references(doc["text"], doc["tokens"])
doc["spans"] = spans

yield doc

def label_numbered_references(self, text, tokens):

# Search for number reference using regex

splits = list(re.finditer(self.regex, text))
spans = []

for index in range(0, len(splits) - 1):

# Calculate the approximate start and end of the reference using
# the character offsets returned by re.finditer.

start = splits[index].end()
end = splits[index + 1].start()

# Calculate which is the closest token to the character offset
# returned above.

token_start = self._find_closest_token(tokens, start, "start")
token_end = self._find_closest_token(tokens, end, "end")

# To avoid the possibility of mismatches between the character
# offset and the token offset, reset the character offsets
# based on the token offsets.

start = self._get_token_offset(tokens, token_start, "start")
end = self._get_token_offset(tokens, token_end, "end")

# Create dict and append

span = {
"start": start,
"end": end,
"token_start": token_start,
"token_end": token_end,
"label": "BE"
}

spans.append(span)

return spans


def _find_closest_token(self, tokens, char_offset, pos_string):
"""
Find the token start/end closest to "number"
Args:
tokens: A list of token dicts from a prodigy document.
char_offset(int): A character offset relating to either the start or the
end of a token.
pos_string(str): One of ["start", "end"] denoting whether `char_offset`
is a start or the end of a token
"""
token_map = self._token_start_mapper(tokens, pos_string)
token_key = self._find_closest_number(token_map.keys(), char_offset)

return token_map[token_key]

def _get_token_offset(self, tokens, token_id, pos_string):
"""
Return the character offset for the token with id == token_id
"""

token_match = (token[pos_string] for token in tokens if token["id"] == token_id)

return next(token_match, None)

def _find_closest_number(self, numbers, number):
""" Find the closest match in a list of numbers when presented with
a number
"""

return min(numbers, key=lambda x:abs(x - number))

def _token_start_mapper(self, tokens, pos_string):
""" Map token id by the token start/end position
"""

return {token[pos_string]:token["id"] for token in tokens}


@plac.annotations(
input_file=(
"Path to jsonl file containing numbered reference sections as docs.",
"positional",
None,
str
),
output_file=(
"Path to output jsonl file containing prodigy docs with numbered references labelled.",
"positional",
None,
str
)
)
def annotate_numbered_references(input_file, output_file):
"""
Takes reference sections with numeric labelling scraped by Reach in prodigy
format, and labels the references as spans by splitting them using regex.
"""

numbered_reference_sections = read_jsonl(input_file)

logger.info("Loaded %s prodigy docs", len(numbered_reference_sections))

nra = NumberedReferenceAnnotator()
docs = list(nra.run[numbered_reference_sections])

write_jsonl(output_file)

logger.info("Wrote %s annotated references to %s", len(docs),
output_file)
Loading

0 comments on commit dc40e8d

Please sign in to comment.