-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from wellcometrust/feature/ivyleavedtoadflax/pr…
…odigy_utilities Add deep_reference_parser utilities
- Loading branch information
Showing
21 changed files
with
2,071 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .io import read_jsonl, write_jsonl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python3 | ||
# coding: utf-8 | ||
|
||
""" | ||
Utilities for loading and saving data from various formats | ||
""" | ||
|
||
import json | ||
|
||
from ..logger import logger | ||
|
||
|
||
def write_jsonl(input_data, output_file): | ||
""" | ||
Write a dict to jsonl (line delimited json) | ||
Output format will look like: | ||
``` | ||
{'a': 0} | ||
{'b': 1} | ||
{'c': 2} | ||
{'d': 3} | ||
``` | ||
Args: | ||
input_data(dict): A dict to be written to json. | ||
output_file(str): Filename to which the jsonl will be saved. | ||
""" | ||
|
||
with open(output_file, 'w') as fb: | ||
|
||
# Check if a dict (and convert to list if so) | ||
|
||
if isinstance(input_data, dict): | ||
input_data = [value for key, value in input_data.items()] | ||
|
||
# Write out to jsonl file | ||
|
||
logger.debug('Writing %s lines to %s', len(input_data), output_file) | ||
|
||
for i in input_data: | ||
json_ = json.dumps(i) + '\n' | ||
fb.write(json_) | ||
|
||
|
||
def _yield_jsonl(file_name): | ||
for row in open(file_name, "r"): | ||
yield json.loads(row) | ||
|
||
|
||
def read_jsonl(input_file): | ||
"""Create a list from a jsonl file | ||
Args: | ||
input_file(str): File to be loaded. | ||
""" | ||
|
||
out = list(_yield_jsonl(input_file)) | ||
|
||
logger.debug('Read %s lines from %s', len(out), input_file) | ||
|
||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Prodigy utilities | ||
|
||
The `deep_reference_parser.prodigy` module contains a number of utility functions for working with annotations created in [prodi.gy](http://prodi.gy). | ||
|
||
The individual functions can be access with the usual `import deep_reference_parser.prodigy` logic, but can also be accessed on the command line with: | ||
|
||
``` | ||
$ python -m deep_reference_parser.prodigy | ||
Using TensorFlow backend. | ||
ℹ Available commands | ||
annotate_numbered_refs, prodigy_to_tsv, reach_to_prodigy, | ||
refs_to_token_annotations | ||
``` | ||
|
||
|Name|Description| | ||
|---|---| | ||
|reach_to_prodigy|Converts a jsonl of reference sections output by reach into a jsonl containing prodigy format documents.| | ||
|annotate_numbered_refs|Takes numbered reference sections extract by Reach, and roughly annotates the references by splitting the reference lines apart on the numbers.| | ||
|prodigy_to_tsv|Converts a jsonl file of prodigy documents to a tab separated values (tsv) file where each token and its associated label occupy a line.| | ||
|refs_to_token_annotations|Takes a jsonl of annotated reference sections in prodigy format that have been manually annotated to the reference level, and converts the references into token level annotations based on the IOBE schema, saving a new file or prodigy documents to jsonl.| | ||
|
||
Help for each of these commands can be sought with the `--help` flag, e.g.: | ||
|
||
``` | ||
$ python -m deep_reference_parser.prodigy prodigy_to_tsv --help | ||
Using TensorFlow backend. | ||
usage: deep_reference_parser prodigy_to_tsv [-h] input_file output_file | ||
Convert token annotated jsonl to token annotated tsv ready for use in the | ||
Rodrigues model. | ||
positional arguments: | ||
input_file Path to jsonl file containing prodigy docs. | ||
output_file Path to output tsv file. | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .spacy_doc_to_prodigy import SpacyDocToProdigy | ||
from .reference_to_token_annotations import TokenTagger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# coding: utf8 | ||
|
||
""" | ||
Modified from https://github.com/explosion/spaCy/blob/master/spacy/__main__.py | ||
""" | ||
|
||
if __name__ == "__main__": | ||
import plac | ||
import sys | ||
from wasabi import msg | ||
from .numbered_reference_annotator import annotate_numbered_references | ||
from .prodigy_to_tsv import prodigy_to_tsv | ||
from .reach_to_prodigy import reach_to_prodigy | ||
from .reference_to_token_annotations import reference_to_token_annotations | ||
|
||
commands = { | ||
"annotate_numbered_refs": annotate_numbered_references, | ||
"prodigy_to_tsv": prodigy_to_tsv, | ||
"reach_to_prodigy": reach_to_prodigy, | ||
"refs_to_token_annotations": reference_to_token_annotations, | ||
} | ||
|
||
if len(sys.argv) == 1: | ||
msg.info("Available commands", ", ".join(commands), exits=1) | ||
command = sys.argv.pop(1) | ||
sys.argv[0] = "deep_reference_parser %s" % command | ||
|
||
if command in commands: | ||
plac.call(commands[command], sys.argv[1:]) | ||
else: | ||
available = "Available: {}".format(", ".join(commands)) | ||
msg.fail("Unknown command: {}".format(command), available, exits=1) |
149 changes: 149 additions & 0 deletions
149
deep_reference_parser/prodigy/numbered_reference_annotator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
# coding: utf-8 | ||
#!/usr/bin/env python3 | ||
|
||
import re | ||
|
||
import plac | ||
|
||
from ..io import read_jsonl, write_jsonl | ||
from ..logger import logger | ||
|
||
REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)" | ||
|
||
class NumberedReferenceAnnotator: | ||
""" | ||
Takes reference sections with numeric labelling scraped by Reach in prodigy | ||
format, and labels the references as spans by splitting them using regex. | ||
Note that you must identify numbered reference section first. This can be | ||
done with a simple textcat model trained in prodigy. | ||
""" | ||
|
||
def __init__(self): | ||
|
||
self.regex = r"" | ||
|
||
def run(self, docs, regex=REGEX): | ||
|
||
self.regex = regex | ||
|
||
for doc in docs: | ||
|
||
spans = self.label_numbered_references(doc["text"], doc["tokens"]) | ||
doc["spans"] = spans | ||
|
||
yield doc | ||
|
||
def label_numbered_references(self, text, tokens): | ||
|
||
# Search for number reference using regex | ||
|
||
splits = list(re.finditer(self.regex, text)) | ||
spans = [] | ||
|
||
for index in range(0, len(splits) - 1): | ||
|
||
# Calculate the approximate start and end of the reference using | ||
# the character offsets returned by re.finditer. | ||
|
||
start = splits[index].end() | ||
end = splits[index + 1].start() | ||
|
||
# Calculate which is the closest token to the character offset | ||
# returned above. | ||
|
||
token_start = self._find_closest_token(tokens, start, "start") | ||
token_end = self._find_closest_token(tokens, end, "end") | ||
|
||
# To avoid the possibility of mismatches between the character | ||
# offset and the token offset, reset the character offsets | ||
# based on the token offsets. | ||
|
||
start = self._get_token_offset(tokens, token_start, "start") | ||
end = self._get_token_offset(tokens, token_end, "end") | ||
|
||
# Create dict and append | ||
|
||
span = { | ||
"start": start, | ||
"end": end, | ||
"token_start": token_start, | ||
"token_end": token_end, | ||
"label": "BE" | ||
} | ||
|
||
spans.append(span) | ||
|
||
return spans | ||
|
||
|
||
def _find_closest_token(self, tokens, char_offset, pos_string): | ||
""" | ||
Find the token start/end closest to "number" | ||
Args: | ||
tokens: A list of token dicts from a prodigy document. | ||
char_offset(int): A character offset relating to either the start or the | ||
end of a token. | ||
pos_string(str): One of ["start", "end"] denoting whether `char_offset` | ||
is a start or the end of a token | ||
""" | ||
token_map = self._token_start_mapper(tokens, pos_string) | ||
token_key = self._find_closest_number(token_map.keys(), char_offset) | ||
|
||
return token_map[token_key] | ||
|
||
def _get_token_offset(self, tokens, token_id, pos_string): | ||
""" | ||
Return the character offset for the token with id == token_id | ||
""" | ||
|
||
token_match = (token[pos_string] for token in tokens if token["id"] == token_id) | ||
|
||
return next(token_match, None) | ||
|
||
def _find_closest_number(self, numbers, number): | ||
""" Find the closest match in a list of numbers when presented with | ||
a number | ||
""" | ||
|
||
return min(numbers, key=lambda x:abs(x - number)) | ||
|
||
def _token_start_mapper(self, tokens, pos_string): | ||
""" Map token id by the token start/end position | ||
""" | ||
|
||
return {token[pos_string]:token["id"] for token in tokens} | ||
|
||
|
||
@plac.annotations( | ||
input_file=( | ||
"Path to jsonl file containing numbered reference sections as docs.", | ||
"positional", | ||
None, | ||
str | ||
), | ||
output_file=( | ||
"Path to output jsonl file containing prodigy docs with numbered references labelled.", | ||
"positional", | ||
None, | ||
str | ||
) | ||
) | ||
def annotate_numbered_references(input_file, output_file): | ||
""" | ||
Takes reference sections with numeric labelling scraped by Reach in prodigy | ||
format, and labels the references as spans by splitting them using regex. | ||
""" | ||
|
||
numbered_reference_sections = read_jsonl(input_file) | ||
|
||
logger.info("Loaded %s prodigy docs", len(numbered_reference_sections)) | ||
|
||
nra = NumberedReferenceAnnotator() | ||
docs = list(nra.run[numbered_reference_sections]) | ||
|
||
write_jsonl(output_file) | ||
|
||
logger.info("Wrote %s annotated references to %s", len(docs), | ||
output_file) |
Oops, something went wrong.