Skip to content

Commit

Permalink
Merge pull request #19 from wellcometrust/reorganise
Browse files Browse the repository at this point in the history
Reorganise sub-module structure
  • Loading branch information
ivyleavedtoadflax authored Mar 18, 2020
2 parents d8ec4df + 2e7da2e commit ae2fcf2
Show file tree
Hide file tree
Showing 15 changed files with 504 additions and 622 deletions.
16 changes: 6 additions & 10 deletions deep_reference_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# distracting on the command line. These lines here (while undesireable)
# reduce the level of verbosity.

import os
import sys
import warnings
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

Expand All @@ -19,20 +19,16 @@

from .common import download_model_artefact
from .deep_reference_parser import DeepReferenceParser
from .logger import logger
from .model_utils import get_config
from .reference_utils import (
break_into_chunks,
labels_to_prodigy,
from .io import (
load_tsv,
prodigy_to_conll,
prodigy_to_lists,
read_jsonl,
read_pickle,
write_json,
write_jsonl,
write_pickle,
write_to_csv,
write_txt,
write_tsv,
)
from .logger import logger
from .model_utils import get_config
from .reference_utils import break_into_chunks
from .tokens_to_references import tokens_to_references
2 changes: 1 addition & 1 deletion deep_reference_parser/__version__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__name__ = "deep_reference_parser"
__version__ = "2020.3.0"
__version__ = "2020.3.1"
__description__ = "Deep learning model for finding and parsing references"
__url__ = "https://github.com/wellcometrust/deep_reference_parser"
__author__ = "Wellcome Trust DataLabs Team"
Expand Down
4 changes: 2 additions & 2 deletions deep_reference_parser/deep_reference_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
save_confusion_matrix,
word2vec_embeddings,
)
from .reference_utils import load_tsv, read_pickle, write_pickle, write_to_csv
from .io import load_tsv, read_pickle, write_pickle, write_to_csv


class DeepReferenceParser:
Expand Down Expand Up @@ -456,7 +456,7 @@ def build_model(

self.model = model

logger.debug(self.model.summary(line_length=150))
# logger.debug(self.model.summary(line_length=150))

def train_model(
self, epochs=25, batch_size=100, early_stopping_patience=5, metric="val_f1"
Expand Down
3 changes: 2 additions & 1 deletion deep_reference_parser/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .io import read_jsonl, write_jsonl
from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
write_to_csv, write_tsv)
133 changes: 133 additions & 0 deletions deep_reference_parser/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,74 @@
"""

import json
import pickle
import csv
import os
import pandas as pd

from ..logger import logger

def _split_list_by_linebreaks(tokens):
"""Cycle through a list of tokens (or labels) and split them into lists
based on the presence of Nones or more likely math.nan caused by converting
pd.DataFrame columns to lists.
"""
out = []
tokens_gen = iter(tokens)
while True:
try:
token = next(tokens_gen)
if isinstance(token, str) and token:
out.append(token)
else:
yield out
out = []
except StopIteration:
if out:
yield out
break

def load_tsv(filepath, split_char="\t"):
"""
Load and return the data stored in the given path.
Expects data in the following format (tab separations).
References o o
o o
1 o o
. o o
o o
WHO title b-r
treatment title i-r
guidelines title i-r
for title i-r
drug title i-r
- title i-r
resistant title i-r
tuberculosis title i-r
, title i-r
2016 title i-r
Args:
filepath (str): Path to the data.
split_char(str): Character to be used to split each line of the
document.
Returns:
a series of lists depending on the number of label columns provided in
filepath.
"""

df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]

logger.info("Loaded %s training examples", len(out[0]))

return tuple(out)

def write_jsonl(input_data, output_file):
"""
Expand Down Expand Up @@ -61,3 +126,71 @@ def read_jsonl(input_file):
logger.debug("Read %s lines from %s", len(out), input_file)

return out


def write_to_csv(filename, columns, rows):
"""
Create a .csv file from data given as columns and rows
Args:
filename(str): Path and name of the .csv file, without csv extension
columns(list): Columns of the csv file (First row of the file)
rows: Data to write into the csv file, given per row
"""

with open(filename, "w") as csvfile:
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
wr.writerow(columns)

for i, row in enumerate(rows):
wr.writerow(row)
logger.info("Wrote results to %s", filename)


def write_pickle(input_data, output_file, path=None):
"""
Write an object to pickle
Args:
input_data(dict): A dict to be written to json.
output_file(str): A filename or path to which the json will be saved.
path(str): A string which will be prepended onto `output_file` with
`os.path.join()`. Obviates the need for lengthy `os.path.join`
statements each time this function is called.
"""

if path:

output_file = os.path.join(path, output_file)

with open(output_file, "wb") as fb:
pickle.dump(input_data, fb)


def read_pickle(input_file, path=None):
"""Create a list from a jsonl file
Args:
input_file(str): File to be loaded.
path(str): A string which will be prepended onto `input_file` with
`os.path.join()`. Obviates the need for lengthy `os.path.join`
statements each time this function is called.
"""

if path:
input_file = os.path.join(path, input_file)

with open(input_file, "rb") as fb:
out = pickle.load(fb)

logger.debug("Read data from %s", input_file)

return out

def write_tsv(token_label_pairs, output_path):
"""
Write tsv files to disk
"""
with open(output_path, "w") as fb:
writer = csv.writer(fb, delimiter="\t")
writer.writerows(token_label_pairs)
2 changes: 2 additions & 0 deletions deep_reference_parser/prodigy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
from .reach_to_prodigy import ReachToProdigy, reach_to_prodigy
from .reference_to_token_annotations import TokenTagger, reference_to_token_annotations
from .spacy_doc_to_prodigy import SpacyDocToProdigy
from .misc import prodigy_to_conll
from .labels_to_prodigy import labels_to_prodigy
57 changes: 57 additions & 0 deletions deep_reference_parser/prodigy/labels_to_prodigy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
def labels_to_prodigy(tokens, labels):
"""
Converts a list of tokens and labels like those used by Rodrigues et al,
and converts to prodigy format dicts.
Args:
tokens (list): A list of tokens.
labels (list): A list of labels relating to `tokens`.
Returns:
A list of prodigy format dicts containing annotated data.
"""

prodigy_data = []

all_token_index = 0

for line_index, line in enumerate(tokens):
prodigy_example = {}

tokens = []
spans = []
token_start_offset = 0

for token_index, token in enumerate(line):

token_end_offset = token_start_offset + len(token)

tokens.append(
{
"text": token,
"id": token_index,
"start": token_start_offset,
"end": token_end_offset,
}
)

spans.append(
{
"label": labels[line_index][token_index : token_index + 1][0],
"start": token_start_offset,
"end": token_end_offset,
"token_start": token_index,
"token_end": token_index,
}
)

prodigy_example["text"] = " ".join(line)
prodigy_example["tokens"] = tokens
prodigy_example["spans"] = spans
prodigy_example["meta"] = {"line": line_index}

token_start_offset = token_end_offset + 1

prodigy_data.append(prodigy_example)

return prodigy_data
38 changes: 38 additions & 0 deletions deep_reference_parser/prodigy/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import spacy


def _join_prodigy_tokens(text):
"""Return all prodigy tokens in a single string
"""

return "\n".join([str(i) for i in text])


def prodigy_to_conll(docs):
"""
Expect list of jsons loaded from a jsonl
"""

nlp = spacy.load("en_core_web_sm")
texts = [doc["text"] for doc in docs]
docs = list(nlp.tokenizer.pipe(texts))

out = [_join_prodigy_tokens(i) for i in docs]

out_str = "DOCSTART\n\n" + "\n\n".join(out)

return out_str


def prodigy_to_lists(docs):
"""
Expect list of jsons loaded from a jsonl
"""

nlp = spacy.load("en_core_web_sm")
texts = [doc["text"] for doc in docs]
docs = list(nlp.tokenizer.pipe(texts))

out = [[str(token) for token in doc] for doc in docs]

return out
4 changes: 1 addition & 3 deletions deep_reference_parser/prodigy/prodigy_to_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

msg = Printer()

ROWS_TO_PRINT=15
ROWS_TO_PRINT = 15


class TokenLabelPairs:
Expand Down Expand Up @@ -375,8 +375,6 @@ def prodigy_to_tsv(

with open(output_file, "w") as fb:
writer = csv.writer(fb, delimiter="\t")
# Write DOCSTART and a blank line
# writer.writerows([("DOCSTART", None), (None, None)])
writer.writerows(merged_pairs)

# Print out the first ten rows as a sense check
Expand Down
Loading

0 comments on commit ae2fcf2

Please sign in to comment.