Skip to content

Commit

Permalink
Merge pull request #17 from wellcometrust/feature/ivyleavedtoadflax/r…
Browse files Browse the repository at this point in the history
…efactor_load_tsv

Refactor load_tsv to cover multitask case
  • Loading branch information
ivyleavedtoadflax authored Mar 18, 2020
2 parents 085f0fb + 87c098c commit d8ec4df
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 132 deletions.
1 change: 0 additions & 1 deletion deep_reference_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from .reference_utils import (
break_into_chunks,
labels_to_prodigy,
load_data,
load_tsv,
prodigy_to_conll,
prodigy_to_lists,
Expand Down
171 changes: 42 additions & 129 deletions deep_reference_parser/reference_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,161 +8,74 @@
import json
import os
import pickle
import pandas as pd

import spacy

from .logger import logger


def load_data(filepath):
def split_list_by_linebreaks(tokens):
"""Cycle through a list of tokens (or labels) and split them into lists
based on the presence of Nones or more likely math.nan caused by converting
pd.DataFrame columns to lists.
"""
Load and return the data stored in the given path.
Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
The data is structured as follows:
* Each line contains four columns separated by a single space.
* Each word has been put on a separate line and there is an empty line
after each sentence.
* The first item on each line is a word, the second, third and fourth are
tags related to the word.
Example:
The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna
1983." is represented in the dataset as:
```
L author b-secondary b-r
. author i-secondary i-r
Antonielli author i-secondary i-r
, author i-secondary i-r
Iprefetti title i-secondary i-r
dell title i-secondary i-r
’ title i-secondary i-r
Italia title i-secondary i-r
napoleonica title i-secondary i-r
, title i-secondary i-r
Bologna publicationplace i-secondary i-r
1983 year e-secondary i-r
. year e-secondary e-r
```
Args:
filepath (str): Path to the data.
Returns:
four lists: The first contains tokens, the next three contain
corresponding labels.
"""

# Arrays to return
words = []
tags_1 = []
tags_2 = []
tags_3 = []

word = tags1 = tags2 = tags3 = []
with open(filepath, "r") as file:
for line in file:
# Do not take the first line into consideration

if "DOCSTART" not in line:
# Check if empty line

if line in ["\n", "\r\n"]:
# Append line

words.append(word)
tags_1.append(tags1)
tags_2.append(tags2)
tags_3.append(tags3)

# Reset
word = []
tags1 = []
tags2 = []
tags3 = []

else:
# Split the line into words, tag #1
w = line[:-1].split(" ")

word.append(w[0])
tags1.append(w[1])
tags2.append(w[2])
tags3.append(w[3])

logger.info("Loaded %s training examples", len(words))

return words, tags_1, tags_2, tags_3

out = []
tokens_gen = iter(tokens)
while True:
try:
token = next(tokens_gen)
if isinstance(token, str) and token:
out.append(token)
else:
yield out
out = []
except StopIteration:
if out:
yield out
break

def load_tsv(filepath, split_char="\t"):
"""
Load and return the data stored in the given path.
Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
Expects data in the following format (tab separations).
References o o
o o
1 o o
. o o
o o
WHO title b-r
treatment title i-r
guidelines title i-r
for title i-r
drug title i-r
- title i-r
resistant title i-r
tuberculosis title i-r
, title i-r
2016 title i-r
NOTE: In the current implementation in deep_reference_parser, only one set
of tags is used. The others will be used in a later PR.
The data is structured as follows:
* Each line contains four columns separated by a single space.
* Each word has been put on a separate line and there is an empty line
after each sentence.
* The first item on each line is a word, the second, third and fourth are
tags related to the word.
Args:
filepath (str): Path to the data.
split_char(str): Character to be used to split each line of the
document.
Returns:
two lists: The first contains tokens, the second contains corresponding
labels.
a series of lists depending on the number of label columns provided in
filepath.
"""

# Arrays to return
words = []
tags_1 = []

word = []
tags1 = []

with open(filepath, "r") as file:
for line in file:
# Check if empty line

if line in ["\n", "\r\n", "\t\n"]:
# Append line

words.append(word)
tags_1.append(tags1)

# Reset
word = []
tags1 = []

else:

# Split the line into words, tag #1

w = line[:-1].split(split_char)
word.append(w[0])

# If tags are passed, (for training) then also add

if len(w) == 2:

tags1.append(w[1])
df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()]

logger.info("Loaded %s training examples", len(words))
logger.info("Loaded %s training examples", len(out[0]))

return words, tags_1
return tuple(out)


def prodigy_to_conll(docs):
Expand Down
1 change: 1 addition & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ def get_path(p):
TEST_REFERENCES = get_path("test_data/test_references.txt")
TEST_TSV_PREDICT = get_path("test_data/test_tsv_predict.tsv")
TEST_TSV_TRAIN = get_path("test_data/test_tsv_train.tsv")
TEST_LOAD_TSV = get_path("test_data/test_load_tsv.tsv")
18 changes: 18 additions & 0 deletions tests/test_data/test_load_tsv.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
the i-r a
focus i-r a
in i-r a
Daloa i-r a
, i-r a
Côte i-r a
d’Ivoire]. i-r a

Bulletin i-r a
de i-r a
la i-r a
Société i-r a
de i-r a
Pathologie i-r a

Exotique i-r a
et i-r a

80 changes: 78 additions & 2 deletions tests/test_reference_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
prodigy_to_conll,
write_tsv,
yield_token_label_pairs,
split_list_by_linebreaks,
)

from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN
from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN, TEST_LOAD_TSV


def test_prodigy_to_conll():
Expand Down Expand Up @@ -75,6 +76,14 @@ def test_load_tsv_train():

actual = load_tsv(TEST_TSV_TRAIN)

assert len(actual[0][0]) == len(expected[0][0])
assert len(actual[0][1]) == len(expected[0][1])
assert len(actual[0][2]) == len(expected[0][2])

assert len(actual[1][0]) == len(expected[1][0])
assert len(actual[1][1]) == len(expected[1][1])
assert len(actual[1][2]) == len(expected[1][2])

assert actual == expected


Expand Down Expand Up @@ -109,13 +118,59 @@ def test_load_tsv_predict():
["Bulletin", "de", "la", "Société", "de", "Pathologie"],
["Exotique", "et"],
],
[[], [], [],],
)

actual = load_tsv(TEST_TSV_PREDICT)

assert actual == expected

def test_load_tsv_train_multiple_labels():
"""
Text of TEST_TSV_TRAIN:
```
the i-r
focus i-r
in i-r
Daloa i-r
, i-r
Côte i-r
d’Ivoire]. i-r
Bulletin i-r
de i-r
la i-r
Société i-r
de-r
Pathologie i-r
Exotique i-r
et i-r
```
"""

expected = (
[
["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
["Bulletin", "de", "la", "Société", "de", "Pathologie"],
["Exotique", "et"],
],
[
["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
["i-r", "i-r"],
],
[
["a", "a", "a", "a", "a", "a", "a"],
["a", "a", "a", "a", "a", "a"],
["a", "a"],
],
)

actual = load_tsv(TEST_LOAD_TSV)

assert actual == expected


def test_yield_toke_label_pairs():

Expand Down Expand Up @@ -197,3 +252,24 @@ def test_break_into_chunks():
actual = break_into_chunks(before, max_words=2)

assert expected == actual

def test_split_list_by_linebreaks():

lst = ["a", "b", "c", None, "d"]
expected = [["a", "b", "c"], ["d"]]

actual = split_list_by_linebreaks(lst)

def test_list_by_linebreaks_ending_in_None():

lst = ["a", "b", "c", float("nan"), "d", None]
expected = [["a", "b", "c"], ["d"]]

actual = split_list_by_linebreaks(lst)

def test_list_by_linebreaks_starting_in_None():

lst = [None, "a", "b", "c", None, "d"]
expected = [["a", "b", "c"], ["d"]]

actual = split_list_by_linebreaks(lst)

0 comments on commit d8ec4df

Please sign in to comment.