-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from wellcometrust/reorganise
Reorganise sub-module structure
- Loading branch information
Showing
15 changed files
with
504 additions
and
622 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from .io import read_jsonl, write_jsonl | ||
from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle, | ||
write_to_csv, write_tsv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
def labels_to_prodigy(tokens, labels): | ||
""" | ||
Converts a list of tokens and labels like those used by Rodrigues et al, | ||
and converts to prodigy format dicts. | ||
Args: | ||
tokens (list): A list of tokens. | ||
labels (list): A list of labels relating to `tokens`. | ||
Returns: | ||
A list of prodigy format dicts containing annotated data. | ||
""" | ||
|
||
prodigy_data = [] | ||
|
||
all_token_index = 0 | ||
|
||
for line_index, line in enumerate(tokens): | ||
prodigy_example = {} | ||
|
||
tokens = [] | ||
spans = [] | ||
token_start_offset = 0 | ||
|
||
for token_index, token in enumerate(line): | ||
|
||
token_end_offset = token_start_offset + len(token) | ||
|
||
tokens.append( | ||
{ | ||
"text": token, | ||
"id": token_index, | ||
"start": token_start_offset, | ||
"end": token_end_offset, | ||
} | ||
) | ||
|
||
spans.append( | ||
{ | ||
"label": labels[line_index][token_index : token_index + 1][0], | ||
"start": token_start_offset, | ||
"end": token_end_offset, | ||
"token_start": token_index, | ||
"token_end": token_index, | ||
} | ||
) | ||
|
||
prodigy_example["text"] = " ".join(line) | ||
prodigy_example["tokens"] = tokens | ||
prodigy_example["spans"] = spans | ||
prodigy_example["meta"] = {"line": line_index} | ||
|
||
token_start_offset = token_end_offset + 1 | ||
|
||
prodigy_data.append(prodigy_example) | ||
|
||
return prodigy_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import spacy | ||
|
||
|
||
def _join_prodigy_tokens(text): | ||
"""Return all prodigy tokens in a single string | ||
""" | ||
|
||
return "\n".join([str(i) for i in text]) | ||
|
||
|
||
def prodigy_to_conll(docs): | ||
""" | ||
Expect list of jsons loaded from a jsonl | ||
""" | ||
|
||
nlp = spacy.load("en_core_web_sm") | ||
texts = [doc["text"] for doc in docs] | ||
docs = list(nlp.tokenizer.pipe(texts)) | ||
|
||
out = [_join_prodigy_tokens(i) for i in docs] | ||
|
||
out_str = "DOCSTART\n\n" + "\n\n".join(out) | ||
|
||
return out_str | ||
|
||
|
||
def prodigy_to_lists(docs): | ||
""" | ||
Expect list of jsons loaded from a jsonl | ||
""" | ||
|
||
nlp = spacy.load("en_core_web_sm") | ||
texts = [doc["text"] for doc in docs] | ||
docs = list(nlp.tokenizer.pipe(texts)) | ||
|
||
out = [[str(token) for token in doc] for doc in docs] | ||
|
||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.