forked from BramVanroy/spacy_conll
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
124 lines (101 loc) · 4.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from typing import Dict, List, Optional, Union
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.vocab import Vocab
try:
import pandas as pd # noqa: F401
PD_AVAILABLE = True
except ImportError:
PD_AVAILABLE = False
try:
import spacy_stanza # noqa: F401
STANZA_AVAILABLE = True
except ImportError:
STANZA_AVAILABLE = False
try:
import spacy_udpipe # noqa: F401
UDPIPE_AVAILABLE = True
except ImportError:
UDPIPE_AVAILABLE = False
def init_parser(
model_or_lang: str,
parser: str,
*,
is_tokenized: bool = False,
disable_sbd: bool = False,
parser_opts: Optional[Dict] = None,
**kwargs,
) -> Language:
"""Initialise a spacy-wrapped parser given a language or model and some options.
:param model_or_lang: language model to use (must be installed for spaCy but will be automatically downloaded for
stanza and UDPipe)
:param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
'spacy', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
installed, e.g. spacy-stanza
:param is_tokenized: indicates whether your text has already been tokenized (space-seperated). When using 'spacy',
this option also disabled sentence segmentation completely. For stanza, sentence segmentation will *only*
to be done by splitting on new lines. See the documentation:
https://stanfordnlp.github.io/stanza/tokenize.html. This optioon does not affect UDPipe.
:param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy)
:param parser_opts: will be passed to the core pipeline. For spacy, it will be passed to its
`.load()` initialisations, for stanza `pipeline_opts` is passed to its `.load_pipeline()`
initialisations. UDPipe does not have any keyword arguments
:param kwargs: options to be passed to the ConllFormatter initialisation
:return: an initialised Language object; the parser
"""
parser_opts = {} if parser_opts is None else parser_opts
if parser == "spacy":
exclude = ["senter", "sentencizer"] if disable_sbd or is_tokenized else []
nlp = spacy.load(model_or_lang, exclude=exclude, **parser_opts)
if is_tokenized:
nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
if disable_sbd or is_tokenized:
nlp.add_pipe("disable_sbd", before="parser")
elif parser == "stanza":
import spacy_stanza # noqa: F811
import stanza
verbose = parser_opts.pop("verbose", False)
stanza.download(model_or_lang, verbose=verbose)
nlp = spacy_stanza.load_pipeline(
model_or_lang, verbose=verbose, tokenize_pretokenized=is_tokenized, **parser_opts
)
elif parser == "udpipe":
import spacy_udpipe # noqa: F811
spacy_udpipe.download(model_or_lang)
nlp = spacy_udpipe.load(model_or_lang)
else:
raise ValueError("Unexpected value for 'parser'. Options are: 'spacy', 'stanza', 'udpipe'")
nlp.add_pipe("conll_formatter", config=kwargs, last=True)
return nlp
class SpacyPretokenizedTokenizer:
"""Custom tokenizer to be used in spaCy when the text is already pretokenized."""
def __init__(self, vocab: Vocab):
"""Initialize tokenizer with a given vocab
:param vocab: an existing vocabulary (see https://spacy.io/api/vocab)
"""
self.vocab = vocab
def __call__(self, inp: Union[List[str], str]) -> Doc:
"""Call the tokenizer on input `inp`.
:param inp: either a string to be split on whitespace, or a list of tokens
:return: the created Doc object
"""
if isinstance(inp, str):
words = inp.split()
spaces = [True] * (len(words) - 1) + ([True] if inp[-1].isspace() else [False])
return Doc(self.vocab, words=words, spaces=spaces)
elif isinstance(inp, list):
return Doc(self.vocab, words=inp)
else:
raise ValueError("Unexpected input format. Expected string to be split on whitespace, or list of tokens.")
@Language.factory("disable_sbd")
class SpacyDisableSentenceSegmentation:
"""Disables spaCy's dependency-based sentence boundary detection. In addition, senter and sentencizer components
need to be disabled as well."""
def __init__(self, nlp: Language, name: str):
self.nlp = nlp
self.name = name
def __call__(self, doc: Doc) -> Doc:
for token in doc:
token.is_sent_start = False
return doc