forked from eellak/gsoc2018-spacy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconftest.py
220 lines (150 loc) · 5.55 KB
/
conftest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# coding: utf-8
from __future__ import unicode_literals
from io import StringIO, BytesIO
from pathlib import Path
import pytest
from .util import load_test_model
from ..tokens import Doc
from ..strings import StringStore
from .. import util
# These languages are used for generic tokenizer tests – only add a language
# here if it's using spaCy's tokenizer (not a different library)
# TODO: re-implement generic tokenizer tests
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'ar', 'ut', 'tt',
'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_sm'],
'fr': ['fr_core_news_sm'],
'xx': ['xx_ent_web_sm'],
'en_core_web_md': ['en_core_web_md'],
'es_core_news_md': ['es_core_news_md']}
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(params=_models['en'])
def EN(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['de'])
def DE(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['fr'])
def FR(request):
return load_test_model(request.param)
@pytest.fixture()
def RU(request):
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('ru')()
#@pytest.fixture(params=_languages)
#def tokenizer(request):
#lang = util.get_lang_class(request.param)
#return lang.Defaults.create_tokenizer()
@pytest.fixture
def tokenizer():
return util.get_lang_class('xx').Defaults.create_tokenizer()
@pytest.fixture
def en_tokenizer():
return util.get_lang_class('en').Defaults.create_tokenizer()
@pytest.fixture
def en_vocab():
return util.get_lang_class('en').Defaults.create_vocab()
@pytest.fixture
def en_parser(en_vocab):
nlp = util.get_lang_class('en')(en_vocab)
return nlp.create_pipe('parser')
@pytest.fixture
def es_tokenizer():
return util.get_lang_class('es').Defaults.create_tokenizer()
@pytest.fixture
def de_tokenizer():
return util.get_lang_class('de').Defaults.create_tokenizer()
@pytest.fixture
def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer()
@pytest.fixture
def hu_tokenizer():
return util.get_lang_class('hu').Defaults.create_tokenizer()
@pytest.fixture
def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
def ro_tokenizer():
return util.get_lang_class('ro').Defaults.create_tokenizer()
@pytest.fixture
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():
return util.get_lang_class('sv').Defaults.create_tokenizer()
@pytest.fixture
def bn_tokenizer():
return util.get_lang_class('bn').Defaults.create_tokenizer()
@pytest.fixture
def ga_tokenizer():
return util.get_lang_class('ga').Defaults.create_tokenizer()
@pytest.fixture
def he_tokenizer():
return util.get_lang_class('he').Defaults.create_tokenizer()
@pytest.fixture
def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
def da_tokenizer():
return util.get_lang_class('da').Defaults.create_tokenizer()
@pytest.fixture
def ja_tokenizer():
janome = pytest.importorskip("MeCab")
return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture
def tr_tokenizer():
return util.get_lang_class('tr').Defaults.create_tokenizer()
@pytest.fixture
def tt_tokenizer():
return util.get_lang_class('tt').Defaults.create_tokenizer()
@pytest.fixture
def ar_tokenizer():
return util.get_lang_class('ar').Defaults.create_tokenizer()
@pytest.fixture
def ur_tokenizer():
return util.get_lang_class('ur').Defaults.create_tokenizer()
@pytest.fixture
def ru_tokenizer():
pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('ru').Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()
@pytest.fixture
def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture
def text_file():
return StringIO()
@pytest.fixture
def text_file_b():
return BytesIO()
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
parser.addoption("--vectors", action="store_true",
help="include word vectors tests")
parser.addoption("--slow", action="store_true",
help="include slow tests")
for lang in _languages + ['all']:
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
for model in _models:
if model not in _languages:
parser.addoption("--%s" % model, action="store_true", help="Use %s model" % model)
def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)
# Check if test is marked with models and has arguments set, i.e. specific
# language. If so, skip test if flag not set.
if item.get_marker('models'):
for arg in item.get_marker('models').args:
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
pytest.skip("need --%s or --all option to run" % arg)