-
Notifications
You must be signed in to change notification settings - Fork 4
/
analiticcl.pyi
309 lines (221 loc) · 13 KB
/
analiticcl.pyi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
from __future__ import annotations
from typing import List, Optional, Union, Tuple
class SearchParameters:
"""An instance of this class holds a configuration for variant search."""
def __init__(self, **kwargs):
"""Weights to assign to various computations done in the :class:`VariantModel`.
Values that are not provided as keyword arguments will be set to their defaults.
Weights don't necessarily have to sum to one if you provide them all, it will be normalised later.
Keyword Arguments
-------------------
max_anagram_distance: Union[int,float,Tuple[float,int]]
Maximum anagram distance. The difference in characters (regardless of order)
Must be an integer expressing an absolute value, or float in range 0-1 expressing a ratio. Or a two-tuple expressing a ratio with an absolute limit (float, int)
max_edit_distance: Union[int,float,Tuple[float,int]]
Maximum edit distance (levenshtein-damarau). The maximum edit distance according to Levenshtein-Damarau. Insertions, deletions, substitutions and transposition all have the same cost (1). It is recommended to set this value slightly lower than the maximum anagram distance.
Must be an integer expressing an absolute value, or float in range 0-1 expressing a ratio. Or a two-tuple expressing a ratio with an absolute limit (float, int)
max_matches: int
Number of matches to return per input (set to 0 for unlimited if you want to exhaustively return every possibility within the specified anagram and edit distance)
score_threshold: float
Require scores to meet this threshold, they are pruned otherwise
cutoff_threshold: float
Cut-off threshold: if a score in the ranking is a specific factor greater than the best score, the ranking will be cut-off at that point and the score not included. Should be set to a value like 2.
stop_criterion: bool
Determines when to stop searching for matches. Enabling this can speed up the process at the
cost of lower accuracy
max_ngram: int
Maximum ngram order (1 for unigrams, 2 for bigrams, etc..).
lm_order: int
Maximum ngram order for Language Models (2 for bigrams, etc..).
max_seq: int
Maximum number of candidate sequences to take along to the language modelling stage
single_thread: bool
Use only a single-thread instead of leveraging multiple cores (lowers resource use and
performance)
context_weight: float
Weight attributed to the language model in relation to the variant model (e.g. 2.0 = twice
as much weight) when considering input context and rescoring.
variantmodel_weight: float
Weight attributed to the variant model in finding the most likely sequence
lm_weight: float
Weight attributed to the language model in finding the most likely sequence
contextrules_weight: float
Weight attributed to the context rules model in finding the most likely sequence
freq_weight: float
Weight attributed to the frequency information in frequency reranking, in relation to
the similarity component. 0 = disabled)
consolidate_matches: bool
Consolidate matches and extract a single most likely sequence, if set
to false, all possible matches (including overlapping ones) are returned.
unicodeoffsets: bool
Output text offsets in unicode points rather than UTF-8 byte offsets
"""
def get_max_anagram_distance(self) -> Union[int,float,Tuple[float,int]]:
"""
Maximum anagram distance. The difference in characters (regardless of order)
Must be an integer expressing an absolute value, or float in range 0-1 expressing a ratio. Or a two-tuple expressing a ratio with an absolute limit (float, int)
"""
def get_edit_distance(self) -> Union[int,float,Tuple[float,int]]:
"""
Maximum edit distance (levenshtein-damarau). The maximum edit distance according to Levenshtein-Damarau. Insertions, deletions, substitutions and transposition all have the same cost (1). It is recommended to set this value slightly lower than the maximum anagram distance.
Must be an integer expressing an absolute value, or float in range 0-1 expressing a ratio. Or a two-tuple expressing a ratio with an absolute limit (float, int)
"""
def get_max_matches(self) -> int:
"""Returns number of matches to return per input (set to 0 for unlimited if you want to exhaustively return every possibility within the specified anagram and edit distance)"""
def get_score_threshold(self) -> float:
"""Require scores to meet this threshold, they are pruned otherwise"""
def get_cutoff_threshold(self) -> float:
"""Cut-off threshold: if a score in the ranking is a specific factor greater than the best score, the ranking will be cut-off at that point and the score not included. Should be set to a value like 2."""
def get_stop_criterion(self) -> bool:
"""Determines when to stop searching for matches. Enabling this can speed up the process at the
cost of lower accuracy"""
def get_max_ngram(self) -> int:
"""Maximum ngram order (1 for unigrams, 2 for bigrams, etc..)."""
def get_lm_order(self) -> int:
"""Maximum ngram order for Language Models (2 for bigrams, etc..)."""
def get_max_seq(self) -> int:
"""Maximum number of candidate sequences to take along to the language modelling stage"""
def get_single_thread(self) -> bool:
"""Use only a single-thread instead of leveraging multiple cores (lowers resource use and
performance)"""
def get_context_weight(self) -> float:
"""Weight attributed to the language model in relation to the variant model (e.g. 2.0 = twice
as much weight) when considering input context and rescoring."""
def get_variantmodel_weight(self) -> float:
"""Weight attributed to the variant model in finding the most likely sequence"""
def get_lm_weight(self) -> float:
"""Weight attributed to the language model in finding the most likely sequence"""
def get_contextrules_weight(self) -> float:
"""Weight attributed to the context rules model in finding the most likely sequence"""
def get_freq_weight(self) -> float:
"""Weight attributed to the frequency information in frequency reranking, in relation to
the similarity component. 0 = disabled)"""
def get_consolidate_matches(self) -> bool:
"""Consolidate matches and extract a single most likely sequence, if set
to false, all possible matches (including overlapping ones) are returned."""
def get_unicodeoffsets(self) -> bool:
"""Output text offsets in unicode points rather than UTF-8 byte offsets"""
def to_dict(self) -> dict:
"""Returns all parameters in a dictionary"""
class VocabParams:
"""Configuration passed when loading vocabularies (lexicons, frequency lists) etc"""
def __init__(self, **kwargs):
"""Configuration passed when loading vocabularies (lexicons, frequency lists) etc.
Keyword Arguments
--------------------
text_column: int
Column containing the Text (if any, 0-indexed)
freq_column: int
Column containing the frequency (if any, 0-indexed)
freq_handling: str
Frequency handling in case of duplicate items (may be across multiple lexicons), can be "sum","max","min","replace"
vocabtype: str
"NONE", "INDEXED", "TRANSPARENT" or "LM"
"""
class Weights:
"""Holds the weights for the :class:`VariantModel`"""
def __init__(self, **kwargs):
"""Weights to assign to various computations done in the :class:`VariantModel`.
Values that are not provided as keyword arguments will be set to their defaults.
Weights don't necessarily have to sum to one if you provide them all, it will be normalised later.
Keyword Arguments
-------------------
ld: float
Weight for the Levenshtein (or Damarau-Levenshtein) distance
lcs: float
Weight for the Longest common substring length
prefix: float
Weight for the prefix length
suffix: float
Weight for the suffix length
case: float
Weight to assign to difference in case (lowercase/uppercase)
"""
def get_ld(self) -> float:
"""Returns the weight for the Levenshtein (or Damarau-Levenshtein) distance"""
def get_lcs(self) -> float:
"""Returns the weight for the Longest common substring length"""
def get_prefix(self) -> float:
"""Returns the weight for the prefix length"""
def get_suffix(self) -> float:
"""Returns the weight for the suffix length"""
def get_case(self) -> float:
"""Returns the weight for the case differences"""
def set_ld(self, value:float):
"""Sets the weight for the Levenshtein (or Damarau-Levenshtein) distance"""
def set_lcs(self, value: float):
"""Sets the weight for the Longest common substring length"""
def set_prefix(self, value: float):
"""Sets the weight for the prefix length"""
def set_suffix(self, value: float):
"""Sets the weight for the suffix length"""
def set_case(self, value: float):
"""Sets the weight for the case differences"""
def to_dict(self) -> dict:
"""Returns all weights as a dictionary"""
class VariantModel:
"""The VariantModel is the most high-level model of analiticcl, it holds all data required for variant matching."""
def __init__(self, alphabet_file: str, weights: Weights, debug: int = 0):
"""Instantiate a new variant model
Parameters
--------------
alphabet_file: str
Path to the alphabet file to load for this model
weights: Weights
Weights for the model
debug: int
Debug level
"""
def build(self):
"""
Build the anagram index (and secondary index) so the model
is ready for variant matching
"""
def add_to_vocabulary(self, text: str, frequency: Optional[int], params: VocabParams):
"""
Add an item to the vocabulary. This is a lower-level interface.
"""
def read_vocabulary(self, filename: str, params: VocabParams):
"""
Load vocabulary (a lexicon or corpus-derived lexicon) from a TSV file
May contain frequency information. This is a lower-level interface.
The parameters define what value can be read from what column
"""
def add_contextrule(self, pattern: str, score: float, tag: List[str], tagoffset: List[str]):
pass
def read_lexicon(self, filename: str):
"""
Higher order function to load a lexicon and make it available to the model.
Wraps around read_vocabulary() with default parameters.
"""
def read_lm(self, filename: str):
"""
Higher order function to load a language model and make it available to the model.
Wraps around read_vocabulary() with default parameters.
"""
def read_variants(self, filename: str):
"""
Load a weighted variant list (set transparent to true if this is an error list and you
don't want the variants themselves to be returned when matching; i.e. they are transparent)
"""
def read_confusiblelist(self, filename: str):
"""
Load a confusable list
"""
def read_contextrules(self, filename: str):
"""
Load context rules from a TSV file
"""
def __contains__(self, text: str):
"""Is this exact text in a loaded lexicon?"""
def find_variants(self, input: str, params: SearchParameters) -> List[dict]:
"""Find variants in the vocabulary for a given string (in its totality), returns a list of variants with scores and their source lexicons"""
def find_variants_par(self, input: List[str], params: SearchParameters) -> List[dict]:
"""Find variants in the vocabulary for all multiple string items at once, provided in in the input list. Returns a list of variants with scores and their source lexicons. Will use parallellisation under the hood."""
def find_all_matches(self, text: str, params: SearchParameters) -> List[dict]:
"""Searches a text and returns all highest-ranking variants found in the text"""
def set_confusables_before_pruning(self):
"""
Configure the model to match against known confusables prior to pruning on maximum weight.
This corresponds to the `--early-confusables` option for the CLI version
"""