forked from winkjs/wink-nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbm25-vectorizer.js
406 lines (367 loc) · 15 KB
/
bm25-vectorizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var allowed = require( './allowed.js' );
var its = require( '../src/its.js' );
var helper = require( '../src/helper.js' );
// Norm computation helper connstants and functions.
const L2 = 'l2';
const NONE = 'none';
const normFn = Object.create( null );
/**
* Computes absolute value of `v` for l1 norm.
*
* @param {number} v values whose abs is computed.
* @return {number} absolute value of v.
*/
normFn.l1 = ( ( v ) => ( Math.abs( v ) ) );
/**
* Computes square of `v` for l2 norm.
*
* @param {number} v values whose square is computed.
* @return {number} square of v.
*/
normFn.l2 = ( ( v ) => ( v * v ) );
/**
* Returns 0, used when norm is none.
*
* @return {number} 0 value.
*/
normFn.none = ( () => ( 0 ) );
/**
* Validates the input config's numeric parameter, if in valid the returns
* the default value.
* @param {Number} num input number to be validated.
* @param {Number} numDefault default value that is used if the input num is invalid.
* @param {Number} min used for testing num < min.
* @param {[type]} max used for testing num > max.
* @return {[type]} input number of default value.
*/
const getValidCfgNum = function ( num, numDefault, min, max ) {
return (
( num === null ) ||
( num === undefined ) ||
( typeof num !== 'number' ) ||
( num < min || num > max )
) ? numDefault : num;
}; // getValidCfgNum()
// # bm25Vectorizer
/**
* Creates an instance of BM25-based vectorizer using the `config`.
*
* @param {object} config defines values of various BM25 configuration params —
* k, k1, b and normalization scheme — none, l1, or l2.
* @return {undefined} Nothing!
*/
var bm25Vectorizer = function ( config ) {
const cfg = ( helper.isObject( config ) ) ? config : Object.create( null );
// Setup BM25 Parameters.
const k = getValidCfgNum( cfg.k, 1, 0, 100 );
const k1 = getValidCfgNum( cfg.k1, 1.2, 0, 100 );
const b = getValidCfgNum( cfg.b, 0.75, 0, 1 );
// Setup precision.
const precision = getValidCfgNum( cfg.precision, 6, 1, 12 );
// Setup norm.
const norm = (
( cfg.norm === null ) ||
( cfg.norm === undefined ) ||
( normFn[ cfg.norm ] === undefined )
) ? NONE : cfg.norm;
// Document Id; it is incremented at the end of each `learn()` call.
var docId = 0;
// Term frequencies. The raw `tf` builds up here during learning. Subsequently
// the same is updated using BM25 formula during wieght computation.
var tf = [];
// Length of each document in terms of number of tokens.
var docLength = [];
// Sum of all document's length, used for average DL computation.
var sumOfAllDLs = 0;
// Inverse Document Frequency. The raw `idf` builds up here during learning.
// Subsequently the same is updated using BM25 formula during wieght computation.
var idf = Object.create( null );
// Norms value for each document.
var norms = [];
// Terms or features — typically to be used alongwith vector or doc term matrix
// by the developer.
var terms = [];
// Useful in avoiding re-computation of final weights.
var weightsComputed = false;
// Returned!
var methods = Object.create( null );
// ## computeWeights
/**
* Computes & updates the TF and the IDF as per the BM25 formulae.
*
* @return {undefined} Nothing!
*/
var computeWeights = function () {
// If weights have been computed, then re-computation is not allowed.
if ( weightsComputed ) return;
if ( docId === 0 ) throw Error( 'wink-nlp: this operation doesn\'t make sense without any learning; use learn() API first.' );
// Set the average document length used for normalization.
const avgDL = sumOfAllDLs / docId;
// Compute IDF.
for ( const t in idf ) { // eslint-disable-line guard-for-in
// Equation 3.3 in "Probabilistic Relevance Framework BM25 and Beyond" by
// Stephen Robertson and Hugo Zaragoza
idf[ t ] = +Math.log( ( ( docId - idf[ t ] + 0.5 ) / ( idf[ t ] + 0.5 ) ) + k ).toFixed( precision );
}
// Compute the TF for every documet.
for ( let id = 0; id < docId; id += 1 ) {
for ( const t in tf[ id ] ) { // eslint-disable-line guard-for-in
// Equation 3.15 with modification described in section on "Some Variations on BM25"
// (3.5.1) of "Probabilistic Relevance Framework BM25 and Beyond" by
// Stephen Robertson and Hugo Zaragoza
// IDF * ((k + 1) * tf) / (k * (1.0 - b + b * (|d|/avgDl)) + tf)
tf[ id ][ t ] = idf[ t ] * ( ( k1 + 1 ) * tf[ id ][ t ] ) / ( ( k1 * ( 1 - b + ( b * ( docLength[ id ] / avgDL ) ) ) ) + tf[ id ][ t ] );
// Compute the norm incrementally; will eventually use it as the divisor
// at the time of final computation in the next loop.
norms[ id ] += normFn[ norm ]( tf[ id ][ t ] );
}
// Compute norm, if none then we set norms[ id ] to 1 and divide by 1 has
// no effect!
if ( norm === L2 ) {
norms[ id ] = Math.sqrt( norms[ id ] );
} else if ( norm === NONE ) norms[ id ] = 1;
for ( const t in tf[ id ] ) { // eslint-disable-line guard-for-in
tf[ id ][ t ] = +( tf[ id ][ t ] / norms[ id ] ).toFixed( precision );
}
}
// Extract terms sorted alphabetically — vectors & matrix follow this order.
terms = Object.keys( idf ).sort();
// Set weightsComputed.
weightsComputed = true;
}; // computeWeights()
// ## learn
/**
*
* Builds raw TF and IDF from the tokenized input document. Throws error if
* call to out has been made — because out updates the raw tf-idf values as
* per the BM25 formulae.
*
* @param {string[]} tokens tokenized document, usually obtained via winkNLP.
* @return {undefined} nothing.
*/
methods.learn = function ( tokens ) {
if ( weightsComputed ) throw Error( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
// It will contain the TF i.e.bag-of-words for this document — `docId`.
const bow = Object.create( null );
// Set the length of this document.
docLength[ docId ] = tokens.length;
// Build TF and keep updating the IDF.
for ( let i = 0; i < tokens.length; i += 1 ) { // eslint-disable-line guard-for-in
const t = tokens[ i ];
if ( bow[ t ] === undefined ) {
bow[ t ] = 1;
// Increment doc count for this token; note it will happen only once whenever
// we encounter a token unnseen so far.
idf[ t ] = 1 + ( idf[ t ] || 0 );
} else {
// Token has been seen, simply increment.
bow[ t ] += 1;
}
}
// Save this TF at `docId` location.
tf.push( bow );
norms.push( 0 );
// Update sum, will be used to compute the average DL later.
sumOfAllDLs += docLength[ docId ];
// Get ready to process the next document.
docId += 1;
}; // learn()
// ## out
/**
* Produces out put according to the input function. Operates at vecorizer level.
*
* @param {function} f a function that determins the output format/content.
* @return {array} array containing either strings, objects or arrays.
*/
methods.out = function ( f ) {
computeWeights();
// Pass `docId` & `sumOfAllDLs` in additionn to `tf`, `idf` & `terms`; this
// is needed while saving the model JSON.
if ( allowed.its4BM25.has( f ) ) return f( tf, idf, terms, docId, sumOfAllDLs );
// In case of innvalid `f`, fall back to the default method — `docBOWArray`.
return its.docBOWArray( tf, idf, terms, docId, sumOfAllDLs );
}; // out()
// ## doc
/**
* Returns the APIs applicable to the document specified by its index.
*
* @param {numebr} n index of the document.
* @return {object} object containing `out()` and `length()` methods.
*/
methods.doc = function ( n ) {
const api = Object.create( null );
// ## doc.out
/**
* Produces out put according to the input function. Operates at the document
* level.
*
* @param {function} f a function that determins the output format/content.
* @return {array} array containing either strings, objects or arrays.
*/
api.out = function ( f ) {
computeWeights();
if ( f === its.bow ) return f( tf[ n ] );
if ( f === its.vector ) {
const arr = new Array( terms.length );
for ( let i = 0; i < terms.length; i += 1 ) {
arr[ i ] = tf[ n ][ terms[ i ] ] || 0;
}
return arr;
}
if ( f === its.tf ) return f( tf[ n ] );
return tf[ n ]; // its.bow — default fall back.
}; // doc.out()
// ## doc.length
/**
*
* Returns the number of unique tokens in the n<sup>th</sup> document.
*
* @return {number} the number of unique tokens in the document.
*/
api.length = function () {
return ( tf.length === 0 ) ? 0 : Object.keys( tf[ n ] ).length;
}; // doc.length()
return api;
}; // doc()
// ## vectorOf
/**
* Computes the vector of the input document given in form of tokens using
* the tf-idf learned so far.
* @param {string[]} tokens tokenized document, usually obtained via winkNLP.
* @return {number[]} its vector.
*/
methods.vectorOf = function ( tokens ) {
computeWeights();
const arr = new Array( terms.length );
const bow = Object.create( null );
const avgDL = sumOfAllDLs / docId;
let thisNorm = 0;
for ( let i = 0; i < tokens.length; i += 1 ) {
const t = tokens[ i ];
bow[ t ] = 1 + ( bow[ t ] || 0 );
}
for ( let i = 0; i < terms.length; i += 1 ) {
const t = terms[ i ];
arr[ i ] = bow[ t ] || 0;
arr[ i ] = idf[ t ] * ( ( k1 + 1 ) * arr[ i ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + arr[ i ] );
thisNorm += normFn[ norm ]( arr[ i ] );
}
if ( norm === L2 ) {
thisNorm = Math.sqrt( thisNorm );
} else if ( norm === NONE ) thisNorm = 1;
// `thisNorm || 1` ensures that there is no attempt to divide by zero!
// This may happen if all tokens are unseen.
return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) );
}; // vectorOf()
// ## bowOf
/**
* Computes the bag-of-words (bowOf) of the input document, using the tf-idf
* learned so far. If `processOOV` is true then for OOV token's frequency is
* computed and its `idf` is assumed to be **1**; otherwise all OOVs are ignored.
* @param {string[]} tokens tokenized text, usually obtained via winkNLP.
* @param {boolean} processOOV true — process OOV, false — ignore OOV (default).
* @return {object} its bow.
*/
methods.bowOf = function ( tokens, processOOV = false ) {
computeWeights();
const bow = Object.create( null );
const avgDL = sumOfAllDLs / docId;
let thisNorm = 0;
if ( typeof processOOV !== 'boolean' ) {
throw Error( 'wink-nlp: processOOV must be a boolean.' );
}
for ( let i = 0; i < tokens.length; i += 1 ) {
const t = tokens[ i ];
// `processOOV` true means count every term otherwise count only if it is
// in the vocabulary i.e. `idf`.
if ( processOOV ) {
bow[ t ] = 1 + ( bow[ t ] || 0 );
} else if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
}
for ( const t in bow ) { // eslint-disable-line guard-for-in
// `bow` tokens are determined by `processOOV` i.e. if true it will contain
// OOVs also otherwise it will not have any OOV. On the other hand `idf`
// always contains all the seen tokens. Therefore when `processOOV` is true,
// the `idf[ t ]` for all OOV will be taken as **1** (highest possible value).
bow[ t ] = ( idf[ t ] || 1 ) * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
thisNorm += normFn[ norm ]( bow[ t ] );
}
if ( norm === L2 ) {
thisNorm = Math.sqrt( thisNorm );
} else if ( norm === NONE ) thisNorm = 1;
for ( const t in bow ) { // eslint-disable-line guard-for-in
// Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be
// empty if `thisNorm` is zero!
bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision );
}
return bow;
}; // bowOf()
methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) );
// ## loadModel
/**
* Loads the input model JSON into the BM25's respective data structure. Throws
* error if invalid JSON or model is passed. Sets `weightsComputed` to true to
* prevent further learning.
* @param {string} json Input model's JSON string.
* @return {void} Nothing!
*/
methods.loadModel = function ( json ) {
// Used to check presence of required fields; `uid` is checked separately.
const modelFields = [ 'docId', 'tf', 'idf', 'terms', 'sumOfAllDLs' ];
let model;
if ( docId > 0 ) throw Error( 'wink-nlp: can not load model after learning.' );
try {
model = JSON.parse( json );
} catch (e) {
throw Error( `wink-nlp: invalid input JSON:\n\t${e}\n\n` );
}
if ( helper.isObject( model ) && ( Object.keys( model ).length === 6 ) && ( model.uid === 'WinkNLP-BM25Vectorizer-Model/1.0.0' ) ) {
// Check presence of all required fields.
modelFields.forEach( ( f ) => {
if ( model[ f ] === undefined ) throw Error( 'wink-nlp: invalid model format/version' );
} );
// All good, set fields.
docId = model.docId;
tf = model.tf;
idf = model.idf;
terms = model.terms;
sumOfAllDLs = model.sumOfAllDLs;
// To prevent further learning.
weightsComputed = true;
} else {
throw Error( 'wink-nlp: invalid model format/version' );
}
}; // loadModel()
return methods;
}; // bm25Vectorizer()
module.exports = bm25Vectorizer;