-
Notifications
You must be signed in to change notification settings - Fork 5
/
seql_learn.cpp
1776 lines (1557 loc) · 65 KB
/
seql_learn.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Author: Georgiana Ifrim ([email protected])
* SEQL: Sequence Learner
* This library trains ElasticNet-regularized Logistic Regression and L2-loss (squared-hinge-loss) SVM for Classifying Sequences in the feature space of all possible
* subsequences in the given training set.
* Elastic Net regularizer: alpha * L1 + (1 - alpha) * L2, which combines L1 and L2 penalty effects. L1 influences the sparsity of the model, L2 corrects potentially high
* coeficients resulting due to feature correlation (see Regularization Paths for Generalized Linear Models via Coordinate Descent, by Friedman et al, 2010).
*
* The user can influence the outcome classification model by specifying the following parameters:
* [-o objective] (objective function; choice between logistic regression and squared-hinge-svm. By default: logistic regression.)
* [-T maxitr] (number of optimization iterations; by default this is set using a convergence threshold on the aggregated change in score predictions.)
* [-l minpat] (constraint on the min length of any feature)
* [-L maxpat] (constraint on the max length of any feature)
* [-m minsup] (constraint on the min support of any feature, i.e. number of sequences containing the feature)
* [-g maxgap] (number of consecutive gaps or wildcards allowed in a feature, e.g. a**b, is a feature of size 4 with any 2 characters in the middle)
* [-n token_type] (word or character-level token to allow sequences such as 'ab cd ab' and 'abcdab')
* [-C regularizer_value] value of the regularization parameter, the higher value means more regularization
* [-a alpha] (weight on L1 vs L2 regularizer, alpha=0.5 means equal weight for l1 and l2)
* [-r traversal_strategy] (BFS or DFS traversal of the search space), BFS by default
* [-c convergence_threshold] (stopping threshold for optimisation iterations based on change in aggregated score predictions)
* [-v verbosity] (amount of printed detail about the model)
*
* License:
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation.
*
*/
/* The obj fct is: loss(x, y, beta) + C * ElasticNetReg(alpha, beta).
*/
#include <cfloat>
#include <iostream>
#include <cmath>
#include <fstream>
#include <vector>
#include <string>
#include <strstream>
#include <algorithm>
#include <numeric>
#include <functional>
#include <map>
#include <set>
#include <iterator>
#include <cstdlib>
#include <cstring>
#include <unistd.h>
#include "common.h"
#include "sys/time.h"
#include <list>
using namespace std;
class SeqLearner {
private:
// Best ngram rule.
struct rule_t {
// Gradient value for this ngram.
double gradient;
// Length of ngram.
unsigned int size;
// Ngram label.
std::string ngram;
// Ngram support, e.g. docids where it occurs in the collection.
std::vector <unsigned int> loc;
friend bool operator < (const rule_t &r1, const rule_t &r2)
{
return r1.ngram < r2.ngram;
}
};
// The search space rooted at this ngram.
struct space_t {
// Last docid.
int last_docid;
// Pointer to previous ngram.
space_t *prev;
// Label of node where extension is done.
string ne;
// Total list of occurrences in the entire collection for an ngram. A sort of expanded inverted index.
std::vector <int> loc;
// Vector of ngrams which are extensions of current ngram.
std::vector <space_t *> next;
// Add a doc_id and position of occurrence to the total list of occurrences,
// for this ngram.
// Encode the doc_id in the vector of locations.
// Negative entry means new doc_id.
void add (unsigned int docid, int pos) {
if (last_docid != (int)docid) loc.push_back (-(int)(docid+1));
loc.push_back (pos);
last_docid = (int)docid;
//cout << "\ndocid:" << docid << " pos:" << pos;
}
// Shrink the list of total occurrences to contain just support doc_ids, instead of doc_ids and occurences.
void shrink () {
std::vector<int> tmp;
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] < 0) tmp.push_back (loc[i]);
}
loc = tmp;
std::vector<int>(loc).swap (loc); // shrink
last_docid = -1;
}
// Return the support of current ngram.
// Simply count the negative loc as doc_ids.
unsigned int support () {
unsigned int result = 0;
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] < 0) ++result;
}
return result;
}
// Constructor for space_t.
space_t(): last_docid(-1), prev(0) {};
};
// Entire collection of documents, each doc represented as a string.
// The collection is a vector of strings.
std::vector < string > transaction;
// True classes.
std::vector < int > y;
// The fraction: 1 / 1 + exp^(yi*beta^t*xi) in the gradient computation.
std::vector < long double > exp_fraction;
// Per document, sum of best beta weights, beta^t * xi = sum_{j best beta coord} gradient_j
std::vector < double > sum_best_beta;
// The scalar product obtained with the optimal beta according to the line search for best step size.
std::vector < double > sum_best_beta_opt;
// Regularized loss function: loss + C * elasticnet_reg
// SLR loss function: log(1+exp(-yi*beta^t*xi))
// Squared Hinge SVM loss function: sum_{i|1-yi*beta^t*xi > 0} (1 - yi*beta^t*xi)^2
long double loss;
long double old_loss; //keep loss in previous iteration for checking convergence
std::map <string, double> features_cache;
map<string, double>::iterator features_it;
// Objective function. For now choice between logistic regression and l2 (Squared Hinge Loss).
unsigned int objective;
// Regularizer value.
double C;
// Weight on L1 vs L2 regularizer.
double alpha;
// The sum of squared values of all non-zero beta_j.
double sum_squared_betas;
// The sum of abs values of all non-zero beta_j.
double sum_abs_betas;
std::set <string> single_node_minsup_cache;
// Current rule.
rule_t rule;
// Current suboptimal gradient.
double tau;
// Max length for an ngram.
unsigned int maxpat;
// Min length for an ngram.
unsigned int minpat;
// Min supoort for an ngram.
unsigned int minsup;
// Allowe features with gaps. Treat the gap as an additional unigram.
// Max # of consecutive gaps allowed in a feature.
unsigned int maxgap;
// Total number of times the pruning condition is checked
unsigned int total;
// Total number of times the pruning condition is satisfied.
unsigned int pruned;
// Total number of times the best rule is updated.
unsigned int rewritten;
// Convergence threshold on aggregated change in score predictions.
// Used to automatically set the number of optimisation iterations.
double convergence_threshold;
// Verbosity level: 0 - print no information,
// 1 - print profiling information,
// 2 - print statistics on model and obj-fct-value per iteration
// > 2 - print details about search for best n-gram and pruning process
int verbosity;
// Type of token
bool token_type;
// Traversal strategy: BFS or DFS.
bool traversal_strategy;
// Profiling variables.
struct timeval t;
struct timeval t_origin;
struct timeval t_start_iter;
//long double LDBL_MAX = numeric_limits<long double>::max();
// Read the input training documents, "true_class document" per line.
// A line in the training file can be: "+1 a b c"
bool read (const char *filename) {
// Set the max line/document size to (10Mb).
const int kMaxLineSize = 10000000;
char *line = new char[kMaxLineSize];
char *column[5];
unsigned int num_pos = 0;
unsigned int num_neg = 0;
string doc;
std::istream *ifs = 0;
if (!strcmp(filename, "-")) {
ifs = &std::cin;
} else {
ifs = new std::ifstream(filename);
if (!*ifs) {
std::cerr << "seql_learn" << " " << filename << " No such file or directory" << std::endl;
return -1;
}
}
if (! ifs) return false;
cout << "\n\nread() input data....";
while (ifs->getline (line, kMaxLineSize)) {
if (line[0] == '\0' || line[0] == ';') continue;
if (line[strlen(line) - 1 ] == '\r')
line[strlen(line) - 1 ] = '\0';
//cout << "\nline size: " << strlen(line);
//cout.flush();
if (2 != tokenize (line, "\t ", column, 2)) {
std::cerr << "FATAL: Format Error: " << line << std::endl;
return false;
}
// Prepare class. _y is +1/-1.
int _y = atoi (column[0]);
y.push_back (_y);
if (_y > 0) num_pos++;
else num_neg++;
// Prepare doc. Assumes column[1] is the original text, e.g. no bracketing of original doc.
doc.assign(column[1]);
transaction.push_back(doc);
//cout << "\nscanning docid: " << transaction.size() - 1 << " class y:" << _y << " doc:" << doc <<"*";
cout.flush();
}
cout << "\n# positive samples: " << num_pos;
cout << "\n# negative samples: " << num_neg;
cout << "\nend read() input data....";
delete [] line;
return true;
}
// For current ngram, compute the gradient value and check prunning conditions.
// Update the current optimal ngram.
bool can_prune (space_t *space, unsigned int size) {
//struct timeval t_start;
//gettimeofday(&t_start, NULL);
++total;
// Upper bound for the positive class.
double upos = 0;
// Upper bound for the negative class.
double uneg = 0;
// Gradient value at current ngram.
double gradient = 0;
// Support of current ngram.
unsigned int support = 0;
//string reversed_ngram;
string ngram;
std::vector <int>& loc = space->loc;
// Compute the gradient and the upper bound on gradient of extensions.
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] >= 0) continue;
++support;
unsigned int j = (unsigned int)(-loc[i]) - 1;
// Choose objective function. 0: SLR, 2: l2-SVM.
if (objective == 0) { //SLR
// From differentiation we get a - in front of the sum_i_to_N
gradient -= y[j] * exp_fraction[j];
if (y[j] > 0) {
upos -= y[j] * exp_fraction[j];
} else {
uneg -= y[j] * exp_fraction[j];
}
} //end SLR (logistic regression)
/* if (objective == 1) { //L1-SVM (Hinge loss)
if (1 - y[j] * sum_best_beta[j] > 0) {
gradient -= y[j];
if (y[j] > 0) {
upos -= y[j];
} else {
uneg -= y[j];
}
}
} //end l1-SVM */
if (objective == 2) { //L2-SVM
if (1 - y[j] * sum_best_beta[j] > 0) {
gradient -= 2 * y[j] * (1 - y[j] * sum_best_beta[j]);
if (y[j] > 0) {
upos -= 2 * y[j] * (1 - y[j] * sum_best_beta[j]);
} else {
uneg -= 2 * y[j] * (1 - y[j] * sum_best_beta[j]);
}
}
} //end l2-SVM
}
//************ Debug info *****************
if (C != 0) {
// Retrieve the current ngram
if (!token_type) { // If word-level token: a bb cd a bb
for (space_t *t = space; t; t = t->prev) {
ngram = " " + t->ne + ngram;
}
// skip the space in front of the ngram
ngram.assign(ngram.substr(1));
} else { //char-level tokens: abbcdabb
for (space_t *t = space; t; t = t->prev) {
ngram = t->ne + ngram;
}
}
if (verbosity > 3) {
cout << "\ncurrent ngram rule: " << ngram;
cout << "\nlocation size: " << space->loc.size() << "\n";
//for (unsigned int i = 0; i < space->loc.size(); ++i)
// cout << space->loc[i] << " ";
cout << "\ngradient (before regularizer): " << gradient;
cout << "\nupos (before regularizer): " << upos;
cout << "\nuneg (before regularizer): " << uneg;
cout << "\ntau: " << tau;
}
} // end if (C != 0)
//*****************************************************
// Assume we can prune until bound sais otherwise.
bool flag_can_prune = 1;
// In case C != 0 we need to update the gradient and check the exact bound
// for non-zero features.
if ( C != 0 ) {
double current_upos = 0;
double current_uneg = 0;
// Retrieve the beta_ij coeficient of this feature. If beta_ij is non-zero,
// update the gradient: gradient += C * [alpha*sign(beta_j) + (1-alpha)*beta_j];
// Fct lower_bound return an iterator to the key >= given key.
features_it = features_cache.lower_bound(ngram);
// If there are keys starting with this prefix (this ngram starts at pos 0 in existing feature).
if (features_it != features_cache.end() && features_it->first.find(ngram) == 0) {
// If found an exact match for the key.
// add regularizer to gradient.
if (features_it->first.compare(ngram) == 0) {
int sign = abs(features_it->second)/features_it->second;
gradient += C * (alpha * sign + (1-alpha) * features_it->second);
}
if (verbosity > 3) {
cout << "\ngradient after regularizer: " << gradient;
}
// Check if current feature s_j is a prefix of any non-zero features s_j'.
// Check exact bound for every such non-zero feature.
while (features_it != features_cache.end() && features_it->first.find(ngram) == 0) {
int sign = abs(features_it->second)/features_it->second;
current_upos = upos + C * (alpha * sign + (1-alpha) * features_it->second);
current_uneg = uneg + C * (alpha * sign + (1-alpha) * features_it->second);
if (verbosity > 3) {
cout << "\nexisting feature starting with current ngram rule prefix: "
<< features_it->first << ", " << features_it->second << ", sign: " << sign;
cout << "\ncurrent_upos: " << current_upos;
cout << "\ncurrent_uneg: " << current_uneg;
cout << "\ntau: " << tau;
}
// Check bound. If any non-zero feature starting with current ngram as a prefix
// can still qualify for selection in the model,
// we cannot prune the search space.
if (std::max (abs(current_upos), abs(current_uneg)) > tau ) {
flag_can_prune = 0;
break;
}
++features_it;
}
} else {
// If there are no features in the model starting with this prefix, check regular bound.
if (std::max (abs(upos), abs(uneg)) > tau ) {
flag_can_prune = 0;
}
}
} // If C = 0, check regular bound.
else {
if (std::max (abs(upos), abs(uneg)) > tau ) {
flag_can_prune = 0;
}
}
if (support < minsup || flag_can_prune) {
++pruned;
if (verbosity > 3) {
cout << "\n\nminsup || upper bound: pruned!\n";
}
return true;
}
double g = std::abs (gradient);
// If current ngram better than previous best ngram, update optimal ngram.
// Check min length requirement.
if (g > tau && size >= minpat) {
// || g == tau && size < rule.size) {
++rewritten;
tau = g;
rule.gradient = gradient;
rule.size = size;
if (C == 0) { // Retrieve the best ngram. If C != 0 this is already done.
if (!token_type) { // If word-level token: a bb cd a bb
// traverse ngram going from the end to the front
for (space_t *t = space; t; t = t->prev) {
ngram = " " + t->ne + ngram;
}
// skip the space in front of the ngram
ngram.assign(ngram.substr(1));
} else { //char-level tokens: abbcdabb
for (space_t *t = space; t; t = t->prev) {
ngram = t->ne + ngram;
}
}
} //end C==0.
rule.ngram = ngram;
if (verbosity >= 3) {
cout << "\n\ncurrent best ngram rule: " << rule.ngram;
cout << "\ngradient: " << gradient << "\n";
}
rule.loc.clear ();
for (unsigned int i = 0; i < space->loc.size(); ++i) {
// Keep the doc ids where the best ngram appears.
if (space->loc[i] < 0) rule.loc.push_back ((unsigned int)(-space->loc[i]) - 1);
}
}
//cout << "\nend can_prune...";
//gettimeofday(&t, NULL);
//cout << " ( " << t.tv_sec - t_start.tv_sec << " seconds; " << (t.tv_sec - t_start.tv_sec) / 60.0 << " minutes )";
return false;
}
// Try to grow the ngram to next level, and prune the appropriate extensions.
// The growth is done breadth-first, e.g. grow all unigrams to bi-grams, than all bi-grams to tri-grams, etc.
void span_bfs (space_t *space, std::vector<space_t *>& new_space, unsigned int size) {
//struct timeval t_start;
//gettimeofday(&t_start, NULL);
//cout << "\nspan next level....";
//cout.flush();
std::vector <space_t *>& next = space->next;
// If working with gaps.
// Check if number of consecutive gaps exceeds the max allowed.
if (maxgap) {
unsigned int num_consec_gaps = 0;
for (space_t *t = space; t; t = t->prev) {
if (t->ne.compare("*") == 0)
num_consec_gaps ++;
else break;
}
if (num_consec_gaps > maxgap) return;
}
if (next.size() == 1 && next[0] == 0) {
return;
}
else {
// If there are candidates for extension, iterate through them, and try to prune some.
if (! next.empty()) {
if (verbosity > 4)
cout << "\n !next.empty()";
for (std::vector<space_t*>::iterator it = next.begin(); it != next.end(); ++it) {
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for the prev ngram without the gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if ((*it)->ne.compare("*") == 0) {
if (verbosity > 4)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next dfs level.";
new_space.push_back ((*it));
} else if (! can_prune (*it, size)) {
new_space.push_back ((*it));
}
}
} else {
//cout << "\nnext.empty";
// Prepare possible extensions.
unsigned int docid = 0;
// Candidates obtained by extension.
std::map<string, space_t> candidates;
std::vector <int>& loc = space->loc;
// Iterate through the inverted index of the current feature.
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] < 0) {
docid = (unsigned int)(-loc[i]) - 1;
//cout << "\n\ndocid: " << docid;
continue;
}
// the start position of this unigram in the document
unsigned int pos = loc[i];
// current unigram where extension is done
string unigram = space->ne;
if (verbosity > 4) {
cout << "\ncurrent pos and start char: " << pos << " " << transaction[docid][pos];
cout << "\ncurrent unigram to be extended (space->ne):" << unigram;
}
string next_unigram;
// If not re-initialized, it should fail.
unsigned int pos_start_next_unigram = transaction[docid].size();
//cout << "\npos: " << pos;
if (pos + unigram.size() < transaction[docid].size()) { //unigram is not in the end of doc, thus it can be extended.
if (verbosity > 4) {
cout << "\npotential extension...";
}
if (!token_type) { // Word level token.
// Find the first space after current unigram position.
unsigned int pos_space = pos + unigram.size();
// Skip over consecutive spaces.
while ( (pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space]) ) {
pos_space++;
}
// Check if doc ends in spaces, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
} else {
pos_start_next_unigram = pos_space; //stoped at first non-space after consec spaces
size_t pos_next_space = transaction[docid].find(' ', pos_start_next_unigram + 1);
// Case1: the next unigram is in the end of the doc, no second space found.
if (pos_next_space == string::npos) {
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram));
} else { //Case2: the next unigram is inside the doc.
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram, pos_next_space - pos_start_next_unigram));
}
}
} else { // Char level token. Skip spaces.
if (!isspace(transaction[docid][pos + 1])) {
//cout << "\nnext char is not space";
next_unigram = transaction[docid][pos + 1]; //next unigram is next non-space char
pos_start_next_unigram = pos + 1;
} else { // If next char is space.
unsigned int pos_space = pos + 1;
// Skip consecutive spaces.
while ((pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space])) {
pos_space++;
}
// Check if doc ends in space, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
}
/* //disallow using char-tokenization for space separated tokens.
else {
pos_start_next_unigram = pos_space;
//cout << "\nnext next char is not space";
next_unigram = transaction[docid][pos_start_next_unigram];
} */
}
} //end char level token
if (next_unigram.empty()) {
cout <<"\nFATAL...in span_bfs() next_unigram for extension of current unigram" << unigram << " is empty...exit";
std::exit(-1);
} else {
if (verbosity > 4) {
cout << "\nnext_unigram for extension:" << next_unigram;
cout << "\npos: " << pos_start_next_unigram << " " << transaction[docid][pos_start_next_unigram];
}
}
if (minsup == 1 || single_node_minsup_cache.find (next_unigram) != single_node_minsup_cache.end()) {
//cout << "\nunigram: " << unigram <<" extension: " << next_unigram;
candidates[next_unigram].add (docid, pos_start_next_unigram);
}
if (maxgap) {
// If we allow gaps, we treat a gap as an additional unigram "*".
// Its inverted index will simply be a copy pf the inverted index of the original features.
// Example, for original feature "a", we extend to "a *", where inverted index of "a *" is simply
// a copy of the inverted index of "a", except for positions where "a" is the last char in the doc.
candidates["*"].add (docid, pos_start_next_unigram);
}
} //end generating candidates for the current pos
} //end iteration through inverted index (docids iand pos) to find candidates
// Keep only doc_ids for occurrences of current ngram.
space->shrink ();
//cout << "\nfinal candidates: ";
//cout.flush();
// Prepare the candidate extensions.
for (std::map <string, space_t >::iterator it = candidates.begin();
it != candidates.end(); ++it) {
space_t* c = new space_t;
c->loc = it->second.loc;
std::vector<int>(c->loc).swap(c->loc);
c->ne = it->first;
c->prev = space;
c->next.clear ();
// Keep all the extensions of the current feature for future iterations.
// If we need to save memory we could sacrifice this storage.
next.push_back (c);
//cout << "\nnode extended: " << space->ne;
//cout << "\nnode for extention: " << it->first;
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for ngram without last gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if (c->ne.compare("*") == 0) {
if (verbosity >= 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next bfs level.";
new_space.push_back (c);
} else if (! can_prune (c, size)) {
new_space.push_back (c);
}
}
if (next.empty()) {
next.push_back (0);
}
std::vector<space_t *>(next).swap (next);
} //end generation of candidates when they weren't stored already.
}
//cout << "\nend span next level....";
//gettimeofday(&t, NULL);
//cout << " ( " << t.tv_sec - t_start.tv_sec << " seconds; " << (t.tv_sec - t_start.tv_sec) / 60.0 << " minutes )";
return;
}
// Try to grow the ngram to next level, and prune the appropriate extensions.
// The growth is done deapth-first rather than breadth-first, e.g. grow each candidate to its longest unpruned sequence
void span_dfs (space_t *space, unsigned int size) {
//struct timeval t_start;
//gettimeofday(&t_start, NULL);
//cout << "\nspan next level....";
//cout.flush();
std::vector <space_t *>& next = space->next;
// Check if ngram larger than maxsize allowed.
if (size > maxpat) return;
// If using gaps.
// Check if number of consecutive gaps exceeds the max allowed.
if (maxgap) {
unsigned int num_consec_gaps = 0;
for (space_t *t = space; t; t = t->prev) {
if (t->ne.compare("*") == 0)
num_consec_gaps ++;
else break;
}
if (num_consec_gaps > maxgap) return;
}
if (next.size() == 1 && next[0] == 0) {
return;
}
else {
//{
// If the extensions are already computed, iterate through them and check pruning condition.
if (! next.empty()) {
if (verbosity >= 3)
cout << "\n !next.empty()";
for (std::vector<space_t*>::iterator it = next.begin(); it != next.end(); ++it) {
if ((*it)->ne.compare("*") == 0) {
if (verbosity >= 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next dfs level.";
// Expand each candidate DFS wise.
span_dfs(*it, size + 1);
} else if (! can_prune (*it, size)) {
// Expand each candidate DFS wise.
span_dfs(*it, size + 1);
}
}
} else {
// Extend the ngram to its next level.
//cout << "\nnext.empty";
// Prepare possible extensions.
unsigned int docid = 0;
// Candidates obtained by extension.
std::map<string, space_t> candidates;
std::vector <int>& loc = space->loc;
// Iterate through the inverted index of the current feature.
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] < 0) {
docid = (unsigned int)(-loc[i]) - 1;
//cout << "\n\ndocid: " << docid;
continue;
}
// the start position of this unigram in the document
unsigned int pos = loc[i];
// current unigram where extension is done
string unigram = space->ne;
if (verbosity > 4) {
cout << "\ncurrent pos and start char: " << pos << " " << transaction[docid][pos];
cout << "\ncurrent unigram to be extended (space->ne):" << unigram;
}
string next_unigram;
// If not re-initialized, it should fail.
unsigned int pos_start_next_unigram = transaction[docid].size();
//cout << "\npos: " << pos;
if (pos + unigram.size() < transaction[docid].size()) { //unigram is not in the end of doc, thus it can be extended.
if (verbosity > 4) {
cout << "\npotential extension...";
}
if (!token_type) { // Word level token.
// Find the first space after current unigram position.
unsigned int pos_space = pos + unigram.size();
// Skip over consecutive spaces.
while ( (pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space]) ) {
pos_space++;
}
// Check if doc ends in spaces, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
} else {
pos_start_next_unigram = pos_space; //stoped at first non-space after consec spaces
size_t pos_next_space = transaction[docid].find(' ', pos_start_next_unigram + 1);
// Case1: the next unigram is in the end of the doc, no second space found.
if (pos_next_space == string::npos) {
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram));
} else { //Case2: the next unigram is inside the doc.
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram, pos_next_space - pos_start_next_unigram));
}
}
} else { // Char level token. Skip spaces.
if (!isspace(transaction[docid][pos + 1])) {
//cout << "\nnext char is not space";
next_unigram = transaction[docid][pos + 1]; //next unigram is next non-space char
pos_start_next_unigram = pos + 1;
} else { // If next char is space.
unsigned int pos_space = pos + 1;
// Skip consecutive spaces.
while ((pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space])) {
pos_space++;
}
// Check if doc ends in space, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
}
/* //disallow using char-tokenization for space separated tokens.
else {
pos_start_next_unigram = pos_space;
//cout << "\nnext next char is not space";
next_unigram = transaction[docid][pos_start_next_unigram];
} */
}
} //end char level token
if (next_unigram.empty()) {
cout <<"\nFATAL...in span_dfs() next_unigram for extension of current unigram" << unigram << " is empty...exit";
std::exit(-1);
} else {
if (verbosity > 4) {
cout << "\nnext_unigram for extension:" << next_unigram;
cout << "\npos: " << pos_start_next_unigram << " " << transaction[docid][pos_start_next_unigram];
}
}
if (minsup == 1 || single_node_minsup_cache.find (next_unigram) != single_node_minsup_cache.end()) {
//cout << "\nunigram: " << unigram <<" extension: " << next_unigram;
candidates[next_unigram].add (docid, pos_start_next_unigram);
}
if (maxgap) {
// If we allow gaps, we treat a gap as an additional unigram "*".
// Its inverted index will simply be a copy pf the inverted index of the original features.
// Example, for original feature "a", we extend to "a *", where inverted index of "a *" is simply
// a copy of the inverted index of "a", except for positions where "a" is the last char in the doc.
candidates["*"].add (docid, pos_start_next_unigram);
}
} //end generating candidates for the current pos
} //end iteration through inverted index (docids iand pos) to find candidates
// Keep only doc_ids for occurrences of current ngram.
space->shrink ();
// If no candidate extensions were found, return.
// if (candidates.empty()) {
// if (verbosity >= 3)
// cout << "\n candidates is empty()!... return";
// return;
// }
//cout << "\nfinal candidates: ";
//cout.flush();
// Prepare the candidate extensions.
for (std::map <string, space_t >::iterator it = candidates.begin();
it != candidates.end(); ++it) {
space_t* c = new space_t;
c->loc = it->second.loc;
std::vector<int>(c->loc).swap(c->loc);
c->ne = it->first;
c->prev = space;
c->next.clear ();
// Keep all the extensions of the current feature for future iterations.
// If we need to save memory we could sacrifice this storage.
next.push_back (c);
//cout << "\nnode extended: " << space->ne;
//cout << "\nnode for extention: " << it->first;
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for ngram without last gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if (c->ne.compare("*") == 0) {
if (verbosity >= 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next dfs level.";
span_dfs(c, size + 1);
} else // If this doesn't end in *, then check gradient and pruning condition.
if (! can_prune (c, size)) {
span_dfs(c, size + 1);
}
}
if (next.empty()) {
next.push_back (0);
}
std::vector<space_t *>(next).swap (next);
}
}
//cout << "\nend span next level....";
//gettimeofday(&t, NULL);
//cout << " ( " << t.tv_sec - t_start.tv_sec << " seconds; " << (t.tv_sec - t_start.tv_sec) / 60.0 << " minutes )";
return;
}
// Line search method. Search for step size that minimizes loss.
// Compute loss in middle point of range, beta_n1, and
// for mid of both ranges beta_n0, beta_n1 and bet_n1, beta_n2
// Compare the loss for the 3 points, and choose range of 3 points
// which contains the minimum. Repeat until the range spanned by the 3 points is small enough,
// e.g. the range approximates well the vector where the loss function is minimized.
// Return the middle point of the best range.
void find_best_range(vector<double>& sum_best_beta_n0, vector<double>& sum_best_beta_n1, vector<double>& sum_best_beta_n2,
vector<double>& sum_best_beta_mid_n0_n1, vector<double>& sum_best_beta_mid_n1_n2,
rule_t& rule, vector<double>* sum_best_beta_opt) {
//struct timeval t, t_start;
//gettimeofday(&t_start, NULL);
// if (verbosity > 3)
// cout << "\n In find_best_range()!";
//cout.flush();
double min_range_size = 1e-3;
double current_range_size = 0;
int current_interpolation_iter = 0;
long double loss_mid_n0_n1 = 0;
long double loss_mid_n1_n2 = 0;
long double loss_n1 = 0;
for (unsigned int i = 0; i < transaction.size(); ++i) {
if (verbosity > 4) {
cout << "\nsum_best_beta_n0[i]: " << sum_best_beta_n0[i];
cout << "\nsum_best_beta_n1[i]: " << sum_best_beta_n1[i];
cout << "\nsum_best_beta_n2[i]: " << sum_best_beta_n2[i];
}
current_range_size += abs(sum_best_beta_n2[i] - sum_best_beta_n0[i]);
}
if (verbosity > 3)
cout << "\ncurrent range size: " << current_range_size;
double beta_coef_n1 = 0;
double beta_coef_mid_n0_n1 = 0;
double beta_coef_mid_n1_n2 = 0;
if (C != 0 && sum_squared_betas != 0) {
features_it = features_cache.find(rule.ngram);
}
// Start interpolation loop.
while (current_range_size > min_range_size) {
if (verbosity > 3)
cout << "\ncurrent interpolation iteration: " << current_interpolation_iter;
for (unsigned int i = 0; i < transaction.size(); ++i) { //loop through training samples
sum_best_beta_mid_n0_n1[i] = (sum_best_beta_n0[i] + sum_best_beta_n1[i]) / 2;
sum_best_beta_mid_n1_n2[i] = (sum_best_beta_n1[i] + sum_best_beta_n2[i]) / 2;
if ( C != 0) {
beta_coef_n1 = sum_best_beta_n1[rule.loc[0]] - sum_best_beta[rule.loc[0]];
beta_coef_mid_n0_n1 = sum_best_beta_mid_n0_n1[rule.loc[0]] - sum_best_beta[rule.loc[0]];
beta_coef_mid_n1_n2 = sum_best_beta_mid_n1_n2[rule.loc[0]] - sum_best_beta[rule.loc[0]];
}
if (verbosity > 4) {
cout << "\nsum_best_beta_mid_n0_n1[i]: " << sum_best_beta_mid_n0_n1[i];
cout << "\nsum_best_beta_mid_n1_n2[i]: " << sum_best_beta_mid_n1_n2[i];
}
if (objective == 0 ) { //SLR
if (-y[i] * sum_best_beta_n1[i] > 8000) {
loss_n1 += log(LDBL_MAX);
} else {
loss_n1 += log(1 + exp(-y[i] * sum_best_beta_n1[i]));
}
if (-y[i] * sum_best_beta_mid_n0_n1[i] > 8000) {
loss_mid_n0_n1 += log(LDBL_MAX);
} else {
loss_mid_n0_n1 += log(1 + exp(-y[i] * sum_best_beta_mid_n0_n1[i]));
}
if (-y[i] * sum_best_beta_mid_n1_n2[i] > 8000) {
loss_mid_n1_n2 += log(LDBL_MAX);
} else {
loss_mid_n1_n2 += log(1 + exp(-y[i] * sum_best_beta_mid_n1_n2[i]));
}
} //end SLR
/*
if (objective == 1) { //L1-SVM
if (1 - y[i] * sum_best_beta_n1[i] > 0)
loss_n1 += 1 - y[i] * sum_best_beta_n1[i];
if (1 - y[i] * sum_best_beta_mid_n0_n1[i] > 0)
loss_mid_n0_n1 += 1 - y[i] * sum_best_beta_mid_n0_n1[i];
if (1 - y[i] * sum_best_beta_mid_n1_n2[i] > 0)
loss_mid_n1_n2 += 1 - y[i] * sum_best_beta_mid_n1_n2[i];
} //end l1-SVM
*/
if (objective == 2) { //l2-SVM
if (1 - y[i] * sum_best_beta_n1[i] > 0)
loss_n1 += pow(1 - y[i] * sum_best_beta_n1[i], 2);
if (1 - y[i] * sum_best_beta_mid_n0_n1[i] > 0)
loss_mid_n0_n1 += pow(1 - y[i] * sum_best_beta_mid_n0_n1[i], 2);
if (1 - y[i] * sum_best_beta_mid_n1_n2[i] > 0)
loss_mid_n1_n2 += pow(1 - y[i] * sum_best_beta_mid_n1_n2[i], 2);
} //end l2-SVM
} //end loop through training samples.
if ( C != 0 ) {
// Add the Elastic Net regularization term.
// If this is the first ngram selected.