-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpattern_random.cpp
67 lines (64 loc) · 2.09 KB
/
pattern_random.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#include <sdsl/suffix_arrays.hpp>
#include <iostream>
#include <string>
#include <cstdio>
using namespace sdsl;
using namespace std;
uint64_t my_rand64()
{
int_vector<> v(1);
util::set_random_bits(v);
return v[0];
}
int main(int argc, char** argv)
{
if (argc < 7) {
printf("Usage ./%s input_file tmp_dir pat_len min_cum_occ pat_file occ_file\n",argv[0]);
printf(" (1) Build an index for the text T stored in input_file.\n");
printf(" Temporary files are stored in tmp_dir.\n");
printf(" (2) Set cum_occ=0\n");
printf(" (3) Select a random position in T and extracts a pattern P\n");
printf(" of length pat_len from this position.\n");
printf(" (3) Get x, the number of occurrences of P in T.\n");
printf(" (4) cum_occ+=x; if cum_occ>=min_cum_occ exit program.\n");
printf(" Otherwise go to step (3).");
printf(" (5) Write pattern in Pizza&Chili format into pat_file.");
printf(" (6) Write #occurrences in into occ_file.");
return 1;
}
uint64_t min_cum_occ = atoll(argv[4]);
uint64_t pat_len = atoll(argv[3]);
assert(pat_len > 0);
vector<string> pattern;
string file = util::basename(argv[1]);
ofstream out(argv[5]);
ofstream occ_out(argv[6]);
//(1)
csa_wt<> csa;
cache_config cconfig(false, argv[2], file);
construct(csa, argv[1], cconfig, 1);
if (pat_len>=csa.size()) {
std::cout << "pat_len=" << pat_len << " too long." << std::endl;
return 1;
}
// (2)
uint64_t cum_sum = 0;
uint64_t number = 0;
do {
// (3)
uint64_t start_idx = my_rand64()%(csa.size()-pat_len);
auto pat = extract(csa, start_idx, start_idx+pat_len-1);
pattern.push_back(pat);
// (4)
uint64_t x = count(csa, pat.begin(), pat.end());
occ_out<<x<<std::endl;
cum_sum += x;
++number;
} while (cum_sum < min_cum_occ);
// (5)
out<<"# number="<<number<<" length="<<pat_len<<" file="<<argv[1];
out<<" forbidden="<<std::endl;
for (size_t i=0; i<pattern.size(); ++i) {
out<<pattern[i];
}
}