Skip to content

Commit

Permalink
improve ac-automaton
Browse files Browse the repository at this point in the history
  • Loading branch information
thallium committed Jun 26, 2023
1 parent 8c8e125 commit 2c0fd94
Showing 1 changed file with 24 additions and 12 deletions.
36 changes: 24 additions & 12 deletions src/string/ac-automaton.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
#include <queue>
/** Modified from:
* https://github.com/kth-competitive-programming/kactl/blob/master/content/strings/AhoCorasick.h
* Try to handdle duplicated patterns beforehand, otherwise change 'end' to
* vector; empty patterns are not allowed. Time: construction takes $O(26N)$,
* If there's no duplicated patterns, just call the constructor, otherwise handle it beforehand
* by yourself, or use the return value of insert
* empty patterns are not allowed.
* Time: construction takes $O(26N)$,
* where $N =$ sum of length of patterns. find(x) is $O(N)$, where N = length of
* x. findAll is $O(N+M)$ where M is number of occurrence of all pattern (up to N*sqrt(N)) */

Expand All @@ -14,8 +16,7 @@ struct AhoCorasick {
struct Node {
// back: failure link, points to longest suffix that is in the trie.
// end: longest pattern that ends here, is -1 if no patten ends here.
// nmatches: number of (patterns that is a suffix of current
// node)/(duplicated patterns), depends on needs.
// nmatches: number of patterns that is a suffix of current node
// output: output link, points to the longest pattern that is a suffix
// of current node
int back, end = -1, nmatches = 0, output = -1;
Expand All @@ -33,7 +34,9 @@ struct AhoCorasick {
build();
}

void insert(const std::string &s, int j) { // j: id of string s
// returns -1 if there's no duplicated pattern already in the trie
// returns the id of the duplicated pattern otherwise
int insert(const std::string &s, int j) { // j: id of string s
assert(!s.empty());
int n = 0;
for (char c : s) {
Expand All @@ -43,13 +46,19 @@ struct AhoCorasick {
}
n = N[n].next[c - first];
}
if (N[n].end != -1) {
return N[n].end;
}
N[n].end = j;
N[n].nmatches++;
return -1;
}

void build() {
// adds a dummy node so the root node can be correctly handled
N[0].back = (int)N.size();
N.emplace_back(0);

std::queue<int> q;
q.push(0);
while (!q.empty()) {
Expand All @@ -64,14 +73,13 @@ struct AhoCorasick {
// if prev is an end node, then set output to prev node,
// otherwise set to output link of prev node
N[v].output = N[fail].end == -1 ? N[fail].output : fail;
// if we don't want to distinguish info of patterns that is
// a suffix of current node, we can add info to the next
// node like this: nxt.nmatches+=N[pnx].nmatches;
N[v].nmatches += N[fail].nmatches;
q.push(v);
}
}
}
}

// for each position, finds the longest pattern that ends here
std::vector<int> find(const std::string &text) {
int len = (int)text.size();
Expand All @@ -83,14 +91,17 @@ struct AhoCorasick {
}
return res;
}
// for each position, finds the all that ends here

// for each position, finds all patterns that ends here
std::vector<std::vector<int>> find_all(const std::string &text) {
int len = (int)text.size();
std::vector<std::vector<int>> res(len);
int n = 0;
for (int i = 0; i < len; i++) {
n = N[n].next[text[i] - first];
res[i].push_back(N[n].end);
if (N[n].end != -1) {
res[i].push_back(N[n].end);
}
for (int ind = N[n].output; ind != -1; ind = N[ind].output) {
assert(N[ind].end != -1);
res[i].push_back(N[ind].end);
Expand All @@ -99,8 +110,9 @@ struct AhoCorasick {
return res;
}

std::vector<int> find_cnt(const std::string& text, int n) {
std::vector<int> cnt(n);
// finds the number of occurrence of each pattern
std::vector<int> find_cnt(const std::string& text, int num_of_patterns) {
std::vector<int> cnt(num_of_patterns);
int p = 0;
for (auto c : text) {
p = N[p].next[c - first];
Expand Down

0 comments on commit 2c0fd94

Please sign in to comment.