Skip to content

Commit

Permalink
(#271) implemented capture groups usefulness checking
Browse files Browse the repository at this point in the history
+ refactoring
  • Loading branch information
xendalm committed Jun 18, 2024
1 parent 42cb212 commit 833cccd
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 63 deletions.
2 changes: 1 addition & 1 deletion apps/MetamorphicTestsApp/src/MetamorphicTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ std::string MetamorphicTests::generate_bregex(RegexGenerator& rg, int cells_num)
condition = false;
}
if (condition)
condition &= r.check_refs();
condition &= r.check_refs_and_memory_writers_usefulness();
} while (!condition);

return rgx_str;
Expand Down
21 changes: 21 additions & 0 deletions apps/UnitTestsApp/src/UnitTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,27 @@ TEST(TestParsing, MFA_equivalence) {
});
}

TEST(TestBrgexChecker, CheckRefsAndMWs) {
using Test = std::tuple<string, bool>;
vector<Test> tests = {
{"[a]:1&1", true},
{"&1[a]:1", false},
{"(&1[a]:1)*", true},
{"&1[a]:1&1", false},
{"[a]:1&1[a]:2", false},
{"&2[a]:1&1[a]:2", false},
{"(&2[a]:1&1[a]:2)*", true},
{"(&1[a]:1[a]:1&1[a]:2)*", false},
{"(&2[a]:1&1[a]:2)*[a]:3*", false},
{"(&2[a]:1&1[a]:2)*[a]:3*&3", true},
};
for_each(tests.begin(), tests.end(), [](const Test& test) {
auto [rgx, expected_res] = test;
SCOPED_TRACE(rgx);
ASSERT_EQ(BackRefRegex(rgx).check_refs_and_memory_writers_usefulness(), expected_res);
});
}

TEST(TestReverse, BRegex_Reverse) {
ASSERT_TRUE(BackRefRegex::equal(BackRefRegex("([a*b]:1&1|b&1)").reverse(),
BackRefRegex("[ba*]:1&1|&1b")));
Expand Down
11 changes: 8 additions & 3 deletions libs/Objects/include/Objects/BackRefRegex.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class BackRefRegex : public AlgExpression {

// возвращает вектор листьев дерева
// устанавливает для них in_lin_cells, first_in_cells и last_in_cells
// линеаризует memoryWriters
void preorder_traversal(
std::vector<BackRefRegex*>& terms, // NOLINT(runtime/references)
int& lin_counter, // NOLINT(runtime/references)
Expand Down Expand Up @@ -79,15 +80,17 @@ class BackRefRegex : public AlgExpression {
CellSet>>>& // NOLINT(runtime/references)
) const;

// преобразует star в conc (раскрывает каждую итерацию один раз) и линеаризует memoryWriter
// преобразует star в conc (раскрывает каждую итерацию один раз) и линеаризует memoryWriters
void unfold_iterations(int& number); // NOLINT(runtime/references)
// рекурсивно проверяет, является ли регулярное выражение ацикличным
bool _is_acreg(
std::unordered_set<int>, std::unordered_set<int>,
std::unordered_map<int, std::unordered_set<int>>&) const; // NOLINT(runtime/references)

void linearize_refs(int& number); // NOLINT(runtime/references)
void _check_refs(std::unordered_set<int>&, std::unordered_set<int>&) const;
void _check_memory_writers(std::unordered_map<int, std::unordered_set<int>>&,
std::unordered_set<int>&, // NOLINT(runtime/references)
std::unordered_set<int>&) const; // NOLINT(runtime/references)

// меняет порядок конкатенаций в дереве (swap term_l и term_r)
void _reverse(std::unordered_map<int, BackRefRegex*>&); // NOLINT(runtime/references)
Expand Down Expand Up @@ -123,6 +126,8 @@ class BackRefRegex : public AlgExpression {
// обращение выражения (для СНФ)
BackRefRegex reverse(iLogTemplate* log = nullptr) const;
// проверяет, что каждая ссылка может следовать за записью в память (соответствующую ячейку)
bool check_refs() const;
// и что каждый memoryWriter не будет однозначно переинициализирован без возможности
// сослаться на него (существует хотя бы один путь, в котором присутствует ссылка на него)
bool check_refs_and_memory_writers_usefulness() const;
BackRefRegex rewrite_aci() const;
};
2 changes: 1 addition & 1 deletion libs/Objects/include/Objects/MemoryFiniteAutomaton.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ class MemoryFiniteAutomaton : public AbstractMachine {
std::pair<int, bool> _parse_slow(const std::string&, Matcher*) const;
std::pair<int, bool> _parse(const std::string&, Matcher*) const;

// поиск множества состояний НКА,
// поиск множества состояний MFA,
// достижимых из множества состояний по eps-переходам
std::tuple<std::set<int>, std::unordered_set<int>, MFATransition::MemoryActions>
get_eps_closure(const std::set<int>& indices) const;
Expand Down
103 changes: 56 additions & 47 deletions libs/Objects/src/BackRefRegex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,6 @@ void BackRefRegex::preorder_traversal(vector<BackRefRegex*>& terms, int& lin_cou
vector<CellSet>& last_in_cells,
unordered_set<int> cur_in_lin_cells,
CellSet cur_first_in_cells, CellSet cur_last_in_cells) {
bool l_contains_eps, r_contains_eps;

switch (type) {
case alt:
cast(term_l)->preorder_traversal(terms,
Expand All @@ -459,9 +457,9 @@ void BackRefRegex::preorder_traversal(vector<BackRefRegex*>& terms, int& lin_cou
cur_first_in_cells,
cur_last_in_cells);
return;
case conc:
l_contains_eps = cast(term_l)->contains_eps();
r_contains_eps = cast(term_r)->contains_eps();
case conc: {
bool l_contains_eps = cast(term_l)->contains_eps();
bool r_contains_eps = cast(term_r)->contains_eps();
cast(term_l)->preorder_traversal(terms,
lin_counter,
in_lin_cells,
Expand All @@ -479,6 +477,7 @@ void BackRefRegex::preorder_traversal(vector<BackRefRegex*>& terms, int& lin_cou
l_contains_eps ? cur_first_in_cells : CellSet(),
cur_last_in_cells);
return;
}
case star:
cast(term_l)->preorder_traversal(terms,
lin_counter,
Expand Down Expand Up @@ -517,17 +516,16 @@ void BackRefRegex::preorder_traversal(vector<BackRefRegex*>& terms, int& lin_cou
}

void BackRefRegex::calculate_may_be_eps(unordered_map<int, vector<BackRefRegex*>>& memory_writers) {
unordered_map<int, vector<BackRefRegex*>> memory_writers_copy;
unordered_map<int, vector<BackRefRegex*>>::iterator it_ref_to;
switch (type) {
case alt:
memory_writers_copy = memory_writers;
case alt: {
auto memory_writers_copy = memory_writers;
cast(term_l)->calculate_may_be_eps(memory_writers);
cast(term_r)->calculate_may_be_eps(memory_writers_copy);
for (const auto& [num, refs_to] : memory_writers_copy)
for (const auto& memory_writer : refs_to)
memory_writers[num].push_back(memory_writer);
return;
}
case conc:
cast(term_l)->calculate_may_be_eps(memory_writers);
cast(term_r)->calculate_may_be_eps(memory_writers);
Expand All @@ -540,14 +538,14 @@ void BackRefRegex::calculate_may_be_eps(unordered_map<int, vector<BackRefRegex*>
memory_writers[cell_number] = {this};
cast(term_l)->calculate_may_be_eps(memory_writers);
return;
case ref:
it_ref_to = memory_writers.find(cell_number);
if (it_ref_to != memory_writers.end())
case ref: {
if (auto it_ref_to = memory_writers.find(cell_number); it_ref_to != memory_writers.end())
for (const auto& memory_writer : it_ref_to->second)
may_be_eps |= memory_writer->contains_eps();
else
may_be_eps = true;
return;
}
default:
return;
}
Expand Down Expand Up @@ -802,9 +800,6 @@ void BackRefRegex::get_cells_under_iteration(unordered_set<int>& iteration_over_
void BackRefRegex::get_follow(
vector<vector<tuple<int, unordered_set<int>, CellSet>>>& following_states) const {
vector<pair<AlgExpression*, ToResetMap>> first, last;
unordered_set<int> iteration_over_cells;
CellSet iteration_over_empty_cells;
pair<bool, ToResetMap> is_eps;
switch (type) {
case Type::alt:
cast(term_l)->get_follow(following_states);
Expand All @@ -824,11 +819,12 @@ void BackRefRegex::get_follow(
}
}
return;
case Type::star:
case Type::star: {
cast(term_l)->get_follow(following_states);
is_eps = contains_eps_tracking_resets();
pair<bool, ToResetMap> is_eps = contains_eps_tracking_resets();
last = cast(term_l)->get_last_nodes_tracking_resets();
first = cast(term_l)->get_first_nodes_tracking_resets();
unordered_set<int> iteration_over_cells;
get_cells_under_iteration(iteration_over_cells);
for (auto& [i, last_to_reset] : last) {
for (auto& [j, first_to_reset] : first) {
Expand All @@ -843,6 +839,7 @@ void BackRefRegex::get_follow(
}
}
return;
}
case Type::memoryWriter:
return cast(term_l)->get_follow(following_states);
default:
Expand Down Expand Up @@ -979,18 +976,17 @@ void BackRefRegex::unfold_iterations(int& number) {

bool BackRefRegex::_is_acreg(unordered_set<int> in_cells, unordered_set<int> in_lin_cells,
unordered_map<int, unordered_set<int>>& refs_in_cells) const {
unordered_map<int, unordered_set<int>>::iterator refs_in_cell;
unordered_map<int, unordered_set<int>> refs_in_cells_copy;
switch (type) {
case alt:
refs_in_cells_copy = refs_in_cells;
case alt: {
auto refs_in_cells_copy = refs_in_cells;
if (!cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells))
return false;
if (!cast(term_r)->_is_acreg(in_cells, in_lin_cells, refs_in_cells_copy))
return false;
for (const auto& [num, refs] : refs_in_cells_copy)
refs_in_cells[num].insert(refs.begin(), refs.end());
return true;
}
case conc:
if (!cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells))
return false;
Expand All @@ -1001,8 +997,8 @@ bool BackRefRegex::_is_acreg(unordered_set<int> in_cells, unordered_set<int> in_
refs_in_cells[cell_number] = {lin_number};
return cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells);
case ref:
refs_in_cell = refs_in_cells.find(cell_number);
if (refs_in_cell != refs_in_cells.end()) {
if (auto refs_in_cell = refs_in_cells.find(cell_number);
refs_in_cell != refs_in_cells.end()) {
for (auto cell_lin_num : in_lin_cells)
// если ссылается на те же линеаризованные memoryWriter, в которых находится сама
if (refs_in_cell->second.count(cell_lin_num))
Expand Down Expand Up @@ -1051,54 +1047,67 @@ void BackRefRegex::linearize_refs(int& number) {
cast(term_l)->linearize_refs(number);
break;
case ref:
symbol.linearize(number);
number++;
symbol.linearize(number++);
break;
default:
break;
}
}

void BackRefRegex::_check_refs(unordered_set<int>& found, unordered_set<int>& found_for_lin) const {
unordered_set<int> found_copy;
void BackRefRegex::_check_memory_writers(
unordered_map<int, unordered_set<int>>& found_memory_writers,
unordered_set<int>& refs_check_set, unordered_set<int>& memory_writers_check_set) const {
switch (type) {
case alt:
found_copy = found;
cast(term_l)->_check_refs(found, found_for_lin);
cast(term_r)->_check_refs(found_copy, found_for_lin);
found.insert(found_copy.begin(), found_copy.end());
case alt: {
auto found_copy = found_memory_writers;
cast(term_l)->_check_memory_writers(
found_memory_writers, refs_check_set, memory_writers_check_set);
cast(term_r)->_check_memory_writers(found_copy, refs_check_set, memory_writers_check_set);
for (const auto& [memory_writer_cell_number, memory_writer_lin_numbers] : found_copy) {
found_memory_writers[memory_writer_cell_number].insert(
memory_writer_lin_numbers.begin(), memory_writer_lin_numbers.end());
}
break;
}
case conc:
cast(term_l)->_check_refs(found, found_for_lin);
cast(term_r)->_check_refs(found, found_for_lin);
cast(term_l)->_check_memory_writers(
found_memory_writers, refs_check_set, memory_writers_check_set);
cast(term_r)->_check_memory_writers(
found_memory_writers, refs_check_set, memory_writers_check_set);
break;
case memoryWriter:
found.insert(cell_number);
cast(term_l)->_check_refs(found, found_for_lin);
found_memory_writers[cell_number] = {lin_number};
cast(term_l)->_check_memory_writers(
found_memory_writers, refs_check_set, memory_writers_check_set);
break;
case ref:
if (found.count(cell_number))
found_for_lin.insert(symbol.last_linearization_number());
if (auto it = found_memory_writers.find(cell_number); it != found_memory_writers.end()) {
refs_check_set.insert(symbol.last_linearization_number());
for (const auto& memory_writer_lin_num : it->second)
memory_writers_check_set.insert(memory_writer_lin_num);
}
break;
default:
break;
}
}

bool BackRefRegex::check_refs() const {
bool BackRefRegex::check_refs_and_memory_writers_usefulness() const {
BackRefRegex temp(*this);

int lin_counter = 0;
temp.linearize_refs(lin_counter);
int refs_lin_counter = 0;
temp.linearize_refs(refs_lin_counter);

int n = 0;
temp.unfold_iterations(n);
int memory_writers_lin_counter = 0;
temp.unfold_iterations(memory_writers_lin_counter);

unordered_set<int> found;
unordered_set<int> found_for_lin;
temp._check_refs(found, found_for_lin);
unordered_map<int, unordered_set<int>> found_memory_writers;
unordered_set<int> refs_check_set;
unordered_set<int> memory_writers_check_set;
temp._check_memory_writers(found_memory_writers, refs_check_set, memory_writers_check_set);

return found_for_lin.size() == lin_counter;
return refs_check_set.size() == refs_lin_counter &&
memory_writers_check_set.size() == memory_writers_lin_counter;
}

void BackRefRegex::_reverse(unordered_map<int, BackRefRegex*>& memory_writers) {
Expand Down
5 changes: 2 additions & 3 deletions libs/Objects/src/MemoryFiniteAutomaton.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,11 +1073,10 @@ pair<unordered_set<string>, unordered_set<string>> MemoryFiniteAutomaton::genera
unordered_set<TraversalState, TraversalState::Hasher> visited_states;
while (!current_states.empty()) {
unordered_set<TraversalState, TraversalState::Hasher> following_states;
for (const auto& state_to_process : current_states) {
if (visited_states.count(state_to_process))
for (auto cur_state : current_states) {
if (visited_states.count(cur_state))
continue;

auto cur_state = state_to_process;
cur_state.process_mutations();
const MFAState* state = cur_state.state;
if (state->is_terminal) {
Expand Down
14 changes: 6 additions & 8 deletions libs/Objects/src/Regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,6 @@ vector<FAState> Regex::_to_thompson(const Alphabet& root_alphabet) const {
vector<FAState> fa_left;
// список состояний и макс индекс состояния для правого автомата относительно операции
vector<FAState> fa_right;
// автомат для отрицания, строится обычный томпсон и берется дополнение
FiniteAutomaton fa_negative;
vector<FAState> fa_negative_states;

switch (type) {
case Type::eps:
Expand Down Expand Up @@ -262,11 +259,11 @@ vector<FAState> Regex::_to_thompson(const Alphabet& root_alphabet) const {

fa_states.emplace_back(int(fa_left.size()) + 1, true);
return fa_states;
case Type::negative:
case Type::negative: {
// строим автомат для отрицания
fa_negative_states = Regex::cast(term_l)->_to_thompson(root_alphabet);

fa_negative = FiniteAutomaton(0, fa_negative_states, root_alphabet);
vector<FAState> fa_negative_states = Regex::cast(term_l)->_to_thompson(root_alphabet);
// автомат для отрицания, строится обычный томпсон и берется дополнение
FiniteAutomaton fa_negative = FiniteAutomaton(0, fa_negative_states, root_alphabet);
fa_negative = fa_negative.minimize();
// берем дополнение автомата
fa_negative = fa_negative.complement();
Expand All @@ -287,6 +284,7 @@ vector<FAState> Regex::_to_thompson(const Alphabet& root_alphabet) const {

// возвращаем состояния и макс индекс
return fa_negative.states;
}
default:
break;
}
Expand Down Expand Up @@ -750,7 +748,7 @@ bool Regex::derivative_with_respect_to_sym(Regex* respected_sym, const Regex* re
result.type = Type::conc;
if (result.term_l == nullptr)
result.term_l = new Regex();
bool answer = derivative_with_respect_to_sym(
answer = derivative_with_respect_to_sym(
respected_sym, Regex::cast(reg_e->term_l), *Regex::cast(result.term_l));
result.term_r = reg_e->make_copy();
return answer;
Expand Down

0 comments on commit 833cccd

Please sign in to comment.