From 833cccd5df16553d815da123856477e94eb8a75a Mon Sep 17 00:00:00 2001 From: Alexander Delman Date: Tue, 18 Jun 2024 07:46:36 +0300 Subject: [PATCH] (#271) implemented capture groups usefulness checking + refactoring --- .../src/MetamorphicTests.cpp | 2 +- apps/UnitTestsApp/src/UnitTests.cpp | 21 ++++ libs/Objects/include/Objects/BackRefRegex.h | 11 +- .../include/Objects/MemoryFiniteAutomaton.h | 2 +- libs/Objects/src/BackRefRegex.cpp | 103 ++++++++++-------- libs/Objects/src/MemoryFiniteAutomaton.cpp | 5 +- libs/Objects/src/Regex.cpp | 14 +-- 7 files changed, 95 insertions(+), 63 deletions(-) diff --git a/apps/MetamorphicTestsApp/src/MetamorphicTests.cpp b/apps/MetamorphicTestsApp/src/MetamorphicTests.cpp index 4724a6a0..ad76036e 100644 --- a/apps/MetamorphicTestsApp/src/MetamorphicTests.cpp +++ b/apps/MetamorphicTestsApp/src/MetamorphicTests.cpp @@ -89,7 +89,7 @@ std::string MetamorphicTests::generate_bregex(RegexGenerator& rg, int cells_num) condition = false; } if (condition) - condition &= r.check_refs(); + condition &= r.check_refs_and_memory_writers_usefulness(); } while (!condition); return rgx_str; diff --git a/apps/UnitTestsApp/src/UnitTests.cpp b/apps/UnitTestsApp/src/UnitTests.cpp index 48338755..93f98c8b 100644 --- a/apps/UnitTestsApp/src/UnitTests.cpp +++ b/apps/UnitTestsApp/src/UnitTests.cpp @@ -759,6 +759,27 @@ TEST(TestParsing, MFA_equivalence) { }); } +TEST(TestBrgexChecker, CheckRefsAndMWs) { + using Test = std::tuple; + vector tests = { + {"[a]:1&1", true}, + {"&1[a]:1", false}, + {"(&1[a]:1)*", true}, + {"&1[a]:1&1", false}, + {"[a]:1&1[a]:2", false}, + {"&2[a]:1&1[a]:2", false}, + {"(&2[a]:1&1[a]:2)*", true}, + {"(&1[a]:1[a]:1&1[a]:2)*", false}, + {"(&2[a]:1&1[a]:2)*[a]:3*", false}, + {"(&2[a]:1&1[a]:2)*[a]:3*&3", true}, + }; + for_each(tests.begin(), tests.end(), [](const Test& test) { + auto [rgx, expected_res] = test; + SCOPED_TRACE(rgx); + ASSERT_EQ(BackRefRegex(rgx).check_refs_and_memory_writers_usefulness(), expected_res); + }); +} + TEST(TestReverse, BRegex_Reverse) { ASSERT_TRUE(BackRefRegex::equal(BackRefRegex("([a*b]:1&1|b&1)").reverse(), BackRefRegex("[ba*]:1&1|&1b"))); diff --git a/libs/Objects/include/Objects/BackRefRegex.h b/libs/Objects/include/Objects/BackRefRegex.h index b282b08a..1b102268 100644 --- a/libs/Objects/include/Objects/BackRefRegex.h +++ b/libs/Objects/include/Objects/BackRefRegex.h @@ -51,6 +51,7 @@ class BackRefRegex : public AlgExpression { // возвращает вектор листьев дерева // устанавливает для них in_lin_cells, first_in_cells и last_in_cells + // линеаризует memoryWriters void preorder_traversal( std::vector& terms, // NOLINT(runtime/references) int& lin_counter, // NOLINT(runtime/references) @@ -79,7 +80,7 @@ class BackRefRegex : public AlgExpression { CellSet>>>& // NOLINT(runtime/references) ) const; - // преобразует star в conc (раскрывает каждую итерацию один раз) и линеаризует memoryWriter + // преобразует star в conc (раскрывает каждую итерацию один раз) и линеаризует memoryWriters void unfold_iterations(int& number); // NOLINT(runtime/references) // рекурсивно проверяет, является ли регулярное выражение ацикличным bool _is_acreg( @@ -87,7 +88,9 @@ class BackRefRegex : public AlgExpression { std::unordered_map>&) const; // NOLINT(runtime/references) void linearize_refs(int& number); // NOLINT(runtime/references) - void _check_refs(std::unordered_set&, std::unordered_set&) const; + void _check_memory_writers(std::unordered_map>&, + std::unordered_set&, // NOLINT(runtime/references) + std::unordered_set&) const; // NOLINT(runtime/references) // меняет порядок конкатенаций в дереве (swap term_l и term_r) void _reverse(std::unordered_map&); // NOLINT(runtime/references) @@ -123,6 +126,8 @@ class BackRefRegex : public AlgExpression { // обращение выражения (для СНФ) BackRefRegex reverse(iLogTemplate* log = nullptr) const; // проверяет, что каждая ссылка может следовать за записью в память (соответствующую ячейку) - bool check_refs() const; + // и что каждый memoryWriter не будет однозначно переинициализирован без возможности + // сослаться на него (существует хотя бы один путь, в котором присутствует ссылка на него) + bool check_refs_and_memory_writers_usefulness() const; BackRefRegex rewrite_aci() const; }; \ No newline at end of file diff --git a/libs/Objects/include/Objects/MemoryFiniteAutomaton.h b/libs/Objects/include/Objects/MemoryFiniteAutomaton.h index 68650853..a354febc 100644 --- a/libs/Objects/include/Objects/MemoryFiniteAutomaton.h +++ b/libs/Objects/include/Objects/MemoryFiniteAutomaton.h @@ -147,7 +147,7 @@ class MemoryFiniteAutomaton : public AbstractMachine { std::pair _parse_slow(const std::string&, Matcher*) const; std::pair _parse(const std::string&, Matcher*) const; - // поиск множества состояний НКА, + // поиск множества состояний MFA, // достижимых из множества состояний по eps-переходам std::tuple, std::unordered_set, MFATransition::MemoryActions> get_eps_closure(const std::set& indices) const; diff --git a/libs/Objects/src/BackRefRegex.cpp b/libs/Objects/src/BackRefRegex.cpp index 1c2e3faf..4d43df99 100644 --- a/libs/Objects/src/BackRefRegex.cpp +++ b/libs/Objects/src/BackRefRegex.cpp @@ -438,8 +438,6 @@ void BackRefRegex::preorder_traversal(vector& terms, int& lin_cou vector& last_in_cells, unordered_set cur_in_lin_cells, CellSet cur_first_in_cells, CellSet cur_last_in_cells) { - bool l_contains_eps, r_contains_eps; - switch (type) { case alt: cast(term_l)->preorder_traversal(terms, @@ -459,9 +457,9 @@ void BackRefRegex::preorder_traversal(vector& terms, int& lin_cou cur_first_in_cells, cur_last_in_cells); return; - case conc: - l_contains_eps = cast(term_l)->contains_eps(); - r_contains_eps = cast(term_r)->contains_eps(); + case conc: { + bool l_contains_eps = cast(term_l)->contains_eps(); + bool r_contains_eps = cast(term_r)->contains_eps(); cast(term_l)->preorder_traversal(terms, lin_counter, in_lin_cells, @@ -479,6 +477,7 @@ void BackRefRegex::preorder_traversal(vector& terms, int& lin_cou l_contains_eps ? cur_first_in_cells : CellSet(), cur_last_in_cells); return; + } case star: cast(term_l)->preorder_traversal(terms, lin_counter, @@ -517,17 +516,16 @@ void BackRefRegex::preorder_traversal(vector& terms, int& lin_cou } void BackRefRegex::calculate_may_be_eps(unordered_map>& memory_writers) { - unordered_map> memory_writers_copy; - unordered_map>::iterator it_ref_to; switch (type) { - case alt: - memory_writers_copy = memory_writers; + case alt: { + auto memory_writers_copy = memory_writers; cast(term_l)->calculate_may_be_eps(memory_writers); cast(term_r)->calculate_may_be_eps(memory_writers_copy); for (const auto& [num, refs_to] : memory_writers_copy) for (const auto& memory_writer : refs_to) memory_writers[num].push_back(memory_writer); return; + } case conc: cast(term_l)->calculate_may_be_eps(memory_writers); cast(term_r)->calculate_may_be_eps(memory_writers); @@ -540,14 +538,14 @@ void BackRefRegex::calculate_may_be_eps(unordered_map memory_writers[cell_number] = {this}; cast(term_l)->calculate_may_be_eps(memory_writers); return; - case ref: - it_ref_to = memory_writers.find(cell_number); - if (it_ref_to != memory_writers.end()) + case ref: { + if (auto it_ref_to = memory_writers.find(cell_number); it_ref_to != memory_writers.end()) for (const auto& memory_writer : it_ref_to->second) may_be_eps |= memory_writer->contains_eps(); else may_be_eps = true; return; + } default: return; } @@ -802,9 +800,6 @@ void BackRefRegex::get_cells_under_iteration(unordered_set& iteration_over_ void BackRefRegex::get_follow( vector, CellSet>>>& following_states) const { vector> first, last; - unordered_set iteration_over_cells; - CellSet iteration_over_empty_cells; - pair is_eps; switch (type) { case Type::alt: cast(term_l)->get_follow(following_states); @@ -824,11 +819,12 @@ void BackRefRegex::get_follow( } } return; - case Type::star: + case Type::star: { cast(term_l)->get_follow(following_states); - is_eps = contains_eps_tracking_resets(); + pair is_eps = contains_eps_tracking_resets(); last = cast(term_l)->get_last_nodes_tracking_resets(); first = cast(term_l)->get_first_nodes_tracking_resets(); + unordered_set iteration_over_cells; get_cells_under_iteration(iteration_over_cells); for (auto& [i, last_to_reset] : last) { for (auto& [j, first_to_reset] : first) { @@ -843,6 +839,7 @@ void BackRefRegex::get_follow( } } return; + } case Type::memoryWriter: return cast(term_l)->get_follow(following_states); default: @@ -979,11 +976,9 @@ void BackRefRegex::unfold_iterations(int& number) { bool BackRefRegex::_is_acreg(unordered_set in_cells, unordered_set in_lin_cells, unordered_map>& refs_in_cells) const { - unordered_map>::iterator refs_in_cell; - unordered_map> refs_in_cells_copy; switch (type) { - case alt: - refs_in_cells_copy = refs_in_cells; + case alt: { + auto refs_in_cells_copy = refs_in_cells; if (!cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells)) return false; if (!cast(term_r)->_is_acreg(in_cells, in_lin_cells, refs_in_cells_copy)) @@ -991,6 +986,7 @@ bool BackRefRegex::_is_acreg(unordered_set in_cells, unordered_set in_ for (const auto& [num, refs] : refs_in_cells_copy) refs_in_cells[num].insert(refs.begin(), refs.end()); return true; + } case conc: if (!cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells)) return false; @@ -1001,8 +997,8 @@ bool BackRefRegex::_is_acreg(unordered_set in_cells, unordered_set in_ refs_in_cells[cell_number] = {lin_number}; return cast(term_l)->_is_acreg(in_cells, in_lin_cells, refs_in_cells); case ref: - refs_in_cell = refs_in_cells.find(cell_number); - if (refs_in_cell != refs_in_cells.end()) { + if (auto refs_in_cell = refs_in_cells.find(cell_number); + refs_in_cell != refs_in_cells.end()) { for (auto cell_lin_num : in_lin_cells) // если ссылается на те же линеаризованные memoryWriter, в которых находится сама if (refs_in_cell->second.count(cell_lin_num)) @@ -1051,54 +1047,67 @@ void BackRefRegex::linearize_refs(int& number) { cast(term_l)->linearize_refs(number); break; case ref: - symbol.linearize(number); - number++; + symbol.linearize(number++); break; default: break; } } -void BackRefRegex::_check_refs(unordered_set& found, unordered_set& found_for_lin) const { - unordered_set found_copy; +void BackRefRegex::_check_memory_writers( + unordered_map>& found_memory_writers, + unordered_set& refs_check_set, unordered_set& memory_writers_check_set) const { switch (type) { - case alt: - found_copy = found; - cast(term_l)->_check_refs(found, found_for_lin); - cast(term_r)->_check_refs(found_copy, found_for_lin); - found.insert(found_copy.begin(), found_copy.end()); + case alt: { + auto found_copy = found_memory_writers; + cast(term_l)->_check_memory_writers( + found_memory_writers, refs_check_set, memory_writers_check_set); + cast(term_r)->_check_memory_writers(found_copy, refs_check_set, memory_writers_check_set); + for (const auto& [memory_writer_cell_number, memory_writer_lin_numbers] : found_copy) { + found_memory_writers[memory_writer_cell_number].insert( + memory_writer_lin_numbers.begin(), memory_writer_lin_numbers.end()); + } break; + } case conc: - cast(term_l)->_check_refs(found, found_for_lin); - cast(term_r)->_check_refs(found, found_for_lin); + cast(term_l)->_check_memory_writers( + found_memory_writers, refs_check_set, memory_writers_check_set); + cast(term_r)->_check_memory_writers( + found_memory_writers, refs_check_set, memory_writers_check_set); break; case memoryWriter: - found.insert(cell_number); - cast(term_l)->_check_refs(found, found_for_lin); + found_memory_writers[cell_number] = {lin_number}; + cast(term_l)->_check_memory_writers( + found_memory_writers, refs_check_set, memory_writers_check_set); break; case ref: - if (found.count(cell_number)) - found_for_lin.insert(symbol.last_linearization_number()); + if (auto it = found_memory_writers.find(cell_number); it != found_memory_writers.end()) { + refs_check_set.insert(symbol.last_linearization_number()); + for (const auto& memory_writer_lin_num : it->second) + memory_writers_check_set.insert(memory_writer_lin_num); + } break; default: break; } } -bool BackRefRegex::check_refs() const { +bool BackRefRegex::check_refs_and_memory_writers_usefulness() const { BackRefRegex temp(*this); - int lin_counter = 0; - temp.linearize_refs(lin_counter); + int refs_lin_counter = 0; + temp.linearize_refs(refs_lin_counter); - int n = 0; - temp.unfold_iterations(n); + int memory_writers_lin_counter = 0; + temp.unfold_iterations(memory_writers_lin_counter); - unordered_set found; - unordered_set found_for_lin; - temp._check_refs(found, found_for_lin); + unordered_map> found_memory_writers; + unordered_set refs_check_set; + unordered_set memory_writers_check_set; + temp._check_memory_writers(found_memory_writers, refs_check_set, memory_writers_check_set); - return found_for_lin.size() == lin_counter; + return refs_check_set.size() == refs_lin_counter && + memory_writers_check_set.size() == memory_writers_lin_counter; } void BackRefRegex::_reverse(unordered_map& memory_writers) { diff --git a/libs/Objects/src/MemoryFiniteAutomaton.cpp b/libs/Objects/src/MemoryFiniteAutomaton.cpp index 2c789c99..1a7877e3 100644 --- a/libs/Objects/src/MemoryFiniteAutomaton.cpp +++ b/libs/Objects/src/MemoryFiniteAutomaton.cpp @@ -1073,11 +1073,10 @@ pair, unordered_set> MemoryFiniteAutomaton::genera unordered_set visited_states; while (!current_states.empty()) { unordered_set following_states; - for (const auto& state_to_process : current_states) { - if (visited_states.count(state_to_process)) + for (auto cur_state : current_states) { + if (visited_states.count(cur_state)) continue; - auto cur_state = state_to_process; cur_state.process_mutations(); const MFAState* state = cur_state.state; if (state->is_terminal) { diff --git a/libs/Objects/src/Regex.cpp b/libs/Objects/src/Regex.cpp index 762f5bb3..6d4cf255 100644 --- a/libs/Objects/src/Regex.cpp +++ b/libs/Objects/src/Regex.cpp @@ -145,9 +145,6 @@ vector Regex::_to_thompson(const Alphabet& root_alphabet) const { vector fa_left; // список состояний и макс индекс состояния для правого автомата относительно операции vector fa_right; - // автомат для отрицания, строится обычный томпсон и берется дополнение - FiniteAutomaton fa_negative; - vector fa_negative_states; switch (type) { case Type::eps: @@ -262,11 +259,11 @@ vector Regex::_to_thompson(const Alphabet& root_alphabet) const { fa_states.emplace_back(int(fa_left.size()) + 1, true); return fa_states; - case Type::negative: + case Type::negative: { // строим автомат для отрицания - fa_negative_states = Regex::cast(term_l)->_to_thompson(root_alphabet); - - fa_negative = FiniteAutomaton(0, fa_negative_states, root_alphabet); + vector fa_negative_states = Regex::cast(term_l)->_to_thompson(root_alphabet); + // автомат для отрицания, строится обычный томпсон и берется дополнение + FiniteAutomaton fa_negative = FiniteAutomaton(0, fa_negative_states, root_alphabet); fa_negative = fa_negative.minimize(); // берем дополнение автомата fa_negative = fa_negative.complement(); @@ -287,6 +284,7 @@ vector Regex::_to_thompson(const Alphabet& root_alphabet) const { // возвращаем состояния и макс индекс return fa_negative.states; + } default: break; } @@ -750,7 +748,7 @@ bool Regex::derivative_with_respect_to_sym(Regex* respected_sym, const Regex* re result.type = Type::conc; if (result.term_l == nullptr) result.term_l = new Regex(); - bool answer = derivative_with_respect_to_sym( + answer = derivative_with_respect_to_sym( respected_sym, Regex::cast(reg_e->term_l), *Regex::cast(result.term_l)); result.term_r = reg_e->make_copy(); return answer;