From b664de480493604bb32f5a058b5ff0a27a855e98 Mon Sep 17 00:00:00 2001 From: James Yang Date: Fri, 17 Jan 2020 19:31:51 -0500 Subject: [PATCH 01/23] Add trie implementation and lexer with trie --- src/core/lexer_trie.hpp | 163 ++++++++++++++++++++++++++++++++ src/core/status.hpp | 5 +- src/core/symbol.hpp | 15 ++- src/core/token.hpp | 4 +- src/core/trie.hpp | 201 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 378 insertions(+), 10 deletions(-) create mode 100644 src/core/lexer_trie.hpp create mode 100644 src/core/trie.hpp diff --git a/src/core/lexer_trie.hpp b/src/core/lexer_trie.hpp new file mode 100644 index 0000000..4bd9eb5 --- /dev/null +++ b/src/core/lexer_trie.hpp @@ -0,0 +1,163 @@ +#pragma once +#include +#include +#include +#include + +namespace docgen { +namespace core { + +struct Lexer +{ + using symbol_t = Symbol; + using token_t = Token; + using status_t = Status; + + Lexer(); + + void process(char c); + std::optional next_token(); + +private: + + bool is_backtracking() const; + void set_backtracking(); + void reset_backtracking(); + void backtrack(char c); + + enum class State : bool { + backtrack, + non_backtrack + }; + + Trie trie_; + std::string text_; + std::string buf_; + State state_ = State::non_backtrack; + status_t status_; +}; + +/////////////////////////////////// +// Lexer Implementation +/////////////////////////////////// + +Lexer::Lexer() + : trie_({ + {"\n", Symbol::NEWLINE}, + {";", Symbol::SEMICOLON}, + {" ", Symbol::WHITESPACE}, + {"\t", Symbol::WHITESPACE}, + {"\v", Symbol::WHITESPACE}, + {"\r", Symbol::WHITESPACE}, + {"\f", Symbol::WHITESPACE}, + {"*", Symbol::STAR}, + {"{", Symbol::OPEN_BRACE}, + {"}", Symbol::CLOSE_BRACE}, + {"///", Symbol::BEGIN_SLINE_COMMENT}, + {"/*!", Symbol::BEGIN_SBLOCK_COMMENT}, + {"//", Symbol::BEGIN_NLINE_COMMENT}, + {"/*", Symbol::BEGIN_NBLOCK_COMMENT}, + {"*/", Symbol::END_BLOCK_COMMENT}, + {"@sdesc", Symbol::SDESC} + }) +{ + // TODO: reserve space for status_.tokens? +} + +inline void Lexer::process(char c) +{ + // if current state is accepting + if (trie_.is_accept()) { + if (!this->is_backtracking()) { + this->set_backtracking(); + } + // ignore contents in buffer up until now + // this optimization can be done because we look for longest match + buf_.clear(); + } + + auto it = trie_.get_children().find(c); + + // if transition exists + if (it != trie_.get_children().end()) { + buf_.push_back(c); + trie_.transition(c); + return; + } + + // otherwise, no transition exists + + // if not backtracking + if (!this->is_backtracking()) { + text_.append(buf_); + text_.push_back(c); + buf_.clear(); + trie_.reset(); + return; + } + + // otherwise, currently backtracking + this->backtrack(c); +} + +inline bool Lexer::is_backtracking() const +{ + return state_ == State::backtrack; +} + +inline void Lexer::set_backtracking() +{ + state_ = State::backtrack; +} + +inline void Lexer::reset_backtracking() +{ + state_ = State::non_backtrack; +} + +inline void Lexer::backtrack(char c) +{ + // reset to non-backtracking + this->reset_backtracking(); + + // tokenize and clear text + if (!text_.empty()) { + status_.tokens.emplace(symbol_t::TEXT, std::move(text_)); + text_.clear(); + } + + // tokenize symbol + for (uint32_t i = 0; i < buf_.size(); ++i) { + trie_.back_transition(); + } + assert(trie_.is_accept()); + auto opt_symbol = trie_.get_symbol(); + assert(static_cast(opt_symbol)); + status_.tokens.emplace(*opt_symbol); + + // move and clear buf_ to temp + std::string reprocess_str(std::move(buf_)); + buf_.clear(); + reprocess_str.push_back(c); + + // reset trie + trie_.reset(); + + // reprocess the rest + for (char c : reprocess_str) { + this->process(c); + } +} + +inline std::optional Lexer::next_token() +{ + if (!status_.tokens.empty()) { + token_t token = std::move(status_.tokens.front()); + status_.tokens.pop(); + return token; + } + return {}; +} + +} // namespace core +} // namespace docgen diff --git a/src/core/status.hpp b/src/core/status.hpp index d29dda4..9294ea7 100644 --- a/src/core/status.hpp +++ b/src/core/status.hpp @@ -1,5 +1,6 @@ #pragma once -#include +#include +#include namespace docgen { namespace core { @@ -8,7 +9,7 @@ template struct Status { using token_t = TokenType; - using token_arr_t = std::vector; + using token_arr_t = std::queue; token_arr_t tokens; }; diff --git a/src/core/symbol.hpp b/src/core/symbol.hpp index 6dc5762..b19e1f7 100644 --- a/src/core/symbol.hpp +++ b/src/core/symbol.hpp @@ -10,16 +10,19 @@ enum class Symbol { // single-char tokens END_OF_FILE, NEWLINE, + WHITESPACE, SEMICOLON, STAR, OPEN_BRACE, CLOSE_BRACE, // string tokens - BEGIN_LINE_COMMENT, - BEGIN_BLOCK_COMMENT, + BEGIN_SLINE_COMMENT, + BEGIN_SBLOCK_COMMENT, + BEGIN_NLINE_COMMENT, + BEGIN_NBLOCK_COMMENT, END_BLOCK_COMMENT, // special tags - TAGNAME, + SDESC, // default TEXT }; @@ -31,8 +34,10 @@ static MAPBOX_ETERNAL_CONSTEXPR const auto symbol_map = {Symbol::STAR, "*"}, {Symbol::OPEN_BRACE, "{"}, {Symbol::CLOSE_BRACE, "}"}, - {Symbol::BEGIN_LINE_COMMENT, "///"}, - {Symbol::BEGIN_BLOCK_COMMENT, "/*!"}, + {Symbol::BEGIN_SLINE_COMMENT, "///"}, + {Symbol::BEGIN_SBLOCK_COMMENT, "/*!"}, + {Symbol::BEGIN_NLINE_COMMENT, "//"}, + {Symbol::BEGIN_NBLOCK_COMMENT, "/*"}, {Symbol::END_BLOCK_COMMENT, "*/"}, }); diff --git a/src/core/token.hpp b/src/core/token.hpp index ac83ec0..cc02f37 100644 --- a/src/core/token.hpp +++ b/src/core/token.hpp @@ -10,10 +10,9 @@ struct Token { using symbol_t = SymbolType; - Token(symbol_t name, std::string&& content, uint32_t leading_ws_count=0) + Token(symbol_t name, std::string&& content) : name(name) , content(std::move(content)) - , leading_ws_count(leading_ws_count) {} Token(symbol_t name) @@ -25,7 +24,6 @@ struct Token symbol_t name; std::string content; - uint32_t leading_ws_count; }; template <> diff --git a/src/core/trie.hpp b/src/core/trie.hpp new file mode 100644 index 0000000..71dbb5c --- /dev/null +++ b/src/core/trie.hpp @@ -0,0 +1,201 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace docgen { +namespace core { + +template +struct Trie +{ +private: + using pair_t = std::pair; + +public: + + // Constructs trie node from a list of pairs of string and symbol. + // The string must be of type std::string_view and it must not be empty. + // The symbol must be of type SymbolType. + Trie(const std::initializer_list&); + + // Delete compiler-generated copy/move ctor/assignment + // This ensures that Trie objects are only (default) constructible. + Trie(const Trie&) =delete; + Trie(Trie&&) =delete; + Trie& operator=(const Trie&) =delete; + Trie& operator=(Trie&&) =delete; + + void transition(char c); + void back_transition(); + bool is_accept() const; + auto get_children(); + bool is_reset() const; + void reset(); + const std::optional& get_symbol() const; + +private: + + struct TrieNode + { + // Insert str from current node to update the trie structure. + // The string str is read starting from idx. + void insert(const std::pair&, size_t = 0); + + // Returns if current node is an accepting state. + bool is_accept() const; + + // Returns the optional symbol associated with current node. + // Symbol will be active if is_accept is true. + const std::optional& get_symbol() const; + + std::unordered_map& get_children(); + + std::optional> get_parent(); + + private: + + enum class State : bool { + accept, + non_accept + }; + + State state_ = State::non_accept; // indicates accepting node or not + std::optional symbol_; // symbol for accepting node + std::unordered_map children_; // current node's children + TrieNode* parent_ptr_; // current node's parent + }; + + TrieNode root_; // root of Trie + std::reference_wrapper curr_node_ = root_; // current node +}; + +//////////////////////////////////////////////////////////////// +// TrieNode Implementation +//////////////////////////////////////////////////////////////// + +template +inline void +Trie::TrieNode::insert(const pair_t& pair, size_t idx) +{ + const auto& str = std::get<0>(pair); + + // if string starting from idx is empty, then accepting state + if (str[idx] == '\0') { + state_ = State::accept; + symbol_ = std::get<1>(pair); + } + + else { + auto& child = children_[str[idx]]; + child.parent_ptr_ = this; + child.insert(pair, idx + 1); + } +} + +template +inline bool +Trie::TrieNode::is_accept() const +{ + return state_ == State::accept; +} + +template +inline const std::optional& +Trie::TrieNode::get_symbol() const +{ + return symbol_; +} + +template +inline std::unordered_map::TrieNode>& +Trie::TrieNode::get_children() +{ + return children_; +} + +template +inline std::optional::TrieNode>> +Trie::TrieNode::get_parent() +{ + if (parent_ptr_) { + return *parent_ptr_; + } + return {}; +} + +//////////////////////////////////////////////////////////////// +// Trie Implementation +//////////////////////////////////////////////////////////////// + +template +inline +Trie::Trie(const std::initializer_list& pairs) + : root_() +{ + for (auto it = pairs.begin(); it != pairs.end(); ++it) { + if (it->first.empty()) { + throw exceptions::control_flow_error("strings must be non-empty"); + } + root_.insert(*it); + } +} + +template +inline void +Trie::transition(char c) +{ + curr_node_ = curr_node_.get().get_children().at(c); +} + +template +inline bool +Trie::is_accept() const +{ + return curr_node_.get().is_accept(); +} + +template +inline auto +Trie::get_children() +{ + return curr_node_.get().get_children(); +} + +template +inline bool +Trie::is_reset() const +{ + return &(curr_node_.get()) == &root_; +} + +template +inline void +Trie::reset() +{ + curr_node_ = root_; +} + +template +inline void +Trie::back_transition() +{ + auto&& opt_parent = curr_node_.get().get_parent(); + if (!opt_parent) { + throw exceptions::control_flow_error("Attempt to back transition past the root"); + } + curr_node_ = *opt_parent; +} + +template +inline const std::optional& +Trie::get_symbol() const +{ + return curr_node_.get().get_symbol(); +} + +} // namespace core +} // namespace docgen From be0b02bcb81177cf8c4d5014f752317295ba44c8 Mon Sep 17 00:00:00 2001 From: James Yang Date: Fri, 17 Jan 2020 21:03:48 -0500 Subject: [PATCH 02/23] Remove unnecessary lexer_routines and rename lexer_trie to lexer --- src/core/lexer.hpp | 170 +++++++++++++++++--- src/core/lexer_routines.hpp | 300 ------------------------------------ src/core/lexer_trie.hpp | 163 -------------------- src/core/symbol.hpp | 4 + 4 files changed, 154 insertions(+), 483 deletions(-) delete mode 100644 src/core/lexer_routines.hpp delete mode 100644 src/core/lexer_trie.hpp diff --git a/src/core/lexer.hpp b/src/core/lexer.hpp index 56f647e..40706ab 100644 --- a/src/core/lexer.hpp +++ b/src/core/lexer.hpp @@ -1,37 +1,167 @@ -#pragma once -#include "lexer_routines.hpp" +#pragma once +#include +#include +#include +#include namespace docgen { namespace core { struct Lexer { - using symbol_t = lexer_details::symbol_t; - using file_reader = lexer_details::file_reader; - using status_t = lexer_details::status_t; + using symbol_t = Symbol; + using token_t = Token; + using status_t = Status; - Lexer(FILE* file) - : reader_(file) - { - status_.tokens.reserve(DEFAULT_TOKEN_ARR_SIZE); + Lexer(); + + void process(char c); + std::optional next_token(); + +private: + + bool is_backtracking() const; + void set_backtracking(); + void reset_backtracking(); + void backtrack(char c); + + enum class State : bool { + backtrack, + non_backtrack + }; + + Trie trie_; + std::string text_; + std::string buf_; + State state_ = State::non_backtrack; + status_t status_; +}; + +/////////////////////////////////// +// Lexer Implementation +/////////////////////////////////// + +Lexer::Lexer() + : trie_({ + {"\n", Symbol::NEWLINE}, + {" ", Symbol::WHITESPACE}, + {"\t", Symbol::WHITESPACE}, + {"\v", Symbol::WHITESPACE}, + {"\r", Symbol::WHITESPACE}, + {"\f", Symbol::WHITESPACE}, + {";", Symbol::SEMICOLON}, + {"#", Symbol::HASHTAG}, + {"*", Symbol::STAR}, + {"{", Symbol::OPEN_BRACE}, + {"}", Symbol::CLOSE_BRACE}, + {"///", Symbol::BEGIN_SLINE_COMMENT}, + {"/*!", Symbol::BEGIN_SBLOCK_COMMENT}, + {"//", Symbol::BEGIN_NLINE_COMMENT}, + {"/*", Symbol::BEGIN_NBLOCK_COMMENT}, + {"*/", Symbol::END_BLOCK_COMMENT}, + {"@sdesc", Symbol::SDESC}, + {"@tparam", Symbol::TPARAM}, + {"@param", Symbol::PARAM}, + {"@return", Symbol::RETURN} + }) +{ + // TODO: reserve space for status_.tokens? +} + +inline void Lexer::process(char c) +{ + // if current state is accepting + if (trie_.is_accept()) { + if (!this->is_backtracking()) { + this->set_backtracking(); + } + // ignore contents in buffer up until now + // this optimization can be done because we look for longest match + buf_.clear(); } - void process() - { - lexer_details::process(reader_, status_); + auto it = trie_.get_children().find(c); + + // if transition exists + if (it != trie_.get_children().end()) { + buf_.push_back(c); + trie_.transition(c); + return; } + + // otherwise, no transition exists - const status_t::token_arr_t& get_tokens() const - { - return status_.tokens; + // if not backtracking + if (!this->is_backtracking()) { + text_.append(buf_); + text_.push_back(c); + buf_.clear(); + trie_.reset(); + return; } + + // otherwise, currently backtracking + this->backtrack(c); +} -private: - static constexpr size_t DEFAULT_TOKEN_ARR_SIZE = 50; +inline bool Lexer::is_backtracking() const +{ + return state_ == State::backtrack; +} - file_reader reader_; - status_t status_; // keeps track of last token value (enum) -}; +inline void Lexer::set_backtracking() +{ + state_ = State::backtrack; +} + +inline void Lexer::reset_backtracking() +{ + state_ = State::non_backtrack; +} + +inline void Lexer::backtrack(char c) +{ + // reset to non-backtracking + this->reset_backtracking(); + + // tokenize and clear text + if (!text_.empty()) { + status_.tokens.emplace(symbol_t::TEXT, std::move(text_)); + text_.clear(); + } + + // tokenize symbol + for (uint32_t i = 0; i < buf_.size(); ++i) { + trie_.back_transition(); + } + assert(trie_.is_accept()); + auto opt_symbol = trie_.get_symbol(); + assert(static_cast(opt_symbol)); + status_.tokens.emplace(*opt_symbol); + + // move and clear buf_ to temp string for reprocessing + std::string reprocess_str(std::move(buf_)); + buf_.clear(); + reprocess_str.push_back(c); + + // reset trie + trie_.reset(); + + // reprocess the rest + for (char c : reprocess_str) { + this->process(c); + } +} + +inline std::optional Lexer::next_token() +{ + if (!status_.tokens.empty()) { + token_t token = std::move(status_.tokens.front()); + status_.tokens.pop(); + return token; + } + return {}; +} } // namespace core } // namespace docgen diff --git a/src/core/lexer_routines.hpp b/src/core/lexer_routines.hpp deleted file mode 100644 index 06e658f..0000000 --- a/src/core/lexer_routines.hpp +++ /dev/null @@ -1,300 +0,0 @@ -#pragma once -#include "token.hpp" -#include "status.hpp" -#include "symbol.hpp" -#include "io/file_reader.hpp" -#include "tag_set.hpp" -#include - -namespace docgen { -namespace core { -namespace lexer_details { - -static constexpr size_t DEFAULT_STRING_RESERVE_SIZE = 50; - -using file_reader = io::file_reader; -using symbol_t = Symbol; -using token_t = Token; -using status_t = Status; - -// Reads and ignores chars until func(c) evaluates to false or reading terminates, -// where c is the current char read. -// Returns the last char read that terminated the function. -template -inline int ignore_until(file_reader& reader, Termination func) -{ - int c = 0; - while (((c = reader.read()) != file_reader::termination) && func(c)); - return c; -} - -// Reads and stores chars until func(c) evaluates to false or reading terminates, -// where c is the current char read. -// Returns the last char read that terminated the function. -template -inline int read_until(file_reader& reader, Termination func, std::string& line) -{ - int c = 0; - line.reserve(DEFAULT_STRING_RESERVE_SIZE); - while (((c = reader.read()) != file_reader::termination) && func(c)) { - line.push_back(c); - } - return c; -} - -// Trims all leading and trailing whitespaces (one of " \t\n\v\f\r") from line. -// Line is directly modified. -// Returns leading whitespace count of original line. -inline uint32_t trim(std::string& line) -{ - static constexpr const char* whitespaces = " \t\n\v\f\r"; - - // find first non-whitespace - const auto begin = line.find_first_not_of(whitespaces); - - // find last non-whitespace - const auto end = line.find_last_not_of(whitespaces); - - // If substring invalid, simply clear line return length of string - // By symmetry, begin and end will be npos if and only if the string only - // consists of whitespaces. In this case, the leading whitespace count is - // simply the length of the string. - if (begin == std::string::npos && end == std::string::npos) { - uint32_t leading_ws_count = line.size(); - line.clear(); - return leading_ws_count; - } - - // otherwise, replace with substring - line = line.substr(begin, end - begin + 1); - - return begin; // number of leading whitespaces -} - -// Trims text, tokenizes it, clears it, and reserve DEFAULT_STRING_RESERVE_SIZE. -// (Trimmed) text is only tokenized if it is non-empty. -inline void tokenize_text(std::string& text, status_t& status) -{ - // trim whitespaces from text first - uint32_t leading_whitespace_count = trim(text); - // tokenize current TEXT only if it is non-empty - if (!text.empty()) { - status.tokens.emplace_back(symbol_t::TEXT, std::move(text), leading_whitespace_count); - } - // clear and reserve - text.clear(); - text.reserve(DEFAULT_STRING_RESERVE_SIZE); -} - -// If c is one of single-char special tokens (see symbol.hpp), -// then text is first tokenized then the single-char special token. -// The tokens are appended to status.tokens in this order. -// Otherwise, no operations are performed. -// Returns true if and only if a single-char special token created. -inline bool process_char(int c, std::string& text, status_t& status) -{ - switch (c) { - case '\n': - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::NEWLINE); - return true; - case ';': - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::SEMICOLON); - return true; - case '{': - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::OPEN_BRACE); - return true; - case '}': - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::CLOSE_BRACE); - return true; - default: - return false; - } -} - -// If tag name is not a valid one, assume it is simply text. -// It is expected that the caller immediately read "@" before calling. -inline void tokenize_tag_name(std::string& text, file_reader& reader, status_t& status) -{ - static constexpr const auto is_alpha = - [](char x) {return isalpha(x);}; - - // parse tag - std::string tagname; - int c = read_until(reader, is_alpha, tagname); - reader.back(c); - - // if valid tag, append text token then token with tag name - if (tag_set.find(tagname) != tag_set.end()) { - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::TAGNAME, std::move(tagname)); - } - - // otherwise, assume part of text: append "@" then tag name to text - else { - text.push_back('@'); - text.append(tagname); - } -} - -// If c is '@', try to tokenize tag name. -// Behavior is the same as tokenize_tag_name. -// Returns true if and only if c is '@'. -inline bool process_tag_name(int c, std::string& text, - file_reader& reader, status_t& status) -{ - if (c == '@') { - tokenize_tag_name(text, reader, status); - return true; - } - return false; -} - -// It is expected that caller has read the string "//" immediately before calling. -inline void process_line_comment(std::string& text, file_reader& reader, status_t& status) -{ - static constexpr const auto is_not_newline = - [](char x) {return x != '\n';}; - - int c = reader.read(); - - if (c == '/') { - c = reader.read(); - // valid triple-slash comment - if (isspace(c)) { - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::BEGIN_LINE_COMMENT); - reader.back(c); // in case it's a single-char token - } - // invalid triple-slash comment - else { - // no need to read back since c cannot be a whitespace and we ignore anyway - ignore_until(reader, is_not_newline); - } - } - - // invalid triple-slash comment - else { - reader.back(c); // the character just read may be '\n' - ignore_until(reader, is_not_newline); - } -} - -// It is expected that caller has read the string "/*" immediately before calling. -inline void process_block_comment(std::string& text, file_reader& reader, status_t& status) -{ - const auto is_not_end_block = - [&](char x) {return (x != '*') || (reader.peek() != '/');}; - - int c = reader.read(); - - if (c == '!') { - c = reader.read(); - // valid block comment: tokenize text then begin block comment symbol - if (isspace(c)) { - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::BEGIN_BLOCK_COMMENT); - reader.back(c); // may be special single-char token - } - // regular block comment: ignore text until end and stop tokenizing - else { - ignore_until(reader, is_not_end_block); - reader.read(); // read the '/' - } - } - - // regular block comment - else { - ignore_until(reader, is_not_end_block); // stops after reading '*' in "*/" - reader.read(); // read the '/' after - } -} - -// If c is not '/' or '*', then no operation done and returns false. -// If c is '/', and if it's a possible line comment ("//") then same as process_line_comment; -// if it's a possible block comment ("/*") then same as process_block_comment; -// otherwise, text is updated to include all characters read. -// -// If c is '*', and if it is the ending of a block comment ("*/"), text tokenized then END_BLOCK_COMMENT; -// otherwise, text tokenized then STAR. -// -// In any case, returns true if first char has been processed. -inline bool process_string(int c, std::string& text, - file_reader& reader, status_t& status) -{ - // possibly beginning of line or block comment - if (c == '/') { - c = reader.read(); - if (c == '/') { - process_line_comment(text, reader, status); - } - else if (c == '*') { - process_block_comment(text, reader, status); - } - else { - text.push_back('/'); - text.push_back(c); - } - return true; - } - - // possibly ending block comment or a star that can be ignored in the middle of a block comment - else if (c == '*') { - c = reader.read(); - if (c == '/') { - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::END_BLOCK_COMMENT); - } - else { - tokenize_text(text, status); - status.tokens.emplace_back(symbol_t::STAR); - reader.back(c); - } - return true; - } - - return false; -} - -inline void process(file_reader& reader, status_t& status) -{ - std::string text; - text.reserve(DEFAULT_STRING_RESERVE_SIZE); - int c = 0; - bool processed = false; - - while ((c = reader.read()) != file_reader::termination) { - - // process special single-char - processed = process_char(c, text, status); - if (processed) { - continue; - } - - // process tag name - processed = process_tag_name(c, text, reader, status); - if (processed) { - continue; - } - - // process string tokens - processed = process_string(c, text, reader, status); - if (processed) { - continue; - } - - // otherwise, no special symbol -> push to text - text.push_back(c); - } - - // tokenize last text then EOF - tokenize_text(text, status); - status.tokens.emplace_back(token_t::symbol_t::END_OF_FILE); -} - -} // namespace lexer_details -} // namespace core -} // namespace docgen diff --git a/src/core/lexer_trie.hpp b/src/core/lexer_trie.hpp deleted file mode 100644 index 4bd9eb5..0000000 --- a/src/core/lexer_trie.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace docgen { -namespace core { - -struct Lexer -{ - using symbol_t = Symbol; - using token_t = Token; - using status_t = Status; - - Lexer(); - - void process(char c); - std::optional next_token(); - -private: - - bool is_backtracking() const; - void set_backtracking(); - void reset_backtracking(); - void backtrack(char c); - - enum class State : bool { - backtrack, - non_backtrack - }; - - Trie trie_; - std::string text_; - std::string buf_; - State state_ = State::non_backtrack; - status_t status_; -}; - -/////////////////////////////////// -// Lexer Implementation -/////////////////////////////////// - -Lexer::Lexer() - : trie_({ - {"\n", Symbol::NEWLINE}, - {";", Symbol::SEMICOLON}, - {" ", Symbol::WHITESPACE}, - {"\t", Symbol::WHITESPACE}, - {"\v", Symbol::WHITESPACE}, - {"\r", Symbol::WHITESPACE}, - {"\f", Symbol::WHITESPACE}, - {"*", Symbol::STAR}, - {"{", Symbol::OPEN_BRACE}, - {"}", Symbol::CLOSE_BRACE}, - {"///", Symbol::BEGIN_SLINE_COMMENT}, - {"/*!", Symbol::BEGIN_SBLOCK_COMMENT}, - {"//", Symbol::BEGIN_NLINE_COMMENT}, - {"/*", Symbol::BEGIN_NBLOCK_COMMENT}, - {"*/", Symbol::END_BLOCK_COMMENT}, - {"@sdesc", Symbol::SDESC} - }) -{ - // TODO: reserve space for status_.tokens? -} - -inline void Lexer::process(char c) -{ - // if current state is accepting - if (trie_.is_accept()) { - if (!this->is_backtracking()) { - this->set_backtracking(); - } - // ignore contents in buffer up until now - // this optimization can be done because we look for longest match - buf_.clear(); - } - - auto it = trie_.get_children().find(c); - - // if transition exists - if (it != trie_.get_children().end()) { - buf_.push_back(c); - trie_.transition(c); - return; - } - - // otherwise, no transition exists - - // if not backtracking - if (!this->is_backtracking()) { - text_.append(buf_); - text_.push_back(c); - buf_.clear(); - trie_.reset(); - return; - } - - // otherwise, currently backtracking - this->backtrack(c); -} - -inline bool Lexer::is_backtracking() const -{ - return state_ == State::backtrack; -} - -inline void Lexer::set_backtracking() -{ - state_ = State::backtrack; -} - -inline void Lexer::reset_backtracking() -{ - state_ = State::non_backtrack; -} - -inline void Lexer::backtrack(char c) -{ - // reset to non-backtracking - this->reset_backtracking(); - - // tokenize and clear text - if (!text_.empty()) { - status_.tokens.emplace(symbol_t::TEXT, std::move(text_)); - text_.clear(); - } - - // tokenize symbol - for (uint32_t i = 0; i < buf_.size(); ++i) { - trie_.back_transition(); - } - assert(trie_.is_accept()); - auto opt_symbol = trie_.get_symbol(); - assert(static_cast(opt_symbol)); - status_.tokens.emplace(*opt_symbol); - - // move and clear buf_ to temp - std::string reprocess_str(std::move(buf_)); - buf_.clear(); - reprocess_str.push_back(c); - - // reset trie - trie_.reset(); - - // reprocess the rest - for (char c : reprocess_str) { - this->process(c); - } -} - -inline std::optional Lexer::next_token() -{ - if (!status_.tokens.empty()) { - token_t token = std::move(status_.tokens.front()); - status_.tokens.pop(); - return token; - } - return {}; -} - -} // namespace core -} // namespace docgen diff --git a/src/core/symbol.hpp b/src/core/symbol.hpp index b19e1f7..34c1969 100644 --- a/src/core/symbol.hpp +++ b/src/core/symbol.hpp @@ -12,6 +12,7 @@ enum class Symbol { NEWLINE, WHITESPACE, SEMICOLON, + HASHTAG, STAR, OPEN_BRACE, CLOSE_BRACE, @@ -23,6 +24,9 @@ enum class Symbol { END_BLOCK_COMMENT, // special tags SDESC, + TPARAM, + PARAM, + RETURN, // default TEXT }; From 5f43218ecd2e358cec0541ed0d1a539db92a2a25 Mon Sep 17 00:00:00 2001 From: James Yang Date: Fri, 17 Jan 2020 21:05:34 -0500 Subject: [PATCH 03/23] Update new lexer unittest --- test/core/lexer_base_fixture.hpp | 3 +- test/core/lexer_unittest.cpp | 654 +++++++++++++++++++------------ test/core/trie_unittest.cpp | 102 +++++ 3 files changed, 517 insertions(+), 242 deletions(-) create mode 100644 test/core/trie_unittest.cpp diff --git a/test/core/lexer_base_fixture.hpp b/test/core/lexer_base_fixture.hpp index 79ea86a..b37b85f 100644 --- a/test/core/lexer_base_fixture.hpp +++ b/test/core/lexer_base_fixture.hpp @@ -1,5 +1,4 @@ #pragma once -#include "core/lexer_routines.hpp" #include namespace docgen { @@ -8,7 +7,7 @@ namespace core { struct lexer_base_fixture : ::testing::Test { protected: - using status_t = lexer_details::status_t; + using status_t = status_t; using token_t = lexer_details::token_t; using symbol_t = lexer_details::symbol_t; diff --git a/test/core/lexer_unittest.cpp b/test/core/lexer_unittest.cpp index aebfbaa..643c2f5 100644 --- a/test/core/lexer_unittest.cpp +++ b/test/core/lexer_unittest.cpp @@ -1,279 +1,453 @@ -#include "core/lexer.hpp" -#include "lexer_base_fixture.hpp" +#define private public + +#include +#include namespace docgen { namespace core { -struct lexer_fixture : lexer_base_fixture +struct lexer_fixture : ::testing::Test { protected: + using status_t = Lexer::status_t; + using token_t = Lexer::token_t; + using symbol_t = Lexer::symbol_t; + + Lexer lexer; + + void setup_lexer(const char* content) + { + std::string str(content); + for (char c : str) { + lexer.process(c); + } + lexer.process(0); + } + + void check_token(symbol_t actual_sym, symbol_t expected_sym, + const std::string& actual_str, const std::string& expected_str) + { + EXPECT_EQ(actual_sym, expected_sym); + EXPECT_EQ(actual_str, expected_str); + } }; -TEST_F(lexer_fixture, process_no_comment) +TEST_F(lexer_fixture, lexer) { static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" + "#include // some comment\n" "\n" + "void f();" ; - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(5)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::NEWLINE, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::END_OF_FILE, - tokens[4].content, ""); -} + setup_lexer(content); -TEST_F(lexer_fixture, process_one_line_comment) -{ - static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" - " /// a very special comment \n" - "\n" - ; + auto token = *lexer.next_token(); + check_token(token.name, symbol_t::HASHTAG, + token.content, ""); - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(8)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::TEXT, - tokens[4].content, "a very special comment"); - EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); - check_token(tokens[5].name, symbol_t::NEWLINE, - tokens[5].content, ""); - check_token(tokens[6].name, symbol_t::NEWLINE, - tokens[6].content, ""); - check_token(tokens[7].name, symbol_t::END_OF_FILE, - tokens[7].content, ""); -} + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "include"); -TEST_F(lexer_fixture, process_two_line_comment) -{ - static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" - " /// a very special comment \n" - "\n" - " // just a normal comment\n" - " /// another very special comment \n" - " // just a normal comment\n" - ; + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(11)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::TEXT, - tokens[4].content, "a very special comment"); - EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); - check_token(tokens[5].name, symbol_t::NEWLINE, - tokens[5].content, ""); - check_token(tokens[6].name, symbol_t::NEWLINE, - tokens[6].content, ""); - check_token(tokens[7].name, symbol_t::BEGIN_LINE_COMMENT, - tokens[7].content, ""); - check_token(tokens[8].name, symbol_t::TEXT, - tokens[8].content, "another very special comment"); - EXPECT_EQ(tokens[8].leading_ws_count, static_cast(1)); - check_token(tokens[9].name, symbol_t::NEWLINE, - tokens[9].content, ""); - check_token(tokens[10].name, symbol_t::END_OF_FILE, - tokens[10].content, ""); -} + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, ""); -TEST_F(lexer_fixture, process_one_block_comment) -{ - static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" - " /*! a very special comment */\n" - "\n" - ; + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "some"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "comment"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "void"); - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(9)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::TEXT, - tokens[4].content, "a very special comment"); - EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); - check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, - tokens[5].content, ""); - check_token(tokens[6].name, symbol_t::NEWLINE, - tokens[6].content, ""); - check_token(tokens[7].name, symbol_t::NEWLINE, - tokens[7].content, ""); - check_token(tokens[8].name, symbol_t::END_OF_FILE, - tokens[8].content, ""); + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "f()"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::SEMICOLON, + token.content, ""); } -TEST_F(lexer_fixture, process_two_block_comment) +TEST_F(lexer_fixture, process_no_comment) { static constexpr const char* content = "#include \n" "\n" " // just a normal comment\n" - " /*! a very special comment */\n" "\n" - " // just a normal comment\n" - " /*! another very \n" - " * special comment \n" - "*/" - " /* just a normal comment\n */" ; - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(16)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::TEXT, - tokens[4].content, "a very special comment"); - EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); - check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, - tokens[5].content, ""); - check_token(tokens[6].name, symbol_t::NEWLINE, - tokens[6].content, ""); - check_token(tokens[7].name, symbol_t::NEWLINE, - tokens[7].content, ""); - check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, - tokens[8].content, ""); - check_token(tokens[9].name, symbol_t::TEXT, - tokens[9].content, "another very"); - EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); - check_token(tokens[10].name, symbol_t::NEWLINE, - tokens[10].content, ""); - check_token(tokens[11].name, symbol_t::STAR, - tokens[11].content, ""); - check_token(tokens[12].name, symbol_t::TEXT, - tokens[12].content, "special comment"); - EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); - check_token(tokens[13].name, symbol_t::NEWLINE, - tokens[13].content, ""); - check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, - tokens[14].content, ""); - check_token(tokens[15].name, symbol_t::END_OF_FILE, - tokens[15].content, ""); + setup_lexer(content); + + auto token = *lexer.next_token(); + check_token(token.name, symbol_t::HASHTAG, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "include"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "just"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "a"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "normal"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "comment"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); } -TEST_F(lexer_fixture, process_line_block_comment) +TEST_F(lexer_fixture, process_one_line_comment) { static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" - " /// a very special comment */\n" - "\n" - " // just a normal comment\n" - " /*! another very \n" - " * special comment \n" - "*/" - " /* just a normal comment\n */" + "// comment\n" + " /// special_comment \n" ; - write_file(content); - Lexer lexer(file); - lexer.process(); - const auto& tokens = lexer.get_tokens(); - - EXPECT_EQ(tokens.size(), static_cast(16)); - - check_token(tokens[0].name, symbol_t::TEXT, - tokens[0].content, "#include "); - check_token(tokens[1].name, symbol_t::NEWLINE, - tokens[1].content, ""); - check_token(tokens[2].name, symbol_t::NEWLINE, - tokens[2].content, ""); - check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, - tokens[3].content, ""); - check_token(tokens[4].name, symbol_t::TEXT, - tokens[4].content, "a very special comment"); - EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); - check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, - tokens[5].content, ""); - check_token(tokens[6].name, symbol_t::NEWLINE, - tokens[6].content, ""); - check_token(tokens[7].name, symbol_t::NEWLINE, - tokens[7].content, ""); - check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, - tokens[8].content, ""); - check_token(tokens[9].name, symbol_t::TEXT, - tokens[9].content, "another very"); - EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); - check_token(tokens[10].name, symbol_t::NEWLINE, - tokens[10].content, ""); - check_token(tokens[11].name, symbol_t::STAR, - tokens[11].content, ""); - check_token(tokens[12].name, symbol_t::TEXT, - tokens[12].content, "special comment"); - EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); - check_token(tokens[13].name, symbol_t::NEWLINE, - tokens[13].content, ""); - check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, - tokens[14].content, ""); - check_token(tokens[15].name, symbol_t::END_OF_FILE, - tokens[15].content, ""); + setup_lexer(content); + + auto token = *lexer.next_token(); + check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "comment"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::BEGIN_SLINE_COMMENT, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::TEXT, + token.content, "special_comment"); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::WHITESPACE, + token.content, ""); + + token = *lexer.next_token(); + check_token(token.name, symbol_t::NEWLINE, + token.content, ""); + + } +//TEST_F(lexer_fixture, process_two_line_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /// a very special comment \n" +// "\n" +// " // just a normal comment\n" +// " /// another very special comment \n" +// " // just a normal comment\n" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(11)); +// +// check_token(tokens[0].name, symbol_t::TEXT, +// tokens[0].content, "#include "); +// check_token(tokens[1].name, symbol_t::NEWLINE, +// tokens[1].content, ""); +// check_token(tokens[2].name, symbol_t::NEWLINE, +// tokens[2].content, ""); +// check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[3].content, ""); +// check_token(tokens[4].name, symbol_t::TEXT, +// tokens[4].content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5].name, symbol_t::NEWLINE, +// tokens[5].content, ""); +// check_token(tokens[6].name, symbol_t::NEWLINE, +// tokens[6].content, ""); +// check_token(tokens[7].name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[7].content, ""); +// check_token(tokens[8].name, symbol_t::TEXT, +// tokens[8].content, "another very special comment"); +// EXPECT_EQ(tokens[8].leading_ws_count, static_cast(1)); +// check_token(tokens[9].name, symbol_t::NEWLINE, +// tokens[9].content, ""); +// check_token(tokens[10].name, symbol_t::END_OF_FILE, +// tokens[10].content, ""); +//} +// +//TEST_F(lexer_fixture, process_one_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /*! a very special comment */\n" +// "\n" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(9)); +// +// check_token(tokens[0].name, symbol_t::TEXT, +// tokens[0].content, "#include "); +// check_token(tokens[1].name, symbol_t::NEWLINE, +// tokens[1].content, ""); +// check_token(tokens[2].name, symbol_t::NEWLINE, +// tokens[2].content, ""); +// check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[3].content, ""); +// check_token(tokens[4].name, symbol_t::TEXT, +// tokens[4].content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, +// tokens[5].content, ""); +// check_token(tokens[6].name, symbol_t::NEWLINE, +// tokens[6].content, ""); +// check_token(tokens[7].name, symbol_t::NEWLINE, +// tokens[7].content, ""); +// check_token(tokens[8].name, symbol_t::END_OF_FILE, +// tokens[8].content, ""); +//} +// +//TEST_F(lexer_fixture, process_two_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /*! a very special comment */\n" +// "\n" +// " // just a normal comment\n" +// " /*! another very \n" +// " * special comment \n" +// "*/" +// " /* just a normal comment\n */" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(16)); +// +// check_token(tokens[0].name, symbol_t::TEXT, +// tokens[0].content, "#include "); +// check_token(tokens[1].name, symbol_t::NEWLINE, +// tokens[1].content, ""); +// check_token(tokens[2].name, symbol_t::NEWLINE, +// tokens[2].content, ""); +// check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[3].content, ""); +// check_token(tokens[4].name, symbol_t::TEXT, +// tokens[4].content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, +// tokens[5].content, ""); +// check_token(tokens[6].name, symbol_t::NEWLINE, +// tokens[6].content, ""); +// check_token(tokens[7].name, symbol_t::NEWLINE, +// tokens[7].content, ""); +// check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[8].content, ""); +// check_token(tokens[9].name, symbol_t::TEXT, +// tokens[9].content, "another very"); +// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); +// check_token(tokens[10].name, symbol_t::NEWLINE, +// tokens[10].content, ""); +// check_token(tokens[11].name, symbol_t::STAR, +// tokens[11].content, ""); +// check_token(tokens[12].name, symbol_t::TEXT, +// tokens[12].content, "special comment"); +// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); +// check_token(tokens[13].name, symbol_t::NEWLINE, +// tokens[13].content, ""); +// check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, +// tokens[14].content, ""); +// check_token(tokens[15].name, symbol_t::END_OF_FILE, +// tokens[15].content, ""); +//} +// +//TEST_F(lexer_fixture, process_line_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /// a very special comment */\n" +// "\n" +// " // just a normal comment\n" +// " /*! another very \n" +// " * special comment \n" +// "*/" +// " /* just a normal comment\n */" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(16)); +// +// check_token(tokens[0].name, symbol_t::TEXT, +// tokens[0].content, "#include "); +// check_token(tokens[1].name, symbol_t::NEWLINE, +// tokens[1].content, ""); +// check_token(tokens[2].name, symbol_t::NEWLINE, +// tokens[2].content, ""); +// check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[3].content, ""); +// check_token(tokens[4].name, symbol_t::TEXT, +// tokens[4].content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, +// tokens[5].content, ""); +// check_token(tokens[6].name, symbol_t::NEWLINE, +// tokens[6].content, ""); +// check_token(tokens[7].name, symbol_t::NEWLINE, +// tokens[7].content, ""); +// check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[8].content, ""); +// check_token(tokens[9].name, symbol_t::TEXT, +// tokens[9].content, "another very"); +// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); +// check_token(tokens[10].name, symbol_t::NEWLINE, +// tokens[10].content, ""); +// check_token(tokens[11].name, symbol_t::STAR, +// tokens[11].content, ""); +// check_token(tokens[12].name, symbol_t::TEXT, +// tokens[12].content, "special comment"); +// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); +// check_token(tokens[13].name, symbol_t::NEWLINE, +// tokens[13].content, ""); +// check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, +// tokens[14].content, ""); +// check_token(tokens[15].name, symbol_t::END_OF_FILE, +// tokens[15].content, ""); +//} + } // namespace core } // namespace docgen diff --git a/test/core/trie_unittest.cpp b/test/core/trie_unittest.cpp new file mode 100644 index 0000000..43bf550 --- /dev/null +++ b/test/core/trie_unittest.cpp @@ -0,0 +1,102 @@ +#define private public + +#include +#include +#include + +namespace docgen { +namespace core { + +enum class MockSymbol { + symbol1, + symbol2 +}; + +struct trie_fixture : ::testing::Test +{ +protected: + using symbol_t = MockSymbol; + using trie_t = Trie; + + void print_trie(const trie_t& trie) + { + print_trie(trie.root_); + } + + void print_trie(const trie_t::TrieNode& node) + { + if (node.is_accept()) { + std::cout << "symbol: " << (int) *node.get_symbol() << std::endl; + } + std::cout << "\nsize: " << node.children_.size() << std::endl; + for (auto it = node.children_.begin(); it != node.children_.end(); ++it) { + std::cout << it->first << "--" << std::endl;; + print_trie(it->second); + std::cout << "--" << std::endl; + } + } +}; + +TEST_F(trie_fixture, trie_ctor) +{ + trie_t trie({ + {"adf", symbol_t::symbol1}, + {"asdf", symbol_t::symbol2}, + {"bscdf", symbol_t::symbol1} + }); + + auto symbol = trie.get_symbol(); + EXPECT_FALSE((bool) symbol); + + //trie.transition('a'); + //EXPECT_FALSE((bool) trie.get_symbol()); + //trie.transition('d'); + //EXPECT_FALSE((bool) trie.get_symbol()); + //trie.transition('b'); + //EXPECT_FALSE((bool) trie.get_symbol()); + + trie.transition('a'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('d'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('f'); + EXPECT_TRUE((bool) trie.get_symbol()); + + trie.back_transition(); + trie.back_transition(); + trie.back_transition(); + + trie.transition('a'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('d'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('f'); + EXPECT_TRUE((bool) trie.get_symbol()); + + trie.reset(); + + trie.transition('a'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('s'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('d'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('f'); + EXPECT_TRUE((bool) trie.get_symbol()); + + trie.reset(); + + trie.transition('b'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('s'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('c'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('d'); + EXPECT_FALSE((bool) trie.get_symbol()); + trie.transition('f'); + EXPECT_TRUE((bool) trie.get_symbol()); +} + +} // namespace core +} // namespace docgen From 10269ca3b6dbfa3aa4b0a67b4d7966a185a24a59 Mon Sep 17 00:00:00 2001 From: James Yang Date: Fri, 17 Jan 2020 21:06:03 -0500 Subject: [PATCH 04/23] Add cmake changes to build new unittests (no more lexer_routines_unittest also) --- test/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c8abf0f..20bf694 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -41,12 +41,14 @@ add_custom_command( $/io_data) ###################################################### -# File IO Unit Tests +# Core Unit Tests ###################################################### add_executable(core_unittests - ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer_routines_unittest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/trie_unittest.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer_unittest.cpp ) create_test("core_unittests" core_unittests) + + From 99431276efa47bdc3e3a2ac519a886919a5c494b Mon Sep 17 00:00:00 2001 From: James Yang Date: Fri, 17 Jan 2020 23:31:37 -0500 Subject: [PATCH 05/23] Fix process when string termination given (changed to flush) --- CMakeLists.txt | 3 + src/CMakeLists.txt | 8 + src/core/{lexer.hpp => lexer/lexer.cpp} | 104 +-- src/core/lexer/lexer.hpp | 100 +++ src/core/{ => lexer}/status.hpp | 3 +- src/core/{ => lexer}/trie.hpp | 9 +- src/core/symbol.hpp | 2 - src/core/token.hpp | 2 +- test/CMakeLists.txt | 6 +- test/core/{ => lexer}/lexer_base_fixture.hpp | 7 - .../{ => lexer}/lexer_routines_unittest.cpp | 0 test/core/lexer/lexer_unittest.cpp | 620 ++++++++++++++++++ test/core/lexer/trie_unittest.cpp | 336 ++++++++++ test/core/lexer_unittest.cpp | 453 ------------- test/core/trie_unittest.cpp | 102 --- 15 files changed, 1103 insertions(+), 652 deletions(-) create mode 100644 src/CMakeLists.txt rename src/core/{lexer.hpp => lexer/lexer.cpp} (52%) create mode 100644 src/core/lexer/lexer.hpp rename src/core/{ => lexer}/status.hpp (86%) rename src/core/{ => lexer}/trie.hpp (96%) rename test/core/{ => lexer}/lexer_base_fixture.hpp (74%) rename test/core/{ => lexer}/lexer_routines_unittest.cpp (100%) create mode 100644 test/core/lexer/lexer_unittest.cpp create mode 100644 test/core/lexer/trie_unittest.cpp delete mode 100644 test/core/lexer_unittest.cpp delete mode 100644 test/core/trie_unittest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ae5c08..c1208dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,5 +57,8 @@ find_package(nlohmann_json 3.2.0 REQUIRED) # add libs subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/libs ${PROJECT_BINARY_DIR}/libs) +# add src subdirectory +add_subdirectory(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src) + # add test subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/test ${PROJECT_BINARY_DIR}/test) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..96870d9 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,8 @@ +# Create object files for lexer +add_library(LEXER_LIB_OBJECTS OBJECT + ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/lexer.cpp + ) +target_include_directories(LEXER_LIB_OBJECTS PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${ETERNAL_DIR}/include + ) diff --git a/src/core/lexer.hpp b/src/core/lexer/lexer.cpp similarity index 52% rename from src/core/lexer.hpp rename to src/core/lexer/lexer.cpp index 40706ab..f727e49 100644 --- a/src/core/lexer.hpp +++ b/src/core/lexer/lexer.cpp @@ -1,41 +1,8 @@ -#pragma once -#include -#include -#include -#include +#include namespace docgen { namespace core { - -struct Lexer -{ - using symbol_t = Symbol; - using token_t = Token; - using status_t = Status; - - Lexer(); - - void process(char c); - std::optional next_token(); - -private: - - bool is_backtracking() const; - void set_backtracking(); - void reset_backtracking(); - void backtrack(char c); - - enum class State : bool { - backtrack, - non_backtrack - }; - - Trie trie_; - std::string text_; - std::string buf_; - State state_ = State::non_backtrack; - status_t status_; -}; +namespace lexer { /////////////////////////////////// // Lexer Implementation @@ -64,21 +31,11 @@ Lexer::Lexer() {"@param", Symbol::PARAM}, {"@return", Symbol::RETURN} }) -{ - // TODO: reserve space for status_.tokens? -} +{} -inline void Lexer::process(char c) +void Lexer::process(char c) { - // if current state is accepting - if (trie_.is_accept()) { - if (!this->is_backtracking()) { - this->set_backtracking(); - } - // ignore contents in buffer up until now - // this optimization can be done because we look for longest match - buf_.clear(); - } + this->update_state(); auto it = trie_.get_children().find(c); @@ -104,32 +61,11 @@ inline void Lexer::process(char c) this->backtrack(c); } -inline bool Lexer::is_backtracking() const +void Lexer::backtrack(char c) { - return state_ == State::backtrack; -} + // tokenize text + this->tokenize_text(); -inline void Lexer::set_backtracking() -{ - state_ = State::backtrack; -} - -inline void Lexer::reset_backtracking() -{ - state_ = State::non_backtrack; -} - -inline void Lexer::backtrack(char c) -{ - // reset to non-backtracking - this->reset_backtracking(); - - // tokenize and clear text - if (!text_.empty()) { - status_.tokens.emplace(symbol_t::TEXT, std::move(text_)); - text_.clear(); - } - // tokenize symbol for (uint32_t i = 0; i < buf_.size(); ++i) { trie_.back_transition(); @@ -141,11 +77,10 @@ inline void Lexer::backtrack(char c) // move and clear buf_ to temp string for reprocessing std::string reprocess_str(std::move(buf_)); - buf_.clear(); reprocess_str.push_back(c); - // reset trie - trie_.reset(); + // reset + this->reset(); // reprocess the rest for (char c : reprocess_str) { @@ -153,15 +88,22 @@ inline void Lexer::backtrack(char c) } } -inline std::optional Lexer::next_token() +void Lexer::flush() { - if (!status_.tokens.empty()) { - token_t token = std::move(status_.tokens.front()); - status_.tokens.pop(); - return token; + this->update_state(); + + if (this->is_backtracking()) { + return this->backtrack(0); } - return {}; + + // non-backtracking: no parent is an accepting node + // append buf_ to text_ and tokenize text_ + // reset all other fields + text_.append(buf_); + this->tokenize_text(); + this->reset(); } +} // namespace lexer } // namespace core } // namespace docgen diff --git a/src/core/lexer/lexer.hpp b/src/core/lexer/lexer.hpp new file mode 100644 index 0000000..03b65fa --- /dev/null +++ b/src/core/lexer/lexer.hpp @@ -0,0 +1,100 @@ +#pragma once +#include +#include +#include +#include + +namespace docgen { +namespace core { +namespace lexer { + +struct Lexer +{ + using symbol_t = Symbol; + using token_t = Token; + using status_t = Status; + + Lexer(); + + void process(char c); + void flush(); + std::optional next_token(); + +private: + + void tokenize_text(); + bool is_backtracking() const; + void set_backtracking(); + void reset_backtracking(); + void backtrack(char c); + void update_state(); + void reset(); + + enum class State : bool { + backtrack, + non_backtrack + }; + + Trie trie_; + std::string text_; + std::string buf_; + State state_ = State::non_backtrack; + status_t status_; +}; + +inline void Lexer::tokenize_text() +{ + if (!text_.empty()) { + status_.tokens.emplace(symbol_t::TEXT, std::move(text_)); + } +} + +inline bool Lexer::is_backtracking() const +{ + return state_ == State::backtrack; +} + +inline void Lexer::set_backtracking() +{ + state_ = State::backtrack; +} + +inline void Lexer::reset_backtracking() +{ + state_ = State::non_backtrack; +} + +inline void Lexer::update_state() +{ + // if current state is accepting + if (trie_.is_accept()) { + if (!this->is_backtracking()) { + this->set_backtracking(); + } + // ignore contents in buffer up until now + // this optimization can be done because we look for longest match + buf_.clear(); + } +} + +inline std::optional Lexer::next_token() +{ + if (!status_.tokens.empty()) { + token_t token = std::move(status_.tokens.front()); + status_.tokens.pop(); + return token; + } + return {}; +} + +inline void Lexer::reset() +{ + text_.clear(); + buf_.clear(); + trie_.reset(); + reset_backtracking(); +} + +} // namespace lexer +} // namespace core +} // namespace docgen diff --git a/src/core/status.hpp b/src/core/lexer/status.hpp similarity index 86% rename from src/core/status.hpp rename to src/core/lexer/status.hpp index 9294ea7..e4af461 100644 --- a/src/core/status.hpp +++ b/src/core/lexer/status.hpp @@ -1,9 +1,9 @@ #pragma once #include -#include namespace docgen { namespace core { +namespace lexer { template struct Status @@ -14,5 +14,6 @@ struct Status token_arr_t tokens; }; +} // namespace lexer } // namespace core } // namespace docgen diff --git a/src/core/trie.hpp b/src/core/lexer/trie.hpp similarity index 96% rename from src/core/trie.hpp rename to src/core/lexer/trie.hpp index 71dbb5c..1fe79f0 100644 --- a/src/core/trie.hpp +++ b/src/core/lexer/trie.hpp @@ -8,14 +8,16 @@ namespace docgen { namespace core { +namespace lexer { template struct Trie { private: - using pair_t = std::pair; + struct TrieNode; // forward declaration public: + using pair_t = std::pair; // Constructs trie node from a list of pairs of string and symbol. // The string must be of type std::string_view and it must not be empty. @@ -32,7 +34,7 @@ struct Trie void transition(char c); void back_transition(); bool is_accept() const; - auto get_children(); + std::unordered_map& get_children(); bool is_reset() const; void reset(); const std::optional& get_symbol() const; @@ -159,7 +161,7 @@ Trie::is_accept() const } template -inline auto +inline std::unordered_map::TrieNode>& Trie::get_children() { return curr_node_.get().get_children(); @@ -197,5 +199,6 @@ Trie::get_symbol() const return curr_node_.get().get_symbol(); } +} // namespace lexer } // namespace core } // namespace docgen diff --git a/src/core/symbol.hpp b/src/core/symbol.hpp index 34c1969..0bb7772 100644 --- a/src/core/symbol.hpp +++ b/src/core/symbol.hpp @@ -1,7 +1,5 @@ #pragma once #include -#include -#include namespace docgen { namespace core { diff --git a/src/core/token.hpp b/src/core/token.hpp index cc02f37..4ef3798 100644 --- a/src/core/token.hpp +++ b/src/core/token.hpp @@ -1,6 +1,6 @@ #pragma once #include -#include "symbol.hpp" +#include namespace docgen { namespace core { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 20bf694..1ddf86a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -45,8 +45,10 @@ add_custom_command( ###################################################### add_executable(core_unittests - ${CMAKE_CURRENT_SOURCE_DIR}/core/trie_unittest.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer_unittest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/trie_unittest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/lexer_unittest.cpp + # Source dependency + $ ) create_test("core_unittests" core_unittests) diff --git a/test/core/lexer_base_fixture.hpp b/test/core/lexer/lexer_base_fixture.hpp similarity index 74% rename from test/core/lexer_base_fixture.hpp rename to test/core/lexer/lexer_base_fixture.hpp index b37b85f..46ba63a 100644 --- a/test/core/lexer_base_fixture.hpp +++ b/test/core/lexer/lexer_base_fixture.hpp @@ -30,13 +30,6 @@ struct lexer_base_fixture : ::testing::Test fwrite(content, sizeof(char), strlen(content), fp); fclose(fp); } - - void check_token(symbol_t actual_sym, symbol_t expected_sym, - const std::string& actual_str, const std::string& expected_str) - { - EXPECT_EQ(actual_sym, expected_sym); - EXPECT_EQ(actual_str, expected_str); - } }; } // namespace core diff --git a/test/core/lexer_routines_unittest.cpp b/test/core/lexer/lexer_routines_unittest.cpp similarity index 100% rename from test/core/lexer_routines_unittest.cpp rename to test/core/lexer/lexer_routines_unittest.cpp diff --git a/test/core/lexer/lexer_unittest.cpp b/test/core/lexer/lexer_unittest.cpp new file mode 100644 index 0000000..9fead88 --- /dev/null +++ b/test/core/lexer/lexer_unittest.cpp @@ -0,0 +1,620 @@ +#include +#include + +namespace docgen { +namespace core { +namespace lexer { + +struct lexer_fixture : ::testing::Test +{ +protected: + using status_t = typename Lexer::status_t; + using token_t = typename Lexer::token_t; + using symbol_t = typename Lexer::symbol_t; + + Lexer lexer; + std::optional token; + + void setup_lexer(const char* content) + { + std::string str(content); + for (char c : str) { + lexer.process(c); + } + lexer.flush(); + } +}; + +//////////////////////////////////////////////////////////////////// +// Individual Symbol TESTS +//////////////////////////////////////////////////////////////////// + +// NEWLINE +TEST_F(lexer_fixture, lexer_newline) +{ + static constexpr const char* content = + "somecrazy1492text\nmvn2b" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "somecrazy1492text"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "mvn2b"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE ( ) +TEST_F(lexer_fixture, lexer_whitespace_space) +{ + static constexpr const char* content = + ",m.,m. abn" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ",m.,m."); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abn"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\t) +TEST_F(lexer_fixture, lexer_whitespace_t) +{ + static constexpr const char* content = + "h0f2n.1\t1234|" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "h0f2n.1"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "1234|"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\v) +TEST_F(lexer_fixture, lexer_whitespace_v) +{ + static constexpr const char* content = + "hello!\v" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\r) +TEST_F(lexer_fixture, lexer_whitespace_r) +{ + static constexpr const char* content = + "hello!\rwsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\f) +TEST_F(lexer_fixture, lexer_whitespace_f) +{ + static constexpr const char* content = + "hello!\fwsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (;) +TEST_F(lexer_fixture, lexer_semicolon) +{ + static constexpr const char* content = + ";wsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SEMICOLON); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +//////////////////////////////////////////////////////////////////// +// Mix TESTS +//////////////////////////////////////////////////////////////////// + +TEST_F(lexer_fixture, lexer_test_1) +{ + static constexpr const char* content = + "#include // some comment\n" + "\n" + "void f();" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "include"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "some"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "void"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f()"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SEMICOLON); + EXPECT_EQ(token->content, ""); +} + +TEST_F(lexer_fixture, process_no_comment) +{ + static constexpr const char* content = + "#include \n" + "\n" + " // just a normal comment\n" + "\n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "include"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "just"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "a"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "normal"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); +} + +TEST_F(lexer_fixture, process_one_line_comment) +{ + static constexpr const char* content = + "// comment\n" + " /// special_comment \n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "special_comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); +} + +//TEST_F(lexer_fixture, process_two_line_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /// a very special comment \n" +// "\n" +// " // just a normal comment\n" +// " /// another very special comment \n" +// " // just a normal comment\n" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(11)); +// +// check_token(tokens[0]->name, symbol_t::TEXT, +// tokens[0]->content, "#include "); +// check_token(tokens[1]->name, symbol_t::NEWLINE, +// tokens[1]->content, ""); +// check_token(tokens[2]->name, symbol_t::NEWLINE, +// tokens[2]->content, ""); +// check_token(tokens[3]->name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[3]->content, ""); +// check_token(tokens[4]->name, symbol_t::TEXT, +// tokens[4]->content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5]->name, symbol_t::NEWLINE, +// tokens[5]->content, ""); +// check_token(tokens[6]->name, symbol_t::NEWLINE, +// tokens[6]->content, ""); +// check_token(tokens[7]->name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[7]->content, ""); +// check_token(tokens[8]->name, symbol_t::TEXT, +// tokens[8]->content, "another very special comment"); +// EXPECT_EQ(tokens[8].leading_ws_count, static_cast(1)); +// check_token(tokens[9]->name, symbol_t::NEWLINE, +// tokens[9]->content, ""); +// check_token(tokens[10]->name, symbol_t::END_OF_FILE, +// tokens[10]->content, ""); +//} +// +//TEST_F(lexer_fixture, process_one_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /*! a very special comment */\n" +// "\n" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(9)); +// +// check_token(tokens[0]->name, symbol_t::TEXT, +// tokens[0]->content, "#include "); +// check_token(tokens[1]->name, symbol_t::NEWLINE, +// tokens[1]->content, ""); +// check_token(tokens[2]->name, symbol_t::NEWLINE, +// tokens[2]->content, ""); +// check_token(tokens[3]->name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[3]->content, ""); +// check_token(tokens[4]->name, symbol_t::TEXT, +// tokens[4]->content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, +// tokens[5]->content, ""); +// check_token(tokens[6]->name, symbol_t::NEWLINE, +// tokens[6]->content, ""); +// check_token(tokens[7]->name, symbol_t::NEWLINE, +// tokens[7]->content, ""); +// check_token(tokens[8]->name, symbol_t::END_OF_FILE, +// tokens[8]->content, ""); +//} +// +//TEST_F(lexer_fixture, process_two_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /*! a very special comment */\n" +// "\n" +// " // just a normal comment\n" +// " /*! another very \n" +// " * special comment \n" +// "*/" +// " /* just a normal comment\n */" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(16)); +// +// check_token(tokens[0]->name, symbol_t::TEXT, +// tokens[0]->content, "#include "); +// check_token(tokens[1]->name, symbol_t::NEWLINE, +// tokens[1]->content, ""); +// check_token(tokens[2]->name, symbol_t::NEWLINE, +// tokens[2]->content, ""); +// check_token(tokens[3]->name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[3]->content, ""); +// check_token(tokens[4]->name, symbol_t::TEXT, +// tokens[4]->content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, +// tokens[5]->content, ""); +// check_token(tokens[6]->name, symbol_t::NEWLINE, +// tokens[6]->content, ""); +// check_token(tokens[7]->name, symbol_t::NEWLINE, +// tokens[7]->content, ""); +// check_token(tokens[8]->name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[8]->content, ""); +// check_token(tokens[9]->name, symbol_t::TEXT, +// tokens[9]->content, "another very"); +// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); +// check_token(tokens[10]->name, symbol_t::NEWLINE, +// tokens[10]->content, ""); +// check_token(tokens[11]->name, symbol_t::STAR, +// tokens[11]->content, ""); +// check_token(tokens[12]->name, symbol_t::TEXT, +// tokens[12]->content, "special comment"); +// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); +// check_token(tokens[13]->name, symbol_t::NEWLINE, +// tokens[13]->content, ""); +// check_token(tokens[14]->name, symbol_t::END_BLOCK_COMMENT, +// tokens[14]->content, ""); +// check_token(tokens[15]->name, symbol_t::END_OF_FILE, +// tokens[15]->content, ""); +//} +// +//TEST_F(lexer_fixture, process_line_block_comment) +//{ +// static constexpr const char* content = +// "#include \n" +// "\n" +// " // just a normal comment\n" +// " /// a very special comment */\n" +// "\n" +// " // just a normal comment\n" +// " /*! another very \n" +// " * special comment \n" +// "*/" +// " /* just a normal comment\n */" +// ; +// +// write_file(content); +// Lexer lexer(file); +// lexer.process(); +// const auto& tokens = lexer.get_tokens(); +// +// EXPECT_EQ(tokens.size(), static_cast(16)); +// +// check_token(tokens[0]->name, symbol_t::TEXT, +// tokens[0]->content, "#include "); +// check_token(tokens[1]->name, symbol_t::NEWLINE, +// tokens[1]->content, ""); +// check_token(tokens[2]->name, symbol_t::NEWLINE, +// tokens[2]->content, ""); +// check_token(tokens[3]->name, symbol_t::BEGIN_LINE_COMMENT, +// tokens[3]->content, ""); +// check_token(tokens[4]->name, symbol_t::TEXT, +// tokens[4]->content, "a very special comment"); +// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); +// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, +// tokens[5]->content, ""); +// check_token(tokens[6]->name, symbol_t::NEWLINE, +// tokens[6]->content, ""); +// check_token(tokens[7]->name, symbol_t::NEWLINE, +// tokens[7]->content, ""); +// check_token(tokens[8]->name, symbol_t::BEGIN_BLOCK_COMMENT, +// tokens[8]->content, ""); +// check_token(tokens[9]->name, symbol_t::TEXT, +// tokens[9]->content, "another very"); +// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); +// check_token(tokens[10]->name, symbol_t::NEWLINE, +// tokens[10]->content, ""); +// check_token(tokens[11]->name, symbol_t::STAR, +// tokens[11]->content, ""); +// check_token(tokens[12]->name, symbol_t::TEXT, +// tokens[12]->content, "special comment"); +// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); +// check_token(tokens[13]->name, symbol_t::NEWLINE, +// tokens[13]->content, ""); +// check_token(tokens[14]->name, symbol_t::END_BLOCK_COMMENT, +// tokens[14]->content, ""); +// check_token(tokens[15]->name, symbol_t::END_OF_FILE, +// tokens[15]->content, ""); +//} + +} // namespace lexer +} // namespace core +} // namespace docgen diff --git a/test/core/lexer/trie_unittest.cpp b/test/core/lexer/trie_unittest.cpp new file mode 100644 index 0000000..206a6d7 --- /dev/null +++ b/test/core/lexer/trie_unittest.cpp @@ -0,0 +1,336 @@ +#include +#include + +namespace docgen { +namespace core { +namespace lexer { + +enum class MockSymbol { + symbol_0, + symbol_1, + symbol_2, + symbol_3, +}; + +struct trie_fixture : ::testing::Test +{ +protected: + using symbol_t = MockSymbol; + using trie_t = Trie; +}; + +//////////////////////////////////////////// +// State TESTS +//////////////////////////////////////////// + +TEST_F(trie_fixture, trie_root) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +TEST_F(trie_fixture, trie_transition_child_a) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_EQ(*trie.get_symbol(), symbol_t::symbol_0); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(2)); + EXPECT_NE(children.find('b'), children.end()); // found + EXPECT_NE(children.find('c'), children.end()); // found +} + +TEST_F(trie_fixture, trie_transition_child_b) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.transition('b'); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('c'), children.end()); // found +} + +TEST_F(trie_fixture, trie_transition_child_bc) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.transition('b'); + trie.transition('c'); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_EQ(*trie.get_symbol(), symbol_t::symbol_1); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(0)); +} + +TEST_F(trie_fixture, trie_transition_child_c) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.transition('c'); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_EQ(*trie.get_symbol(), symbol_t::symbol_2); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(0)); +} + +TEST_F(trie_fixture, trie_reset_root) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.reset(); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +TEST_F(trie_fixture, trie_reset_child_a) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + + trie.reset(); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +TEST_F(trie_fixture, trie_reset_child_a_b) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.transition('b'); + + trie.reset(); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +TEST_F(trie_fixture, trie_back_transition_root) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + EXPECT_THROW(trie.back_transition(), exceptions::control_flow_error); +} + +TEST_F(trie_fixture, trie_back_transition_child_a) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.back_transition(); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto& children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +TEST_F(trie_fixture, trie_back_transition_child_ab) +{ + trie_t trie({ + {"a", symbol_t::symbol_0}, + {"abc", symbol_t::symbol_1}, + {"ac", symbol_t::symbol_2}, + }); + + trie.transition('a'); + trie.transition('b'); + + // back to child 'a' + trie.back_transition(); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + auto children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(2)); + EXPECT_NE(children.find('b'), children.end()); // found + EXPECT_NE(children.find('c'), children.end()); // found + + // back to root + trie.back_transition(); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found +} + +//////////////////////////////////////////// +// Structural Checks +//////////////////////////////////////////// + +TEST_F(trie_fixture, trie_off_by_one_prefix) +{ + trie_t trie({ + {"ab", symbol_t::symbol_1}, + {"bab", symbol_t::symbol_1}, + }); + + // check root + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(trie.is_reset()); + + auto children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(2)); + EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_NE(children.find('b'), children.end()); // found + + // check child 'a' + trie.transition('a'); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('b'), children.end()); // found + + // check child 'a'->'b' + trie.transition('b'); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(0)); + + // up to child 'a' + trie.back_transition(); + + // up to root + trie.back_transition(); + + // check child 'b' + trie.transition('b'); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('a'), children.end()); // found + + // check child 'b'->'a' + trie.transition('a'); + + EXPECT_TRUE(!trie.get_symbol()); + EXPECT_TRUE(!trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(1)); + EXPECT_NE(children.find('b'), children.end()); // found + + // check child 'b'->'a'->'b' + trie.transition('b'); + + EXPECT_TRUE(trie.get_symbol()); + EXPECT_TRUE(trie.is_accept()); + EXPECT_TRUE(!trie.is_reset()); + + children = trie.get_children(); + EXPECT_EQ(children.size(), static_cast(0)); +} + +} // namespace lexer +} // namespace core +} // namespace docgen diff --git a/test/core/lexer_unittest.cpp b/test/core/lexer_unittest.cpp deleted file mode 100644 index 643c2f5..0000000 --- a/test/core/lexer_unittest.cpp +++ /dev/null @@ -1,453 +0,0 @@ -#define private public - -#include -#include - -namespace docgen { -namespace core { - -struct lexer_fixture : ::testing::Test -{ -protected: - using status_t = Lexer::status_t; - using token_t = Lexer::token_t; - using symbol_t = Lexer::symbol_t; - - Lexer lexer; - - void setup_lexer(const char* content) - { - std::string str(content); - for (char c : str) { - lexer.process(c); - } - lexer.process(0); - } - - void check_token(symbol_t actual_sym, symbol_t expected_sym, - const std::string& actual_str, const std::string& expected_str) - { - EXPECT_EQ(actual_sym, expected_sym); - EXPECT_EQ(actual_str, expected_str); - } -}; - -TEST_F(lexer_fixture, lexer) -{ - static constexpr const char* content = - "#include // some comment\n" - "\n" - "void f();" - ; - - setup_lexer(content); - - auto token = *lexer.next_token(); - check_token(token.name, symbol_t::HASHTAG, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "include"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "some"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "comment"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "void"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "f()"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::SEMICOLON, - token.content, ""); -} - -TEST_F(lexer_fixture, process_no_comment) -{ - static constexpr const char* content = - "#include \n" - "\n" - " // just a normal comment\n" - "\n" - ; - - setup_lexer(content); - - auto token = *lexer.next_token(); - check_token(token.name, symbol_t::HASHTAG, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "include"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "just"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "a"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "normal"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "comment"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); -} - -TEST_F(lexer_fixture, process_one_line_comment) -{ - static constexpr const char* content = - "// comment\n" - " /// special_comment \n" - ; - - setup_lexer(content); - - auto token = *lexer.next_token(); - check_token(token.name, symbol_t::BEGIN_NLINE_COMMENT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "comment"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::BEGIN_SLINE_COMMENT, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::TEXT, - token.content, "special_comment"); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::WHITESPACE, - token.content, ""); - - token = *lexer.next_token(); - check_token(token.name, symbol_t::NEWLINE, - token.content, ""); - - -} - -//TEST_F(lexer_fixture, process_two_line_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /// a very special comment \n" -// "\n" -// " // just a normal comment\n" -// " /// another very special comment \n" -// " // just a normal comment\n" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(11)); -// -// check_token(tokens[0].name, symbol_t::TEXT, -// tokens[0].content, "#include "); -// check_token(tokens[1].name, symbol_t::NEWLINE, -// tokens[1].content, ""); -// check_token(tokens[2].name, symbol_t::NEWLINE, -// tokens[2].content, ""); -// check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[3].content, ""); -// check_token(tokens[4].name, symbol_t::TEXT, -// tokens[4].content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5].name, symbol_t::NEWLINE, -// tokens[5].content, ""); -// check_token(tokens[6].name, symbol_t::NEWLINE, -// tokens[6].content, ""); -// check_token(tokens[7].name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[7].content, ""); -// check_token(tokens[8].name, symbol_t::TEXT, -// tokens[8].content, "another very special comment"); -// EXPECT_EQ(tokens[8].leading_ws_count, static_cast(1)); -// check_token(tokens[9].name, symbol_t::NEWLINE, -// tokens[9].content, ""); -// check_token(tokens[10].name, symbol_t::END_OF_FILE, -// tokens[10].content, ""); -//} -// -//TEST_F(lexer_fixture, process_one_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /*! a very special comment */\n" -// "\n" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(9)); -// -// check_token(tokens[0].name, symbol_t::TEXT, -// tokens[0].content, "#include "); -// check_token(tokens[1].name, symbol_t::NEWLINE, -// tokens[1].content, ""); -// check_token(tokens[2].name, symbol_t::NEWLINE, -// tokens[2].content, ""); -// check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[3].content, ""); -// check_token(tokens[4].name, symbol_t::TEXT, -// tokens[4].content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, -// tokens[5].content, ""); -// check_token(tokens[6].name, symbol_t::NEWLINE, -// tokens[6].content, ""); -// check_token(tokens[7].name, symbol_t::NEWLINE, -// tokens[7].content, ""); -// check_token(tokens[8].name, symbol_t::END_OF_FILE, -// tokens[8].content, ""); -//} -// -//TEST_F(lexer_fixture, process_two_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /*! a very special comment */\n" -// "\n" -// " // just a normal comment\n" -// " /*! another very \n" -// " * special comment \n" -// "*/" -// " /* just a normal comment\n */" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(16)); -// -// check_token(tokens[0].name, symbol_t::TEXT, -// tokens[0].content, "#include "); -// check_token(tokens[1].name, symbol_t::NEWLINE, -// tokens[1].content, ""); -// check_token(tokens[2].name, symbol_t::NEWLINE, -// tokens[2].content, ""); -// check_token(tokens[3].name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[3].content, ""); -// check_token(tokens[4].name, symbol_t::TEXT, -// tokens[4].content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, -// tokens[5].content, ""); -// check_token(tokens[6].name, symbol_t::NEWLINE, -// tokens[6].content, ""); -// check_token(tokens[7].name, symbol_t::NEWLINE, -// tokens[7].content, ""); -// check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[8].content, ""); -// check_token(tokens[9].name, symbol_t::TEXT, -// tokens[9].content, "another very"); -// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); -// check_token(tokens[10].name, symbol_t::NEWLINE, -// tokens[10].content, ""); -// check_token(tokens[11].name, symbol_t::STAR, -// tokens[11].content, ""); -// check_token(tokens[12].name, symbol_t::TEXT, -// tokens[12].content, "special comment"); -// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); -// check_token(tokens[13].name, symbol_t::NEWLINE, -// tokens[13].content, ""); -// check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, -// tokens[14].content, ""); -// check_token(tokens[15].name, symbol_t::END_OF_FILE, -// tokens[15].content, ""); -//} -// -//TEST_F(lexer_fixture, process_line_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /// a very special comment */\n" -// "\n" -// " // just a normal comment\n" -// " /*! another very \n" -// " * special comment \n" -// "*/" -// " /* just a normal comment\n */" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(16)); -// -// check_token(tokens[0].name, symbol_t::TEXT, -// tokens[0].content, "#include "); -// check_token(tokens[1].name, symbol_t::NEWLINE, -// tokens[1].content, ""); -// check_token(tokens[2].name, symbol_t::NEWLINE, -// tokens[2].content, ""); -// check_token(tokens[3].name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[3].content, ""); -// check_token(tokens[4].name, symbol_t::TEXT, -// tokens[4].content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5].name, symbol_t::END_BLOCK_COMMENT, -// tokens[5].content, ""); -// check_token(tokens[6].name, symbol_t::NEWLINE, -// tokens[6].content, ""); -// check_token(tokens[7].name, symbol_t::NEWLINE, -// tokens[7].content, ""); -// check_token(tokens[8].name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[8].content, ""); -// check_token(tokens[9].name, symbol_t::TEXT, -// tokens[9].content, "another very"); -// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); -// check_token(tokens[10].name, symbol_t::NEWLINE, -// tokens[10].content, ""); -// check_token(tokens[11].name, symbol_t::STAR, -// tokens[11].content, ""); -// check_token(tokens[12].name, symbol_t::TEXT, -// tokens[12].content, "special comment"); -// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); -// check_token(tokens[13].name, symbol_t::NEWLINE, -// tokens[13].content, ""); -// check_token(tokens[14].name, symbol_t::END_BLOCK_COMMENT, -// tokens[14].content, ""); -// check_token(tokens[15].name, symbol_t::END_OF_FILE, -// tokens[15].content, ""); -//} - -} // namespace core -} // namespace docgen diff --git a/test/core/trie_unittest.cpp b/test/core/trie_unittest.cpp deleted file mode 100644 index 43bf550..0000000 --- a/test/core/trie_unittest.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#define private public - -#include -#include -#include - -namespace docgen { -namespace core { - -enum class MockSymbol { - symbol1, - symbol2 -}; - -struct trie_fixture : ::testing::Test -{ -protected: - using symbol_t = MockSymbol; - using trie_t = Trie; - - void print_trie(const trie_t& trie) - { - print_trie(trie.root_); - } - - void print_trie(const trie_t::TrieNode& node) - { - if (node.is_accept()) { - std::cout << "symbol: " << (int) *node.get_symbol() << std::endl; - } - std::cout << "\nsize: " << node.children_.size() << std::endl; - for (auto it = node.children_.begin(); it != node.children_.end(); ++it) { - std::cout << it->first << "--" << std::endl;; - print_trie(it->second); - std::cout << "--" << std::endl; - } - } -}; - -TEST_F(trie_fixture, trie_ctor) -{ - trie_t trie({ - {"adf", symbol_t::symbol1}, - {"asdf", symbol_t::symbol2}, - {"bscdf", symbol_t::symbol1} - }); - - auto symbol = trie.get_symbol(); - EXPECT_FALSE((bool) symbol); - - //trie.transition('a'); - //EXPECT_FALSE((bool) trie.get_symbol()); - //trie.transition('d'); - //EXPECT_FALSE((bool) trie.get_symbol()); - //trie.transition('b'); - //EXPECT_FALSE((bool) trie.get_symbol()); - - trie.transition('a'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('d'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('f'); - EXPECT_TRUE((bool) trie.get_symbol()); - - trie.back_transition(); - trie.back_transition(); - trie.back_transition(); - - trie.transition('a'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('d'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('f'); - EXPECT_TRUE((bool) trie.get_symbol()); - - trie.reset(); - - trie.transition('a'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('s'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('d'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('f'); - EXPECT_TRUE((bool) trie.get_symbol()); - - trie.reset(); - - trie.transition('b'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('s'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('c'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('d'); - EXPECT_FALSE((bool) trie.get_symbol()); - trie.transition('f'); - EXPECT_TRUE((bool) trie.get_symbol()); -} - -} // namespace core -} // namespace docgen From a18548da111c5a407e7e22a2668c536a5d78ddf6 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 00:06:36 -0500 Subject: [PATCH 06/23] Fix issue with non-backtrack behavior when at root vs. non-root --- src/core/lexer/lexer.cpp | 8 +- test/core/lexer/lexer_unittest.cpp | 224 +++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+), 2 deletions(-) diff --git a/src/core/lexer/lexer.cpp b/src/core/lexer/lexer.cpp index f727e49..82ca5d6 100644 --- a/src/core/lexer/lexer.cpp +++ b/src/core/lexer/lexer.cpp @@ -50,11 +50,15 @@ void Lexer::process(char c) // if not backtracking if (!this->is_backtracking()) { + // if trie at root + if (trie_.is_reset()) { + text_.push_back(c); + return; + } text_.append(buf_); - text_.push_back(c); buf_.clear(); trie_.reset(); - return; + return this->process(c); } // otherwise, currently backtracking diff --git a/test/core/lexer/lexer_unittest.cpp b/test/core/lexer/lexer_unittest.cpp index 9fead88..0a1b465 100644 --- a/test/core/lexer/lexer_unittest.cpp +++ b/test/core/lexer/lexer_unittest.cpp @@ -196,10 +196,234 @@ TEST_F(lexer_fixture, lexer_semicolon) EXPECT_FALSE(static_cast(token)); } +// BEGIN_SLINE_COMMENT +TEST_F(lexer_fixture, lexer_begin_sline_comment) +{ + static constexpr const char* content = + "abc///" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// BEGIN_SBLOCK_COMMENT +TEST_F(lexer_fixture, lexer_begin_sblock_comment) +{ + static constexpr const char* content = + "abc/*!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SBLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// BEGIN_NBLOCK_COMMENT +TEST_F(lexer_fixture, lexer_begin_nblock_comment) +{ + static constexpr const char* content = + "abc/**!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NBLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::STAR); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// END_BLOCK_COMMENT +TEST_F(lexer_fixture, lexer_end_block_comment_no_star) +{ + static constexpr const char* content = + "abc*/f" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::END_BLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_end_block_comment_star) +{ + static constexpr const char* content = + "abc**/f" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::STAR); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::END_BLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// SDESC +TEST_F(lexer_fixture, lexer_sdesc) +{ + static constexpr const char* content = + "ssdesc@@sdescf@sdesscf" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "ssdesc@"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SDESC); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f@sdesscf"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + //////////////////////////////////////////////////////////////////// // Mix TESTS //////////////////////////////////////////////////////////////////// +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_4) +{ + static constexpr const char* content = + "abc////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "/"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_5) +{ + static constexpr const char* content = + "abc/////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_6) +{ + static constexpr const char* content = + "abc//////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + TEST_F(lexer_fixture, lexer_test_1) { static constexpr const char* content = From c718ace66169183d0d8436f24584666a3cd6605e Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 00:17:34 -0500 Subject: [PATCH 07/23] Add more unittests and integration tests --- test/core/lexer/lexer_unittest.cpp | 367 +++++++++++++---------------- 1 file changed, 163 insertions(+), 204 deletions(-) diff --git a/test/core/lexer/lexer_unittest.cpp b/test/core/lexer/lexer_unittest.cpp index 0a1b465..294cf63 100644 --- a/test/core/lexer/lexer_unittest.cpp +++ b/test/core/lexer/lexer_unittest.cpp @@ -345,6 +345,56 @@ TEST_F(lexer_fixture, lexer_sdesc) EXPECT_FALSE(static_cast(token)); } +// TPARAM +TEST_F(lexer_fixture, lexer_tparam) +{ + static constexpr const char* content = + "ssdes@@@@@@tpaar@tpara@m@tparam@tpar" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "ssdes@@@@@@tpaar@tpara@m"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TPARAM); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@tpar"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// RETURN +TEST_F(lexer_fixture, lexer_return) +{ + static constexpr const char* content = + "@re@@@@@@return@@@@@" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@re@@@@@"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::RETURN); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@@@@@"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + //////////////////////////////////////////////////////////////////// // Mix TESTS //////////////////////////////////////////////////////////////////// @@ -424,7 +474,7 @@ TEST_F(lexer_fixture, lexer_line_comment_6) EXPECT_FALSE(static_cast(token)); } -TEST_F(lexer_fixture, lexer_test_1) +TEST_F(lexer_fixture, lexer_test_1_no_special_comment) { static constexpr const char* content = "#include // some comment\n" @@ -497,9 +547,13 @@ TEST_F(lexer_fixture, lexer_test_1) token = lexer.next_token(); EXPECT_EQ(token->name, symbol_t::SEMICOLON); EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); } -TEST_F(lexer_fixture, process_no_comment) +TEST_F(lexer_fixture, lexer_test_2_no_special_comment) { static constexpr const char* content = "#include \n" @@ -585,9 +639,13 @@ TEST_F(lexer_fixture, process_no_comment) token = lexer.next_token(); EXPECT_EQ(token->name, symbol_t::NEWLINE); EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); } -TEST_F(lexer_fixture, process_one_line_comment) +TEST_F(lexer_fixture, lexer_test_1_comment_mix) { static constexpr const char* content = "// comment\n" @@ -635,209 +693,110 @@ TEST_F(lexer_fixture, process_one_line_comment) token = lexer.next_token(); EXPECT_EQ(token->name, symbol_t::NEWLINE); EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); } -//TEST_F(lexer_fixture, process_two_line_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /// a very special comment \n" -// "\n" -// " // just a normal comment\n" -// " /// another very special comment \n" -// " // just a normal comment\n" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(11)); -// -// check_token(tokens[0]->name, symbol_t::TEXT, -// tokens[0]->content, "#include "); -// check_token(tokens[1]->name, symbol_t::NEWLINE, -// tokens[1]->content, ""); -// check_token(tokens[2]->name, symbol_t::NEWLINE, -// tokens[2]->content, ""); -// check_token(tokens[3]->name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[3]->content, ""); -// check_token(tokens[4]->name, symbol_t::TEXT, -// tokens[4]->content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5]->name, symbol_t::NEWLINE, -// tokens[5]->content, ""); -// check_token(tokens[6]->name, symbol_t::NEWLINE, -// tokens[6]->content, ""); -// check_token(tokens[7]->name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[7]->content, ""); -// check_token(tokens[8]->name, symbol_t::TEXT, -// tokens[8]->content, "another very special comment"); -// EXPECT_EQ(tokens[8].leading_ws_count, static_cast(1)); -// check_token(tokens[9]->name, symbol_t::NEWLINE, -// tokens[9]->content, ""); -// check_token(tokens[10]->name, symbol_t::END_OF_FILE, -// tokens[10]->content, ""); -//} -// -//TEST_F(lexer_fixture, process_one_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /*! a very special comment */\n" -// "\n" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(9)); -// -// check_token(tokens[0]->name, symbol_t::TEXT, -// tokens[0]->content, "#include "); -// check_token(tokens[1]->name, symbol_t::NEWLINE, -// tokens[1]->content, ""); -// check_token(tokens[2]->name, symbol_t::NEWLINE, -// tokens[2]->content, ""); -// check_token(tokens[3]->name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[3]->content, ""); -// check_token(tokens[4]->name, symbol_t::TEXT, -// tokens[4]->content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, -// tokens[5]->content, ""); -// check_token(tokens[6]->name, symbol_t::NEWLINE, -// tokens[6]->content, ""); -// check_token(tokens[7]->name, symbol_t::NEWLINE, -// tokens[7]->content, ""); -// check_token(tokens[8]->name, symbol_t::END_OF_FILE, -// tokens[8]->content, ""); -//} -// -//TEST_F(lexer_fixture, process_two_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /*! a very special comment */\n" -// "\n" -// " // just a normal comment\n" -// " /*! another very \n" -// " * special comment \n" -// "*/" -// " /* just a normal comment\n */" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(16)); -// -// check_token(tokens[0]->name, symbol_t::TEXT, -// tokens[0]->content, "#include "); -// check_token(tokens[1]->name, symbol_t::NEWLINE, -// tokens[1]->content, ""); -// check_token(tokens[2]->name, symbol_t::NEWLINE, -// tokens[2]->content, ""); -// check_token(tokens[3]->name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[3]->content, ""); -// check_token(tokens[4]->name, symbol_t::TEXT, -// tokens[4]->content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, -// tokens[5]->content, ""); -// check_token(tokens[6]->name, symbol_t::NEWLINE, -// tokens[6]->content, ""); -// check_token(tokens[7]->name, symbol_t::NEWLINE, -// tokens[7]->content, ""); -// check_token(tokens[8]->name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[8]->content, ""); -// check_token(tokens[9]->name, symbol_t::TEXT, -// tokens[9]->content, "another very"); -// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); -// check_token(tokens[10]->name, symbol_t::NEWLINE, -// tokens[10]->content, ""); -// check_token(tokens[11]->name, symbol_t::STAR, -// tokens[11]->content, ""); -// check_token(tokens[12]->name, symbol_t::TEXT, -// tokens[12]->content, "special comment"); -// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); -// check_token(tokens[13]->name, symbol_t::NEWLINE, -// tokens[13]->content, ""); -// check_token(tokens[14]->name, symbol_t::END_BLOCK_COMMENT, -// tokens[14]->content, ""); -// check_token(tokens[15]->name, symbol_t::END_OF_FILE, -// tokens[15]->content, ""); -//} -// -//TEST_F(lexer_fixture, process_line_block_comment) -//{ -// static constexpr const char* content = -// "#include \n" -// "\n" -// " // just a normal comment\n" -// " /// a very special comment */\n" -// "\n" -// " // just a normal comment\n" -// " /*! another very \n" -// " * special comment \n" -// "*/" -// " /* just a normal comment\n */" -// ; -// -// write_file(content); -// Lexer lexer(file); -// lexer.process(); -// const auto& tokens = lexer.get_tokens(); -// -// EXPECT_EQ(tokens.size(), static_cast(16)); -// -// check_token(tokens[0]->name, symbol_t::TEXT, -// tokens[0]->content, "#include "); -// check_token(tokens[1]->name, symbol_t::NEWLINE, -// tokens[1]->content, ""); -// check_token(tokens[2]->name, symbol_t::NEWLINE, -// tokens[2]->content, ""); -// check_token(tokens[3]->name, symbol_t::BEGIN_LINE_COMMENT, -// tokens[3]->content, ""); -// check_token(tokens[4]->name, symbol_t::TEXT, -// tokens[4]->content, "a very special comment"); -// EXPECT_EQ(tokens[4].leading_ws_count, static_cast(1)); -// check_token(tokens[5]->name, symbol_t::END_BLOCK_COMMENT, -// tokens[5]->content, ""); -// check_token(tokens[6]->name, symbol_t::NEWLINE, -// tokens[6]->content, ""); -// check_token(tokens[7]->name, symbol_t::NEWLINE, -// tokens[7]->content, ""); -// check_token(tokens[8]->name, symbol_t::BEGIN_BLOCK_COMMENT, -// tokens[8]->content, ""); -// check_token(tokens[9]->name, symbol_t::TEXT, -// tokens[9]->content, "another very"); -// EXPECT_EQ(tokens[9].leading_ws_count, static_cast(1)); -// check_token(tokens[10]->name, symbol_t::NEWLINE, -// tokens[10]->content, ""); -// check_token(tokens[11]->name, symbol_t::STAR, -// tokens[11]->content, ""); -// check_token(tokens[12]->name, symbol_t::TEXT, -// tokens[12]->content, "special comment"); -// EXPECT_EQ(tokens[12].leading_ws_count, static_cast(1)); -// check_token(tokens[13]->name, symbol_t::NEWLINE, -// tokens[13]->content, ""); -// check_token(tokens[14]->name, symbol_t::END_BLOCK_COMMENT, -// tokens[14]->content, ""); -// check_token(tokens[15]->name, symbol_t::END_OF_FILE, -// tokens[15]->content, ""); -//} +TEST_F(lexer_fixture, lexer_test_1_tagname_comments) +{ + static constexpr const char* content = + "// @tparam normal comment\n" + "/// @sdescspecial comment \n" + "#define hehe\n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TPARAM); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "normal"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SDESC); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "special"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "define"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hehe"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} } // namespace lexer } // namespace core From 2498daf49a4b2f1a4ad7ce9b8a78118a3258fa74 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 00:24:36 -0500 Subject: [PATCH 08/23] Rename namespace lexer to lex and change CMake and directory structure --- src/CMakeLists.txt | 2 +- src/core/{lexer => lex}/lexer.cpp | 6 +- src/core/{lexer => lex}/lexer.hpp | 8 +- src/core/{lexer => lex}/status.hpp | 4 +- src/core/{lexer => lex}/trie.hpp | 4 +- test/CMakeLists.txt | 4 +- test/core/{lexer => lex}/lexer_unittest.cpp | 6 +- test/core/{lexer => lex}/trie_unittest.cpp | 6 +- test/core/lexer/lexer_base_fixture.hpp | 36 - test/core/lexer/lexer_routines_unittest.cpp | 784 -------------------- 10 files changed, 20 insertions(+), 840 deletions(-) rename src/core/{lexer => lex}/lexer.cpp (97%) rename src/core/{lexer => lex}/lexer.hpp (94%) rename src/core/{lexer => lex}/status.hpp (86%) rename src/core/{lexer => lex}/trie.hpp (99%) rename test/core/{lexer => lex}/lexer_unittest.cpp (99%) rename test/core/{lexer => lex}/trie_unittest.cpp (99%) delete mode 100644 test/core/lexer/lexer_base_fixture.hpp delete mode 100644 test/core/lexer/lexer_routines_unittest.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 96870d9..f95563a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Create object files for lexer add_library(LEXER_LIB_OBJECTS OBJECT - ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/lexer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer.cpp ) target_include_directories(LEXER_LIB_OBJECTS PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/src/core/lexer/lexer.cpp b/src/core/lex/lexer.cpp similarity index 97% rename from src/core/lexer/lexer.cpp rename to src/core/lex/lexer.cpp index 82ca5d6..2cb7356 100644 --- a/src/core/lexer/lexer.cpp +++ b/src/core/lex/lexer.cpp @@ -1,8 +1,8 @@ -#include +#include namespace docgen { namespace core { -namespace lexer { +namespace lex { /////////////////////////////////// // Lexer Implementation @@ -108,6 +108,6 @@ void Lexer::flush() this->reset(); } -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/src/core/lexer/lexer.hpp b/src/core/lex/lexer.hpp similarity index 94% rename from src/core/lexer/lexer.hpp rename to src/core/lex/lexer.hpp index 03b65fa..053f75c 100644 --- a/src/core/lexer/lexer.hpp +++ b/src/core/lex/lexer.hpp @@ -1,12 +1,12 @@ #pragma once -#include -#include +#include +#include #include #include namespace docgen { namespace core { -namespace lexer { +namespace lex { struct Lexer { @@ -95,6 +95,6 @@ inline void Lexer::reset() reset_backtracking(); } -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/src/core/lexer/status.hpp b/src/core/lex/status.hpp similarity index 86% rename from src/core/lexer/status.hpp rename to src/core/lex/status.hpp index e4af461..2af91f7 100644 --- a/src/core/lexer/status.hpp +++ b/src/core/lex/status.hpp @@ -3,7 +3,7 @@ namespace docgen { namespace core { -namespace lexer { +namespace lex { template struct Status @@ -14,6 +14,6 @@ struct Status token_arr_t tokens; }; -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/src/core/lexer/trie.hpp b/src/core/lex/trie.hpp similarity index 99% rename from src/core/lexer/trie.hpp rename to src/core/lex/trie.hpp index 1fe79f0..eabe4ce 100644 --- a/src/core/lexer/trie.hpp +++ b/src/core/lex/trie.hpp @@ -8,7 +8,7 @@ namespace docgen { namespace core { -namespace lexer { +namespace lex { template struct Trie @@ -199,6 +199,6 @@ Trie::get_symbol() const return curr_node_.get().get_symbol(); } -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1ddf86a..d5ab8e4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -45,8 +45,8 @@ add_custom_command( ###################################################### add_executable(core_unittests - ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/trie_unittest.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core/lexer/lexer_unittest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/trie_unittest.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_unittest.cpp # Source dependency $ ) diff --git a/test/core/lexer/lexer_unittest.cpp b/test/core/lex/lexer_unittest.cpp similarity index 99% rename from test/core/lexer/lexer_unittest.cpp rename to test/core/lex/lexer_unittest.cpp index 294cf63..e7483bb 100644 --- a/test/core/lexer/lexer_unittest.cpp +++ b/test/core/lex/lexer_unittest.cpp @@ -1,9 +1,9 @@ -#include +#include #include namespace docgen { namespace core { -namespace lexer { +namespace lex { struct lexer_fixture : ::testing::Test { @@ -798,6 +798,6 @@ TEST_F(lexer_fixture, lexer_test_1_tagname_comments) EXPECT_FALSE(static_cast(token)); } -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/test/core/lexer/trie_unittest.cpp b/test/core/lex/trie_unittest.cpp similarity index 99% rename from test/core/lexer/trie_unittest.cpp rename to test/core/lex/trie_unittest.cpp index 206a6d7..bc8d483 100644 --- a/test/core/lexer/trie_unittest.cpp +++ b/test/core/lex/trie_unittest.cpp @@ -1,9 +1,9 @@ -#include +#include #include namespace docgen { namespace core { -namespace lexer { +namespace lex { enum class MockSymbol { symbol_0, @@ -331,6 +331,6 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_EQ(children.size(), static_cast(0)); } -} // namespace lexer +} // namespace lex } // namespace core } // namespace docgen diff --git a/test/core/lexer/lexer_base_fixture.hpp b/test/core/lexer/lexer_base_fixture.hpp deleted file mode 100644 index 46ba63a..0000000 --- a/test/core/lexer/lexer_base_fixture.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include - -namespace docgen { -namespace core { - -struct lexer_base_fixture : ::testing::Test -{ -protected: - using status_t = status_t; - using token_t = lexer_details::token_t; - using symbol_t = lexer_details::symbol_t; - - static constexpr const char* filename = ".lexer_routines_unittest.data.txt"; - static constexpr size_t buf_size = 20; - FILE* file; - - lexer_base_fixture() - : file(fopen(filename, "r")) - {} - - ~lexer_base_fixture() - { - fclose(file); - } - - void write_file(const char* content) - { - FILE* fp = fopen(filename, "w"); - fwrite(content, sizeof(char), strlen(content), fp); - fclose(fp); - } -}; - -} // namespace core -} // namespace docgen diff --git a/test/core/lexer/lexer_routines_unittest.cpp b/test/core/lexer/lexer_routines_unittest.cpp deleted file mode 100644 index f32f0e3..0000000 --- a/test/core/lexer/lexer_routines_unittest.cpp +++ /dev/null @@ -1,784 +0,0 @@ -#include "lexer_base_fixture.hpp" - -namespace docgen { -namespace core { -namespace lexer_details { - -struct lexer_routines_fixture : lexer_base_fixture -{ -protected: - - template - static bool is_not(char x) - { - return x != c; - } - - void read(file_reader& reader, std::string& str) - { - int c = 0; - while ((c = reader.read()) != file_reader::termination) { - str.push_back(c); - } - } - - template - void ignore_until_test(const char* content, const char* expected_str, - char expected_last_char, Condition condition) - { - write_file(content); - file_reader reader(file); - std::string actual; - int last_char = ignore_until(reader, condition); - EXPECT_EQ(last_char, expected_last_char); - read(reader, actual); - EXPECT_EQ(actual, expected_str); - } - - template - void read_until_test(const char* content, const char* expected_str, - char expected_last_char, Condition condition) - { - write_file(content); - file_reader reader(file); - std::string actual; - int last_char = read_until(reader, condition, actual); - EXPECT_EQ(last_char, expected_last_char); - EXPECT_EQ(actual, expected_str); - } - - void trim_test(const char* content, const char* expected) - { - std::string actual(content); - trim(actual); - EXPECT_EQ(actual, expected); - } - - void tokenize_text_check(const std::string& actual, const token_t& token, - const char* expected) - { - check_token(token.name, symbol_t::TEXT, - token.content, expected); - EXPECT_EQ(actual.size(), static_cast(0)); - EXPECT_GT(actual.capacity(), DEFAULT_STRING_RESERVE_SIZE); - } - - void process_char_check(bool res, const status_t& status, - const std::string& actual, const char* expected, - symbol_t expected_symbol) - { - EXPECT_TRUE(res); - EXPECT_EQ(status.tokens.size(), static_cast(2)); - tokenize_text_check(actual, status.tokens[0], expected); - check_token(status.tokens[1].name, expected_symbol, - status.tokens[1].content, ""); - } - -}; - -//////////////////////////////////////////////////////////////////////// -// ignore_until TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, ignore_until_newline) -{ - static constexpr const char* content = - "to ignore here \ndo not ignore" - ; - static constexpr const char* expected = - "do not ignore"; - ignore_until_test(content, expected, '\n', is_not<'\n'>); -} - -TEST_F(lexer_routines_fixture, ignore_until_empty_content) -{ - static constexpr const char* content = - "" - ; - static constexpr const char* expected = - ""; - ignore_until_test(content, expected, file_reader::termination, is_not<'a'>); -} - -TEST_F(lexer_routines_fixture, ignore_until_first_char) -{ - static constexpr const char* content = - "hello" - ; - static constexpr const char* expected = - "ello"; - ignore_until_test(content, expected, 'h', is_not<'h'>); -} - -TEST_F(lexer_routines_fixture, ignore_until_last_char) -{ - static constexpr const char* content = - "hello" - ; - static constexpr const char* expected = - ""; - ignore_until_test(content, expected, 'o', is_not<'o'>); -} - -//////////////////////////////////////////////////////////////////////// -// read_until TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, read_until_newline) -{ - static constexpr const char* content = - "very special comment \n not read here" - ; - static constexpr const char* expected = - "very special comment "; - read_until_test(content, expected, '\n', is_not<'\n'>); -} - -TEST_F(lexer_routines_fixture, read_until_two_newline) -{ - static constexpr const char* content = - "very special \ncomment \n" - ; - static constexpr const char* expected = - "very special "; - read_until_test(content, expected, '\n', is_not<'\n'>); -} - -TEST_F(lexer_routines_fixture, read_until_empty) -{ - static constexpr const char* content = - "" - ; - static constexpr const char* expected = - ""; - read_until_test(content, expected, file_reader::termination, is_not<'c'>); -} - -TEST_F(lexer_routines_fixture, read_until_first_char) -{ - static constexpr const char* content = - "very special \ncomment \n" - ; - static constexpr const char* expected = - ""; - read_until_test(content, expected, 'v', is_not<'v'>); -} - -TEST_F(lexer_routines_fixture, read_until_last_char) -{ - static constexpr const char* content = - "very special comment #" - ; - static constexpr const char* expected = - "very special comment "; - read_until_test(content, expected, '#', is_not<'#'>); -} - -//////////////////////////////////////////////////////////////////////// -// trim TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, trim_empty) -{ - static constexpr const char* content = - "" - ; - static constexpr const char* expected = - ""; - trim_test(content, expected); -} - -TEST_F(lexer_routines_fixture, trim_only_leading) -{ - static constexpr const char* content = - " \n\t hello\tworld!" - ; - static constexpr const char* expected = - "hello\tworld!"; - trim_test(content, expected); -} - -TEST_F(lexer_routines_fixture, trim_only_trailing) -{ - static constexpr const char* content = - "hello\tworld!\v\r\t\f \n\t " - ; - static constexpr const char* expected = - "hello\tworld!"; - trim_test(content, expected); -} - -TEST_F(lexer_routines_fixture, trim_leading_trailing) -{ - static constexpr const char* content = - "\n \r\t \f hello\tworld!\v\r\t\f \n\t " - ; - static constexpr const char* expected = - "hello\tworld!"; - trim_test(content, expected); -} - -//////////////////////////////////////////////////////////////////////// -// tokenize_text TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, tokenize_text_empty) -{ - static constexpr const char* content = - "" - ; - static constexpr const char* expected = - ""; - - std::string actual(content); - status_t status; - tokenize_text(actual, status); - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(actual, expected); -} - -TEST_F(lexer_routines_fixture, tokenize_text) -{ - static constexpr const char* content = - "\n \r\t \f hello\tworld!\v\r\t\f \n\t " - ; - static constexpr const char* expected = - "hello\tworld!"; - - std::string actual(content); - status_t status; - tokenize_text(actual, status); - tokenize_text_check(actual, status.tokens[0], expected); -} - -// this tests whether text is left in a valid state for the next processing -TEST_F(lexer_routines_fixture, tokenize_text_twice) -{ - static constexpr const char* content_1 = - "\n \r\t \f hello\tworld!\v\r\t\f \n\t " - ; - static constexpr const char* expected_1 = - "hello\tworld!"; - - static constexpr const char* content_2 = - "\n this is docgen!\v\f \n\t " - ; - static constexpr const char* expected_2 = - "this is docgen!"; - - std::string actual(content_1); - status_t status; - tokenize_text(actual, status); // actual cleared, status.tokens updated - - // check first token - tokenize_text_check(actual, status.tokens[0], expected_1); - - // push back content of content_2 - for (size_t i = 0; i < strlen(content_2); ++i) { - actual.push_back(content_2[i]); - } - - tokenize_text(actual, status); - - // only 2 tokens - EXPECT_EQ(status.tokens.size(), static_cast(2)); - - // check second token - tokenize_text_check(actual, status.tokens[1], expected_2); - // check content of first token to test if moving worked correctly - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, expected_1); -} - -//////////////////////////////////////////////////////////////////////// -// process_char TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process_char_newline) -{ - static constexpr const char* content = - "\t some text " - ; - static constexpr const char* expected = - "some text"; - - std::string actual(content); - status_t status; - bool res = process_char('\n', actual, status); - process_char_check(res, status, actual, expected, symbol_t::NEWLINE); -} - -TEST_F(lexer_routines_fixture, process_char_semicolon) -{ - static constexpr const char* content = - "\v\t some text \r\v\f \v" - ; - static constexpr const char* expected = - "some text"; - - std::string actual(content); - status_t status; - bool res = process_char(';', actual, status); - process_char_check(res, status, actual, expected, symbol_t::SEMICOLON); -} - -TEST_F(lexer_routines_fixture, process_char_open_brace) -{ - static constexpr const char* content = - " \v some text \v" - ; - static constexpr const char* expected = - "some text"; - - std::string actual(content); - status_t status; - bool res = process_char('{', actual, status); - process_char_check(res, status, actual, expected, symbol_t::OPEN_BRACE); -} - -TEST_F(lexer_routines_fixture, process_char_close_brace) -{ - static constexpr const char* content = - " \v some text \v" - ; - static constexpr const char* expected = - "some text"; - - std::string actual(content); - status_t status; - bool res = process_char('}', actual, status); - process_char_check(res, status, actual, expected, symbol_t::CLOSE_BRACE); -} - -TEST_F(lexer_routines_fixture, process_char_default) -{ - static constexpr const char* content = - " \v some text \v" - ; - static constexpr const char* expected = content; - - std::string actual(content); - status_t status; - bool res = process_char('a', actual, status); - - EXPECT_FALSE(res); - EXPECT_EQ(actual, expected); - EXPECT_EQ(status.tokens.size(), static_cast(0)); -} - -//////////////////////////////////////////////////////////////////////// -// tokenize_tag_name TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, tokenize_tag_name_sdesc) -{ - static constexpr const char* content = - "sdesc\t " - ; - static constexpr const char* text_content = - "some text"; - - std::string text(text_content); - write_file(content); - file_reader reader(file); - status_t status; // context is none - tokenize_tag_name(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, text_content); - check_token(status.tokens[1].name, symbol_t::TAGNAME, - status.tokens[1].content, "sdesc"); - EXPECT_EQ(reader.peek(), '\t'); -} - -TEST_F(lexer_routines_fixture, tokenize_tag_name_param) -{ - static constexpr const char* content = - "param\n \t " - ; - static constexpr const char* text_content = - "some text"; - - std::string text(text_content); - write_file(content); - file_reader reader(file); - status_t status; // context is none - tokenize_tag_name(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, text_content); - check_token(status.tokens[1].name, symbol_t::TAGNAME, - status.tokens[1].content, "param"); - EXPECT_EQ(reader.peek(), '\n'); -} - -TEST_F(lexer_routines_fixture, tokenize_tag_name_tparam) -{ - static constexpr const char* content = - "tparam\n \t " - ; - static constexpr const char* text_content = - "some text"; - - std::string text(text_content); - write_file(content); - file_reader reader(file); - status_t status; // context is none - tokenize_tag_name(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, text_content); - check_token(status.tokens[1].name, symbol_t::TAGNAME, - status.tokens[1].content, "tparam"); - EXPECT_EQ(reader.peek(), '\n'); -} - -TEST_F(lexer_routines_fixture, tokenize_tag_name_invalid) -{ - static constexpr const char* content = - "tparram\n \t " - ; - static constexpr const char* text_content = - "some text"; - - std::string text(text_content); - write_file(content); - file_reader reader(file); - status_t status; // context is none - tokenize_tag_name(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, std::string(text_content) + "@tparram"); - EXPECT_EQ(reader.peek(), '\n'); -} - -TEST_F(lexer_routines_fixture, tokenize_tag_name_eof) -{ - static constexpr const char* content = - "tparam" - ; - static constexpr const char* text_content = - "some text"; - - std::string text(text_content); - write_file(content); - file_reader reader(file); - status_t status; // context is none - tokenize_tag_name(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, text_content); - check_token(status.tokens[1].name, symbol_t::TAGNAME, - status.tokens[1].content, "tparam"); - EXPECT_EQ(reader.peek(), static_cast(file_reader::termination)); -} - -//////////////////////////////////////////////////////////////////////// -// process_tag_name TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process_tag_name_valid) -{ - static constexpr const char* content = - "param x\tsome int\n" - ; - static constexpr const char* text_content = - " some existing text... \n"; - static constexpr const char* expected_text = - "some existing text..."; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - bool res = process_tag_name('@', text, reader, status); - - EXPECT_TRUE(res); - EXPECT_EQ(status.tokens.size(), static_cast(2)); - tokenize_text_check(text, status.tokens[0], expected_text); - check_token(status.tokens[1].name, symbol_t::TAGNAME, - status.tokens[1].content, "param"); -} - -TEST_F(lexer_routines_fixture, process_tag_name_invalid) -{ - static constexpr const char* content = - "xparam x\tsome int\n" - ; - static constexpr const char* text_content = - " some existing text... \n"; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - bool res = process_tag_name('m', text, reader, status); - - EXPECT_FALSE(res); - EXPECT_EQ(status.tokens.size(), static_cast(0)); -} - -//////////////////////////////////////////////////////////////////////// -// process_line_comment TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process_line_comment_valid) -{ - static constexpr const char* content = - "/ some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - static constexpr const char* expected_text = - "some text..."; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_line_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - tokenize_text_check(text, status.tokens[0], expected_text); - check_token(status.tokens[1].name, symbol_t::BEGIN_LINE_COMMENT, - status.tokens[1].content, ""); -} - -TEST_F(lexer_routines_fixture, process_line_comment_invalid_nospace) -{ - static constexpr const char* content = - "/some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_line_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, text_content); // text unchanged -} - -TEST_F(lexer_routines_fixture, process_line_comment_invalid_noslash) -{ - static constexpr const char* content = - " some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_line_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, text_content); // text unchanged -} - -//////////////////////////////////////////////////////////////////////// -// process_block_comment TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process_block_comment_valid) -{ - static constexpr const char* content = - "! some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - static constexpr const char* expected_text = - "some text..."; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_block_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(2)); - tokenize_text_check(text, status.tokens[0], expected_text); - check_token(status.tokens[1].name, symbol_t::BEGIN_BLOCK_COMMENT, - status.tokens[1].content, ""); -} - -TEST_F(lexer_routines_fixture, process_block_comment_invalid_nospace) -{ - static constexpr const char* content = - "!some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_block_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, text_content); // text unchanged -} - -TEST_F(lexer_routines_fixture, process_block_comment_invalid_noexclam) -{ - static constexpr const char* content = - " some special content...\n" - ; - static constexpr const char* text_content = - "\n some text... \t"; - - write_file(content); - file_reader reader(file); - status_t status; - std::string text(text_content); - process_block_comment(text, reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, text_content); // text unchanged -} - -//////////////////////////////////////////////////////////////////////// -// process_tags TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process_string_invalid_comment) -{ - static constexpr const char* content = - "some content...\n " - ; - static constexpr const char* text_content = - " some text... "; - - write_file(content); - file_reader reader(file); - status_t status; // context is none - std::string text(text_content); - bool res = process_string('/', text, reader, status); - - EXPECT_TRUE(res); - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, std::string(text_content) + "/s"); -} - -TEST_F(lexer_routines_fixture, process_string_invalid_slash) -{ - static constexpr const char* content = - "some content...\n " - ; - static constexpr const char* text_content = - " some text... "; - - write_file(content); - file_reader reader(file); - status_t status; // context is none - std::string text(text_content); - bool res = process_string('x', text, reader, status); - - EXPECT_FALSE(res); - EXPECT_EQ(status.tokens.size(), static_cast(0)); - EXPECT_EQ(text, text_content); -} - -//////////////////////////////////////////////////////////////////////// -// process TESTS -//////////////////////////////////////////////////////////////////////// - -TEST_F(lexer_routines_fixture, process) -{ - static constexpr const char* content = - "#include // for json\n" - "// this is some comment to ignore\n" - "/* this is another comment to ignore \n*/" - "\n" - " /// description... @sdesc some short description\n" - " /*! @param x some int\n" - " * that we care about\n" - " */" - "inline f(int x);" - "struct A {const char* p = \"@param\"};" - ; - - write_file(content); - file_reader reader(file); - status_t status; // context is none - process(reader, status); - - EXPECT_EQ(status.tokens.size(), static_cast(27)); - check_token(status.tokens[0].name, symbol_t::TEXT, - status.tokens[0].content, "#include "); - check_token(status.tokens[1].name, symbol_t::NEWLINE, - status.tokens[1].content, ""); - check_token(status.tokens[2].name, symbol_t::BEGIN_LINE_COMMENT, - status.tokens[2].content, ""); - check_token(status.tokens[3].name, symbol_t::TEXT, - status.tokens[3].content, "description..."); - EXPECT_EQ(status.tokens[3].leading_ws_count, static_cast(1)); - - check_token(status.tokens[4].name, symbol_t::TAGNAME, - status.tokens[4].content, "sdesc"); - check_token(status.tokens[5].name, symbol_t::TEXT, - status.tokens[5].content, "some short description"); - EXPECT_EQ(status.tokens[5].leading_ws_count, static_cast(2)); - - check_token(status.tokens[6].name, symbol_t::NEWLINE, - status.tokens[6].content, ""); - check_token(status.tokens[7].name, symbol_t::BEGIN_BLOCK_COMMENT, - status.tokens[7].content, ""); - check_token(status.tokens[8].name, symbol_t::TAGNAME, - status.tokens[8].content, "param"); - check_token(status.tokens[9].name, symbol_t::TEXT, - status.tokens[9].content, "x some int"); - EXPECT_EQ(status.tokens[9].leading_ws_count, static_cast(1)); - - check_token(status.tokens[10].name, symbol_t::NEWLINE, - status.tokens[10].content, ""); - check_token(status.tokens[11].name, symbol_t::STAR, - status.tokens[11].content, ""); - check_token(status.tokens[12].name, symbol_t::TEXT, - status.tokens[12].content, "that we care about"); - EXPECT_EQ(status.tokens[12].leading_ws_count, static_cast(1)); - - check_token(status.tokens[13].name, symbol_t::NEWLINE, - status.tokens[13].content, ""); - check_token(status.tokens[14].name, symbol_t::END_BLOCK_COMMENT, - status.tokens[14].content, ""); - check_token(status.tokens[15].name, symbol_t::TEXT, - status.tokens[15].content, "inline f(int x)"); - check_token(status.tokens[16].name, symbol_t::SEMICOLON, - status.tokens[16].content, ""); - check_token(status.tokens[17].name, symbol_t::TEXT, - status.tokens[17].content, "struct A"); - check_token(status.tokens[18].name, symbol_t::OPEN_BRACE, - status.tokens[18].content, ""); - check_token(status.tokens[19].name, symbol_t::TEXT, - status.tokens[19].content, "const char"); - check_token(status.tokens[20].name, symbol_t::STAR, - status.tokens[20].content, ""); - check_token(status.tokens[21].name, symbol_t::TEXT, - status.tokens[21].content, "p = \""); - EXPECT_EQ(status.tokens[21].leading_ws_count, static_cast(1)); - - check_token(status.tokens[22].name, symbol_t::TAGNAME, - status.tokens[22].content, "param"); - check_token(status.tokens[23].name, symbol_t::TEXT, - status.tokens[23].content, "\""); - check_token(status.tokens[24].name, symbol_t::CLOSE_BRACE, - status.tokens[24].content, ""); - check_token(status.tokens[25].name, symbol_t::SEMICOLON, - status.tokens[25].content, ""); - check_token(status.tokens[26].name, symbol_t::END_OF_FILE, - status.tokens[26].content, ""); -} - -} // namespace lexer_details -} // namespace core -} // namespace docgen From 68b59ed3582739e1835066848124b3cdce20c288 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 15:08:01 -0500 Subject: [PATCH 09/23] Modify configuration to fix at release points for libs --- configure.sh | 25 +++++++++++++++++++++++-- libs/benchmark | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/configure.sh b/configure.sh index 3b5097a..417a613 100755 --- a/configure.sh +++ b/configure.sh @@ -1,14 +1,35 @@ -#!/bin/sh +#!/bin/bash + +# directory where current shell script resides +PROJECTDIR=$(dirname "$BASH_SOURCE") + +cd "$PROJECTDIR" # If setup.sh was called before if [ -d "libs/benchmark/googletest" ]; then rm -rf libs/benchmark fi +# Initialize submodules if needed +git submodule init # Update submodule if needed git submodule update --remote -# Setup google benchmark and googletest + +# Setup googletest git clone https://github.com/google/googletest.git libs/benchmark/googletest +# Set google test to specific release tag +cd libs/benchmark/googletest +git fetch --all --tags --prune +git checkout tags/release-1.10.0 -b release-1.10.0 +cd - + +# Setup googlebenchmark +cd libs/benchmark +git fetch --all --tags --prune +git checkout tags/v1.5.0 -b v1.5.0 +cd - + +# Build google benchmark cd libs/benchmark mkdir -p build && cd build diff --git a/libs/benchmark b/libs/benchmark index 5ce2429..090faec 160000 --- a/libs/benchmark +++ b/libs/benchmark @@ -1 +1 @@ -Subproject commit 5ce2429af7a8481581896afaa480552cc7584808 +Subproject commit 090faecb454fbd6e6e17a75ef8146acb037118d4 From b242364f4ce8b7c4c5a5a73e0919ce72b378219f Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 15:23:54 -0500 Subject: [PATCH 10/23] Use conan to manage nlohmann/json on linux --- .gitignore | 3 +++ conanfile.txt | 2 ++ 2 files changed, 5 insertions(+) create mode 100644 conanfile.txt diff --git a/.gitignore b/.gitignore index 25871d0..4234b19 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ node_modules build/ +conan* +!conanfile.txt +graph_info.json diff --git a/conanfile.txt b/conanfile.txt new file mode 100644 index 0000000..ca65909 --- /dev/null +++ b/conanfile.txt @@ -0,0 +1,2 @@ +[requires] +nlohmann_json/3.7.3 From e6627fb08ba3da76cf507ce0a8cdd7f8474a2401 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 15:30:55 -0500 Subject: [PATCH 11/23] Add configuration for nlohmann json on linux and mac --- configure.sh | 17 +++++++++++++++++ libs/benchmark | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/configure.sh b/configure.sh index 417a613..3e61983 100755 --- a/configure.sh +++ b/configure.sh @@ -5,6 +5,23 @@ PROJECTDIR=$(dirname "$BASH_SOURCE") cd "$PROJECTDIR" +# Install nlohmann/json +if [[ "$OSTYPE" == "linux-gnu" ]]; then + if [ $(command -v conan) == "" ]; then + echo "config fail: conan not installed" + exit 1 + fi + conan install . +elif [[ "$OSTYPE" == "darwin"* ]]; then + if [ $(command -v brew) == "" ]; then + echo "config fail: brew not installed" + exit 1 + fi + brew install nlohmann-json +else + echo "config fail: unrecognizable OS" +fi + # If setup.sh was called before if [ -d "libs/benchmark/googletest" ]; then rm -rf libs/benchmark diff --git a/libs/benchmark b/libs/benchmark index 090faec..daff5fe 160000 --- a/libs/benchmark +++ b/libs/benchmark @@ -1 +1 @@ -Subproject commit 090faecb454fbd6e6e17a75ef8146acb037118d4 +Subproject commit daff5fead3fbe22c6fc58310ca3f49caf117f185 From 4a8a1da75136a9410460bb7a93d6c10586bd7d04 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 16:19:41 -0500 Subject: [PATCH 12/23] Move conan to libs and reconfigure --- .gitignore | 1 + CMakeLists.txt | 4 ++++ conanfile.txt | 2 -- configure.sh | 2 ++ libs/conanfile.txt | 5 +++++ 5 files changed, 12 insertions(+), 2 deletions(-) delete mode 100644 conanfile.txt create mode 100644 libs/conanfile.txt diff --git a/.gitignore b/.gitignore index 4234b19..06736e1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ build/ conan* !conanfile.txt graph_info.json +*Find*.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index c1208dd..697bac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,10 @@ enable_testing() # Set C++17 standard for project target set(CMAKE_CXX_STANDARD 17) +# Set this such that dependency installation through conan can be found +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/libs) +message("CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") + # Create DocgenConfigVersion.cmake which contains current project version # This is supposed to help with (major) version compatibility. include(CMakePackageConfigHelpers) diff --git a/conanfile.txt b/conanfile.txt deleted file mode 100644 index ca65909..0000000 --- a/conanfile.txt +++ /dev/null @@ -1,2 +0,0 @@ -[requires] -nlohmann_json/3.7.3 diff --git a/configure.sh b/configure.sh index 3e61983..a641f6c 100755 --- a/configure.sh +++ b/configure.sh @@ -11,7 +11,9 @@ if [[ "$OSTYPE" == "linux-gnu" ]]; then echo "config fail: conan not installed" exit 1 fi + cd libs conan install . + cd - elif [[ "$OSTYPE" == "darwin"* ]]; then if [ $(command -v brew) == "" ]; then echo "config fail: brew not installed" diff --git a/libs/conanfile.txt b/libs/conanfile.txt new file mode 100644 index 0000000..64b1192 --- /dev/null +++ b/libs/conanfile.txt @@ -0,0 +1,5 @@ +[requires] +nlohmann_json/3.7.3 + +[generators] +cmake_find_package From 58e882430e9ec9789c62ad59d3f7acefaf3d4fca Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 16:37:23 -0500 Subject: [PATCH 13/23] Add cstring header for strerror --- src/exceptions/exceptions.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/exceptions/exceptions.hpp b/src/exceptions/exceptions.hpp index b4fc786..8dbe010 100644 --- a/src/exceptions/exceptions.hpp +++ b/src/exceptions/exceptions.hpp @@ -2,6 +2,7 @@ #include #include +#include #include #include From a89bd2f89f52d3f30eb37830ff5e426871c89e7f Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 17:24:20 -0500 Subject: [PATCH 14/23] Reimplement TrieNode to have uniqueptr of children --- src/core/lex/trie.hpp | 31 +++++++----- test/core/lex/trie_unittest.cpp | 83 +++++++++++++-------------------- 2 files changed, 52 insertions(+), 62 deletions(-) diff --git a/src/core/lex/trie.hpp b/src/core/lex/trie.hpp index eabe4ce..2df5008 100644 --- a/src/core/lex/trie.hpp +++ b/src/core/lex/trie.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace docgen { @@ -34,7 +35,7 @@ struct Trie void transition(char c); void back_transition(); bool is_accept() const; - std::unordered_map& get_children(); + typename TrieNode::children_t& get_children(); bool is_reset() const; void reset(); const std::optional& get_symbol() const; @@ -43,6 +44,8 @@ struct Trie struct TrieNode { + using children_t = std::unordered_map>; + // Insert str from current node to update the trie structure. // The string str is read starting from idx. void insert(const std::pair&, size_t = 0); @@ -54,7 +57,7 @@ struct Trie // Symbol will be active if is_accept is true. const std::optional& get_symbol() const; - std::unordered_map& get_children(); + children_t& get_children(); std::optional> get_parent(); @@ -65,10 +68,10 @@ struct Trie non_accept }; - State state_ = State::non_accept; // indicates accepting node or not - std::optional symbol_; // symbol for accepting node - std::unordered_map children_; // current node's children - TrieNode* parent_ptr_; // current node's parent + State state_ = State::non_accept; // indicates accepting node or not + std::optional symbol_; // symbol for accepting node + children_t children_; // current node's children + TrieNode* parent_ptr_; // current node's parent }; TrieNode root_; // root of Trie @@ -92,9 +95,13 @@ Trie::TrieNode::insert(const pair_t& pair, size_t idx) } else { - auto& child = children_[str[idx]]; - child.parent_ptr_ = this; - child.insert(pair, idx + 1); + // if no child with str[idx] mapping + if (children_.find(str[idx]) == children_.end()) { + children_.emplace(str[idx], std::make_unique()); + } + auto& child = children_.at(str[idx]); + child->parent_ptr_ = this; + child->insert(pair, idx + 1); } } @@ -113,7 +120,7 @@ Trie::TrieNode::get_symbol() const } template -inline std::unordered_map::TrieNode>& +inline typename Trie::TrieNode::children_t& Trie::TrieNode::get_children() { return children_; @@ -150,7 +157,7 @@ template inline void Trie::transition(char c) { - curr_node_ = curr_node_.get().get_children().at(c); + curr_node_ = *(curr_node_.get().get_children().at(c)); } template @@ -161,7 +168,7 @@ Trie::is_accept() const } template -inline std::unordered_map::TrieNode>& +inline typename Trie::TrieNode::children_t& Trie::get_children() { return curr_node_.get().get_children(); diff --git a/test/core/lex/trie_unittest.cpp b/test/core/lex/trie_unittest.cpp index bc8d483..7fb1eb9 100644 --- a/test/core/lex/trie_unittest.cpp +++ b/test/core/lex/trie_unittest.cpp @@ -35,9 +35,8 @@ TEST_F(trie_fixture, trie_root) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_transition_child_a) @@ -55,10 +54,9 @@ TEST_F(trie_fixture, trie_transition_child_a) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(2)); - EXPECT_NE(children.find('b'), children.end()); // found - EXPECT_NE(children.find('c'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(2)); + EXPECT_NE(trie.get_children().find('b'), trie.get_children().end()); // found + EXPECT_NE(trie.get_children().find('c'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_transition_child_b) @@ -76,9 +74,8 @@ TEST_F(trie_fixture, trie_transition_child_b) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('c'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('c'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_transition_child_bc) @@ -98,8 +95,7 @@ TEST_F(trie_fixture, trie_transition_child_bc) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(0)); + EXPECT_EQ(trie.get_children().size(), static_cast(0)); } TEST_F(trie_fixture, trie_transition_child_c) @@ -118,8 +114,7 @@ TEST_F(trie_fixture, trie_transition_child_c) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(0)); + EXPECT_EQ(trie.get_children().size(), static_cast(0)); } TEST_F(trie_fixture, trie_reset_root) @@ -136,9 +131,8 @@ TEST_F(trie_fixture, trie_reset_root) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_reset_child_a) @@ -157,9 +151,8 @@ TEST_F(trie_fixture, trie_reset_child_a) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_reset_child_a_b) @@ -179,9 +172,8 @@ TEST_F(trie_fixture, trie_reset_child_a_b) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_back_transition_root) @@ -210,9 +202,8 @@ TEST_F(trie_fixture, trie_back_transition_child_a) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto& children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } TEST_F(trie_fixture, trie_back_transition_child_ab) @@ -233,10 +224,9 @@ TEST_F(trie_fixture, trie_back_transition_child_ab) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - auto children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(2)); - EXPECT_NE(children.find('b'), children.end()); // found - EXPECT_NE(children.find('c'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(2)); + EXPECT_NE(trie.get_children().find('b'), trie.get_children().end()); // found + EXPECT_NE(trie.get_children().find('c'), trie.get_children().end()); // found // back to root trie.back_transition(); @@ -245,9 +235,8 @@ TEST_F(trie_fixture, trie_back_transition_child_ab) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found } //////////////////////////////////////////// @@ -266,10 +255,9 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(trie.is_reset()); - auto children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(2)); - EXPECT_NE(children.find('a'), children.end()); // found - EXPECT_NE(children.find('b'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(2)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found + EXPECT_NE(trie.get_children().find('b'), trie.get_children().end()); // found // check child 'a' trie.transition('a'); @@ -278,9 +266,8 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('b'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('b'), trie.get_children().end()); // found // check child 'a'->'b' trie.transition('b'); @@ -289,8 +276,7 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(0)); + EXPECT_EQ(trie.get_children().size(), static_cast(0)); // up to child 'a' trie.back_transition(); @@ -305,9 +291,8 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('a'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('a'), trie.get_children().end()); // found // check child 'b'->'a' trie.transition('a'); @@ -316,9 +301,8 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(!trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(1)); - EXPECT_NE(children.find('b'), children.end()); // found + EXPECT_EQ(trie.get_children().size(), static_cast(1)); + EXPECT_NE(trie.get_children().find('b'), trie.get_children().end()); // found // check child 'b'->'a'->'b' trie.transition('b'); @@ -327,8 +311,7 @@ TEST_F(trie_fixture, trie_off_by_one_prefix) EXPECT_TRUE(trie.is_accept()); EXPECT_TRUE(!trie.is_reset()); - children = trie.get_children(); - EXPECT_EQ(children.size(), static_cast(0)); + EXPECT_EQ(trie.get_children().size(), static_cast(0)); } } // namespace lex From 954f449a23dbe5c807b3424b67dbbf0dbd02d743 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 17:31:11 -0500 Subject: [PATCH 15/23] Remove io unittests for now --- test/CMakeLists.txt | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d5ab8e4..125e82f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,18 +27,18 @@ create_test("exceptions_unittests" exceptions_unittests) # File IO Unit Tests ###################################################### -add_executable(io_unittests - ${CMAKE_CURRENT_SOURCE_DIR}/io/file_reader_unittest.cpp - ) - -create_test("io_unittests" io_unittests) - -# copy data directory into where io_unittests executable ends up -add_custom_command( - TARGET io_unittests POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory - ${CMAKE_CURRENT_SOURCE_DIR}/io/io_data/ - $/io_data) +#add_executable(io_unittests +# ${CMAKE_CURRENT_SOURCE_DIR}/io/file_reader_unittest.cpp +# ) +# +#create_test("io_unittests" io_unittests) +# +## copy data directory into where io_unittests executable ends up +#add_custom_command( +# TARGET io_unittests POST_BUILD +# COMMAND ${CMAKE_COMMAND} -E copy_directory +# ${CMAKE_CURRENT_SOURCE_DIR}/io/io_data/ +# $/io_data) ###################################################### # Core Unit Tests From 12385f85301376c291c2ba8f0ecbaa9b4713f9a6 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 17:44:58 -0500 Subject: [PATCH 16/23] Update google benchmark --- libs/benchmark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/benchmark b/libs/benchmark index daff5fe..090faec 160000 --- a/libs/benchmark +++ b/libs/benchmark @@ -1 +1 @@ -Subproject commit daff5fead3fbe22c6fc58310ca3f49caf117f185 +Subproject commit 090faecb454fbd6e6e17a75ef8146acb037118d4 From 782126a063615221eb3e7c7e80e260f4537c517a Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 17:45:45 -0500 Subject: [PATCH 17/23] Update google benchmark again --- libs/benchmark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/benchmark b/libs/benchmark index 090faec..daff5fe 160000 --- a/libs/benchmark +++ b/libs/benchmark @@ -1 +1 @@ -Subproject commit 090faecb454fbd6e6e17a75ef8146acb037118d4 +Subproject commit daff5fead3fbe22c6fc58310ca3f49caf117f185 From 1456ef67fae0295a118359e609c07ff45e722efc Mon Sep 17 00:00:00 2001 From: James Yang Date: Sat, 18 Jan 2020 17:53:57 -0500 Subject: [PATCH 18/23] Update configure to recursively update submodule --- configure.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.sh b/configure.sh index a641f6c..363eecd 100755 --- a/configure.sh +++ b/configure.sh @@ -30,9 +30,9 @@ if [ -d "libs/benchmark/googletest" ]; then fi # Initialize submodules if needed -git submodule init +git submodule update --init # Update submodule if needed -git submodule update --remote +git submodule update --recursive --remote # Setup googletest git clone https://github.com/google/googletest.git libs/benchmark/googletest From 27f7d05b58129286a1d077aba16061a78ccdfea2 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sun, 19 Jan 2020 09:32:04 -0500 Subject: [PATCH 19/23] Add legacy files for benchmarking --- src/core/lex/legacy/lexer.hpp | 37 +++ src/core/lex/legacy/lexer_routines.hpp | 301 +++++++++++++++++++++++++ src/core/lex/legacy/status.hpp | 17 ++ src/core/lex/legacy/symbol.hpp | 44 ++++ src/core/{ => lex/legacy}/token.hpp | 10 +- 5 files changed, 407 insertions(+), 2 deletions(-) create mode 100644 src/core/lex/legacy/lexer.hpp create mode 100644 src/core/lex/legacy/lexer_routines.hpp create mode 100644 src/core/lex/legacy/status.hpp create mode 100644 src/core/lex/legacy/symbol.hpp rename src/core/{ => lex/legacy}/token.hpp (72%) diff --git a/src/core/lex/legacy/lexer.hpp b/src/core/lex/legacy/lexer.hpp new file mode 100644 index 0000000..56f647e --- /dev/null +++ b/src/core/lex/legacy/lexer.hpp @@ -0,0 +1,37 @@ +#pragma once +#include "lexer_routines.hpp" + +namespace docgen { +namespace core { + +struct Lexer +{ + using symbol_t = lexer_details::symbol_t; + using file_reader = lexer_details::file_reader; + using status_t = lexer_details::status_t; + + Lexer(FILE* file) + : reader_(file) + { + status_.tokens.reserve(DEFAULT_TOKEN_ARR_SIZE); + } + + void process() + { + lexer_details::process(reader_, status_); + } + + const status_t::token_arr_t& get_tokens() const + { + return status_.tokens; + } + +private: + static constexpr size_t DEFAULT_TOKEN_ARR_SIZE = 50; + + file_reader reader_; + status_t status_; // keeps track of last token value (enum) +}; + +} // namespace core +} // namespace docgen diff --git a/src/core/lex/legacy/lexer_routines.hpp b/src/core/lex/legacy/lexer_routines.hpp new file mode 100644 index 0000000..d6e8dd0 --- /dev/null +++ b/src/core/lex/legacy/lexer_routines.hpp @@ -0,0 +1,301 @@ +#pragma once +#include "core/lex/legacy/token.hpp" +#include "core/lex/legacy/status.hpp" +#include "core/lex/legacy/symbol.hpp" +#include "io/file_reader.hpp" +#include "core/tag_set.hpp" + +namespace docgen { +namespace core { +namespace lex { +namespace legacy { + +static constexpr size_t DEFAULT_STRING_RESERVE_SIZE = 50; + +using file_reader = io::file_reader; +using symbol_t = Symbol; +using token_t = Token; +using status_t = Status; + +// Reads and ignores chars until func(c) evaluates to false or reading terminates, +// where c is the current char read. +// Returns the last char read that terminated the function. +template +inline int ignore_until(file_reader& reader, Termination func) +{ + int c = 0; + while (((c = reader.read()) != file_reader::termination) && func(c)); + return c; +} + +// Reads and stores chars until func(c) evaluates to false or reading terminates, +// where c is the current char read. +// Returns the last char read that terminated the function. +template +inline int read_until(file_reader& reader, Termination func, std::string& line) +{ + int c = 0; + line.reserve(DEFAULT_STRING_RESERVE_SIZE); + while (((c = reader.read()) != file_reader::termination) && func(c)) { + line.push_back(c); + } + return c; +} + +// Trims all leading and trailing whitespaces (one of " \t\n\v\f\r") from line. +// Line is directly modified. +// Returns leading whitespace count of original line. +inline uint32_t trim(std::string& line) +{ + static constexpr const char* whitespaces = " \t\n\v\f\r"; + + // find first non-whitespace + const auto begin = line.find_first_not_of(whitespaces); + + // find last non-whitespace + const auto end = line.find_last_not_of(whitespaces); + + // If substring invalid, simply clear line return length of string + // By symmetry, begin and end will be npos if and only if the string only + // consists of whitespaces. In this case, the leading whitespace count is + // simply the length of the string. + if (begin == std::string::npos && end == std::string::npos) { + uint32_t leading_ws_count = line.size(); + line.clear(); + return leading_ws_count; + } + + // otherwise, replace with substring + line = line.substr(begin, end - begin + 1); + + return begin; // number of leading whitespaces +} + +// Trims text, tokenizes it, clears it, and reserve DEFAULT_STRING_RESERVE_SIZE. +// (Trimmed) text is only tokenized if it is non-empty. +inline void tokenize_text(std::string& text, status_t& status) +{ + // trim whitespaces from text first + uint32_t leading_whitespace_count = trim(text); + // tokenize current TEXT only if it is non-empty + if (!text.empty()) { + status.tokens.emplace_back(symbol_t::TEXT, std::move(text), leading_whitespace_count); + } + // clear and reserve + text.clear(); + text.reserve(DEFAULT_STRING_RESERVE_SIZE); +} + +// If c is one of single-char special tokens (see symbol.hpp), +// then text is first tokenized then the single-char special token. +// The tokens are appended to status.tokens in this order. +// Otherwise, no operations are performed. +// Returns true if and only if a single-char special token created. +inline bool process_char(int c, std::string& text, status_t& status) +{ + switch (c) { + case '\n': + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::NEWLINE); + return true; + case ';': + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::SEMICOLON); + return true; + case '{': + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::OPEN_BRACE); + return true; + case '}': + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::CLOSE_BRACE); + return true; + default: + return false; + } +} + +// If tag name is not a valid one, assume it is simply text. +// It is expected that the caller immediately read "@" before calling. +inline void tokenize_tag_name(std::string& text, file_reader& reader, status_t& status) +{ + static constexpr const auto is_alpha = + [](char x) {return isalpha(x);}; + + // parse tag + std::string tagname; + int c = read_until(reader, is_alpha, tagname); + reader.back(c); + + // if valid tag, append text token then token with tag name + if (tag_set.find(tagname) != tag_set.end()) { + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::TAGNAME, std::move(tagname)); + } + + // otherwise, assume part of text: append "@" then tag name to text + else { + text.push_back('@'); + text.append(tagname); + } +} + +// If c is '@', try to tokenize tag name. +// Behavior is the same as tokenize_tag_name. +// Returns true if and only if c is '@'. +inline bool process_tag_name(int c, std::string& text, + file_reader& reader, status_t& status) +{ + if (c == '@') { + tokenize_tag_name(text, reader, status); + return true; + } + return false; +} + +// It is expected that caller has read the string "//" immediately before calling. +inline void process_line_comment(std::string& text, file_reader& reader, status_t& status) +{ + static constexpr const auto is_not_newline = + [](char x) {return x != '\n';}; + + int c = reader.read(); + + if (c == '/') { + c = reader.read(); + // valid triple-slash comment + if (isspace(c)) { + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::BEGIN_LINE_COMMENT); + reader.back(c); // in case it's a single-char token + } + // invalid triple-slash comment + else { + // no need to read back since c cannot be a whitespace and we ignore anyway + ignore_until(reader, is_not_newline); + } + } + + // invalid triple-slash comment + else { + reader.back(c); // the character just read may be '\n' + ignore_until(reader, is_not_newline); + } +} + +// It is expected that caller has read the string "/*" immediately before calling. +inline void process_block_comment(std::string& text, file_reader& reader, status_t& status) +{ + const auto is_not_end_block = + [&](char x) {return (x != '*') || (reader.peek() != '/');}; + + int c = reader.read(); + + if (c == '!') { + c = reader.read(); + // valid block comment: tokenize text then begin block comment symbol + if (isspace(c)) { + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::BEGIN_BLOCK_COMMENT); + reader.back(c); // may be special single-char token + } + // regular block comment: ignore text until end and stop tokenizing + else { + ignore_until(reader, is_not_end_block); + reader.read(); // read the '/' + } + } + + // regular block comment + else { + ignore_until(reader, is_not_end_block); // stops after reading '*' in "*/" + reader.read(); // read the '/' after + } +} + +// If c is not '/' or '*', then no operation done and returns false. +// If c is '/', and if it's a possible line comment ("//") then same as process_line_comment; +// if it's a possible block comment ("/*") then same as process_block_comment; +// otherwise, text is updated to include all characters read. +// +// If c is '*', and if it is the ending of a block comment ("*/"), text tokenized then END_BLOCK_COMMENT; +// otherwise, text tokenized then STAR. +// +// In any case, returns true if first char has been processed. +inline bool process_string(int c, std::string& text, + file_reader& reader, status_t& status) +{ + // possibly beginning of line or block comment + if (c == '/') { + c = reader.read(); + if (c == '/') { + process_line_comment(text, reader, status); + } + else if (c == '*') { + process_block_comment(text, reader, status); + } + else { + text.push_back('/'); + text.push_back(c); + } + return true; + } + + // possibly ending block comment or a star that can be ignored in the middle of a block comment + else if (c == '*') { + c = reader.read(); + if (c == '/') { + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::END_BLOCK_COMMENT); + } + else { + tokenize_text(text, status); + status.tokens.emplace_back(symbol_t::STAR); + reader.back(c); + } + return true; + } + + return false; +} + +inline void process(file_reader& reader, status_t& status) +{ + std::string text; + text.reserve(DEFAULT_STRING_RESERVE_SIZE); + int c = 0; + bool processed = false; + + while ((c = reader.read()) != file_reader::termination) { + + // process special single-char + processed = process_char(c, text, status); + if (processed) { + continue; + } + + // process tag name + processed = process_tag_name(c, text, reader, status); + if (processed) { + continue; + } + + // process string tokens + processed = process_string(c, text, reader, status); + if (processed) { + continue; + } + + // otherwise, no special symbol -> push to text + text.push_back(c); + } + + // tokenize last text then EOF + tokenize_text(text, status); + status.tokens.emplace_back(token_t::symbol_t::END_OF_FILE); +} + +} // namespace legacy +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/src/core/lex/legacy/status.hpp b/src/core/lex/legacy/status.hpp new file mode 100644 index 0000000..d29dda4 --- /dev/null +++ b/src/core/lex/legacy/status.hpp @@ -0,0 +1,17 @@ +#pragma once +#include + +namespace docgen { +namespace core { + +template +struct Status +{ + using token_t = TokenType; + using token_arr_t = std::vector; + + token_arr_t tokens; +}; + +} // namespace core +} // namespace docgen diff --git a/src/core/lex/legacy/symbol.hpp b/src/core/lex/legacy/symbol.hpp new file mode 100644 index 0000000..4eb1126 --- /dev/null +++ b/src/core/lex/legacy/symbol.hpp @@ -0,0 +1,44 @@ +#pragma once +#include +#include +#include + +namespace docgen { +namespace core { +namespace lex { +namespace legacy { + +enum class Symbol { + // single-char tokens + END_OF_FILE, + NEWLINE, + SEMICOLON, + STAR, + OPEN_BRACE, + CLOSE_BRACE, + // string tokens + BEGIN_LINE_COMMENT, + BEGIN_BLOCK_COMMENT, + END_BLOCK_COMMENT, + // special tags + TAGNAME, + // default + TEXT +}; + +// Compile-time mapping of strings to corresponding symbol +static MAPBOX_ETERNAL_CONSTEXPR const auto symbol_map = + mapbox::eternal::map({ + {Symbol::SEMICOLON, ";"}, + {Symbol::STAR, "*"}, + {Symbol::OPEN_BRACE, "{"}, + {Symbol::CLOSE_BRACE, "}"}, + {Symbol::BEGIN_LINE_COMMENT, "///"}, + {Symbol::BEGIN_BLOCK_COMMENT, "/*!"}, + {Symbol::END_BLOCK_COMMENT, "*/"}, + }); + +} // namespace legacy +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/src/core/token.hpp b/src/core/lex/legacy/token.hpp similarity index 72% rename from src/core/token.hpp rename to src/core/lex/legacy/token.hpp index 4ef3798..72edcaa 100644 --- a/src/core/token.hpp +++ b/src/core/lex/legacy/token.hpp @@ -1,18 +1,21 @@ #pragma once #include -#include +#include "core/symbol.hpp" namespace docgen { namespace core { +namespace lex { +namespace legacy { template struct Token { using symbol_t = SymbolType; - Token(symbol_t name, std::string&& content) + Token(symbol_t name, std::string&& content, uint32_t leading_ws_count=0) : name(name) , content(std::move(content)) + , leading_ws_count(leading_ws_count) {} Token(symbol_t name) @@ -24,6 +27,7 @@ struct Token symbol_t name; std::string content; + uint32_t leading_ws_count; }; template <> @@ -33,5 +37,7 @@ inline const char* Token::c_str() const symbol_map.at(name).c_str() : content.c_str(); } +} +} // namespace lex } // namespace core } // namespace docgen From 87b49ad76aa52b5102765d590a44398623b60fe8 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sun, 19 Jan 2020 09:35:33 -0500 Subject: [PATCH 20/23] Readd new token.hpp --- src/core/token.hpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/core/token.hpp diff --git a/src/core/token.hpp b/src/core/token.hpp new file mode 100644 index 0000000..4ef3798 --- /dev/null +++ b/src/core/token.hpp @@ -0,0 +1,37 @@ +#pragma once +#include +#include + +namespace docgen { +namespace core { + +template +struct Token +{ + using symbol_t = SymbolType; + + Token(symbol_t name, std::string&& content) + : name(name) + , content(std::move(content)) + {} + + Token(symbol_t name) + : Token(name, "") + {} + + // left undefined for SymbolType != Symbol + const char* c_str() const; + + symbol_t name; + std::string content; +}; + +template <> +inline const char* Token::c_str() const +{ + return (symbol_map.find(name) != symbol_map.end()) ? + symbol_map.at(name).c_str() : content.c_str(); +} + +} // namespace core +} // namespace docgen From 73210ceb2a9866b750feab8a525e2e59f22b7160 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sun, 19 Jan 2020 10:54:56 -0500 Subject: [PATCH 21/23] Modifying configuration and benchmark in cmake --- CMakeLists.txt | 15 +- benchmark/CMakeLists.txt | 24 + benchmark/core/lex/data/data_1.txt | 113 ++++ benchmark/core/lex/data/data_2.txt | 211 +++++++ benchmark/core/lex/data/data_3.txt | 803 +++++++++++++++++++++++++ benchmark/core/lex/lexer_benchmark.cpp | 40 ++ configure.sh | 15 +- src/CMakeLists.txt | 1 + test/CMakeLists.txt | 1 + 9 files changed, 1210 insertions(+), 13 deletions(-) create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/core/lex/data/data_1.txt create mode 100644 benchmark/core/lex/data/data_2.txt create mode 100644 benchmark/core/lex/data/data_3.txt create mode 100644 benchmark/core/lex/lexer_benchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 697bac1..7944435 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,15 +6,12 @@ project("Docgen" VERSION 1.0.0 # This will perform memcheck include(CTest) -# This is to make this library portable to other machines. -# This will be used for install. -include(GNUInstallDirs) - # enables testing enable_testing() -# Set C++17 standard for project target -set(CMAKE_CXX_STANDARD 17) +# This is to make this library portable to other machines. +# This will be used for install. +include(GNUInstallDirs) # Set this such that dependency installation through conan can be found set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/libs) @@ -58,11 +55,17 @@ set(ETERNAL_DIR ${PROJECT_SOURCE_DIR}/libs/eternal) # find json library find_package(nlohmann_json 3.2.0 REQUIRED) +# find google benchmark +find_package(benchmark REQUIRED) + # add libs subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/libs ${PROJECT_BINARY_DIR}/libs) # add src subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src) +# add benchmark subdirectory +add_subdirectory(${PROJECT_SOURCE_DIR}/benchmark ${PROJECT_BINARY_DIR}/benchmark) + # add test subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/test ${PROJECT_BINARY_DIR}/test) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 0000000..fe431a0 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,24 @@ +add_executable(lexer_benchmark + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_benchmark.cpp + # Source dependency + $ + ) +target_compile_features(lexer_benchmark PRIVATE cxx_std_17) +target_include_directories(lexer_benchmark PRIVATE + ${GBENCH_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${ETERNAL_DIR}/include + ) +target_link_libraries(lexer_benchmark PRIVATE + benchmark::benchmark + benchmark::benchmark_main + pthread + nlohmann_json::nlohmann_json + ) + +# copy data directory into where lexer_benchmark executable ends up +add_custom_command( + TARGET lexer_benchmark POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/data/ + $/data) diff --git a/benchmark/core/lex/data/data_1.txt b/benchmark/core/lex/data/data_1.txt new file mode 100644 index 0000000..2cb7356 --- /dev/null +++ b/benchmark/core/lex/data/data_1.txt @@ -0,0 +1,113 @@ +#include + +namespace docgen { +namespace core { +namespace lex { + +/////////////////////////////////// +// Lexer Implementation +/////////////////////////////////// + +Lexer::Lexer() + : trie_({ + {"\n", Symbol::NEWLINE}, + {" ", Symbol::WHITESPACE}, + {"\t", Symbol::WHITESPACE}, + {"\v", Symbol::WHITESPACE}, + {"\r", Symbol::WHITESPACE}, + {"\f", Symbol::WHITESPACE}, + {";", Symbol::SEMICOLON}, + {"#", Symbol::HASHTAG}, + {"*", Symbol::STAR}, + {"{", Symbol::OPEN_BRACE}, + {"}", Symbol::CLOSE_BRACE}, + {"///", Symbol::BEGIN_SLINE_COMMENT}, + {"/*!", Symbol::BEGIN_SBLOCK_COMMENT}, + {"//", Symbol::BEGIN_NLINE_COMMENT}, + {"/*", Symbol::BEGIN_NBLOCK_COMMENT}, + {"*/", Symbol::END_BLOCK_COMMENT}, + {"@sdesc", Symbol::SDESC}, + {"@tparam", Symbol::TPARAM}, + {"@param", Symbol::PARAM}, + {"@return", Symbol::RETURN} + }) +{} + +void Lexer::process(char c) +{ + this->update_state(); + + auto it = trie_.get_children().find(c); + + // if transition exists + if (it != trie_.get_children().end()) { + buf_.push_back(c); + trie_.transition(c); + return; + } + + // otherwise, no transition exists + + // if not backtracking + if (!this->is_backtracking()) { + // if trie at root + if (trie_.is_reset()) { + text_.push_back(c); + return; + } + text_.append(buf_); + buf_.clear(); + trie_.reset(); + return this->process(c); + } + + // otherwise, currently backtracking + this->backtrack(c); +} + +void Lexer::backtrack(char c) +{ + // tokenize text + this->tokenize_text(); + + // tokenize symbol + for (uint32_t i = 0; i < buf_.size(); ++i) { + trie_.back_transition(); + } + assert(trie_.is_accept()); + auto opt_symbol = trie_.get_symbol(); + assert(static_cast(opt_symbol)); + status_.tokens.emplace(*opt_symbol); + + // move and clear buf_ to temp string for reprocessing + std::string reprocess_str(std::move(buf_)); + reprocess_str.push_back(c); + + // reset + this->reset(); + + // reprocess the rest + for (char c : reprocess_str) { + this->process(c); + } +} + +void Lexer::flush() +{ + this->update_state(); + + if (this->is_backtracking()) { + return this->backtrack(0); + } + + // non-backtracking: no parent is an accepting node + // append buf_ to text_ and tokenize text_ + // reset all other fields + text_.append(buf_); + this->tokenize_text(); + this->reset(); +} + +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/benchmark/core/lex/data/data_2.txt b/benchmark/core/lex/data/data_2.txt new file mode 100644 index 0000000..2df5008 --- /dev/null +++ b/benchmark/core/lex/data/data_2.txt @@ -0,0 +1,211 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace docgen { +namespace core { +namespace lex { + +template +struct Trie +{ +private: + struct TrieNode; // forward declaration + +public: + using pair_t = std::pair; + + // Constructs trie node from a list of pairs of string and symbol. + // The string must be of type std::string_view and it must not be empty. + // The symbol must be of type SymbolType. + Trie(const std::initializer_list&); + + // Delete compiler-generated copy/move ctor/assignment + // This ensures that Trie objects are only (default) constructible. + Trie(const Trie&) =delete; + Trie(Trie&&) =delete; + Trie& operator=(const Trie&) =delete; + Trie& operator=(Trie&&) =delete; + + void transition(char c); + void back_transition(); + bool is_accept() const; + typename TrieNode::children_t& get_children(); + bool is_reset() const; + void reset(); + const std::optional& get_symbol() const; + +private: + + struct TrieNode + { + using children_t = std::unordered_map>; + + // Insert str from current node to update the trie structure. + // The string str is read starting from idx. + void insert(const std::pair&, size_t = 0); + + // Returns if current node is an accepting state. + bool is_accept() const; + + // Returns the optional symbol associated with current node. + // Symbol will be active if is_accept is true. + const std::optional& get_symbol() const; + + children_t& get_children(); + + std::optional> get_parent(); + + private: + + enum class State : bool { + accept, + non_accept + }; + + State state_ = State::non_accept; // indicates accepting node or not + std::optional symbol_; // symbol for accepting node + children_t children_; // current node's children + TrieNode* parent_ptr_; // current node's parent + }; + + TrieNode root_; // root of Trie + std::reference_wrapper curr_node_ = root_; // current node +}; + +//////////////////////////////////////////////////////////////// +// TrieNode Implementation +//////////////////////////////////////////////////////////////// + +template +inline void +Trie::TrieNode::insert(const pair_t& pair, size_t idx) +{ + const auto& str = std::get<0>(pair); + + // if string starting from idx is empty, then accepting state + if (str[idx] == '\0') { + state_ = State::accept; + symbol_ = std::get<1>(pair); + } + + else { + // if no child with str[idx] mapping + if (children_.find(str[idx]) == children_.end()) { + children_.emplace(str[idx], std::make_unique()); + } + auto& child = children_.at(str[idx]); + child->parent_ptr_ = this; + child->insert(pair, idx + 1); + } +} + +template +inline bool +Trie::TrieNode::is_accept() const +{ + return state_ == State::accept; +} + +template +inline const std::optional& +Trie::TrieNode::get_symbol() const +{ + return symbol_; +} + +template +inline typename Trie::TrieNode::children_t& +Trie::TrieNode::get_children() +{ + return children_; +} + +template +inline std::optional::TrieNode>> +Trie::TrieNode::get_parent() +{ + if (parent_ptr_) { + return *parent_ptr_; + } + return {}; +} + +//////////////////////////////////////////////////////////////// +// Trie Implementation +//////////////////////////////////////////////////////////////// + +template +inline +Trie::Trie(const std::initializer_list& pairs) + : root_() +{ + for (auto it = pairs.begin(); it != pairs.end(); ++it) { + if (it->first.empty()) { + throw exceptions::control_flow_error("strings must be non-empty"); + } + root_.insert(*it); + } +} + +template +inline void +Trie::transition(char c) +{ + curr_node_ = *(curr_node_.get().get_children().at(c)); +} + +template +inline bool +Trie::is_accept() const +{ + return curr_node_.get().is_accept(); +} + +template +inline typename Trie::TrieNode::children_t& +Trie::get_children() +{ + return curr_node_.get().get_children(); +} + +template +inline bool +Trie::is_reset() const +{ + return &(curr_node_.get()) == &root_; +} + +template +inline void +Trie::reset() +{ + curr_node_ = root_; +} + +template +inline void +Trie::back_transition() +{ + auto&& opt_parent = curr_node_.get().get_parent(); + if (!opt_parent) { + throw exceptions::control_flow_error("Attempt to back transition past the root"); + } + curr_node_ = *opt_parent; +} + +template +inline const std::optional& +Trie::get_symbol() const +{ + return curr_node_.get().get_symbol(); +} + +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/benchmark/core/lex/data/data_3.txt b/benchmark/core/lex/data/data_3.txt new file mode 100644 index 0000000..e7483bb --- /dev/null +++ b/benchmark/core/lex/data/data_3.txt @@ -0,0 +1,803 @@ +#include +#include + +namespace docgen { +namespace core { +namespace lex { + +struct lexer_fixture : ::testing::Test +{ +protected: + using status_t = typename Lexer::status_t; + using token_t = typename Lexer::token_t; + using symbol_t = typename Lexer::symbol_t; + + Lexer lexer; + std::optional token; + + void setup_lexer(const char* content) + { + std::string str(content); + for (char c : str) { + lexer.process(c); + } + lexer.flush(); + } +}; + +//////////////////////////////////////////////////////////////////// +// Individual Symbol TESTS +//////////////////////////////////////////////////////////////////// + +// NEWLINE +TEST_F(lexer_fixture, lexer_newline) +{ + static constexpr const char* content = + "somecrazy1492text\nmvn2b" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "somecrazy1492text"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "mvn2b"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE ( ) +TEST_F(lexer_fixture, lexer_whitespace_space) +{ + static constexpr const char* content = + ",m.,m. abn" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ",m.,m."); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abn"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\t) +TEST_F(lexer_fixture, lexer_whitespace_t) +{ + static constexpr const char* content = + "h0f2n.1\t1234|" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "h0f2n.1"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "1234|"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\v) +TEST_F(lexer_fixture, lexer_whitespace_v) +{ + static constexpr const char* content = + "hello!\v" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\r) +TEST_F(lexer_fixture, lexer_whitespace_r) +{ + static constexpr const char* content = + "hello!\rwsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (\f) +TEST_F(lexer_fixture, lexer_whitespace_f) +{ + static constexpr const char* content = + "hello!\fwsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hello!"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// WHITESPACE (;) +TEST_F(lexer_fixture, lexer_semicolon) +{ + static constexpr const char* content = + ";wsdescorrld!!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SEMICOLON); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "wsdescorrld!!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// BEGIN_SLINE_COMMENT +TEST_F(lexer_fixture, lexer_begin_sline_comment) +{ + static constexpr const char* content = + "abc///" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// BEGIN_SBLOCK_COMMENT +TEST_F(lexer_fixture, lexer_begin_sblock_comment) +{ + static constexpr const char* content = + "abc/*!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SBLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// BEGIN_NBLOCK_COMMENT +TEST_F(lexer_fixture, lexer_begin_nblock_comment) +{ + static constexpr const char* content = + "abc/**!" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NBLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::STAR); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "!"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// END_BLOCK_COMMENT +TEST_F(lexer_fixture, lexer_end_block_comment_no_star) +{ + static constexpr const char* content = + "abc*/f" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::END_BLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_end_block_comment_star) +{ + static constexpr const char* content = + "abc**/f" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::STAR); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::END_BLOCK_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// SDESC +TEST_F(lexer_fixture, lexer_sdesc) +{ + static constexpr const char* content = + "ssdesc@@sdescf@sdesscf" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "ssdesc@"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SDESC); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f@sdesscf"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// TPARAM +TEST_F(lexer_fixture, lexer_tparam) +{ + static constexpr const char* content = + "ssdes@@@@@@tpaar@tpara@m@tparam@tpar" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "ssdes@@@@@@tpaar@tpara@m"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TPARAM); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@tpar"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// RETURN +TEST_F(lexer_fixture, lexer_return) +{ + static constexpr const char* content = + "@re@@@@@@return@@@@@" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@re@@@@@"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::RETURN); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "@@@@@"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +//////////////////////////////////////////////////////////////////// +// Mix TESTS +//////////////////////////////////////////////////////////////////// + +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_4) +{ + static constexpr const char* content = + "abc////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "/"); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_5) +{ + static constexpr const char* content = + "abc/////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +// line comment mix +TEST_F(lexer_fixture, lexer_line_comment_6) +{ + static constexpr const char* content = + "abc//////" + ; + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "abc"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_test_1_no_special_comment) +{ + static constexpr const char* content = + "#include // some comment\n" + "\n" + "void f();" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "include"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "some"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "void"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "f()"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SEMICOLON); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_test_2_no_special_comment) +{ + static constexpr const char* content = + "#include \n" + "\n" + " // just a normal comment\n" + "\n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "include"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "just"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "a"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "normal"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_test_1_comment_mix) +{ + static constexpr const char* content = + "// comment\n" + " /// special_comment \n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "special_comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +TEST_F(lexer_fixture, lexer_test_1_tagname_comments) +{ + static constexpr const char* content = + "// @tparam normal comment\n" + "/// @sdescspecial comment \n" + "#define hehe\n" + ; + + setup_lexer(content); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_NLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TPARAM); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "normal"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::BEGIN_SLINE_COMMENT); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::SDESC); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "special"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "comment"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::HASHTAG); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "define"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::WHITESPACE); + EXPECT_EQ(token->content, ""); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::TEXT); + EXPECT_EQ(token->content, "hehe"); + + token = lexer.next_token(); + EXPECT_EQ(token->name, symbol_t::NEWLINE); + EXPECT_EQ(token->content, ""); + + // check that there are no more tokens + token = lexer.next_token(); + EXPECT_FALSE(static_cast(token)); +} + +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/benchmark/core/lex/lexer_benchmark.cpp b/benchmark/core/lex/lexer_benchmark.cpp new file mode 100644 index 0000000..327e8e3 --- /dev/null +++ b/benchmark/core/lex/lexer_benchmark.cpp @@ -0,0 +1,40 @@ +#include +#include + +namespace docgen { +namespace core { +namespace lex { + +struct lexer_fixture : benchmark::Fixture +{ + static constexpr const char* data_1_path = "data/data_1.txt"; + static constexpr const char* data_2_path = "data/data_2.txt"; + static constexpr const char* data_3_path = "data/data_3.txt"; + + Lexer lexer; + + void SetUp(const ::benchmark::State& state) + { + } + + void TearDown(const ::benchmark::State& state) + { + } +}; + +BENCHMARK_F(lexer_fixture, data_1_test)(benchmark::State& st) +{ + std::unique_ptr file(fopen(data_1_path, "r"), + [](FILE* file) {fclose(file);}); + int c = 0; + for (auto _ : st) { + while ((c = getc(file.get())) != EOF) { + lexer.process(c); + } + } +} + +} // namespace lex +} // namespace core +} // namespace docgen + diff --git a/configure.sh b/configure.sh index 363eecd..bfe444f 100755 --- a/configure.sh +++ b/configure.sh @@ -1,7 +1,8 @@ #!/bin/bash -# directory where current shell script resides -PROJECTDIR=$(dirname "$BASH_SOURCE") +# relative directory where current shell script resides from where shell script was called +PROJECTDIR="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" +echo "Project directory: $PROJECTDIR" cd "$PROJECTDIR" @@ -48,13 +49,13 @@ git fetch --all --tags --prune git checkout tags/v1.5.0 -b v1.5.0 cd - -# Build google benchmark +# Build and install google benchmark locally cd libs/benchmark mkdir -p build && cd build - +cmake_flags="-DCMAKE_INSTALL_PREFIX=$PROJECTDIR/libs/benchmark/build" if [ $(command -v ninja) != "" ]; then - cmake ../ -GNinja + cmake ../ -GNinja $cmake_flags else - cmake ../ + cmake ../ $cmake_flags fi -cmake --build . -- -j12 +cmake --build . --target install -- -j12 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f95563a..4147592 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,7 @@ add_library(LEXER_LIB_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer.cpp ) +target_compile_features(LEXER_LIB_OBJECTS PRIVATE cxx_std_17) target_include_directories(LEXER_LIB_OBJECTS PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${ETERNAL_DIR}/include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 125e82f..3f4594f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,6 +3,7 @@ # TEST_TARGET is added as a test and an executable named ${TEST_NAME} will be created. function(create_test TEST_NAME TEST_TARGET) target_compile_options(${TEST_TARGET} PRIVATE -g -Wall -Werror -Wextra) + target_compile_features(${TEST_TARGET} PRIVATE cxx_std_17) target_include_directories(${TEST_TARGET} PRIVATE ${GTEST_DIR}/include From 4cf249393f93e71d9797f27dcf924640b757405a Mon Sep 17 00:00:00 2001 From: James Yang Date: Sun, 19 Jan 2020 14:35:36 -0500 Subject: [PATCH 22/23] Finish benchmarking --- CMakeLists.txt | 2 +- benchmark/CMakeLists.txt | 1 + benchmark/core/lex/data/data_4.txt | 71 +++++++++++++++++++ benchmark/core/lex/lexer_base_fixture.hpp | 24 +++++++ benchmark/core/lex/lexer_benchmark.cpp | 60 +++++++++++----- benchmark/core/lex/lexer_legacy_benchmark.cpp | 59 +++++++++++++++ src/core/lex/legacy/lexer.hpp | 8 +-- 7 files changed, 204 insertions(+), 21 deletions(-) create mode 100644 benchmark/core/lex/data/data_4.txt create mode 100644 benchmark/core/lex/lexer_base_fixture.hpp create mode 100644 benchmark/core/lex/lexer_legacy_benchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7944435..d0de76a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,7 +56,7 @@ set(ETERNAL_DIR ${PROJECT_SOURCE_DIR}/libs/eternal) find_package(nlohmann_json 3.2.0 REQUIRED) # find google benchmark -find_package(benchmark REQUIRED) +find_package(benchmark REQUIRED PATHS ${GBENCH_DIR}/build) # add libs subdirectory add_subdirectory(${PROJECT_SOURCE_DIR}/libs ${PROJECT_BINARY_DIR}/libs) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index fe431a0..d2822fb 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,5 +1,6 @@ add_executable(lexer_benchmark ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_benchmark.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core/lex/lexer_legacy_benchmark.cpp # Source dependency $ ) diff --git a/benchmark/core/lex/data/data_4.txt b/benchmark/core/lex/data/data_4.txt new file mode 100644 index 0000000..374e703 --- /dev/null +++ b/benchmark/core/lex/data/data_4.txt @@ -0,0 +1,71 @@ +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj +jfowfoeijiowjfo///jfoijeonnvmlalfjioejoifjoelkdkfnkejwaoijjfoj12893483u2rjjdkfj + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + +/*! !hfowojlk @tparam scdesc fjown @tparam x jeiofhjoeifjdkjofijsoiejfoijeojoirjiohignjknjfbnkjnvkfjoiejioj +* +* +*/ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd + + +; +{jfiojeojahl{jiofejiofkvnkelwojio}} +/// @sdesc some short description............ + + +jfoijeiojokvmjknfirojijjoejfiokjeofjojdjfdfoejoj +JFOIJEIOJOKVMJKNFIROJIJJOEJFIOKJEOFJOJDJFDFOEJOJ +iuhigarufejoafwrhsgijfkajnbvkmfpoefrigirjkfel;df,ldmbkglrf;e,d.v mcd diff --git a/benchmark/core/lex/lexer_base_fixture.hpp b/benchmark/core/lex/lexer_base_fixture.hpp new file mode 100644 index 0000000..51631d6 --- /dev/null +++ b/benchmark/core/lex/lexer_base_fixture.hpp @@ -0,0 +1,24 @@ +#pragma once +#include + +namespace docgen { +namespace core { +namespace lex { + +struct lexer_base_fixture : benchmark::Fixture +{ + static constexpr const char* data_1_path = "data/data_1.txt"; + static constexpr const char* data_2_path = "data/data_2.txt"; + static constexpr const char* data_3_path = "data/data_3.txt"; + static constexpr const char* data_4_path = "data/data_3.txt"; + + void SetUp(const ::benchmark::State& state) + {} + + void TearDown(const ::benchmark::State& state) + {} +}; + +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/benchmark/core/lex/lexer_benchmark.cpp b/benchmark/core/lex/lexer_benchmark.cpp index 327e8e3..f2cc6d1 100644 --- a/benchmark/core/lex/lexer_benchmark.cpp +++ b/benchmark/core/lex/lexer_benchmark.cpp @@ -1,36 +1,64 @@ #include -#include +#include "lexer_base_fixture.hpp" namespace docgen { namespace core { namespace lex { -struct lexer_fixture : benchmark::Fixture +struct lexer_fixture : lexer_base_fixture { - static constexpr const char* data_1_path = "data/data_1.txt"; - static constexpr const char* data_2_path = "data/data_2.txt"; - static constexpr const char* data_3_path = "data/data_3.txt"; - Lexer lexer; +}; - void SetUp(const ::benchmark::State& state) - { +BENCHMARK_F(lexer_fixture, data_1_test)(benchmark::State& st) +{ + for (auto _ : st) { + std::unique_ptr file(fopen(data_1_path, "r"), + [](FILE* file) {fclose(file);}); + int c = 0; + while ((c = fgetc(file.get())) != EOF) { + lexer.process(c); + } + benchmark::DoNotOptimize(lexer.next_token()); + } +} + +BENCHMARK_F(lexer_fixture, data_2_test)(benchmark::State& st) +{ + for (auto _ : st) { + std::unique_ptr file(fopen(data_2_path, "r"), + [](FILE* file) {fclose(file);}); + int c = 0; + while ((c = fgetc(file.get())) != EOF) { + lexer.process(c); + } + benchmark::DoNotOptimize(lexer.next_token()); } +} - void TearDown(const ::benchmark::State& state) - { +BENCHMARK_F(lexer_fixture, data_3_test)(benchmark::State& st) +{ + for (auto _ : st) { + std::unique_ptr file(fopen(data_3_path, "r"), + [](FILE* file) {fclose(file);}); + int c = 0; + while ((c = fgetc(file.get())) != EOF) { + lexer.process(c); + } + benchmark::DoNotOptimize(lexer.next_token()); } -}; +} -BENCHMARK_F(lexer_fixture, data_1_test)(benchmark::State& st) +BENCHMARK_F(lexer_fixture, data_4_test)(benchmark::State& st) { - std::unique_ptr file(fopen(data_1_path, "r"), - [](FILE* file) {fclose(file);}); - int c = 0; for (auto _ : st) { - while ((c = getc(file.get())) != EOF) { + std::unique_ptr file(fopen(data_4_path, "r"), + [](FILE* file) {fclose(file);}); + int c = 0; + while ((c = fgetc(file.get())) != EOF) { lexer.process(c); } + benchmark::DoNotOptimize(lexer.next_token()); } } diff --git a/benchmark/core/lex/lexer_legacy_benchmark.cpp b/benchmark/core/lex/lexer_legacy_benchmark.cpp new file mode 100644 index 0000000..34e9848 --- /dev/null +++ b/benchmark/core/lex/lexer_legacy_benchmark.cpp @@ -0,0 +1,59 @@ +#include +#include "lexer_base_fixture.hpp" + +namespace docgen { +namespace core { +namespace lex { +namespace legacy { + +struct lexer_legacy_fixture : lexer_base_fixture +{}; + +BENCHMARK_F(lexer_legacy_fixture, data_1_test)(benchmark::State& st) +{ + for (auto _ : st) { + FILE* file = fopen(data_1_path, "r"); + Lexer lexer(file); + lexer.process(); + benchmark::DoNotOptimize(lexer.get_tokens()[0]); + fclose(file); + } +} + +BENCHMARK_F(lexer_legacy_fixture, data_2_test)(benchmark::State& st) +{ + for (auto _ : st) { + FILE* file = fopen(data_2_path, "r"); + Lexer lexer(file); + lexer.process(); + benchmark::DoNotOptimize(lexer.get_tokens()[0]); + fclose(file); + } +} + +BENCHMARK_F(lexer_legacy_fixture, data_3_test)(benchmark::State& st) +{ + for (auto _ : st) { + FILE* file = fopen(data_3_path, "r"); + Lexer lexer(file); + lexer.process(); + benchmark::DoNotOptimize(lexer.get_tokens()[0]); + fclose(file); + } +} + +BENCHMARK_F(lexer_legacy_fixture, data_4_test)(benchmark::State& st) +{ + for (auto _ : st) { + FILE* file = fopen(data_4_path, "r"); + Lexer lexer(file); + lexer.process(); + benchmark::DoNotOptimize(lexer.get_tokens()[0]); + fclose(file); + } +} + +} // namespace legacy +} // namespace lex +} // namespace core +} // namespace docgen diff --git a/src/core/lex/legacy/lexer.hpp b/src/core/lex/legacy/lexer.hpp index 56f647e..ec5537e 100644 --- a/src/core/lex/legacy/lexer.hpp +++ b/src/core/lex/legacy/lexer.hpp @@ -6,9 +6,9 @@ namespace core { struct Lexer { - using symbol_t = lexer_details::symbol_t; - using file_reader = lexer_details::file_reader; - using status_t = lexer_details::status_t; + using symbol_t = lex::legacy::symbol_t; + using file_reader = lex::legacy::file_reader; + using status_t = lex::legacy::status_t; Lexer(FILE* file) : reader_(file) @@ -18,7 +18,7 @@ struct Lexer void process() { - lexer_details::process(reader_, status_); + lex::legacy::process(reader_, status_); } const status_t::token_arr_t& get_tokens() const From 75a459e011642bb03a45898ebff5194d703d4af2 Mon Sep 17 00:00:00 2001 From: James Yang Date: Sun, 19 Jan 2020 15:09:22 -0500 Subject: [PATCH 23/23] Add cmake command line args passable to configure --- configure.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/configure.sh b/configure.sh index bfe444f..132c372 100755 --- a/configure.sh +++ b/configure.sh @@ -1,5 +1,9 @@ #!/bin/bash +mode=$1 # debug/release mode +shift # shift command-line arguments + # the rest are cmake command-line arguments + # relative directory where current shell script resides from where shell script was called PROJECTDIR="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" echo "Project directory: $PROJECTDIR" @@ -54,8 +58,8 @@ cd libs/benchmark mkdir -p build && cd build cmake_flags="-DCMAKE_INSTALL_PREFIX=$PROJECTDIR/libs/benchmark/build" if [ $(command -v ninja) != "" ]; then - cmake ../ -GNinja $cmake_flags + cmake ../ -GNinja $cmake_flags "$@" else - cmake ../ $cmake_flags + cmake ../ $cmake_flags "$@" fi cmake --build . --target install -- -j12